In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np

# Parameters
n_epochs = 100
batch_size = 128
hidden_size = 128
seq_length = 70
max_seq_length = 110 # I have too many data points and not enough RAM so filtering out sequences that are too long

In [36]:
# Parameters: max length
filename = "all_arxiv_titles.txt"
raw_text = open(filename, 'r', encoding='utf-8').read().lower()
chars = sorted(list(set(raw_text)))

raw_text = raw_text.splitlines()

titles = []

def special_characters(s):
  special_characters = ["\\", "^", "!", "*", "/"]
  for c in special_characters:
    if c in s:
      return True
    return False

# Cleaning up data: Filtering out titles with special characters for easier learning and because I have too many datapoints
for line in raw_text:
  if len(line) > seq_length and len(line) < max_seq_length and not special_characters(line):
    titles.append(line)

In [None]:
#Data visualization: See the longest
print("We have " + str(len(titles)) + " in total")
lengths = [len(s) for s in titles]
max_index = np.argmax(lengths)
maxlen, minlen = max(lengths), min(lengths)
print(titles[max_index], max_index, len(titles[max_index]))

#vocab size
num_chars = len(chars)

total variation distance between sdes with stable noise and brownian motion with applications to poisson pdes 39 109


In [None]:
# Constructing Dataset Substrings
char_to_int = dict((c,i) for i,c in enumerate(chars))

dataX = []
dataY = []

# The input is a bunch of substrings of each title at a fixed length
# The output/label is the next character
for title in titles:
  for i in range(0, len(title) - seq_length):
      dataX.append(title[i: i + seq_length])
      dataY.append(title[i + seq_length])


In [None]:
# Dataset Preparation: Constructing the tensors

# First dimension is the number of datapoints
# Second dimension: each datapoint has seq_length timesteps i.e. characters
# Third dimension: Each character is one hot encoded, so it's a vector of len(chars)
x = np.zeros((len(dataX), seq_length, len(chars)), dtype=float)
y = np.zeros((len(dataX), len(chars)), dtype=float)

for i, title_prefix in enumerate(dataX): #i-th data point, i.e. i-th title
    for t, char in enumerate(title_prefix): #tth character in the title
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[dataY[i]]] = 1

x = torch.tensor(x).to(torch.float32)
y = torch.tensor(y).to(torch.float32)

In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create DataLoaders
train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_test, y_test), shuffle=False, batch_size=batch_size)


In [None]:
print(X_train.shape)

torch.Size([69194, 70, 62])


In [37]:
class GRUModel(nn.Module):
    def __init__(self, hidden_size, maxlen, num_chars):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size=num_chars, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_chars)

    def forward(self, x):
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [61]:
# Initialize model, optimizer, and loss function
model = GRUModel(hidden_size, seq_length, num_chars)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")

pretrained = True
if pretrained:
  try:
      checkpoint = torch.load("gru.pth")
      print(checkpoint)
      model.load_state_dict(checkpoint[0])
      char_to_int = checkpoint[1]
      print("Loaded pre-trained model")
  except FileNotFoundError:
      print("No pre-trained model found, training from scratch")


[OrderedDict([('gru.weight_ih_l0', tensor([[-0.0775, -0.0241, -0.0691,  ..., -0.0652, -0.0764, -0.0978],
        [-0.0634, -0.7764,  0.0173,  ..., -1.8852,  0.0607,  0.4673],
        [ 0.0684, -0.7741,  0.0130,  ...,  0.7599,  0.0452, -0.2084],
        ...,
        [ 0.0553,  0.4371,  0.0355,  ...,  1.0812,  0.0123,  0.4369],
        [ 0.0798,  0.0936, -0.0465,  ..., -0.8510, -0.0583, -0.4165],
        [-0.0590,  0.0244, -0.0711,  ...,  0.2479, -0.0270, -0.2188]])), ('gru.weight_hh_l0', tensor([[-1.0715, -0.4643,  0.2535,  ..., -0.2104,  0.0946, -0.2188],
        [-0.1557, -0.5505, -0.1002,  ...,  0.1397, -0.3605,  0.0515],
        [-0.3736, -0.0759, -0.1117,  ..., -0.1969, -0.0260,  0.0476],
        ...,
        [ 0.1859,  0.2638,  0.4203,  ..., -0.2255,  0.5151, -0.2171],
        [ 0.0764, -0.2539,  0.3958,  ...,  0.3364,  0.1704,  0.1478],
        [-0.3593,  0.2668, -0.0479,  ..., -0.0795, -0.3739,  0.0870]])), ('gru.bias_ih_l0', tensor([ 2.7191e-01, -1.3049e-03, -1.7128e-01,  2.132

In [None]:
best_model = None
best_loss = np.inf

# Training loop
for epoch in tqdm(range(n_epochs)):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        # X_batch = (X_batch.reshape(batch_size, seq_length, -1))
        # print(X_batch.shape, batch_size)
        y_pred = model(X_batch) #forward pass
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            test_loss += loss.item()

    test_loss /= len(test_loader.dataset)

    if test_loss < best_loss:
        best_loss = test_loss
        best_model = model.state_dict()

    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

# Save the best model and char_to_int dictionary
torch.save([best_model, char_to_int], "single-char_finetune.pth")


max leng 70


  1%|          | 1/100 [01:25<2:20:42, 85.28s/it]

Epoch 1: Train Loss: 2.7461, Test Loss: 2.4225


  2%|▏         | 2/100 [02:55<2:24:08, 88.25s/it]

Epoch 2: Train Loss: 2.2935, Test Loss: 2.1983


  3%|▎         | 3/100 [04:31<2:28:26, 91.82s/it]

Epoch 3: Train Loss: 2.0750, Test Loss: 2.0126


  4%|▍         | 4/100 [06:09<2:30:36, 94.13s/it]

Epoch 4: Train Loss: 1.8862, Test Loss: 1.8558


  5%|▌         | 5/100 [07:42<2:28:21, 93.70s/it]

Epoch 5: Train Loss: 1.7197, Test Loss: 1.7228


  6%|▌         | 6/100 [09:15<2:26:20, 93.41s/it]

Epoch 6: Train Loss: 1.5776, Test Loss: 1.6315


  7%|▋         | 7/100 [10:41<2:21:12, 91.10s/it]

Epoch 7: Train Loss: 1.4580, Test Loss: 1.5332


  8%|▊         | 8/100 [12:05<2:16:04, 88.75s/it]

Epoch 8: Train Loss: 1.3544, Test Loss: 1.4735


  9%|▉         | 9/100 [13:31<2:13:31, 88.04s/it]

Epoch 9: Train Loss: 1.2629, Test Loss: 1.4072


 10%|█         | 10/100 [14:57<2:11:09, 87.44s/it]

Epoch 10: Train Loss: 1.1772, Test Loss: 1.3597


 11%|█         | 11/100 [16:22<2:08:26, 86.59s/it]

Epoch 11: Train Loss: 1.1005, Test Loss: 1.3069


 12%|█▏        | 12/100 [17:48<2:06:50, 86.48s/it]

Epoch 12: Train Loss: 1.0293, Test Loss: 1.2683


 13%|█▎        | 13/100 [19:14<2:04:53, 86.14s/it]

Epoch 13: Train Loss: 0.9660, Test Loss: 1.2262


 14%|█▍        | 14/100 [20:38<2:02:48, 85.68s/it]

Epoch 14: Train Loss: 0.9084, Test Loss: 1.2007


 15%|█▌        | 15/100 [22:04<2:01:15, 85.59s/it]

Epoch 15: Train Loss: 0.8568, Test Loss: 1.1717


 16%|█▌        | 16/100 [23:28<1:59:22, 85.26s/it]

Epoch 16: Train Loss: 0.8122, Test Loss: 1.1490


 17%|█▋        | 17/100 [24:54<1:58:21, 85.55s/it]

Epoch 17: Train Loss: 0.7734, Test Loss: 1.1261


 18%|█▊        | 18/100 [26:20<1:56:47, 85.46s/it]

Epoch 18: Train Loss: 0.7376, Test Loss: 1.1091


 19%|█▉        | 19/100 [27:44<1:54:54, 85.11s/it]

Epoch 19: Train Loss: 0.7049, Test Loss: 1.0924


 20%|██        | 20/100 [29:06<1:52:24, 84.30s/it]

Epoch 20: Train Loss: 0.6783, Test Loss: 1.0806


 21%|██        | 21/100 [30:30<1:50:48, 84.16s/it]

Epoch 21: Train Loss: 0.6533, Test Loss: 1.0611


 22%|██▏       | 22/100 [31:56<1:50:04, 84.68s/it]

Epoch 22: Train Loss: 0.6314, Test Loss: 1.0503


 23%|██▎       | 23/100 [33:29<1:51:55, 87.21s/it]

Epoch 23: Train Loss: 0.6095, Test Loss: 1.0357


 24%|██▍       | 24/100 [34:51<1:48:30, 85.66s/it]

Epoch 24: Train Loss: 0.5931, Test Loss: 1.0284


 25%|██▌       | 25/100 [36:15<1:46:33, 85.24s/it]

Epoch 25: Train Loss: 0.5691, Test Loss: 1.0355


 26%|██▌       | 26/100 [37:41<1:45:21, 85.43s/it]

Epoch 26: Train Loss: 0.5593, Test Loss: 1.0228


 27%|██▋       | 27/100 [39:06<1:43:32, 85.10s/it]

Epoch 27: Train Loss: 0.5468, Test Loss: 1.0131


 28%|██▊       | 28/100 [40:32<1:42:43, 85.60s/it]

Epoch 28: Train Loss: 0.5255, Test Loss: 0.9979


 29%|██▉       | 29/100 [41:58<1:41:11, 85.51s/it]

Epoch 29: Train Loss: 0.5174, Test Loss: 1.0073


 30%|███       | 30/100 [43:25<1:40:21, 86.02s/it]

Epoch 30: Train Loss: 0.5034, Test Loss: 0.9859


 31%|███       | 31/100 [44:47<1:37:43, 84.98s/it]

Epoch 31: Train Loss: 0.4924, Test Loss: 0.9790


 32%|███▏      | 32/100 [46:10<1:35:28, 84.24s/it]

Epoch 32: Train Loss: 0.4776, Test Loss: 0.9724


 33%|███▎      | 33/100 [47:33<1:33:34, 83.80s/it]

Epoch 33: Train Loss: 0.4671, Test Loss: 0.9828


 34%|███▍      | 34/100 [48:58<1:32:36, 84.19s/it]

Epoch 34: Train Loss: 0.4582, Test Loss: 0.9685


 35%|███▌      | 35/100 [50:20<1:30:36, 83.64s/it]

Epoch 35: Train Loss: 0.4464, Test Loss: 0.9734


 36%|███▌      | 36/100 [51:42<1:28:42, 83.17s/it]

Epoch 36: Train Loss: 0.4387, Test Loss: 0.9560


 37%|███▋      | 37/100 [53:10<1:28:48, 84.57s/it]

Epoch 37: Train Loss: 0.4326, Test Loss: 0.9723


 38%|███▊      | 38/100 [54:35<1:27:30, 84.68s/it]

Epoch 38: Train Loss: 0.4353, Test Loss: 0.9466


 39%|███▉      | 39/100 [56:00<1:26:03, 84.65s/it]

Epoch 39: Train Loss: 0.4168, Test Loss: 0.9485


 40%|████      | 40/100 [57:26<1:25:04, 85.07s/it]

Epoch 40: Train Loss: 0.4092, Test Loss: 0.9511


 41%|████      | 41/100 [58:49<1:23:07, 84.54s/it]

Epoch 41: Train Loss: 0.4007, Test Loss: 0.9550


 42%|████▏     | 42/100 [1:00:11<1:21:02, 83.84s/it]

Epoch 42: Train Loss: 0.4022, Test Loss: 0.9732


 43%|████▎     | 43/100 [1:01:34<1:19:29, 83.68s/it]

Epoch 43: Train Loss: 0.4031, Test Loss: 0.9512


 44%|████▍     | 44/100 [1:02:58<1:18:07, 83.71s/it]

Epoch 44: Train Loss: 0.3826, Test Loss: 0.9446


 45%|████▌     | 45/100 [1:04:21<1:16:25, 83.37s/it]

Epoch 45: Train Loss: 0.3764, Test Loss: 0.9503


 46%|████▌     | 46/100 [1:05:44<1:15:04, 83.42s/it]

Epoch 46: Train Loss: 0.3710, Test Loss: 0.9538


 47%|████▋     | 47/100 [1:07:07<1:13:37, 83.35s/it]

Epoch 47: Train Loss: 0.3797, Test Loss: 0.9401


 48%|████▊     | 48/100 [1:08:32<1:12:35, 83.76s/it]

Epoch 48: Train Loss: 0.3582, Test Loss: 0.9249


 49%|████▉     | 49/100 [1:09:55<1:10:59, 83.52s/it]

Epoch 49: Train Loss: 0.3654, Test Loss: 0.9281


 50%|█████     | 50/100 [1:11:19<1:09:44, 83.68s/it]

Epoch 50: Train Loss: 0.3580, Test Loss: 0.9315


 51%|█████     | 51/100 [1:12:45<1:08:55, 84.40s/it]

Epoch 51: Train Loss: 0.3493, Test Loss: 0.9503


 52%|█████▏    | 52/100 [1:14:10<1:07:30, 84.38s/it]

Epoch 52: Train Loss: 0.3454, Test Loss: 0.9308


 53%|█████▎    | 53/100 [1:15:37<1:06:42, 85.15s/it]

Epoch 53: Train Loss: 0.3338, Test Loss: 0.9394


 54%|█████▍    | 54/100 [1:17:03<1:05:39, 85.64s/it]

Epoch 54: Train Loss: 0.3377, Test Loss: 0.9824


 55%|█████▌    | 55/100 [1:18:29<1:04:15, 85.67s/it]

Epoch 55: Train Loss: 0.3373, Test Loss: 0.9284


 56%|█████▌    | 56/100 [1:19:57<1:03:14, 86.24s/it]

Epoch 56: Train Loss: 0.3240, Test Loss: 0.9181


 57%|█████▋    | 57/100 [1:21:25<1:02:10, 86.75s/it]

Epoch 57: Train Loss: 0.3228, Test Loss: 0.9407


 58%|█████▊    | 58/100 [1:22:52<1:00:47, 86.86s/it]

Epoch 58: Train Loss: 0.3195, Test Loss: 0.9169


 59%|█████▉    | 59/100 [1:24:16<58:53, 86.19s/it]  

Epoch 59: Train Loss: 0.3104, Test Loss: 0.9184


 60%|██████    | 60/100 [1:25:44<57:44, 86.61s/it]

Epoch 60: Train Loss: 0.3117, Test Loss: 0.9134


 61%|██████    | 61/100 [1:27:09<56:01, 86.19s/it]

Epoch 61: Train Loss: 0.3072, Test Loss: 0.9331


 62%|██████▏   | 62/100 [1:28:36<54:39, 86.31s/it]

Epoch 62: Train Loss: 0.3074, Test Loss: 0.9421


 63%|██████▎   | 63/100 [1:30:03<53:21, 86.52s/it]

Epoch 63: Train Loss: 0.3036, Test Loss: 0.9024


 64%|██████▍   | 64/100 [1:31:30<52:00, 86.69s/it]

Epoch 64: Train Loss: 0.2915, Test Loss: 0.9163


 65%|██████▌   | 65/100 [1:32:55<50:22, 86.36s/it]

Epoch 65: Train Loss: 0.2871, Test Loss: 0.9251


 66%|██████▌   | 66/100 [1:34:23<49:06, 86.65s/it]

Epoch 66: Train Loss: 0.3117, Test Loss: 0.9185


 67%|██████▋   | 67/100 [1:35:47<47:18, 86.02s/it]

Epoch 67: Train Loss: 0.2819, Test Loss: 0.9051


 68%|██████▊   | 68/100 [1:37:11<45:34, 85.45s/it]

Epoch 68: Train Loss: 0.2863, Test Loss: 0.9390


 69%|██████▉   | 69/100 [1:38:36<44:04, 85.31s/it]

Epoch 69: Train Loss: 0.2920, Test Loss: 0.9177


 70%|███████   | 70/100 [1:40:24<46:00, 92.02s/it]

Epoch 70: Train Loss: 0.2787, Test Loss: 0.9247


 71%|███████   | 71/100 [1:42:02<45:18, 93.75s/it]

Epoch 71: Train Loss: 0.2641, Test Loss: 0.9253


 72%|███████▏  | 72/100 [1:43:28<42:41, 91.49s/it]

Epoch 72: Train Loss: 0.2703, Test Loss: 0.9451


 73%|███████▎  | 73/100 [1:44:54<40:21, 89.68s/it]

Epoch 73: Train Loss: 0.2837, Test Loss: 0.9306


 74%|███████▍  | 74/100 [1:46:22<38:40, 89.24s/it]

Epoch 74: Train Loss: 0.2607, Test Loss: 0.9295


 75%|███████▌  | 75/100 [1:47:46<36:30, 87.63s/it]

Epoch 75: Train Loss: 0.2586, Test Loss: 0.9318


 76%|███████▌  | 76/100 [1:49:18<35:37, 89.07s/it]

Epoch 76: Train Loss: 0.2676, Test Loss: 0.9303


 77%|███████▋  | 77/100 [1:50:55<35:00, 91.34s/it]

Epoch 77: Train Loss: 0.2637, Test Loss: 0.9305


 78%|███████▊  | 78/100 [1:52:33<34:15, 93.42s/it]

Epoch 78: Train Loss: 0.2645, Test Loss: 0.9222


 79%|███████▉  | 79/100 [1:54:10<33:06, 94.58s/it]

Epoch 79: Train Loss: 0.2560, Test Loss: 0.9323


 80%|████████  | 80/100 [1:55:37<30:42, 92.12s/it]

Epoch 80: Train Loss: 0.2565, Test Loss: 0.9422


 81%|████████  | 81/100 [1:57:02<28:29, 89.96s/it]

Epoch 81: Train Loss: 0.2591, Test Loss: 0.9261


 82%|████████▏ | 82/100 [1:58:25<26:21, 87.87s/it]

Epoch 82: Train Loss: 0.2504, Test Loss: 0.9314


 83%|████████▎ | 83/100 [1:59:49<24:36, 86.83s/it]

Epoch 83: Train Loss: 0.2421, Test Loss: 0.8974


 84%|████████▍ | 84/100 [2:01:14<23:02, 86.41s/it]

Epoch 84: Train Loss: 0.2283, Test Loss: 0.9457


 85%|████████▌ | 85/100 [2:02:43<21:45, 87.02s/it]

Epoch 85: Train Loss: 0.2474, Test Loss: 0.9412


 86%|████████▌ | 86/100 [2:04:08<20:10, 86.47s/it]

Epoch 86: Train Loss: 0.2472, Test Loss: 0.9249


 87%|████████▋ | 87/100 [2:05:33<18:39, 86.10s/it]

Epoch 87: Train Loss: 0.2429, Test Loss: 0.9360


 88%|████████▊ | 88/100 [2:06:57<17:06, 85.52s/it]

Epoch 88: Train Loss: 0.2396, Test Loss: 0.9380


 89%|████████▉ | 89/100 [2:08:19<15:28, 84.40s/it]

Epoch 89: Train Loss: 0.2384, Test Loss: 0.9340


 90%|█████████ | 90/100 [2:09:44<14:03, 84.37s/it]

Epoch 90: Train Loss: 0.2256, Test Loss: 0.9373


 91%|█████████ | 91/100 [2:11:07<12:36, 84.00s/it]

Epoch 91: Train Loss: 0.2332, Test Loss: 0.9260


 92%|█████████▏| 92/100 [2:12:31<11:12, 84.08s/it]

Epoch 92: Train Loss: 0.2348, Test Loss: 0.9542


 93%|█████████▎| 93/100 [2:13:56<09:50, 84.34s/it]

Epoch 93: Train Loss: 0.2304, Test Loss: 0.9508


 94%|█████████▍| 94/100 [2:15:17<08:20, 83.45s/it]

Epoch 94: Train Loss: 0.2247, Test Loss: 0.9394


 95%|█████████▌| 95/100 [2:16:40<06:56, 83.24s/it]

Epoch 95: Train Loss: 0.2350, Test Loss: 0.9739


 96%|█████████▌| 96/100 [2:18:03<05:32, 83.18s/it]

Epoch 96: Train Loss: 0.2300, Test Loss: 0.9564


 97%|█████████▋| 97/100 [2:19:28<04:11, 83.73s/it]

Epoch 97: Train Loss: 0.2152, Test Loss: 0.9209


 98%|█████████▊| 98/100 [2:20:54<02:49, 84.51s/it]

Epoch 98: Train Loss: 0.2263, Test Loss: 0.9470


 99%|█████████▉| 99/100 [2:22:19<01:24, 84.67s/it]

Epoch 99: Train Loss: 0.2155, Test Loss: 0.9638


100%|██████████| 100/100 [2:23:40<00:00, 86.21s/it]

Epoch 100: Train Loss: 0.2210, Test Loss: 0.9381





In [None]:
torch.save([best_model, char_to_int], "single-char_finetune.pth")


In [79]:
def generate_sequence(model, seed, n_chars, char_to_int, int_to_char):
    model.eval()
    with torch.no_grad():
        x = np.zeros((1, len(seed), len(char_to_int)), dtype=float)
        for t, char in enumerate(seed):
            x[0, t, char_to_int[char]] = 1.0
        x = torch.tensor(x, dtype=torch.float32)

        hidden = None
        generated_text = seed

        for _ in range(n_chars):
            output = model(x)
            output = torch.softmax(output, dim=1)
            char_index = torch.multinomial(output, 1).item()
            char = int_to_char[char_index]
            generated_text += char

            # Prepare input for next character generation
            x = np.zeros((1, 1, len(char_to_int)), dtype=float)
            x[0, 0, char_index] = 1.0
            x = torch.tensor(x, dtype=torch.float32)

    return generated_text


int_to_char = dict((i,c) for i,c in enumerate(chars))

# Generate a sequence
seed = 'Parametri'.lower()
x = np.zeros((1, len(seed), len(chars)), dtype=float)
for t, char in enumerate(seed): #tth character in the title
     x[0, t, char_to_int[char]] = 1.0
x = torch.tensor(x, dtype=torch.float32)

# print(x)
n_chars = 80  # Number of characters to generate
generated_text = generate_sequence(model, seed, n_chars, char_to_int, int_to_char)
print(generated_text)


parametrich ph jed x p ph dinin k7 win pedininin d dy wininin dininin ph-n ph pth 2 majed
