In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.device_count()

# Parameters
n_epochs = 100
batch_size = 128
hidden_size = 128 #hidden dimension representation of each token

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Section 1: Loading the Data
- opens the file
- makes a list of all the titles without special characters

In [5]:
filename = "/Users/sophieliu/git_desktop/rizzGPT/cs_titles.txt"
data = open(filename, 'r', encoding='utf-8').read().lower()
data = data.splitlines()

titles = []

def special_characters(s):
  special_characters = ["\\", "^", "!", "*", "/", "-", "_", "~"]
  for c in special_characters:
    if c in s:
      return True
    return False

for line in data:
  if not special_characters(line):
    titles.append(line)
print("before embeddng", len(titles))
 


before embeddng 1384


### Subsection 1.1: Get the length for the max tokenized sequence

In [6]:
max_tokens = 0
index = 0
for i, example in enumerate(titles):
    test = len(tokenizer(example)["input_ids"])
    
    if test > max_tokens:
        max_tokens = test
        index = i

print(titles[index])

global solver based on the sperner-lemma and mazurkewicz-knaster-kuratowski-lemma based proof of the brouwer fixed-point theorem


### Subsection 1.2: Generate padded and tokenized prefix/suffix split

In [16]:
prefixes = []
suffixes = []

for title in titles[:10]:
    tokenized = tokenizer(title, truncation=True, padding=False, return_tensors='pt') 
    # tokenized is of shape [1,n], and we want a 1D vector
    input_ids = tokenized["input_ids"].squeeze()


    # Generate prefixes with padding to max length
    for i in range(5, len(input_ids)):
        prefix = input_ids[:i]
        suffix = input_ids[i:]

        #add padding
        prefix_padded = torch.nn.functional.pad(prefix, (0, max_tokens - len(prefix)), value=tokenizer.pad_token_id)
        suffix_padded = torch.nn.functional.pad(suffix, (0, max_tokens - len(suffix)), value=tokenizer.pad_token_id)
        
 
        prefixes.append(prefix_padded)
        suffixes.append(suffix_padded)

In [17]:
#Intuition check to make sure that all the sequences are of max_tokens length
print(prefixes[0].shape, suffixes[0].shape, max_tokens)

torch.Size([40]) torch.Size([40]) 40


### Section 1.3 Embed the prefixes and turn suffixes into vocab_size id vectors

In [18]:
#embed the prefixes
with torch.no_grad():
    #the Bert model outputs a bunch of hidden states for each token
    embedded_prefixes = [bert_model(input_ids=p.unsqueeze(0)).last_hidden_state for p in prefixes]
    
#prepare the suffix as a vector of logits
suffix_logits = []
vocab_size = tokenizer.vocab_size

for suffix in suffixes:
    logits = torch.zeros((max_tokens, vocab_size))
    #set the jth token which corresponds to the jth vector in the logit to be the token id
    for j, token_id in enumerate(suffix):
        logits[j, token_id] = 1.0 

    suffix_logits.append(logits)

In [19]:
embedded_prefixes_tensor = torch.cat(embedded_prefixes)
logits_suffixes_tensor = torch.stack(suffix_logits) 
#i forgot how stack and cat are different, but I got lucky and it works so.....

# This should (batch, max length, embed length) and (batch, max length, vocab size)
print(embedded_prefixes_tensor.shape, logits_suffixes_tensor.shape)

torch.Size([99, 40, 768]) torch.Size([99, 40, 30522])


### Section 1.4: Generate Train and Test Split
- create training data by iterating through the embedded titles

In [20]:
import torch.utils.data as data

dataX = embedded_prefixes_tensor
dataY = logits_suffixes_tensor

X_train, X_test, y_train, y_test = train_test_split(dataX, dataY, test_size=0.2, random_state=42)
train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_test, y_test), shuffle=False, batch_size=batch_size)

## Section 2: Define the Model

In [21]:
class GRUModel(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.gru(x)
        # out = out[:, -1, :]
        out = self.fc(out)
        return out

# Initialize model, optimizer, and loss function
model = GRUModel(hidden_size, bert_model.config.hidden_size, tokenizer.vocab_size)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")

# Check if multiple GPUs are available
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)  # Wrap the model for parallel processing

grubert = model.to(device)

In [22]:
# Check the model to make sure the dimensions work out
from torchvision import models
from torchsummary import summary

print("\t", model)

print("Begin torch summary: ")

summary(model, (35, 768))

#I can't get summary to tell me the shape if I pass in a batched input but, as you can 
# see it outputs features of the same size as the bert vocab size

	 GRUModel(
  (gru): GRU(768, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=30522, bias=True)
)
Begin torch summary: 
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
               GRU-1  [[-1, 35, 128], [-1, 2, 128]]               0
            Linear-2            [-1, 35, 30522]       3,937,338
Total params: 3,937,338
Trainable params: 3,937,338
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.10
Forward/backward pass size (MB): 0.60
Params size (MB): 15.02
Estimated Total Size (MB): 15.72
----------------------------------------------------------------


## Section 3: Start Training

In [23]:
# Training loop
for epoch in tqdm(range(n_epochs)):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        y_pred = model(X_batch) # forward pass
        # print(y_pred.shape, y_batch.shape)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            test_loss += loss.item()

    test_loss /= len(test_loader.dataset)

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

# Save the best model and char_to_int dictionary
torch.save(model.state_dict(), "gru_finetune.pth")    

  0%|          | 0/100 [00:00<?, ?it/s]

  5%|▌         | 5/100 [00:07<02:05,  1.32s/it]

Epoch 5: Train Loss: 143.7288, Test Loss: 143.5130


 10%|█         | 10/100 [00:13<01:52,  1.25s/it]

Epoch 10: Train Loss: 141.5754, Test Loss: 141.5658


 15%|█▌        | 15/100 [00:21<02:03,  1.45s/it]

Epoch 15: Train Loss: 139.8530, Test Loss: 139.8501


 20%|██        | 20/100 [00:27<01:37,  1.22s/it]

Epoch 20: Train Loss: 138.4459, Test Loss: 138.5006


 25%|██▌       | 25/100 [00:33<01:29,  1.20s/it]

Epoch 25: Train Loss: 137.3410, Test Loss: 137.4655


 30%|███       | 30/100 [00:39<01:26,  1.23s/it]

Epoch 30: Train Loss: 136.3555, Test Loss: 136.4854


 35%|███▌      | 35/100 [00:45<01:24,  1.30s/it]

Epoch 35: Train Loss: 135.3839, Test Loss: 135.5014


 40%|████      | 40/100 [00:52<01:17,  1.29s/it]

Epoch 40: Train Loss: 134.4483, Test Loss: 134.5080


 45%|████▌     | 45/100 [00:58<01:11,  1.30s/it]

Epoch 45: Train Loss: 133.6013, Test Loss: 133.5860


 50%|█████     | 50/100 [01:05<01:06,  1.32s/it]

Epoch 50: Train Loss: 132.8414, Test Loss: 132.8604


 55%|█████▌    | 55/100 [01:11<00:56,  1.25s/it]

Epoch 55: Train Loss: 132.1280, Test Loss: 132.1878


 60%|██████    | 60/100 [01:17<00:48,  1.22s/it]

Epoch 60: Train Loss: 131.4292, Test Loss: 131.5656


 65%|██████▌   | 65/100 [01:23<00:41,  1.20s/it]

Epoch 65: Train Loss: 130.8575, Test Loss: 131.0280


 70%|███████   | 70/100 [01:29<00:36,  1.20s/it]

Epoch 70: Train Loss: 130.2931, Test Loss: 130.6382


 75%|███████▌  | 75/100 [01:35<00:29,  1.19s/it]

Epoch 75: Train Loss: 129.7371, Test Loss: 130.2868


 80%|████████  | 80/100 [01:41<00:23,  1.19s/it]

Epoch 80: Train Loss: 129.3074, Test Loss: 130.0517


 85%|████████▌ | 85/100 [01:47<00:17,  1.18s/it]

Epoch 85: Train Loss: 128.9335, Test Loss: 129.8866


 90%|█████████ | 90/100 [01:53<00:12,  1.22s/it]

Epoch 90: Train Loss: 128.5391, Test Loss: 129.7570


 95%|█████████▌| 95/100 [01:59<00:05,  1.19s/it]

Epoch 95: Train Loss: 128.1995, Test Loss: 129.7416


100%|██████████| 100/100 [02:05<00:00,  1.26s/it]

Epoch 100: Train Loss: 127.8126, Test Loss: 129.7445





In [26]:
import random

def predict_completion(model, prefix):
    model.eval()

    tokenized = tokenizer(prefix, truncation=True, padding=False, return_tensors='pt') 
    # tokenized is of shape [1,n], and we want a 1D vector
    input_ids = tokenized["input_ids"].squeeze()
    embedded = bert_model(input_ids.unsqueeze(0)).last_hidden_state

    with torch.no_grad():
        
        logits = model(embedded)
        probs = torch.softmax(logits[0], dim=-1)
        max_indices = torch.argmax(probs, dim=-1)

        suffix_tokens = tokenizer.convert_ids_to_tokens(max_indices.tolist())
        
        # Join tokens into the generated suffix
        suffix_text = tokenizer.convert_tokens_to_string(suffix_tokens)

        return suffix_text


seed = "From "

print(seed + predict_completion(grubert, seed))


From neuralcamp hat


In [27]:
torch.save(model.state_dict(), "grubert.pth")    