In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

# Parameters
n_epochs = 100
batch_size = 128
hidden_size = 128
seq_length = 70
max_seq_length = 110 # I have too many data points and not enough RAM so filtering out sequences that are too long

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Parameters: max length
filename = "cs_titles.txt"
raw_text = open(filename, 'r', encoding='utf-8').read().lower()
raw_text = raw_text.splitlines()

titles = []

def special_characters(s):
  special_characters = ["\\", "^", "!", "*", "/", "-", "_", "~"]
  for c in special_characters:
    if c in s:
      return True
    return False

# Cleaning up data: Filtering out titles with special characters for easier learning and because I have too many datapoints
for line in raw_text:
  if len(line) > seq_length and len(line) < max_seq_length: # and not special_characters(line):
    titles.append(line)

# Data visualization: See the longest
print("We have " + str(len(titles)) + " in total")
lengths = [len(s) for s in titles]
max_index = np.argmax(lengths)
maxlen, minlen = max(lengths), min(lengths)
print(titles[max_index], max_index, len(titles[max_index]))

# Tokenize and convert titles to BERT embeddings
def titles_to_bert_embeddings(titles, tokenizer, bert_model, seq_length):
    inputs = tokenizer(titles, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_length)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state
    return embeddings

# Prepare dataset
x = titles_to_bert_embeddings(titles, tokenizer, bert_model, seq_length)


x = x[:, :seq_length, :]  # Ensure the sequence length matches
y = np.zeros((len(titles), tokenizer.vocab_size), dtype=float)

# Create labels
for i, title in enumerate(titles):
    tokens = tokenizer.tokenize(title)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    for t in range(len(token_ids) - 1):
        y[i, t, token_ids[t + 1]] = 1.0

x = torch.tensor(x).to(torch.float32)
y = torch.tensor(y).to(torch.float32)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create DataLoaders
train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_test, y_test), shuffle=False, batch_size=batch_size)

print(X_train.shape)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


We have 741 in total
event-based optical flow on neuromorphic processor: ann vs. snn comparison based on activation sparsification 110 109


IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [2]:
print(y_train.shape)


torch.Size([592, 70, 30522])


In [3]:
class GRUModel(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

# Initialize model, optimizer, and loss function
model = GRUModel(hidden_size, bert_model.config.hidden_size, tokenizer.vocab_size)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")

# Training loop
for epoch in tqdm(range(n_epochs)):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        y_pred = model(X_batch) # forward pass
        print(y_pred.shape, y_batch.shape)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            test_loss += loss.item()

    test_loss /= len(test_loader.dataset)

    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

# Save the best model and char_to_int dictionary
torch.save(model.state_dict(), "gru_finetune.pth")

  0%|          | 0/100 [00:00<?, ?it/s]

torch.Size([128, 30522]) torch.Size([128, 70, 30522])


  0%|          | 0/100 [00:02<?, ?it/s]


RuntimeError: 0D or 1D target tensor expected, multi-target not supported