In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import torch
import math
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

In [None]:
df = pd.read_csv("/kaggle/input/poetry/poetry.csv")

In [None]:
df = df.iloc[:len(df)]

In [None]:
pd.set_option('display.max_colwidth', None)
df.tail()

In [None]:
df.isna().sum()

In [None]:
df = df.drop(['Unnamed: 0', 'Tags', 'Poet', 'Title'], axis=1)

In [None]:
# df['Title'] = df['Title'].str.replace('\r', '')
df['Poem'] = df['Poem'].str.replace('\r', '')
# df['Title'] = df['Title'].str.replace('\n', '')
df['Poem'] = df['Poem'].str.strip('\n')

In [None]:
df.tail()

In [None]:
# Lower case everything
# df['Title'] = df['Title'].str.lower()
df['Poem'] = df['Poem'].str.lower()

# Remove apostrophes and join the parts
# df['Title'] = df['Title'].str.replace("'", "", regex=False).str.replace("’", "", regex=False)
df['Poem'] = df['Poem'].str.replace("'", "", regex=False).str.replace("’", "", regex=False)

In [None]:
# Uncomment only when including encoder, until then let it be commented off
# Filter out rows where both 'Title' and 'Poem' have no alphabetic characters

# df = df[df.apply(lambda x: any(c.isalpha() for c in x['Title']) and
#                             any(c.isalpha() for c in x['Poem']), axis=1)]

# Filter out rows with no value (its not NaN its '' in this dataset)
df = df[df.apply(lambda x: any(c.isalpha() for c in x['Poem']), axis=1)]

In [None]:
# Make newline a special token
# df['Title'] = 'START ' + df['Title'] + ' END'
df['Poem'] = ' START ' + df['Poem'] + ' END '

df['Poem'] = df['Poem'].apply(lambda x: re.sub(r'\n', ' N ', x))

In [None]:
def remove_numeric_tokens(text):
    tokens = word_tokenize(text)
    # Filter tokens to exclude any that contain digits
    filtered_tokens = [token for token in tokens if not re.search(r'\d', token)]
    processed_text = ' '.join(filtered_tokens)
    return processed_text

df['Poem'] = df['Poem'].apply(remove_numeric_tokens)

In [None]:
df.tail()

In [None]:
df.isna().sum()

In [None]:
df.size

In [None]:
tokenizer = ByteLevelBPETokenizer()

# Saving poems to a file (required for tokenizer training)
poem_file = "poems.txt"
with open(poem_file, "w", encoding='utf-8') as file: 
    file.write(' N '.join(df['Poem']))

# Train the tokenizer on the poems
tokenizer.train(files=[poem_file], vocab_size=32000, min_frequency=1, special_tokens=["START", "END", "N", "UNK", "PAD"])

In [None]:
vocab_size = tokenizer.get_vocab_size()  # Number of unique words in the vocabulary
embedding_dim = 256
batch_size = 32 # Increase batch size if resources allow as it bring stabilization, 1 is very noisy
learning_rate = 0.001 # changed lr because maybe the embedding dim is too low and lr is too high so gradient is just bouncing around and not learning much
heads = 4
epochs = 400
seq_len = 300
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open(poem_file, "r", encoding='utf-8') as file: 
    f = file.read()
    
data = torch.tensor(tokenizer.encode(f).ids)

In [None]:
def preprocessing(data, seq_len, pad_token):
    input_tensor = []
    target_tensor = []
    i = 0

    while i < len(data):
        input_tensor.append(torch.tensor(data[i:i+seq_len]))
        target_tensor.append(torch.tensor(data[i+1:i+seq_len+1]))
        i += seq_len
    
    input_tensor = torch.nn.utils.rnn.pad_sequence(input_tensor, padding_value=pad_token, batch_first=True)
    target_tensor = torch.nn.utils.rnn.pad_sequence(target_tensor, padding_value=pad_token, batch_first=True)
    
    return input_tensor, target_tensor

pad_token = tokenizer.token_to_id("PAD")         
input_tensor, target_tensor = preprocessing(data, seq_len, pad_token)

In [None]:
input_tensor.shape

In [None]:
target_tensor.shape

In [None]:
input_tensor[51]

In [None]:
target_tensor[51]

In [None]:
dataset = TensorDataset(input_tensor, target_tensor)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # Don't shuffle validation data

In [None]:
def count_batches(loader, name):
    n_batches = 0
    for i in loader:
        n_batches+=1

    print(f"Number of batches in {name}: ",n_batches)

count_batches(train_loader, "train_loader")
count_batches(val_loader, "val_loader")

In [None]:
pad_token

In [None]:
class WordEmbedder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, x):
        return self.embeddings(x)

In [None]:
# Now that we have successfully trained our word embedding layer, time to make our positional matrix, which we will make using the formula mentioned in the paper
def positional_encoding(seq_len, embedding_dim, device=device):
        positional_encoding = torch.zeros(seq_len, embedding_dim)
        position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embedding_dim))
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        return positional_encoding

In [None]:
# Time to make the Multi-head Self Attention block
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, heads, embedding_dim):
        super(MultiHeadSelfAttention, self).__init__()
        self.heads = heads

        # 3 Linear Layers for Q, K and V
        self.w_q = nn.Linear(embedding_dim, embedding_dim)
        self.w_k = nn.Linear(embedding_dim, embedding_dim)
        self.w_v = nn.Linear(embedding_dim, embedding_dim)

        # Since the feature or embedding dimension is typically the last dimension
        self.softmax = nn.Softmax(dim=-1)

        # Last Linear layer for the attention
        self.w_a = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, embedding_vector):
        batch_size, seq_len, embedding_dim = embedding_vector.size()
        # Compute Q, K, and V
        Q = self.w_q(embedding_vector)
        K = self.w_k(embedding_vector)
        V = self.w_v(embedding_vector)

        # Seperate into heads
        head_dim = embedding_dim // self.heads
        Q = Q.view(batch_size, self.heads, seq_len, head_dim)
        K = K.view(batch_size, self.heads, seq_len, head_dim)
        V = V.view(batch_size, self.heads, seq_len, head_dim)

        # Calculate attention
        attention = torch.matmul(self.softmax(torch.matmul(Q, K.transpose(-1, -2)) / torch.sqrt(torch.tensor(embedding_dim))), V)

        # Concatenating the attention heads (Transposing for correct concatenation)
        attention = attention.transpose(1, 2).reshape(batch_size, seq_len, embedding_dim)
        output = self.w_a(attention)
        # print("Shape after attention:", output.shape)
        # num_active_elements = torch.gt(output, -1).sum().item()
        # total_elements = output.numel()
        # print(f"active  att: {num_active_elements}/{total_elements}")
        return output

In [None]:
# Time to make the Masked Multi-head Self Attention block
class MaskedMultiHeadSelfAttention(nn.Module):
    def __init__(self, heads, embedding_dim):
        super(MaskedMultiHeadSelfAttention, self).__init__()
        self.heads = heads

        # 3 Linear Layers for Q, K and V
        self.w_q = nn.Linear(embedding_dim, embedding_dim)
        self.w_k = nn.Linear(embedding_dim, embedding_dim)
        self.w_v = nn.Linear(embedding_dim, embedding_dim)

        # Since the feature or embedding dimension is typically the last dimension
        self.softmax = nn.Softmax(dim=-1)

        # Last Linear layer for the attention
        self.w_a = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, embedding_vector):
        batch_size, seq_len, embedding_dim = embedding_vector.size()
        # Compute Q, K, and V
        Q = self.w_q(embedding_vector)
        K = self.w_k(embedding_vector)
        V = self.w_v(embedding_vector)

        # Seperate into heads
        head_dim = embedding_dim // self.heads
        Q = Q.view(batch_size, self.heads, seq_len, head_dim)
        K = K.view(batch_size, self.heads, seq_len, head_dim)
        V = V.view(batch_size, self.heads, seq_len, head_dim)

        # Create a mask for masking the attention score
        mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).unsqueeze(0).unsqueeze(0).expand(batch_size, self.heads, -1, -1).to(device)
        value_to_fill = float('-inf')

        # Calculate attention (including mask)
        attention = torch.matmul(self.softmax(torch.matmul(Q, K.transpose(-1, -2)).masked_fill(mask, value_to_fill) / torch.sqrt(torch.tensor(embedding_dim))), V)
        # Concatenating the attention heads (Transposing for correct concatenation)
        attention = attention.transpose(1, 2).reshape(batch_size, seq_len, embedding_dim)
        output = self.w_a(attention)
        # print("Shape after mask:", output.shape)
        # num_active_elements = torch.gt(output, -1).sum().item()
        # total_elements = output.numel()
        # print(f"active masked: {num_active_elements}/{total_elements}")
        return output

In [None]:
class AddNorm(nn.Module):
    def __init__(self, n_features):
        super(AddNorm, self).__init__()
        # Layer Norm will normalize the last dimension of the matrix
        self.norm = nn.LayerNorm(n_features)

    def forward(self, original, modified):
        return self.norm(original + modified)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super(FeedForward, self).__init__()
        # Normally nn.Linear(embedding_dim, embedding_dim * 4) for expressiveness, we will change it if we have the resources to do so
        self.lr1 = nn.Linear(embedding_dim, embedding_dim)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active feedforward: {num_active_elements}/{total_elements}")
        return x

In [None]:
# Time to build the Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, batch_size, max_length, heads):
        super(Decoder, self).__init__()
        self.word_embedder = WordEmbedder(vocab_size, embedding_dim)
        self.masked_attention = MaskedMultiHeadSelfAttention(heads, embedding_dim)
        self.add_norm1 = AddNorm(embedding_dim)
        self.attention = MultiHeadSelfAttention(heads, embedding_dim)
        self.add_norm2 = AddNorm(embedding_dim)
        self.feed_forward = FeedForward(embedding_dim)
        self.add_norm3 = AddNorm(embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        # print("Shape before word embedder:", x.shape)
        x = self.word_embedder(x)
        # print("Shape after word embedder/before positional:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active word embedder: {num_active_elements}/{total_elements}")
        x += positional_encoding(x.size(1), embedding_dim).unsqueeze(0).expand(x.size(0), -1, -1).to(device)
        # print("Shape after positional:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active positional_encoding: {num_active_elements}/{total_elements}")
#         print(x.shape)
        x = self.add_norm1(x, self.masked_attention(x))
        # print("Shape after addnorm1:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active add_norm1: {num_active_elements}/{total_elements}")
#         print(x.shape)
        x = self.add_norm2(x, self.attention(x))
        # print("Shape after addnorm2:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active add_nor2: {num_active_elements}/{total_elements}")
#         print(x.shape)
        x = self.add_norm3(x, self.feed_forward(x))
        # print("Shape after addnorm3:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active add_norm3: {num_active_elements}/{total_elements}")
#         print(x.shape)
        # print("Shape before linear:", x.shape)
        logits = self.linear(x)
        # print("Shape after linear:", logits.shape)
        # num_active_elements = torch.gt(logits, 0).sum().item()
        # total_elements = logits.numel()
        # print(f"active linear: {num_active_elements}/{total_elements}")
        return logits

In [None]:
model = Decoder(vocab_size, embedding_dim, batch_size, seq_len, heads).to(device)
model

In [None]:
# Initialize our optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_token)

In [None]:
def generate_poem(model, start_sequence, tokenizer=tokenizer, max_length=500):
    model.eval()
    tokens = tokenizer.encode(start_sequence)
    generated_ids = [tokenizer.token_to_id("START")] + tokens.ids
    input_seq = torch.tensor([generated_ids], dtype=torch.long).to(device)

    for _ in range(max_length):
        with torch.no_grad():
            logits = model(input_seq)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            next_token_id = idx_next.item()

            # Append the newly generated token to the input sequence
            input_seq = torch.cat([input_seq, torch.tensor([next_token_id], dtype=torch.long).unsqueeze(0).to(device)], dim=1)
            generated_ids.append(next_token_id)

            if next_token_id == tokenizer.token_to_id("END"):
                break

    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return generated_text

In [None]:
def train_epoch(data_loader, model, optimizer, loss_fn, device, pad_token):
  model.train()
  total_loss = 0.0

  # Wrap your data loader with tqdm for a progress bar
  progress_bar = tqdm(train_loader, desc="Training", leave=True)

  for input_batch, target_batch in progress_bar:
    input_data = input_batch.to(device)
    target_data = target_batch.to(device)
    optimizer.zero_grad()

    # Forward pass
    probs = model(input_data)

    # Reshape input and output to correct format for loss calculation
    B, T, C = probs.shape
    probs = probs.view(B * T, C)
    target_data = target_data.view(-1)

    # Calculate loss
    loss = loss_fn(probs, target_data)

    loss.backward()
    optimizer.step()
    total_loss += loss.item()

    # Update the progress bar with the current batch loss
    progress_bar.set_postfix(loss=loss.item())

  return total_loss / len(data_loader)  # Average loss

def validate_epoch(data_loader, model, loss_fn, device, pad_token):
  model.eval()
  total_loss = 0.0

  with torch.no_grad():
    # Wrap your data loader with tqdm for a progress bar
    progress_bar = tqdm(val_loader, desc="Validation", leave=True)

    for input_batch, target_batch in progress_bar:
      input_data = input_batch.to(device)
      target_data = target_batch.to(device)

      # Forward pass
      probs = model(input_data)

      # Reshape input and output to correct format for loss calculation
      B, T, C = probs.shape
      probs = probs.view(B * T, C)
      target_data = target_data.view(-1)

      # Calculate loss
      loss = loss_fn(probs, target_data)

      total_loss += loss.item()

      # Update the progress bar with the current batch loss (optional)
      # progress_bar.set_postfix(loss=loss.item())

  return total_loss / len(data_loader)  # Average loss

epoch_losses = []
validation_losses = []
for epoch in range(epochs):
  print(f"Starting Epoch {epoch + 1}/{epochs}")
  train_loss = train_epoch(train_loader, model, optimizer, loss_fn, device, pad_token)
  val_loss = validate_epoch(val_loader, model, loss_fn, device, pad_token)
  epoch_losses.append(train_loss)
  validation_losses.append(val_loss)
  print(f"Epoch {epoch + 1}, Train Loss: {train_loss:.15f}, Validation Loss: {val_loss:.15f}")
  print(generate_poem(model, "flower"))

# You can use validation_losses to track model performance and potentially implement early stopping


In [None]:
plt.plot(range(1, epochs + 1), epoch_losses, validation_losses) 
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.title("Training Loss Over Epochs")
plt.show()

In [None]:
len(validation_losses)

In [None]:
plt.plot(range(epochs-5), epoch_losses[5:], label='Training Loss')
plt.plot(range(epochs-5), validation_losses[5:], label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.title("Training Loss Over Epochs")
plt.show()

In [None]:
model_path = "model_weights.pth"
torch.save(model.state_dict(), model_path)

In [None]:
model.load_state_dict(torch.load("model_weights.pth"))
model.eval()

In [None]:
start_sequence = "flower"
poem = generate_poem(model, start_sequence)
print(poem)