In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import torch
import math
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv("../input/poetry-foundation-poems/PoetryFoundationData.csv")

In [3]:
pd.set_option('display.max_colwidth', None)
df.tail()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
13849,13,\r\r\n 1-800-FEAR\r\r\n,\r\r\nWe'd like to talk with you about fear they said so\r\r\r\nmany people live in fear these days they drove up\r\r\r\nall four of them in a small car nice boy they said\r\r\r\nbeautiful dogs they said so friendly the man ahead\r\r\r\nof the woman the other two waiting in the drive I\r\r\r\nwas outside digging up the garden no one home I said\r\r\r\nwhat are you selling anyway I'm not interested I\r\r\r\nsaid well you have a nice day they said here's our\r\r\r\ncard there's a phone number you can call anytime\r\r\r\nany other houses down this road anyone else live\r\r\r\nhere we'd like to talk to them about living in fear\r\r\n,Jody Gladding,"Living,Social Commentaries,Popular Culture"
13850,14,\r\r\n The Death of Atahuallpa\r\r\n,\r\r\n\r\r\n,William Jay Smith,
13851,15,\r\r\n Poet's Wish\r\r\n,\r\r\n\r\r\n,William Jay Smith,
13852,0,\r\r\n 0\r\r\n,"\r\r\n Philosophic\r\r\nin its complex, ovoid emptiness,\r\r\na skillful pundit coined it as a sort\r\r\n of stopgap doorstop for those\r\r\n quaint equations Romans never\r\r\ndreamt of. In form completely clever\r\r\nand discrete—a mirror come unsilvered, loose watch face without the works, a hollowed globe from tip to toe\r\r\nunbroken, it evades the grappling\r\r\nhooks of mass, tilts the thin rim of no thing, remains embryonic sum, non-cogito.\r\r\n",Hailey Leithauser,"Arts & Sciences,Philosophy"
13853,1,\r\r\n !\r\r\n,"\r\r\nDear Writers, I’m compiling the first in what I hope is a series of publications I’m calling artists among artists. The theme for issue 1 is “Faggot Dinosaur.” I hope to hear from you! Thank you and best wishes.",Wendy Videlock,"Relationships,Gay, Lesbian, Queer,Arts & Sciences,Poetry & Poets,Social Commentaries,Gender & Sexuality"


In [4]:
df.isna().sum()

Unnamed: 0      0
Title           0
Poem            0
Poet            0
Tags          955
dtype: int64

In [5]:
df = df.drop(['Unnamed: 0', 'Tags', 'Poet', 'Title'], axis=1)

In [6]:
# df['Title'] = df['Title'].str.replace('\r', '')
df['Poem'] = df['Poem'].str.replace('\r', '')
# df['Title'] = df['Title'].str.replace('\n', '')
df['Poem'] = df['Poem'].str.strip('\n')

In [7]:
# Lower case everything
# df['Title'] = df['Title'].str.lower()
df['Poem'] = df['Poem'].str.lower()

# Remove apostrophes and join the parts
# df['Title'] = df['Title'].str.replace("'", "", regex=False)
df['Poem'] = df['Poem'].str.replace("'", "", regex=False)

In [8]:
# Remove tokens containing numbers from the Titles
def remove_numeric_tokens(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.isalpha()]
    processed_text = ' '.join(filtered_tokens)
    return processed_text

# df['Title'] = df['Title'].apply(remove_numeric_tokens)
df['Poem'] = df['Poem'].apply(remove_numeric_tokens)

In [9]:
# Uncomment only when including encoder, until then let it be commented off
# Filter out rows where both 'Title' and 'Poem' have no alphabetic characters

# df = df[df.apply(lambda x: any(c.isalpha() for c in x['Title']) and
#                             any(c.isalpha() for c in x['Poem']), axis=1)]

# Filter out rows with no value (its not NaN its '' in this dataset)
df = df[df.apply(lambda x: any(c.isalpha() for c in x['Poem']), axis=1)]

In [10]:
# Make newline a special token
# df['Title'] = 'START ' + df['Title'] + ' END'
df['Poem'] = 'START ' + df['Poem'] + ' END'

df['Poem'] = df['Poem'].str.replace('\n', ' N ')

In [11]:
df.tail()

Unnamed: 0,Poem
13835,START dear writers i m compiling the first in what i hope is a series of publications i m calling artists among artists the theme for issue is faggot i hope to hear from you thank you and best wishes END
13848,START the wise men will unlearn your name above your head no star will flame one weary sound will be the the hoarse roar of the gale the shadows fall from your tired eyes as your lone bedside candle dies for here the calendar breeds nights till stores of candles fail what prompts this melancholy key a long familiar melody it sounds again so let it be let it sound from this night let it sound in my hour of as gratefulness of eyes and lips for that which sometimes makes us lift our gaze to the far sky you glare in silence at the wall your stocking gapes no gifts at all its clear that you are now too old to trust in good saint nick that its too late for miracles suddenly lifting your eyes to heavens light you realize your life is a sheer gift END
13849,START wed like to talk with you about fear they said so many people live in fear these days they drove up all four of them in a small car nice boy they said beautiful dogs they said so friendly the man ahead of the woman the other two waiting in the drive i was outside digging up the garden no one home i said what are you selling anyway im not interested i said well you have a nice day they said heres our card theres a phone number you can call anytime any other houses down this road anyone else live here wed like to talk to them about living in fear END
13852,START philosophic in its complex ovoid emptiness a skillful pundit coined it as a sort of stopgap doorstop for those quaint equations romans never dreamt of in form completely clever and mirror come unsilvered loose watch face without the works a hollowed globe from tip to toe unbroken it evades the grappling hooks of mass tilts the thin rim of no thing remains embryonic sum END
13853,START dear writers i m compiling the first in what i hope is a series of publications i m calling artists among artists the theme for issue is faggot i hope to hear from you thank you and best wishes END


In [12]:
df.isna().sum()

Poem    0
dtype: int64

In [13]:
df.size

13750

In [14]:
length_to_trim = 20
# Tokenize poems
tokenized_poems = df['Poem'].apply(word_tokenize)

# Get the lengths of all poems
poem_lengths = [len(seq) for seq in tokenized_poems]

# Sort the lengths in descending order and find the minimum length to keep
sorted_lengths = sorted(poem_lengths, reverse=True)
print("Length of the longest poem:", sorted_lengths[0])
print("Five longest poem sizes:", sorted_lengths[:5])

# Determine the maximum length of poem to keep
longest_length = sorted_lengths[length_to_trim]
print("Maximum length for trimming:", longest_length)

# Filter out poems that are longer than the threshold
tokenized_poems = [seq for seq, length in zip(tokenized_poems, poem_lengths) if length <= longest_length]

# Collect all tokens from the remaining poems
all_tokens = [token for poem in tokenized_poems for token in poem]

# Create the unique vocabulary, including special tokens
unique_tokens = list(set(all_tokens))
special_tokens = ["UNK", "PAD"]
vocabulary = special_tokens + unique_tokens

print("Total vocabulary size:", len(vocabulary))

Length of the longest poem: 15617
Five longest poem sizes: [15617, 9887, 9196, 9035, 8360]
Maximum length for trimming: 5034
Total vocabulary size: 104643


In [None]:
'.' in unique_tokens

In [None]:
vocab_size = len(vocabulary)  # Number of unique words in the vocabulary
embedding_dim = 15  # Number of dimensions for word embeddings
# context_window_size = 5  # Number of surrounding words considered for context (adjust)
batch_size = 1
learning_rate = 0.01
pad_index = 1
heads = 3
epochs = 2
device = "cuda"

In [None]:
class WordEmbedder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, x):
        return self.embeddings(x)

In [None]:
# We map word to indexes instead of one-hot encoding as it takes up less space and is faster
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Add context_window_size as a factor to longest length
# longest_length_plus_context = longest_length + (2 * context_window_size)

# Replace all words in the titles and poems with their respective token indexes
# If a word is not in the word_to_index dictionary, replace it with 0 (index of UNK)
# indexed_titles = [[word_to_index.get(word, word_to_index["UNK"]) for word in title] for title in tokenized_titles]
indexed_poems = [[word_to_index.get(word, word_to_index["UNK"]) for word in poem] for poem in tokenized_poems]

# Apply padding
pad_token = word_to_index["PAD"]
# padded_titles = torch.tensor([title + [pad_token] * (longest_length - len(title)) for title in indexed_titles]).to(device)
padded_poems = torch.tensor([poem + [pad_token] * (longest_length - len(poem)) for poem in indexed_poems]).to(device)

# Uncomment when using encoder too
# Concatenate titles and poems one below the other
# padded_input = torch.cat((padded_titles, padded_poems), dim=0).to(device)

# Create PyTorch dataset and dataloader
dataset = TensorDataset(padded_poems)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Verify shapes (optional)
print(f"Shape of padded inputs tensor: {padded_poems.shape}")

In [None]:
# for i in data_loader:
#     print(i[0].shape)
#     break

In [None]:
word_to_index["UNK"]

In [None]:
poems_with_unk = [poem for poem in indexed_poems if 0 in poem]
poems_with_unk

In [None]:
# Now that we have successfully trained our word embedding layer, time to make our positional matrix, which we will make using the formula mentioned in the paper
def positional_encoding(seq_len, embedding_dim, device=device):
    pe = torch.zeros(seq_len, embedding_dim, device=device)
    # Compute the positional encoding values
    position = torch.arange(0, seq_len, dtype=torch.float, device=device).unsqueeze(1)
    # Adjust div_term calculation to handle odd embedding_dim
    div_term_exp = torch.arange(0, embedding_dim, 2).float() * -(np.log(10000.0) / embedding_dim)
    div_term = torch.exp(div_term_exp).to(device)
    pe[:, 0::2] = torch.sin(position * div_term)
    if embedding_dim % 2 == 0:  # Check if embedding_dim is even
        pe[:, 1::2] = torch.cos(position * div_term)  # For even embedding_dim
    else:
        # Adjust the last cosine computation for odd embedding_dim
        pe[:, 1::2] = torch.cos(position * div_term)[:,:-1]  # Exclude the last column for odd embedding_dim
    return pe

In [None]:
encoder_pos_encoding = positional_encoding(longest_length, embedding_dim)
encoder_pos_encoding.shape

In [None]:
# Time to make the Multi-head Self Attention block
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, heads, embedding_dim):
        super(MultiHeadSelfAttention, self).__init__()
        self.heads = heads

        # 3 Linear Layers for Q, K and V
        self.w_q = nn.Linear(embedding_dim, embedding_dim)
        self.w_k = nn.Linear(embedding_dim, embedding_dim)
        self.w_v = nn.Linear(embedding_dim, embedding_dim)

        # Since the feature or embedding dimension is typically the last dimension
        self.softmax = nn.Softmax(dim=-1)

        # Last Linear layer for the attention
        self.w_a = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, embedding_vector):
        batch_size, seq_len, embedding_dim = embedding_vector.size()
        # Compute Q, K, and V
        Q = self.w_q(embedding_vector)
        K = self.w_k(embedding_vector)
        V = self.w_v(embedding_vector)

        # Seperate into heads
        head_dim = embedding_dim // self.heads
        Q = Q.view(batch_size, self.heads, seq_len, head_dim)
        K = K.view(batch_size, self.heads, seq_len, head_dim)
        V = V.view(batch_size, self.heads, seq_len, head_dim)

        # Calculate attention
        attention = torch.matmul(self.softmax(torch.matmul(Q, K.transpose(-1, -2)) / torch.sqrt(torch.tensor(embedding_dim))), V)

        # Concatenating the attention heads (Transposing for correct concatenation)
        attention = attention.transpose(1, 2).reshape(batch_size, seq_len, embedding_dim)
        output = self.w_a(attention)
        return output

In [None]:
# Time to make the Masked Multi-head Self Attention block
class MaskedMultiHeadSelfAttention(nn.Module):
    def __init__(self, heads, embedding_dim):
        super(MaskedMultiHeadSelfAttention, self).__init__()
        self.heads = heads

        # 3 Linear Layers for Q, K and V
        self.w_q = nn.Linear(embedding_dim, embedding_dim)
        self.w_k = nn.Linear(embedding_dim, embedding_dim)
        self.w_v = nn.Linear(embedding_dim, embedding_dim)

        # Since the feature or embedding dimension is typically the last dimension
        self.softmax = nn.Softmax(dim=-1)

        # Last Linear layer for the attention
        self.w_a = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, embedding_vector):
        batch_size, seq_len, embedding_dim = embedding_vector.size()
        # Compute Q, K, and V
        Q = self.w_q(embedding_vector)
        K = self.w_k(embedding_vector)
        V = self.w_v(embedding_vector)

        # Seperate into heads
        head_dim = embedding_dim // self.heads
        Q = Q.view(batch_size, self.heads, seq_len, head_dim)
        K = K.view(batch_size, self.heads, seq_len, head_dim)
        V = V.view(batch_size, self.heads, seq_len, head_dim)

        # Create a mask for masking the attention score
        mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).unsqueeze(0).unsqueeze(0).expand(batch_size, self.heads, -1, -1).to(device)
        value_to_fill = float('-inf')

        # Calculate attention (including mask)
        attention = torch.matmul(self.softmax(torch.matmul(Q, K.transpose(-1, -2)).masked_fill(mask, value_to_fill) / torch.sqrt(torch.tensor(embedding_dim))), V)

        # Concatenating the attention heads (Transposing for correct concatenation)
        attention = attention.transpose(1, 2).reshape(batch_size, seq_len, embedding_dim)
        output = self.w_a(attention)
        return output

In [None]:
class AddNorm(nn.Module):
    def __init__(self, n_features):
        super(AddNorm, self).__init__()
        # Layer Norm will normalize the last dimension of the matrix
        self.norm = nn.LayerNorm(n_features)

    def forward(self, original, modified):
        return self.norm(original + modified)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super(FeedForward, self).__init__()
        # Normally nn.Linear(embedding_dim, embedding_dim * 4) for expressiveness, we will change it if we have the resources to do so
        self.lr1 = nn.Linear(embedding_dim, embedding_dim)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        out = self.lr2(x)
        return out

In [None]:
# Time to build the Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, batch_size, heads):
        super(Decoder, self).__init__()
        self.word_embedder = WordEmbedder(vocab_size, embedding_dim)
        self.positional_encoding = positional_encoding(longest_length, embedding_dim)
        self.masked_attention = MaskedMultiHeadSelfAttention(heads, embedding_dim)
        self.add_norm1 = AddNorm(embedding_dim)
        self.attention = MultiHeadSelfAttention(heads, embedding_dim)
        self.add_norm2 = AddNorm(embedding_dim)
        self.feed_forward = FeedForward(embedding_dim)
        self.add_norm3 = AddNorm(embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.word_embedder(x)
        x = x + self.positional_encoding[:x.size(1), :].unsqueeze(0).expand(x.size(0), -1, -1)
        x = self.add_norm1(x, self.masked_attention(x))
        x = self.add_norm2(x, self.attention(x))
        x = self.add_norm3(x, self.feed_forward(x))
        logits = self.linear(x)
        return F.softmax(logits, dim=-1)

In [None]:
model = Decoder(vocab_size, embedding_dim, batch_size, heads).to(device)

In [None]:
# Initialize our optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
def train_epoch(data_loader, model, optimizer, device=device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        torch.cuda.empty_cache()
        poems = batch[0].to(device)  # Assuming batch is correctly formatted
        batch_size, seq_len = poems.size()

        optimizer.zero_grad()

        # Prepare input and target sequences
        input_sequences = poems[:, :-1]  # Exclude the last token for input
        target_sequences = poems[:, 1:]  # Exclude the first token for targets
        # Forward pass using teacher forcing
        logits = model(input_sequences)
#         print(logits.transpose(1, 2).size())
#         print(target_sequences.size())
        loss = loss_fn(logits.transpose(1, 2), target_sequences)  # Adjust logits and targets format if necessary

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(data_loader)
    return average_loss

for epoch in range(epochs):
    epoch_loss = train_epoch(data_loader, model, optimizer)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

In [None]:
model_path = "/kaggle/working/model_weights.pth" 
torch.save(model.state_dict(), model_path)

In [None]:
model.load_state_dict(torch.load("path/to/save/model_weights.pth"))
model.eval()

In [None]:
def generate_poem(model, start_sequence, max_length=500):
    model.eval()  # Set the model to evaluation mode.
    tokens = start_sequence.split()  # Tokenize the start sequence.
    generated_sequence = [word_to_index.get(word, word_to_index["UNK"]) for word in tokens]  # Convert to indices.

    for _ in range(max_length):
        input_seq = torch.tensor([generated_sequence], dtype=torch.long).to(device)
        with torch.no_grad():
            output_logits = model(input_seq)

        # Predict the next word index.
        next_word_id = output_logits[:, -1, :].argmax(-1).item()
        generated_sequence.append(next_word_id)  # Append the index of the next word.

        # Stop if the end token is generated.
        if next_word_id == word_to_index["END"]:
            break

    # Convert indices back to words.
    generated_words = [index_to_word[idx] for idx in generated_sequence]

    return ' '.join(generated_words)

In [None]:
start_sequence = input("Please enter the starting words for your poem: ")
poem = generate_poem(model, start_sequence, max_length=50)
print(poem)
