In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

%precision %.4f

'%.4f'

In [2]:
# grader-required-cell

# Define path for file with sonnets
SONNETS_FILE = './data/sonnets.txt'

# Read the data
with open(SONNETS_FILE) as f:
    data = f.read()

# Convert to lower case and save as a list
#corpus = data.lower().split("\n")

In [3]:
data[:500]

"FROM fairest creatures we desire increase,\nThat thereby beauty's rose might never die,\nBut as the riper should by time decease,\nHis tender heir might bear his memory:\nBut thou, contracted to thine own bright eyes,\nFeed'st thy light'st flame with self-substantial fuel,\nMaking a famine where abundance lies,\nThyself thy foe, to thy sweet self too cruel.\nThou that art now the world's fresh ornament\nAnd only herald to the gaudy spring,\nWithin thine own bud buriest thy content\nAnd, tender churl, makes"

In [4]:
text = data.lower().replace("\n", " ")

In [5]:
text[:1000]

"from fairest creatures we desire increase, that thereby beauty's rose might never die, but as the riper should by time decease, his tender heir might bear his memory: but thou, contracted to thine own bright eyes, feed'st thy light'st flame with self-substantial fuel, making a famine where abundance lies, thyself thy foe, to thy sweet self too cruel. thou that art now the world's fresh ornament and only herald to the gaudy spring, within thine own bud buriest thy content and, tender churl, makest waste in niggarding. pity the world, or else this glutton be, to eat the world's due, by the grave and thee. when forty winters shall beseige thy brow, and dig deep trenches in thy beauty's field, thy youth's proud livery, so gazed on now, will be a tatter'd weed, of small worth held: then being ask'd where all thy beauty lies, where all the treasure of thy lusty days, to say, within thine own deep-sunken eyes, were an all-eating shame and thriftless praise. how much more praise deserved thy 

In [6]:
import re
# Function to clean text by removing special characters
def clean_text(text):
    # Remove special characters using regex
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Return the cleaned text
    return cleaned_text

In [7]:
text = clean_text(text)

In [8]:
text[:500]

'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lightst flame with selfsubstantial fuel making a famine where abundance lies thyself thy foe to thy sweet self too cruel thou that art now the worlds fresh ornament and only herald to the gaudy spring within thine own bud buriest thy content and tender churl makest waste in niggard'

In [9]:
# Sample data: a paragraph split into words
text = text.split()

In [10]:
text[:10]

['from',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase',
 'that',
 'thereby',
 'beautys',
 'rose']

In [11]:
# Hyperparameters
CONTEXT_SIZE = 4  # Number of preceding words used to predict the next word
EMBEDDING_DIM = 100  # Dimensionality of word embeddings

In [12]:
# Function to create context-target pairs from the text
def create_context_target_pairs(text, context_size):
    context_target_pairs = []
    for i in range(context_size, len(text)):
        context = text[i - context_size:i]  # Get the preceding words
        target = text[i]  # The next word
        context_target_pairs.append((context, target))  # Append the pair
    return context_target_pairs

In [13]:
# Create context-target pairs from the sample text
context_target_pairs = create_context_target_pairs(text, CONTEXT_SIZE)

In [14]:
context_target_pairs[:5]

[(['from', 'fairest', 'creatures', 'we'], 'desire'),
 (['fairest', 'creatures', 'we', 'desire'], 'increase'),
 (['creatures', 'we', 'desire', 'increase'], 'that'),
 (['we', 'desire', 'increase', 'that'], 'thereby'),
 (['desire', 'increase', 'that', 'thereby'], 'beautys')]

In [15]:
# Create a vocabulary from the unique words in the text
vocab = set(text)
vocab_size = len(vocab)  # Number of unique words
word_to_idx = {word: i for i, word in enumerate(vocab)}  # Mapping from word to index
idx_to_word = {i: word for i, word in enumerate(vocab)}  # Mapping from index to word

In [16]:
list(vocab)[:5]

['thyself', 'grows', 'authority', 'waking', 'detain']

In [17]:
vocab_size

3187

In [41]:
#word_to_idx

In [40]:
#idx_to_word

In [19]:
#word_to_idx

In [20]:
# Define a dataset class for the context-target pairs
class NextWordDataset(torch.utils.data.Dataset):
    def __init__(self, context_target_pairs, word_to_idx, context_size):
        self.context_target_pairs = context_target_pairs  # Store the pairs
        self.word_to_idx = word_to_idx  # Store the word-to-index mapping
        self.context_size = context_size  # Store the context size

    def __len__(self):
        return len(self.context_target_pairs)  # Number of pairs

    def __getitem__(self, idx):
        context, target = self.context_target_pairs[idx]  # Get the context and target
        context_indices = [self.word_to_idx[word] for word in context]  # Convert context words to indices
        target_index = self.word_to_idx[target]  # Convert target word to index
        return torch.tensor(context_indices), torch.tensor(target_index)  # Return as tensors

In [21]:
# Create a DataLoader for the dataset
dataset = NextWordDataset(context_target_pairs, word_to_idx, CONTEXT_SIZE)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)  # Shuffle data and set batch size

In [22]:
for i, (context, target) in enumerate(dataloader):
    print(context.shape)
    print(target.shape)
    print(context, target)
    if i == 0:
        break

torch.Size([64, 4])
torch.Size([64])
tensor([[1624,  749,  146, 1982],
        [1624,  928, 1768, 2204],
        [2785, 1594, 2565,  138],
        [1691, 1637, 3089,  849],
        [1637,  376,  773, 1916],
        [1982,  983,  194,  706],
        [2302, 1905, 3080,  194],
        [ 824, 2026,  896,  194],
        [1768,  364,  254, 1935],
        [2561, 1638,  194, 1441],
        [2634, 2687, 1057, 2951],
        [2562,  194, 2614,   99],
        [1366, 2976, 1917, 1433],
        [2951, 1768, 1263,  390],
        [ 394,  194, 1066,  210],
        [2561, 2785, 1433,  343],
        [ 975, 3101,  824, 1637],
        [2462,  194, 1129, 1262],
        [2833, 1524,   59,  148],
        [1624, 2785,  931, 1363],
        [1391, 2802, 2146, 2843],
        [2561,   56, 1433,  295],
        [1445, 1074, 2501, 2943],
        [2302, 2438,  254,   81],
        [1192, 2718, 2799, 1840],
        [2745,  566, 1982,  755],
        [1624, 1614, 2293, 1248],
        [1740, 2951, 1775, 1810],
        [23

In [23]:
# Define the RNN model class
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)  # RNN layer
        self.fc = nn.Linear(hidden_dim, output_dim)  # Fully connected layer
        self.dropout = nn.Dropout(dropout)  # Dropout layer

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))  # Apply embedding and dropout
        output, hidden = self.rnn(embedded)  # Pass through RNN
        hidden = self.dropout(hidden[-1,:,:])  # Apply dropout to the last hidden state
        return self.fc(hidden)  # Pass through the fully connected layer

In [24]:
# Hyperparameters for the RNN model
HIDDEN_DIM = 128  # Number of hidden units in the RNN
OUTPUT_DIM = vocab_size  # Output dimension equals the vocabulary size
N_LAYERS = 2  # Number of RNN layers
DROPOUT = 0.3  # Dropout rate

In [25]:
# Instantiate the RNN model
model = RNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)

In [26]:
# Define the loss function (Cross Entropy Loss)
loss_function = nn.CrossEntropyLoss()
# Define the optimizer (Adam optimizer)
optimizer = optim.Adam(model.parameters())

In [27]:
# Training loop
EPOCHS = 200  # Number of epochs

for epoch in range(EPOCHS):
    total_loss = 0  # Initialize total loss for the epoch
    model.train()  # Set the model to training mode
    for context, target in dataloader:
        optimizer.zero_grad()  # Zero the gradients
        output = model(context)  # Forward pass
        loss = loss_function(output, target)  # Compute the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update the parameters
        total_loss += loss.item()  # Accumulate the loss
    if epoch % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {total_loss/len(dataloader)}")  # Print the average loss for the epoch


Epoch 1, Loss: 6.850768024904014
Epoch 11, Loss: 5.0441063111715945
Epoch 21, Loss: 4.243407128501112
Epoch 31, Loss: 3.805284557551363
Epoch 41, Loss: 3.5108140968058232
Epoch 51, Loss: 3.3135825125840457
Epoch 61, Loss: 3.1719155572626714
Epoch 71, Loss: 3.055161089792739
Epoch 81, Loss: 2.944371388776459
Epoch 91, Loss: 2.8307629023155156
Epoch 101, Loss: 2.7603789689767098
Epoch 111, Loss: 2.71348135662775
Epoch 121, Loss: 2.627792612044481
Epoch 131, Loss: 2.5573419680560594
Epoch 141, Loss: 2.503313988229654
Epoch 151, Loss: 2.4488139709416967
Epoch 161, Loss: 2.4204945907975635
Epoch 171, Loss: 2.363468280238827
Epoch 181, Loss: 2.320353685069258
Epoch 191, Loss: 2.2821211210132515


In [28]:
# Function to predict the next word given a context
def predict(context, model):
    context_indices = [word_to_idx[word] for word in context]  # Convert context words to indices
    context_tensor = torch.tensor(context_indices).unsqueeze(0)  # Add batch dimension and convert to tensor
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        output = model(context_tensor)  # Forward pass
    predicted_idx = torch.argmax(output, dim=1).item()  # Get the index of the highest score
    return idx_to_word[predicted_idx]  # Convert index back to word

In [29]:
# Example test
test = "from fairest creatures we that".split() # Define a context
predicted_test = predict(test, model)  # Predict the next word
print(f"Context: {test}")  # Print the context
print(f"Predicted word: {predicted_test}")  # Print the predicted word


Context: ['from', 'fairest', 'creatures', 'we', 'that']
Predicted word: your


In [36]:
next_words = 10
#seed_text = "from fairest creatures we that".split()
seed_text = "from fairest creatures".split()
context_all = seed_text.copy()

for _ in range(next_words):
    print(f"Context: {seed_text}")  # Print the context
    predicted_word = predict(seed_text, model)
    print(f"Predicted word: {predicted_word}")  # Print the predicted word
    
    seed_text.append(predicted_word)
    context_all.append(predicted_word)
    seed_text = seed_text[1:]

Context: ['from', 'fairest', 'creatures']
Predicted word: the
Context: ['fairest', 'creatures', 'the']
Predicted word: sad
Context: ['creatures', 'the', 'sad']
Predicted word: skill
Context: ['the', 'sad', 'skill']
Predicted word: of
Context: ['sad', 'skill', 'of']
Predicted word: thy
Context: ['skill', 'of', 'thy']
Predicted word: truth
Context: ['of', 'thy', 'truth']
Predicted word: and
Context: ['thy', 'truth', 'and']
Predicted word: then
Context: ['truth', 'and', 'then']
Predicted word: of
Context: ['and', 'then', 'of']
Predicted word: welfare


In [37]:
print(" ".join(context_all))

from fairest creatures the sad skill of thy truth and then of welfare
