In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from collections import Counter
import warnings
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import itertools
from copy import deepcopy
import collections
from sklearn.model_selection import train_test_split

In [2]:
batch_size = 64
seq_size = 32
embedding_size = 64
lstm_size = 64
gradients_norm = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Let's download the dataset:

In [3]:
path = kagglehub.dataset_download("kingburrito666/shakespeare-plays")

Downloading from https://www.kaggle.com/api/v1/datasets/download/kingburrito666/shakespeare-plays?dataset_version_number=4...


100%|██████████| 4.55M/4.55M [00:00<00:00, 6.49MB/s]

Extracting files...





In [4]:
files = os.listdir(path)
file_path = os.path.join(path, "Shakespeare_data.csv")

Save the dataset inside doc:

In [5]:
with open (file_path, 'r') as f:
    doc = f.read()

# Data preparation

## Data split

In [6]:
# Load data from a CSV file (update the path as necessary)
data = pd.read_csv(file_path)

# Function to create unique keys for play and character
def play_and_character(play, character):
    return f"{play}_{character}"

# Function to split the dataset into train, validation, and test sets
def split_dataset_by_character(data, test_fraction=0.2, val_fraction=0.1):
    """
    Splits the dataset into train, validation, and test sets, keeping character data separate.

    data: DataFrame with columns [Play, Player, PlayerLine]
    test_fraction: Percentage of data for the test set
    val_fraction: Percentage of data for the validation set (relative to the total)

    Returns: tuple (train_data, val_data, test_data)
    """
    skipped_characters = 0
    all_train = []
    all_val = []
    all_test = []

    grouped = data.groupby(["Play", "Player"])

    for (play, character), group in grouped:
        if len(group) <= 2:
            skipped_characters += 1
            continue

        examples = group
        train, test = train_test_split(examples, test_size=test_fraction, random_state=42)
        if val_fraction > 0:
            train, val = train_test_split(train, test_size=val_fraction / (1 - test_fraction), random_state=42)
            all_val.append(val)
        all_train.append(train)
        all_test.append(test)

    train_data = pd.concat(all_train, ignore_index=True)
    val_data = pd.concat(all_val, ignore_index=True) if all_val else pd.DataFrame()
    test_data = pd.concat(all_test, ignore_index=True)

    print(f"Skipped characters: {skipped_characters}")
    return train_data, val_data, test_data


train_data, val_data, test_data = split_dataset_by_character(data, test_fraction=0.2, val_fraction=0.1)

# Save the resulting datasets
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)


Skipped characters: 161


In [7]:
files = os.listdir(path)

In [8]:
with open ("train_data.csv", 'r') as f:
    doc_train = f.read()
with open ("test_data.csv", 'r') as f:
    doc_test = f.read()
with open ("val_data.csv", 'r') as f:
    doc_val = f.read()

Function to convert the all dataset into a list of words. We remove undesired characters.

In [9]:
def extract_words(doc):
    lines = doc.split('\n')
    lines = [line.strip(r'\"') for line in lines]
    words = ' '.join(lines).split()
    return words

Function that is defined to remove all the punctuation characters.

In [10]:
def remove_punct(words):
    punct = set(string.punctuation)
    words = [''.join([char for char in list(word) if char not in punct]) for word in words]
    return words

Function to build a vocabulary of each word considering the number of occurrencies of that word.

In [11]:
# get vocab from word list
def getvocab(words):
    wordfreq = Counter(words)
    sorted_wordfreq = sorted(wordfreq, key=wordfreq.get)
    return sorted_wordfreq

To simplify the training we create 2 dictionaies that map each word to a distinct number and viceversa.

In [12]:
def vocab_map(vocab):
    int_to_vocab = {k:w for k,w in enumerate(vocab)}
    vocab_to_int = {w:k for k,w in int_to_vocab.items()}
    return int_to_vocab, vocab_to_int

Let's define a function that concatenates all these functions together:

In [13]:
def preprocess_text_custom(text):
    words = extract_words(text)
    words = remove_punct(words)
    return words

Let's use these functions for our dataset:

In [14]:
words_train = remove_punct(extract_words(doc_train)) # words without punctuation
words_test = remove_punct(extract_words(doc_test))
words_val = remove_punct(extract_words(doc_val))
vocab_train = getvocab(words_train)
vocab_test = getvocab(words_test)
vocab_val = getvocab(words_val)
int_to_vocab_train, vocab_to_int_train = vocab_map(vocab_train)
int_to_vocab_test, vocab_to_int_test = vocab_map(vocab_test)
int_to_vocab_val, vocab_to_int_val = vocab_map(vocab_val)

Function to create batches of sequences:

In [15]:
def get_batches(words, vocab_to_int, batch_size, seq_size):
    # generate a Xs and Ys of shape (batchsize * num_batches) * seq_size
    word_ints = [vocab_to_int[word] for word in words]
    num_batches = int(len(word_ints) / (batch_size * seq_size))
    Xs = word_ints[:num_batches*batch_size*seq_size]
    Ys = np.zeros_like(Xs)
    Ys[:-1] = Xs[1:]
    Ys[-1] = Xs[0]
    Xs = np.reshape(Xs, (num_batches*batch_size, seq_size))
    Ys= np.reshape(Ys, (num_batches*batch_size, seq_size))

    # iterate over rows of Xs and Ys to generate batches
    for i in range(0, num_batches*batch_size, batch_size):
        yield Xs[i:i+batch_size, :], Ys[i:i+batch_size, :]

# Model

In [16]:
class ShakespeareRNN(nn.Module):
    def __init__(self, n_vocab=90, seq_length=80, embedding_dim=8, lstm_hidden_size=256):
        super(ShakespeareRNN, self).__init__()

        self.seq_length = seq_length
        self.lstm_hidden_size = lstm_hidden_size

        # Embedding layer: maps each character to an 8-dimensional space
        self.embedding = nn.Embedding(n_vocab, embedding_dim)
        #the first parameter is the total number of words inside the vocabulary
        #the second is the length of the vector for each instance
        #in practice each batch get converted into a dense representation of an
        #embedding

        # Two LSTM layers, each with 256 nodes
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=lstm_hidden_size,
            num_layers=2,
            batch_first=True
        )

        # Dense softmax output layer to predict next character probabilities
        self.dense = nn.Linear(lstm_hidden_size, n_vocab)

    def forward(self, x, prev_state):
        # Embed input characters into learned 8-dimensional space
        embedded = self.embedding(x)

        # Pass through LSTM layers
        lstm_out, state = self.lstm(embedded, prev_state)

        # Dense layer to produce logits for the output vocabulary
        logits = self.dense(lstm_out)

        return logits, state

    def init_state(self, batch_size):
        # Initialize LSTM hidden and cell states with zeros
        return (torch.zeros(2, batch_size, self.lstm_hidden_size),  # Hidden states for 2 layers
                torch.zeros(2, batch_size, self.lstm_hidden_size))  # Cell states for 2 layers


The last layer of the ShakespeareRNN is a fully connected layer (self.dense = nn.Linear(lstm_hidden_size, n_vocab)) that outputs the logits for each character in the vocabulary.

The softmax function is not explicitly applied in the model. Instead, the logits from the FC layer are directly passed to the loss function. This is because CrossEntropyLoss internally applies log_softmax during computation.

In [17]:
def get_loss_and_train_op(net, lr=0.001, weight_decay = 1e-05):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay = weight_decay)

    return criterion, optimizer

In [18]:
def generate_text(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])

    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

# Train

## Checkpoints utils functions
During training is important to save the progress in case google colab interrupts the execution. To do so we decided to define a function that save the checkpoint and, in particular, save the copy of the values of the current parameters inside the network, the loss value, the epoch reached and the parameters of the optimizer.

We mantain a default path to store these values:

In [19]:
checkpoint_path = "shakespeare_rnn_checkpoint.pth"

In [20]:
def save_checkpoint(model, optimizer, epoch, loss, path=checkpoint_path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)

In [21]:
def load_checkpoint(model, optimizer, path=checkpoint_path):
    if os.path.exists(path):
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        loss = checkpoint['loss']
        print(f"Checkpoint loaded: epoch {epoch}, loss {loss:.4f}.")
        return epoch, loss
    else:
        print("No checkpoint found.")
        return 0, None

## function for training

In [22]:
def train_rnn(words, vocab_to_int, int_to_vocab, n_vocab, n_epochs = 5, lr = 0.01, weight_decay = 1e-05, scheduler_type=None, scheduler_params=None):
    print("training started")
    # RNN instance
    net = ShakespeareRNN(n_vocab, seq_size, embedding_size, lstm_size)
    net = net.to(device)
    criterion, optimizer = get_loss_and_train_op(net, lr, weight_decay=weight_decay)

    scheduler = None
    if scheduler_type is not None:
        if scheduler_type == "StepLR":
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **scheduler_params)
        elif scheduler_type == "ExponentialLR":
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, **scheduler_params)
        elif scheduler_type == "CosineAnnealingLR":
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, **scheduler_params)
        else:
            raise ValueError(f"Unsupported scheduler type: {scheduler_type}")

    iteration = 0

    for e in range(n_epochs):
        batches = get_batches(words, vocab_to_int, batch_size, seq_size)
        print("batches taken")
        state_h, state_c = net.init_state(batch_size)

        # Transfer data to GPU
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1

            # Tell it we are in training mode
            net.train()

            # Reset all gradients
            optimizer.zero_grad()

            # Transfer data to GPU
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss_value = loss.item()

            # Perform back-propagation
            loss.backward(retain_graph=True)

            _ = torch.nn.utils.clip_grad_norm_(net.parameters(), gradients_norm)

            # Update the network's parameters
            optimizer.step()

            #save the current values of training:

            #FOR EACH EPOCH
            #USE ACCURACY INSTEAD OF LOSS

            #validation: vedere la validation

            #iid non iid
            #uniform s-qued

            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, n_epochs),'Iteration: {}'.format(iteration),'Loss: {}'.format(loss_value))

            # if iteration % 1000 == 0:
                # predict(device, net, flags.initial_words, n_vocab,vocab_to_int, int_to_vocab, top_k=5)
                # torch.save(net.state_dict(),'checkpoint_pt/model-{}.pth'.format(iteration))
        # Update the learning rate scheduler
        save_checkpoint(net, optimizer, e, loss)
        if scheduler is not None:
            scheduler.step()
    return net, loss_value

## Hyperparam tuning
Here's an implementation to perform hyperparameter tuning for learning rate, weight decay, and learning rate scheduler hyperparameters. It uses grid search to identify the best combination of parameters.

Define the parameters grid:

In [23]:
param_grid = {
    'learning_rate': [0.05, 0.01, 0.005, 0.001],
    'weight_decay': [1e-5, 5e-5, 1e-4],
    'lr_scheduler': [
        ("StepLR", {'step_size': 5, 'gamma': 0.1}),
        ("CosineAnnealingLR", {'T_max': 5, 'eta_min': 1e-4}),
        ("ExponentialLR", {'gamma': 0.95}),
        ("ExponentialLR", {'gamma': 0.9})
    ]
}

In [24]:
def accuracy(y_pred, y_true):
    """Accuracy computed as the percentage of correct predictions."""
    _, predicted = torch.max(y_pred, 1)
    correct = (predicted == y_true).sum().item()
    accuracy = correct / y_true.size(0)
    return accuracy

In [25]:
def hyperparameter_search(param_grid, train_fn, n_vocab, seq_size, embedding_size, lstm_size, words, vocab_to_int, int_to_vocab, num_epochs):
    """Perform hyperparameter tuning using grid search."""
    best_params = None
    best_loss = float('inf')

    # Create all combinations of hyperparameters
    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    for params in param_combinations:
        # Initialize the model, optimizer, and scheduler with the current parameters
        net = ShakespeareRNN(n_vocab, seq_size, embedding_size, lstm_size).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(net.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])

        # Initialize the scheduler using the factory function
        scheduler_name, scheduler_factory = params['lr_scheduler']
        scheduler_type, scheduler_params = params['lr_scheduler']

        # Train the model and obtain the final loss
        print(f"Testing parameters: {params} with scheduler: {scheduler_name}")
        trained_model, final_loss = train_fn(
            words, vocab_to_int, int_to_vocab, n_vocab, num_epochs, params['learning_rate'], params['weight_decay'], scheduler_type, scheduler_params
        )

        # Update the best parameters if a better loss is found
        if final_loss < best_loss:
            best_loss = final_loss
            best_params = deepcopy(params)

    print(f"Best parameters: {best_params} with loss: {best_loss}")
    return best_params

In [26]:
best_params = hyperparameter_search(
    param_grid,
    train_rnn,  #pass the name of the function that compute the training
    len(vocab_val),  #vocabulary size
    seq_size=80,  #sequence size
    embedding_size=8,  #embedding size
    lstm_size=256,  #LSTM size
    words=words_val,  #training data
    vocab_to_int=vocab_to_int_val,  #vocabulary to int mapping
    int_to_vocab=int_to_vocab_val,  #int to vocabulary mapping
    num_epochs = 20
)

Testing parameters: {'learning_rate': 0.05, 'weight_decay': 1e-05, 'lr_scheduler': ('StepLR', {'step_size': 5, 'gamma': 0.1})} with scheduler: StepLR
training started
batches taken
batches taken
Epoch: 1/20 Iteration: 100 Loss: 8.584595680236816
batches taken
batches taken
Epoch: 3/20 Iteration: 200 Loss: 8.323540687561035
batches taken
batches taken
Epoch: 5/20 Iteration: 300 Loss: 8.160771369934082
batches taken
batches taken
Epoch: 7/20 Iteration: 400 Loss: 7.798923492431641
batches taken
batches taken
Epoch: 9/20 Iteration: 500 Loss: 7.685837268829346
batches taken
batches taken
Epoch: 11/20 Iteration: 600 Loss: 8.400830268859863
batches taken
batches taken
Epoch: 13/20 Iteration: 700 Loss: 8.303919792175293
batches taken
batches taken
Epoch: 15/20 Iteration: 800 Loss: 7.030515670776367
batches taken
batches taken
Epoch: 17/20 Iteration: 900 Loss: 8.32906723022461
batches taken
batches taken
Epoch: 19/20 Iteration: 1000 Loss: 7.871896266937256
Testing parameters: {'learning_rate': 

In [None]:
rnn_net = train_rnn(words_train, vocab_to_int_train, int_to_vocab_train, len(vocab_train))

training started
batches taken
Epoch: 0/5 Iteration: 100 Loss: 9.605334281921387
Epoch: 0/5 Iteration: 200 Loss: 8.48792839050293
Epoch: 0/5 Iteration: 300 Loss: 9.618524551391602
batches taken
Epoch: 1/5 Iteration: 400 Loss: 8.567215919494629


In [None]:
generate_text(device, rnn_net, ['hey', 'you'], len(vocab_test), vocab_to_int_test, int_to_vocab_test)