# Recurrent Neural Network Comparison for Poetry Generation

# 1. Import libraries

In [1]:
# General python libraries
import math, time, gzip, json, random, re
import requests
import shutil

# pytorch libraries
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader,TensorDataset

# 2. Download the dataset

This project uses a corpus of English-language poetry from Project Gutenberg, as curated here: https://github.com/aparrish/gutenberg-poetry-corpus

In [2]:
url = 'http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz'
data_file = 'data.ndjson'

with requests.get(url, stream=True) as r:
    with open(data_file, 'wb') as f:
        shutil.copyfileobj(r.raw, f)

print(f"Poetry data downloaded to file '{data_file}'")

Poetry data downloaded to file 'data.ndjson'


# 3. Preprocessing functions

## Load & vectorize the data

In [3]:
def vectorize(data_file, char_limit, line_offset, use_words):
    print("\n -= Preprocessing Statistics =- ")
    
    # Load data from ndjson file.  The character limit prevents crashes from excessive memory usage.
    all_lines = []
    char_count = 0
    line_count = 0
    for line in gzip.open(data_file):
        if (line_count >= line_offset):
            parsed_line = json.loads(line.strip())['s']
            all_lines.append(parsed_line)
            
            char_count += len(parsed_line)
            if (char_count > char_limit):
                break

        line_count += 1
        
    print(f"Loaded {len(all_lines)} lines of poetry")

    # Join all lines to create the raw input string
    raw_input = "\n".join(all_lines)
    print(f"Raw input length: {len(raw_input)} characters")

    # Sanitize input data
    input_data = re.sub(r'[^a-zA-Z0-9_ \n\.\!\?\&\-\:\;\']', '', raw_input)
    print(f"Sanitized input length: {len(input_data)} total characters")

    # If using a word-based model, split words while preserving newlines
    if (use_words):
        input_data = input_data.replace('\n', ' \n ').split(' ')
        print(f"Split input string into {len(input_data)} words")

    # Determine the input vocabulary
    vocab = sorted(set(input_data))
    print(f"Vocabulary length: {len(vocab)} distinct tokens")

    # Get token mappings. Used for vectorization, model input/output sizes, and text generation.
    token_to_index = {token: index for index, token in enumerate(vocab)}
    index_to_token = np.array(vocab)

    # Vectorize the input data
    vectorized_input = np.array([token_to_index[token] for token in input_data])

    return vectorized_input, token_to_index, index_to_token

## Create training sequences

In [4]:
def generate_sequences(tokens, sequence_length):
    sequence_count = len(tokens) - sequence_length
    print(f"Total training sequences: {sequence_count}")
    
    sequences = np.zeros((sequence_count, sequence_length))
    targets = np.zeros(sequence_count)
    
    for i in range(0, sequence_count):
        sequence = tokens[i:i + sequence_length]
        target = tokens[i + sequence_length]
        sequences[i] = sequence
        targets[i] = target
        # if (i % 10000000 == 0):
        #     print(f"Sequence {i}:  Sequence: {sequence}, Next Token: {target}")
    return sequences, targets

## Construct the data loader

In [5]:
def construct_data_loader(vectorized_input, sequence_length, batch_size):
    print("\n -= Data Loader Statistics =- ")
    sequences, targets = generate_sequences(vectorized_input, sequence_length)
    
    inputs_tensor = torch.tensor(sequences, dtype=torch.long)
    print(f"Created input tensor with shape: {inputs_tensor.shape}")
    targets_tensor = torch.tensor(targets, dtype=torch.long)
    print(f"Created target tensor with shape: {targets_tensor.shape}")
    
    dataset = TensorDataset(inputs_tensor, targets_tensor)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    return data_loader

# 4. Parameterized model definition

In [6]:
class ParameterizedModel(nn.Module):
    """Parameterized model constructor.
    
    Parameters:
        vocab_size -- Size of the input vocabulary.
        embed_size -- Feature size of the embedding layer.
        recurrence_type -- pytorch class for the recurrence type (nn.RNN, nn.LSTM, nn.GRU, etc.)
        recurrence_size -- Feature size of the recurrence layers.
        num_recurrence -- Number of recurrence layers.
        bidirectional -- Boolean indicating if the recurrence layers are bidirectional.
        dropout -- Dropout value after each recurrence layer.
        num_nonlinear -- Number of nonlinear layers. Valid range: 0-2
        linear_size -- Hidden feature size of linear layers. Only applicable when num_nonlinear > 1.
        output_size -- Final network output size.
    """
    def __init__(self,
                 vocab_size, embed_size,
                 recurrence_type, recurrence_size, num_recurrence, bidirectional,
                 dropout,
                 num_nonlinear, linear_size,
                 output_size):
        super(ParameterizedModel, self).__init__()

        # Create embedding layer
        self.embed = nn.Embedding(vocab_size, embed_size)

        # Create recurrence layers
        self.rnn = recurrence_type(embed_size, recurrence_size, num_recurrence, batch_first=True, dropout=dropout, bidirectional=bidirectional)
        
        # Convert bidirectionality to an integer value for later calculations.
        self.num_directions = int(bidirectional == True) + 1

        # Apply dropout after the final recurrence layer
        self.dropout = nn.Dropout(dropout)

        # Create fully connected layers
        if num_nonlinear == 0:
            self.fc = nn.Sequential(nn.Linear(recurrence_size * self.num_directions, output_size))
        elif num_nonlinear == 1:
            self.fc = nn.Sequential(nn.ReLU(),
                                    nn.Linear(recurrence_size * self.num_directions, output_size))
        elif num_nonlinear == 2:
            self.fc = nn.Sequential(nn.ReLU(),
                                    nn.Linear(recurrence_size * self.num_directions, linear_size),
                                    nn.ReLU(),
                                    nn.Linear(linear_size, output_size))

    def forward(self, x, state):
        embed = self.embed(x)
        out, state = self.rnn(embed, state)
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        # out = self.linear(out[:, -1, :])
        return out, state

    def get_initial_state(self, batch_size, device):
        if not isinstance(self.rnn, nn.LSTM):
            # RNN & GRU take a tensor as hidden state
            return torch.zeros(self.num_directions * self.rnn.num_layers,
                               batch_size,
                               self.rnn.hidden_size).to(device)
        else:
            # LSTM takes a tuple of hidden states
            return (torch.zeros(self.num_directions * self.rnn.num_layers,
                                batch_size,
                                self.rnn.hidden_size).to(device),
                    torch.zeros(self.num_directions * self.rnn.num_layers,
                                batch_size,
                                self.rnn.hidden_size).to(device))
        

# 5. Training function

In [7]:
def train_model(model, epochs, data_loader, device, optimizer, loss_function):
    print("\n -= Training Statistics =- ")
    epoch_output = ["Epoch,AverageLoss,AveragePerplexity,ElapsedTime"]
    start_time = time.perf_counter()
    
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_perplexity = 0
        state = None
    
        for batch_idx, (train_inputs, train_targets) in enumerate(data_loader):
            train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)
            
            # Detach states
            if state is None:
                state = model.get_initial_state(train_inputs.size(0), device)
            else:
                if not isinstance(state, tuple):
                    #Detach state for RNN & GRU networks
                    state.detach_()
                else:
                    # Detach all states for LSTM network
                    for s in state:
                        s.detach_()
    
            optimizer.zero_grad()
            output, state = model(train_inputs, state)
            loss = loss_function(output, train_targets)
            loss.backward()
            optimizer.step()
    
            epoch_loss += loss.item()
            epoch_perplexity += torch.exp(loss)

        epoch_output.append(f"{epoch + 1},{epoch_loss / len(data_loader) :.5f},{epoch_perplexity / len(data_loader) :.5f},{time.perf_counter() - start_time :.3f}")
        print(f'Epoch {epoch + 1} | Average loss: {epoch_loss / len(data_loader) :.3f} | Average perplexity: {epoch_perplexity / len(data_loader) :.3f} | Elapsed time: {time.perf_counter() - start_time :.3f} seconds')

    return epoch_output

# 6. Text Generation

In [8]:
def tokens_to_tensor(tokens, token_to_index):
    token_indices = []
    for token in tokens:
        if token in token_to_index:
            token_indices.append(token_to_index[token])
        else:
            # If the provided token isn't in the dictionary, choose a random one. Useful for word-based models.
            token_indices.append(random.choice(list(token_to_index.values())))
    # token_indices = [token_to_index[token] for token in tokens]
    return torch.tensor(np.array([token_indices]), dtype=torch.long)

def generate_text(model, seed, token_to_index, index_to_token, sequence_length, generate_length):
    input_tensor = tokens_to_tensor(seed[-sequence_length:], token_to_index)
    state = model.get_initial_state(batch_size=1, device=torch.device("cpu"))

    for _ in range(generate_length):
        with torch.no_grad():
            output, state = model(input_tensor, state)
        predicted_index = torch.argmax(output, dim=1).item()
        predicted_token = index_to_token[predicted_index]
        seed.append(predicted_token)
        input_tensor = tokens_to_tensor(seed[-sequence_length:], token_to_index)

    return seed

# 7. Wrapper function to bring everything together

In [9]:
def train_and_generate(# Required parameters
                       model_type,             # pytorch recurrent class for the model. Supported values: nn.RNN, nn.LSTM, nn.GRU
                       use_words,              # If true, the model uses word tokens instead of character tokens
                       seeds,                  # Array of text generation seed strings
                       output_filename,         # Name for the output .csv, .txt, and .ckpt files
                       
                       # Data loading parameters
                       char_limit = 10000,                       # Number of characters to load from the dataset
                       line_offset = random.randrange(2900000),  # Number of lines (of ~3 million) to skip before reading data
                       sequence_length = 30,                     # Length for training data token sequences
                       batch_size = 32,                          # Batch size for training data
                       
                       # Model definition parameters
                       embed_size = 256,       # Feature size of the embedding layer
                       num_recurrence = 1,     # Number of recurrence layers
                       recurrence_size = 512,  # Feature size of the recurrence layers
                       bidirectional = False,  # If true, sets the recurrence layers to be bidirectional
                       dropout = 0,            # Dropout after each recurrence layer
                       num_nonlinear = 0,      # Number of nonlinear ReLU layers. Supported values: 0-2
                       linear_size = 128,      # Feature size of linear layers. Only applies if num_nonlinear >= 2
                       
                       # Training parameters
                       epochs = 20,            # Number of training epochs
                       
                       # Generation parameters
                       generate_length = 500   # Number of tokens of output text to generate for each seed string
                      ):
    
    vectorized_input, token_to_index, index_to_token = vectorize(data_file, char_limit, line_offset, use_words)
    data_loader = construct_data_loader(vectorized_input, sequence_length, batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    input_size = len(token_to_index)
    output_size = len(token_to_index)

    model = ParameterizedModel(input_size,
                               embed_size,
                               model_type,
                               recurrence_size,
                               num_recurrence,
                               bidirectional,
                               dropout,
                               num_nonlinear,
                               linear_size,
                               output_size).to(device)
    
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    epoch_output = train_model(model, epochs, data_loader, device, optimizer, loss_function)

    print("\n -= Generated Text =- ")
    generated_text = ""

    for seed in seeds:
        if (use_words):
            generated_text = generated_text + "\n\n" + " ".join(generate_text(model,
                                                                              seed.split(),
                                                                              token_to_index,
                                                                              index_to_token,
                                                                              sequence_length,
                                                                              generate_length))
        else:
            generated_text = generated_text + "\n\n" + "".join(generate_text(model,
                                                                             list(seed),
                                                                             token_to_index,
                                                                             index_to_token,
                                                                             sequence_length,
                                                                             generate_length))

    print(generated_text)

    print("\nSaving outputs and model...")
    
    with open(f"{output_filename}.csv", "w") as csv:
        csv.write('\n'.join(epoch_output))

    with open(f"{output_filename}.txt", "w") as txt:
        txt.write(generated_text)

    torch.save(model, f"{output_filename}.ckpt")

# 8. Run the model for each model/token combination

Output results can be found in the report .pdf file.

In [None]:
# Test RNN model
recurrence = nn.RNN
use_words = False
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
output_filename = "RNN_Test"

train_and_generate(recurrence, use_words, seeds, output_filename)

In [None]:
# Character-based RNN models
recurrence = nn.RNN
use_words = False
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
char_limit = 100000
root_filename = "RNN_char_"

train_and_generate(recurrence, use_words, seeds, root_filename + "base", char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers2", num_recurrence=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers3", num_recurrence=3, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout20", dropout=0.2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout50", dropout=0.5, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "bidirectional", bidirectional=True, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear1", num_nonlinear=1, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear2", num_nonlinear=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "all", num_recurrence=2, dropout=0.2, bidirectional=True, num_nonlinear=1, char_limit=char_limit)

In [None]:
# Word-based RNN models
recurrence = nn.RNN
use_words = True
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
char_limit = 200000
root_filename = "RNN_word_"

train_and_generate(recurrence, use_words, seeds, root_filename + "base", char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers2", num_recurrence=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers3", num_recurrence=3, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout20", dropout=0.2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout50", dropout=0.5, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "bidirectional", bidirectional=True, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear1", num_nonlinear=1, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear2", num_nonlinear=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "all", num_recurrence=2, dropout=0.2, bidirectional=True, num_nonlinear=1, char_limit=char_limit)

In [None]:
# Character-based LSTM models
recurrence = nn.LSTM
use_words = False
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
char_limit = 100000
root_filename = "LSTM_char_"

train_and_generate(recurrence, use_words, seeds, root_filename + "base", char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers2", num_recurrence=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers3", num_recurrence=3, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout20", dropout=0.2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout50", dropout=0.5, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "bidirectional", bidirectional=True, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear1", num_nonlinear=1, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear2", num_nonlinear=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "all", num_recurrence=2, dropout=0.2, bidirectional=True, num_nonlinear=1, char_limit=char_limit)

In [None]:
# Word-based LSTM models
recurrence = nn.LSTM
use_words = True
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
char_limit = 200000
root_filename = "LSTM_word_"

train_and_generate(recurrence, use_words, seeds, root_filename + "base", char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers2", num_recurrence=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers3", num_recurrence=3, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout20", dropout=0.2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout50", dropout=0.5, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "bidirectional", bidirectional=True, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear1", num_nonlinear=1, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear2", num_nonlinear=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "all", num_recurrence=2, dropout=0.2, bidirectional=True, num_nonlinear=1, char_limit=char_limit)

In [None]:
# Character-based GRU models
recurrence = nn.GRU
use_words = False
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
char_limit = 100000
root_filename = "GRU_char_"

train_and_generate(recurrence, use_words, seeds, root_filename + "base", char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers2", num_recurrence=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers3", num_recurrence=3, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout20", dropout=0.2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout50", dropout=0.5, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "bidirectional", bidirectional=True, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear1", num_nonlinear=1, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear2", num_nonlinear=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "all", num_recurrence=2, dropout=0.2, bidirectional=True, num_nonlinear=1, char_limit=char_limit)

In [None]:
# Word-based GRU models
recurrence = nn.GRU
use_words = True
seeds = ["Two roads diverged in a yellow wood",
         "And on the pedestal these words appear",
         "Shall I compare thee to a summer's day?"]
char_limit = 200000
root_filename = "GRU_word_"

train_and_generate(recurrence, use_words, seeds, root_filename + "base", char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers2", num_recurrence=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "layers3", num_recurrence=3, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout20", dropout=0.2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "dropout50", dropout=0.5, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "bidirectional", bidirectional=True, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear1", num_nonlinear=1, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "nonlinear2", num_nonlinear=2, char_limit=char_limit)
train_and_generate(recurrence, use_words, seeds, root_filename + "all", num_recurrence=2, dropout=0.2, bidirectional=True, num_nonlinear=1, char_limit=char_limit)