# LSTM Language Model

## Imports

In [2]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_
from data_utils import Dictionary, Corpus

## Hyperparameters

In [3]:
device = 'cpu'

# Hyper-parameters

embed_size = 128
hidden_size = 1024
num_layers = 2
num_epochs = 5
batch_size = 20
seq_length = 30
learning_rate = 0.002

## Data Retrieval

Read the data from the files, which is basically assigning word IDs and making batches.

I have used Penn Treebank from Mikolov's webpage.

In [4]:
corpus = Corpus()
word_ids = corpus.get_data("data/train.txt", batch_size)
vocab_size = len(corpus.dictionary)
number_batches = word_ids.size(1) // seq_length

## LSTM Model Definition

A simple multi-layer LSTM with dropout.

In [5]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout_prob = 0.5):
        super(RNN, self).__init__()
        self.drop = nn.Dropout(dropout_prob) # create inline dropout function
        self.embeddings = nn.Embedding(vocab_size, embed_size) # create embedding matrix
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, dropout = dropout_prob, batch_first = True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        x = self.drop(self.embeddings(x))  # dropout before LSTM pipeline
        out, (h, c) = self.lstm(x, h)
        out = self.drop(out) 
        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0) * out.size(1), out.size(2))
        out = self.linear(out)
        return out, (h, c)

In [6]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

## Pretraining code

In [7]:
model = RNN(vocab_size, embed_size, hidden_size, num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.load_state_dict(torch.load('model.ckpt')) # load previous model
model.lstm.flatten_parameters() # unrolling parameters

## Main training code

In [9]:
try:
    for epoch in range(num_epochs):
        model.train() # turn the model into training mode
        
        #initialize the states
        states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
                  torch.zeros(num_layers, batch_size, hidden_size).to(device))

        for i in range(0, word_ids.size(1) - seq_length, seq_length):
            # batchifying the sequence
            inputs = word_ids[:, i: i + seq_length].to(device)
            targets = word_ids[:, (i + 1) : (i + 1) + seq_length].to(device)

            # detaching the states every batch
            # idea of truncated backprop
            states = repackage_hidden(states)
            outputs, states = model(inputs, states)
            loss = criterion(outputs, targets.reshape(-1))

            model.zero_grad()
            loss.backward()
            
            # avoiding gradient explosion
            clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            # log errors after every 100 steps
            step = (i + 1) // seq_length
            if step % 100 == 0:
                print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                       .format(epoch+1, num_epochs, step, number_batches, loss.item(), np.exp(loss.item())))
        
        # after every epoch run the validation step
        validate(model, "valid")

except KeyboardInterrupt:
    print('Exiting from training early')

Epoch [1/5], Step[0/1549], Loss: 9.1282, Perplexity: 9211.32
Exiting from training early


In [8]:
def validate(model, data):
    model.eval() # turn the model to evalutation mode
    eval_batch_size = 1
    states = (torch.zeros(num_layers, eval_batch_size, hidden_size).to(device),
              torch.zeros(num_layers, eval_batch_size, hidden_size).to(device))
    total_loss = 0
    count = 0
    
    # get data item by item from validation set
    if data == "valid":
        test_ids = corpus.get_data("data/valid.txt", eval_batch_size)
    else:
        test_ids = corpus.get_data("data/test.txt", eval_batch_size)
    
    num_batches = test_ids.size(1) // seq_length

    for i in range(0, test_ids.size(1) - seq_length, seq_length):
        inputs = test_ids[:, i: i + seq_length].to(device)
        targets = test_ids[:, (i + 1) : (i + 1) + seq_length].to(device)

        states = repackage_hidden(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))
        
        # this time, we want to keep track of accumulative loss
        total_loss += len(inputs) * loss.item()
        count += 1
        current_loss = total_loss / count

        step = (i + 1) // seq_length
        if step % 100 == 0:
            print ('Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(step, num_batches, current_loss, np.exp(current_loss)))

In [None]:
torch.save(model.state_dict(), 'model.ckpt')

validate(model, "test")

The test set receives the perplexity around 175.6. I shouldn't have done multi-layer LSTM with dropout training on CPU as it really takes a long time to train.