In [3]:
import os
import time
import math 
import torch
import torch.nn as nn
from torch.autograd import Variable

In [4]:
batch_size = 20
embedding_dim = 200
hidden_size = 200
sequence_length = 35
learning_rate = 20
num_layers = 2
dropout = 0.5
display_interval = 100
epochs = 2
clip_gradient = 0.25
tie_weights = False
eval_batch_size = 10
save = 'models/model.pt'

In [20]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word) 
            self.word2idx[word] = len(self.idx2word) - 1
            return self.word2idx[word]
        
    def __len__(self):
        return len(self.idx2word)

In [21]:
class Corpus(object):

    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))
    
    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as file:
            tokens = 0
            for line in file:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as file:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in file:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
        return ids

In [22]:
corpus = Corpus('wikitext/')
num_tokens = len(corpus.dictionary)

In [23]:
num_tokens

33278

In [9]:
# Stack data one after another 
def create_batch(data, batch_size):
    num_batch = data.size(0) // batch_size
    data = data.narrow(0, 0, num_batch * batch_size)
    data = data.view(batch_size, -1).t().contiguous()
    return data

In [24]:
# generate trainig batch
train_data = create_batch(corpus.train, batch_size)
valid_data = create_batch(corpus.valid, batch_size)
test_data =  create_batch(corpus.test, batch_size)

In [25]:
train_data

tensor([[    0,   284, 15178,  ...,  1352,  1335,    16],
        [    1,   357,    43,  ...,    46,    43,  2015],
        [    2,  1496,  7369,  ...,   380,    27, 33001],
        ...,
        [  357,   415,   173,  ...,   212,    78,  1575],
        [ 2520,     9,  3890,  ...,   208,    27,   808],
        [   33,    35,    19,  ...,  8832,  6091,   209]])

In [11]:
valid_data

tensor([[    0,   652,     9,  ...,    17,    15,   641],
        [    1,    17, 10058,  ...,  2058,     0,   127],
        [32966,  1874,    26,  ...,    27,     0,    15],
        ...,
        [ 1450,  5990,     8,  ...,  2228,    37,     1],
        [ 3158,     9,     0,  ...,  8587,     9,  8627],
        [   15,     9, 16497,  ...,  4382,   128,  4249]])

In [12]:
test_data.shape

torch.Size([12278, 20])

In [32]:
# Data loader
def get_batch(source, index):
    seq_len = min(sequence_length, len(source) - 1 - index)
    data = source[index: index+seq_len]
    target = source[index+1: index+1+seq_len].view(-1)
    return data, target

In [33]:
data, target = get_batch(train_data, 1)

In [34]:
data.shape

torch.Size([35, 20])

In [35]:
target.shape

torch.Size([700])

In [2]:
class RNNModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, dropout=0.5, tie_weights=False):
        # embedding_dim == input_size
        # num_embeddings == vocabulary_size
        # hidden_size == number of features in the hidden layer
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.encoder = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, num_embeddings)

        # suggested on paper
        if tie_weights:
            if hidden_size != embedding_dim:
                raise ValueError('When using tie weights hidden_size == embedding_size')
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()

    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        embed = self.dropout(self.encoder(input))
        output, hidden = self.lstm(embed, hidden)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters())
        return weight.new_zeros(self.num_layers, batch_size, self.hidden_size), weight.new_zeros(self.num_layers, batch_size, self.hidden_size)

In [None]:
# Build model
model = model.RNNModel(num_tokens, embedding_dim, hidden_size, num_layers, dropout, tie_weights=False)

In [None]:
criterion = nn.CrossEntropyLoss()

In [36]:
def repackage_hidden(hidden):
    if isinstance(hidden, torch.Tensor):
        return hidden.detach()
    else:
        return tuple(repackage_hidden(v) for v in hidden)

In [19]:
# model evaluation
def evaluate(data_source):
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for index in range(0, data_source.size(0) - 1, sequence_length):
            data, targets = get_batch(data_source, index)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, num_tokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [37]:
# model training
def train():
    model.train()
    print('*'*20)
    total_loss = 0.
    start_time = time.time()
    
    hidden = model.init_hidden(batch_size)
    
    for batch, index in enumerate(range(0, train_data.size(0) - 1, sequence_length)):
        data, targets = get_batch(train_data, index)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, num_tokens), targets)
        loss.backward()
        
        # gradient clipping to avoid exploding gradient
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_gradient)
        for p in model.parameters():
            p.data.add_(-learning_rate, p.grad.data)

        total_loss += loss.item()

        if batch % display_interval == 0 and batch > 0:
            cur_loss = total_loss / display_interval
            elapsed = time.time() - start_time
            print('| Epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format(
                epochs - 1, batch, len(train_data) // sequence_length, learning_rate,
                elapsed * 1000 / display_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
# Loop over epochs.
best_val_loss = None

for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(valid_data)
        print('-' * 89)
        print('| End of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as file:
                torch.save(model, file)
            best_val_loss = val_loss
        else:
            # Learninig rate annealing
            # Cut-off the learning rate by the factor of 4 if no improvement has been seen in validation data.
            learning_rate /= 4.0

In [None]:
# test model

In [None]:
# Load the best saved model.
with open(save, 'rb') as file:
    model = torch.load(file)
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('| End of evaluation | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss)))

In [None]:
# generate text