In [1]:
import torch
import os
import torch.nn as nn
import numpy as np

In [2]:
# Create a Dictionary class to store tokens and their indices
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)

In [3]:
# TextProcess to consume the given text and convert it into a tensors
class TextProcess(object):
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        lines = []
        with open(path, 'r') as f:
            lines = f.readlines()

        tokens = 0  # The total no. of tokens in the file
        for line in lines:
            words = line.split() + ['<eos>']  # <eos> is a special token to signify the end-of-sentence
            tokens += len(words)
            for word in words:
                self.dictionary.add_word(word)

        
        # A 1-D Tensor containing the index of all the words in the file
        rep_tensor = torch.LongTensor(tokens)
        index = 0
        for line in lines:
            words = line.split() + ['<eos>']
            for word in words:
                rep_tensor[index] = self.dictionary.word2idx[word]
                index += 1

        # Batch this data and trim what doesn't fit in the batches
        num_batches = rep_tensor.shape[0] // batch_size
        rep_tensor = rep_tensor[:(num_batches*batch_size)]

        rep_tensor = rep_tensor.view(batch_size, -1)
        return rep_tensor

In [4]:
# Learning parameters
embed_size = 128
hidden_size = 1024
num_layers = 2
num_epochs = 20
batch_size = 20
timesteps = 30
learning_rate = 0.002

In [5]:
# Let's create the TextProcess object and load the data dictionary
corpus = TextProcess()
rep_tensor = corpus.get_data('alice.txt', batch_size)
rep_tensor.shape

torch.Size([20, 1484])

In [6]:
vocab_size = len(corpus.dictionary)
vocab_size

5290

In [7]:
num_batches = rep_tensor.shape[1] // timesteps
num_batches

49

In [8]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        # Embed the input
        x = self.embed(x)

        out, (h, c) = self.lstm(x, h)

        # Flatten the input of the linear layer, 
        # From (batch_size, timestep, hidden_size) into (batch_size*timestep, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))

        out = self.linear(out)
        return out, (h, c)

In [9]:
# Create the model
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [10]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
# Training the model
for epoch in range(num_epochs):
    # Initialize the hidden and cell states for the LSTM
    states = (torch.zeros(num_layers, batch_size, hidden_size),
             torch.zeros(num_layers, batch_size, hidden_size))


    for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
        # Say rep_tensor is "I like pizza"
        # If timestep is 8, inputs are "I like p"
        # Then target embeddings are " like pi"
        inputs = rep_tensor[:, i: i+timesteps]
        targets = rep_tensor[:, (i+1): (i+1)+timesteps]
        
        outputs, _ = model(inputs, states)
        loss = loss_fn(outputs, targets.reshape(-1))

        # Backpropogate and update weights
        model.zero_grad()
        loss.backward()

        # Clip the gradient to [-0.5, 0.5] to prevent the exploding gradient problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // timesteps
        if step % 100 == 0:
            print("Epoch [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, loss.item()))

Epoch [1/20], Loss: 8.5677
Epoch [2/20], Loss: 6.2088
Epoch [3/20], Loss: 5.8226
Epoch [4/20], Loss: 5.5457
Epoch [5/20], Loss: 5.3234
Epoch [6/20], Loss: 4.9615
Epoch [7/20], Loss: 4.6447
Epoch [8/20], Loss: 4.3099
Epoch [9/20], Loss: 4.0590
Epoch [10/20], Loss: 3.7702
Epoch [11/20], Loss: 3.6148
Epoch [12/20], Loss: 3.3218
Epoch [13/20], Loss: 2.9759
Epoch [14/20], Loss: 2.7244
Epoch [15/20], Loss: 2.3738
Epoch [16/20], Loss: 1.9343
Epoch [17/20], Loss: 1.6013
Epoch [18/20], Loss: 1.2326
Epoch [19/20], Loss: 0.9657
Epoch [20/20], Loss: 0.6403


In [12]:
# Testing the model
with torch.no_grad():
    with open('results.txt', 'w') as f:
        state = (torch.zeros(num_layers, 1, hidden_size),
                 torch.zeros(num_layers, 1, hidden_size))

        input = torch.randint(0, vocab_size, (1, )).long().unsqueeze(1)
        
        for i in range(500):
            output, _ = model(input, state)
            
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()            
            input.fill_(word_id)

            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)

            if (i + 1) % 100 == 0:
                print("Sampled [{}/{}] words and save to {}".format(i+1, 500, 'results.txt'))

Sampled [100/500] words and save to results.txt
Sampled [200/500] words and save to results.txt
Sampled [300/500] words and save to results.txt
Sampled [400/500] words and save to results.txt
Sampled [500/500] words and save to results.txt
