In this tutorial, we will implement a recurrent neural network which is able to classify images. The dataset used here is called MNIST consisting of handwritten digits.

In [1]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

In [2]:
use_cuda = True

In [3]:
# Build the data.
train_sentences = ["The man ate the apple.", "Two women running on the beach.", "Everybody read this book."]
val_sentences = ["A blue bird is singing on the tree.", "People were excited about the election."]

In [4]:
# Lower--case the training sentences, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in train_sentences]

# Create the vocabulary.
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(2000)]
vocabulary_size = len(vocabulary)

# Build one-hot embeddings.
one_hot_embeddings = np.eye(vocabulary_size)

# Build a word-to-index dictionary.
word2index = {word:index for index,word in enumerate(vocabulary)}

def preprocess_numberize(sentence):
    """
    Given a sentence in the form of a string, preprocess it into a list of numbers denoting the index 
    of each word in the vocabulary.
    """
    tokenized = word_tokenize(sentence.lower())
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word) for word in tokenized]
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence in the form of a string, preprocess it into a a matrix of one-hot vectors corresponding
    to each word in the vocabulary.
    """
    numberized = preprocess_numberize(sentence)
    one_hot_embedded = one_hot_embeddings[numberized]
    return one_hot_embedded

In [5]:
# Let's see what the vocabulary looks like:
print(vocabulary)

['<UNK>', '<SOS>', 'the', '.', '<EOS>', 'man', 'ate', 'apple', 'two', 'women', 'running', 'on', 'beach', 'everybody', 'read', 'this', 'book']


In [6]:
# Now, we can build the LSTM network:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size 
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.softmax = nn.Softmax()
        
    def forward(self, input, hidden):
        #output = F.relu(input)
        input = input.view(1,1,-1)
        output, hidden = self.lstm(input, hidden)
        output = self.out(output[0])
        return output, hidden
    
    def initHidden(self):
        h0 = torch.zeros(1, 1, self.hidden_size)
        c0 = torch.zeros(1, 1, self.hidden_size)
        result = (h0, c0)
        if use_cuda:
            result = (h0.cuda(), c0.cuda())
        return result

In [7]:
# Let's see what the model looks like:
model = EncoderLSTM(input_size=vocabulary_size, hidden_size=150, output_size=vocabulary_size)
if use_cuda:
    model = model.cuda()
model

EncoderLSTM(
  (lstm): LSTM(17, 150)
  (out): Linear(in_features=150, out_features=17, bias=True)
  (softmax): Softmax()
)

In [8]:
# We will write a function to train the network:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train(target_tensor,
         model,
         optimizer,
         criterion,
         embeddings = one_hot_embeddings,
         teacher_forcing_ratio = 1.0):
    """
    Given a signle training sample, go through a single step of training.
    """
    loss = 0
    optimizer.zero_grad()
    
    input_tensor = torch.FloatTensor(embeddings[word2index['<SOS>']]).unsqueeze(0)
    input_tensor = input_tensor.cuda() if use_cuda else input_tensor
    target_tensor = target_tensor.view(-1,1)
    hidden = model.initHidden()
    
    use_teacher_forcing = True if np.random.random() < teacher_forcing_ratio else False
    
    predictions = []
    
    target_length = target_tensor.size(0)
    
    if use_teacher_forcing:
        # Teacher force:
        for i in range(target_length): # ignore first word
            output, hidden = model(input_tensor, hidden)
            loss += criterion(output, target_tensor[i])
            
            _, topi = model.softmax(output).data.topk(1) 
            predictions.append(vocabulary[topi])
            
            # Set model input to next ground-truth word:
            input_tensor = torch.FloatTensor(embeddings[target_tensor[i].data])
            input_tensor = input_tensor.cuda() if use_cuda else input_tensor
            
    else:
        # No teacher force:
        for i in range(target_length):
            output, hidden = model(input_tensor, hidden)
            loss += criterion(output, target_tensor[i])
            
            topv, topi = model.softmax(output).data.topk(1)
            predictions.append(vocabulary[topi[0][0]])
            
            # Set model input to its current output:
            input_tensor = torch.FloatTensor([embeddings[topi]])
            input_tensor = input_tensor.cuda() if use_cuda else input_tensor
            
    loss.backward()
    optimizer.step()
    
    return loss.item() / target_length

In [9]:
# Let's train the network:
model.train()
num_epochs = 100
teacher_forcing_ratio = 1.0
for epoch in range(num_epochs):
    total_loss = 0
    for i, sentence in enumerate(train_sentences):
        '''
        if epoch > 200:
            teacher_forcing_ratio = 0.5
        elif epoch > 800:
            teacher_forcing_ratio = 0.2
        elif epoch > 1500:
            teacher_forcing_ratio = 0.0
        '''
        word_indices = preprocess_numberize(sentence.lower())
        target_tensor = torch.LongTensor(word_indices[1:])
        target_tensor = target_tensor.cuda() if use_cuda else target_tensor # start from first word instead of <SOS>
        loss = train(target_tensor, model, optimizer, criterion, teacher_forcing_ratio=teacher_forcing_ratio)
        total_loss += loss
        if i % 10 == 0:
            print('epoch[%d], sentence[%d] loss: %.3f' % (epoch+1, i+1, total_loss/10))
            total_loss = 0



epoch[1], sentence[1] loss: 0.282
epoch[2], sentence[1] loss: 0.279
epoch[3], sentence[1] loss: 0.277
epoch[4], sentence[1] loss: 0.274
epoch[5], sentence[1] loss: 0.271
epoch[6], sentence[1] loss: 0.266
epoch[7], sentence[1] loss: 0.258
epoch[8], sentence[1] loss: 0.246
epoch[9], sentence[1] loss: 0.231
epoch[10], sentence[1] loss: 0.225
epoch[11], sentence[1] loss: 0.225
epoch[12], sentence[1] loss: 0.223
epoch[13], sentence[1] loss: 0.218
epoch[14], sentence[1] loss: 0.215
epoch[15], sentence[1] loss: 0.212
epoch[16], sentence[1] loss: 0.210
epoch[17], sentence[1] loss: 0.207
epoch[18], sentence[1] loss: 0.204
epoch[19], sentence[1] loss: 0.200
epoch[20], sentence[1] loss: 0.197
epoch[21], sentence[1] loss: 0.194
epoch[22], sentence[1] loss: 0.190
epoch[23], sentence[1] loss: 0.187
epoch[24], sentence[1] loss: 0.184
epoch[25], sentence[1] loss: 0.180
epoch[26], sentence[1] loss: 0.176
epoch[27], sentence[1] loss: 0.173
epoch[28], sentence[1] loss: 0.170
epoch[29], sentence[1] loss: 

In [10]:
# Let's test the training:
# to do so, we will input first half of words of each sentence in train_sentences and output the second half
model.eval()
for sentence in train_sentences:
    print(sentence)
    '''
    one_hot_embedded = preprocess_one_hot(sentence)
    input_tensor = input_tensor.cuda() if use_cuda else input_tensor
    input_tensor = torch.FloatTensor(one_hot_embedded)
    hidden = model.initHidden()
    input_length = len(input_tensor)
    '''
    numberized = preprocess_numberize(sentence)
    
    out_sentence = []
    in_sentence = []
    hidden = model.initHidden()
    for i in range(len(numberized)//2):
        in_sentence.append(vocabulary[numberized[i]])
        input_tensor = torch.FloatTensor([one_hot_embeddings[numberized[i]]])
        input_tensor = input_tensor.cuda() if use_cuda else input_tensor
        output, hidden = model(input_tensor, hidden)
        #topv, topi = model.softmax(output).data.topk(1)
        #out_sentence.append(vocabulary[topi[0][0]])
        
    while len(out_sentence) == 0 or out_sentence[-1] != '<EOS>':
        topv, topi = model.softmax(output).data.topk(1)
        out_sentence.append(vocabulary[topi[0][0]])

        input_tensor = torch.FloatTensor([one_hot_embeddings[topi]])
        input_tensor = input_tensor.cuda() if use_cuda else input_tensor
        output, hidden = model(input_tensor, hidden)
    print(in_sentence)
    print(out_sentence)

The man ate the apple.
['<SOS>', 'the', 'man', 'ate']
['the', 'apple', '.', '<EOS>']
Two women running on the beach.
['<SOS>', 'two', 'women', 'running']
['on', 'the', 'beach', '.', '<EOS>']
Everybody read this book.
['<SOS>', 'everybody', 'read']
['this', 'book', '.', '<EOS>']


