In [46]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

In [47]:
use_cuda = 1

In [48]:
# Build the data.
train_sentences = ["The man ate the apple.", "Two women running on the beach.", "Everybody read this book."]
val_sentences = ["A blue bird is singing on the tree.", "People were excited about the election."]

print(len(train_sentences))
print(len(val_sentences))
print(train_sentences[0])

3
2
The man ate the apple.


In [49]:
# Lower--case the training sentences, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in train_sentences]

# Create the vocabulary.
vocabulary_size = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabulary_size-1)]

# Build one-hot embeddings.
one_hot_embeddings = np.eye(vocabulary_size)

# Build a word-to-index dictionary.
word2index = {word:index for index,word in enumerate(vocabulary)}

def preprocess_numberize(sentence):
    """
    Given a sentence in the form of a string, preprocess it into a list of numbers denoting the index 
    of each word in the vocabulary.
    """
    tokenized = word_tokenize(sentence.lower())
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word) for word in tokenized]
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence in the form of a string, preprocess it into a a matrix of one-hot vectors corresponding
    to each word in the vocabulary.
    """
    numberized = preprocess_numberize(sentence)
    one_hot_embedded = one_hot_embeddings[numberized]
    return one_hot_embedded

In [53]:
# Building the LSTM network:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size 
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.softmax = nn.Softmax()
        
    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        return output, hidden
    
    def initHidden(self):
        h0 = torch.zeros(1, 1, self.hidden_size)
        c0 = torch.zeros(1, 1, self.hidden_size)
        result = (h0, c0)
        if use_cuda:
            result = (h0.cuda(), c0.cuda())
        return result

In [54]:
sentence = "the man ate the apple."
print(sentence)
word_indices = preprocess_numberize(sentence.lower())
print(word_indices)
target_tensor = torch.LongTensor(word_indices[1:]).cuda()
print(target_tensor.shape)
#decoder_input = Variable(torch.FloatTensor([[embeddings[target_variable[0].data[0]]]]))
print(target_tensor[0].data)

the man ate the apple.
[1, 2, 5, 6, 2, 7, 3, 4]
torch.Size([7])
tensor(2, device='cuda:0')


In [55]:
# Let's see what the model looks like:
model = EncoderLSTM(input_size=vocabulary_size, hidden_size=300, output_size=vocabulary_size)
if use_cuda:
    model = model.cuda()
model

EncoderLSTM(
  (lstm): LSTM(1000, 300)
  (out): Linear(in_features=300, out_features=1000, bias=True)
  (softmax): Softmax()
)

In [69]:
# Training the network:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

teacher_forcing_ratio = 0.0

def train(target_tensor,
         model,
         optimizer,
         criterion,
         embeddings = one_hot_embeddings):
    """
    Given a signle training sample, go through a single step of training.
    """
    loss = 0
    optimizer.zero_grad()
    
    input_tensor = torch.FloatTensor(embeddings[target_tensor[0].data])
    input_tensor = input_tensor.cuda() if use_cuda else input_tensor
    
    hidden = model.initHidden()
    
    use_teacher_forcing = True if np.random.random() < teacher_forcing_ratio else False
    
    predictions = []
    
    target_length = target_tensor.size(0)
    
    if use_teacher_forcing:
        # Teacher force:
        for i in range(1, target_length): # ignore first word
            output, hidden = model(input_tensor, hidden)
            loss += criterion(output, target_tensor[i])
            
            _, topi = model.softmax(output).data.topk(1) 
            predictions.append(vocabulary[topi])
            
            # Set model input to next ground-truth word:
            input_tensor = torch.FloatTensor(embeddings[target_tensor[i].data])
            input_tensor = input_tensor.cuda() if use_cuda else input_tensor
            
    else:
        # No teacher force:
        for i in range(1, target_length): # ignore first word
            print("i:", i)
            print("input:", input_tensor.shape)
            #print("hidden:", hidden.type())
            output, hidden = model(input_tensor, hidden)
            loss += criterion(output, target_tensor[i])
            
            _, topi = model.softmax(output).data.topk(1)
            predictions.append(vocabulary[topi])
            
            # Set model input to its current output:
            input_tensor = torch.FloatTensor([embeddings[topi]])
            input_tensor = input_tensor.cuda() if use_cuda else input_tensor
            
    loss.backward()
    optimizer.step()
    
    return loss.data[0] / target_length

In [70]:
# Let's train the network:
num_epochs = 100

for epoch in range(num_epochs):
    total_loss = 0
    for i, sentence in enumerate(train_sentences):
        word_indices = preprocess_numberize(sentence.lower())
        if len(word_indices) < 3:
            continue
        target_tensor = torch.LongTensor(word_indices[1:])
        target_tensor = target_tensor.cuda() if use_cuda else target_tensor # start from first word instead of <SOS>
        loss = train(target_tensor, model, optimizer, criterion)
        total_loss += loss
        if i % 10 == 0:
            print('epoch[%d], loss: %.3f' % (epoch+1, totol_loss/10))
            total_loss = 0

i: 1
input: torch.Size([1000])


RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
# Let's test the training