In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x104e41f10>

In [2]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(autograd.Variable(lookup_tensor))
print(hello_embed)

Variable containing:
 0.6614  0.2669  0.0617  0.6213 -0.4519
[torch.FloatTensor of size 1x5]



In [3]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 100
# test_sentence = """When forty winters shall besiege thy brow,
# And dig deep trenches in thy beauty's field,
# Thy youth's proud livery so gazed on now,
# Will be a totter'd weed of small worth held:
# Then being asked, where all thy beauty lies,
# Where all the treasure of thy lusty days;
# To say, within thine own deep sunken eyes,
# Were an all-eating shame, and thriftless praise.
# How much more praise deserv'd thy beauty's use,
# If thou couldst answer 'This fair child of mine
# Shall sum my count, and make my old excuse,'
# Proving his beauty by succession thine!
# This were to be new made when thou art old,
# And see thy blood warm when thou feel'st it cold.""".split()
# test_sentence.remove('thy')
test_sentence = open('input.txt').read().split()
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
print(len(test_sentence))
test_sentence = [word for word in test_sentence if word not in stops]
print('length reduced to {}'.format(len(test_sentence)))

202651
length reduced to 134575


In [4]:
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

[(['First', 'Citizen:'], 'Before'), (['Citizen:', 'Before'], 'proceed'), (['Before', 'proceed'], 'further,')]


In [5]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [6]:
class NGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        """Initialize an NGram Model
        @param vocab_size: size of the vocabulary
        @param embedding dim: size of word embeddings (i.e. length of row vector)
        @param context_size: number of context words to condition predicted word on
        """
        super(NGramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        self.dropout = nn.Dropout(p = 0.5, inplace = False)
    
    def forward(self, inputs):
        embeds = self.embedding(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
       # out = self.dropout(out)
        out = self.linear2(out)
        probs = F.softmax(out)
        return out
    
        

In [7]:
model = NGramModel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
losses = []
loss_function = nn.NLLLoss()
opt = torch.optim.SGD(model.parameters(), lr = 0.001)
ix_to_word = {ix: word for word, ix in word_to_ix.items()}

In [None]:
import numpy as np
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in trigrams:
        context_idxs = [word_to_ix[w] for w in context]
        context_var = torch.autograd.Variable(torch.LongTensor(context_idxs))
        
        model.zero_grad()
        probs = model(context_var) # looks up the embeddings for context_var and then passes it through the network
        #print(context, ix_to_word[np.argmax(probs.data.numpy())])
        target_var = torch.autograd.Variable(torch.LongTensor([word_to_ix[target]]))
        loss = loss_function(probs, target_var)
        loss.backward()
        opt.step()
        total_loss +=loss.data
        losses.append(total_loss)
    print('EPOCH {}'.format(epoch))
    print(loss.data)

import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(losses)
plt.show()



In [None]:
# now, try to generate a text
def convert_to_idx(words):
    return [word_to_ix[w] for w in words]

def convert_to_words(idxs):
    return [ix_to_word[idx] for idx in idxs]

generated_seq = test_sentence[:2]

for i in range(15):
    context = generated_seq[len(generated_seq) -2:]
    probs = model(torch.autograd.Variable(torch.LongTensor(convert_to_idx(context))))
    word = convert_to_words([np.argmax(probs.data.numpy())])
    generated_seq = generated_seq + word
print(generated_seq)
