In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
training_data = [
    ("The cat ate the cheese".lower().split(), ["DET", "NN", "V", "DET", "NN"]),
    ("She read that book".lower().split(), ["NN", "V", "DET", "NN"]),
    ("The dog loves art".lower().split(), ["DET", "NN", "V", "NN"]),
    ("The elephant answers the phone".lower().split(), ["DET", "NN", "V", "DET", "NN"])
]
word2idx = {}
for sent,tags in training_data:
    for word in sent:
        if word not in word2idx:
            word2idx[word]=len(word2idx)
tag2idx = {"DET":0,"NN":1,"V":2}
print(word2idx)

{'the': 0, 'cat': 1, 'ate': 2, 'cheese': 3, 'she': 4, 'read': 5, 'that': 6, 'book': 7, 'dog': 8, 'loves': 9, 'art': 10, 'elephant': 11, 'answers': 12, 'phone': 13}


In [18]:
import numpy as np
def prepare_sequence(seq,to_idx):
    idxs = [to_idx[w] for w in seq]
    idxs = np.array(idxs)
    return torch.from_numpy(idxs)

In [19]:
example_input = prepare_sequence("The dog answers the phone".lower().split(),word2idx)
print(example_input)

tensor([ 0,  8, 12,  0, 13], dtype=torch.int32)


In [20]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        ''' Initialize the layers of this model.'''
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim

        # embedding layer that turns words into a vector of a specified size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # the LSTM takes embedded word vectors (of a specified size) as inputs 
        # and outputs hidden states of size hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # the linear layer that maps the hidden state output dimension 
        # to the number of tags we want as output, tagset_size (in this case this is 3 tags)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        # initialize the hidden state (see code below)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):
        ''' At the start of training, we need to initialize a hidden state;
           there will be none because the hidden state is formed based on perviously seen data.
           So, this function defines a hidden state with all zeroes and of a specified size.'''
        # The axes dimensions are (n_layers, batch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        ''' Define the feedforward behavior of the model.'''
        # create embedded word vectors for each word in a sentence
        embeds = self.word_embeddings(sentence)
        
        # get the output and hidden state by passing the lstm over our word embeddings
        # the lstm takes in our embeddings and hiddent state
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        # get the scores for the most likely tag for a word
        tag_outputs = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_outputs, dim=1)
        
        return tag_scores


In [21]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,len(word2idx),len(tag2idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(),lr = 0.1)

In [30]:
test_sentence = "The cheese loves the elephant".lower().split()
inputs = prepare_sequence(test_sentence,word2idx)
inputs = torch.tensor(inputs).to(torch.int64)
tag_scores = model(inputs)
print(tag_scores)
_,predicted_tags = torch.max(tag_scores,1)
print('\n')
print('Predicted tags: \n',predicted_tags)

tensor([[-1.4482, -1.3494, -0.6820],
        [-1.4141, -1.2905, -0.7304],
        [-1.4089, -1.3132, -0.7203],
        [-1.4135, -1.3223, -0.7130],
        [-1.4618, -1.3608, -0.6700]], grad_fn=<LogSoftmaxBackward>)


Predicted tags: 
 tensor([2, 2, 2, 2, 2])


  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
n_epochs = 300
for epoch in range(n_epochs):
    epoch_loss = 0.0
    for sentence,tags in training_data:
        model.zero_grad()
        model.hidden = model.init_hidden()
        sentence_in = prepare_sequence(sentence,word2idx)
        targets = prepare_sequence(tags,tag2idx)
        sentence_in = torch.tensor(sentence_in).to(torch.int64)
        targets = torch.tensor(targets).to(torch.int64)
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores,targets)
        epoch_loss+=loss.item()
        loss.backward()
        optimizer.step()
    if(epoch%20==19):
        print("Epoch: %d, loss: %1.5f" % (epoch+1,epoch_loss/len(training_data)))

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Epoch: 20, loss: 1.03321
Epoch: 40, loss: 0.97148
Epoch: 60, loss: 0.84358
Epoch: 80, loss: 0.60448
Epoch: 100, loss: 0.33087
Epoch: 120, loss: 0.16615
Epoch: 140, loss: 0.09066
Epoch: 160, loss: 0.05686
Epoch: 180, loss: 0.03977
Epoch: 200, loss: 0.02996
Epoch: 220, loss: 0.02376
Epoch: 240, loss: 0.01954
Epoch: 260, loss: 0.01652
Epoch: 280, loss: 0.01426
Epoch: 300, loss: 0.01251


In [34]:
test_sentence = "The cheese loves the elephant".lower().split()
inputs = prepare_sequence(test_sentence,word2idx)
inputs = torch.tensor(inputs).to(torch.int64)
tag_scores = model(inputs)
print(tag_scores)
_,predicted_tags = torch.max(tag_scores,1)
print('\n')
print('Predicted tags: \n',predicted_tags)

tensor([[-6.0019e-02, -3.3125e+00, -3.8245e+00],
        [-6.5547e+00, -3.2173e-03, -6.3263e+00],
        [-4.0466e+00, -5.2362e+00, -2.3065e-02],
        [-1.1967e-02, -6.0536e+00, -4.6516e+00],
        [-2.9635e+00, -5.4852e-02, -6.3560e+00]], grad_fn=<LogSoftmaxBackward>)


Predicted tags: 
 tensor([0, 1, 2, 0, 1])


  This is separate from the ipykernel package so we can avoid doing imports until
