In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f1118064588>

In [101]:
input_minibatch = Variable(torch.randn(9, 5, 4))

hidden = (Variable(torch.zeros(1, 5, 3)), Variable(torch.zeros((1, 5, 3))))

lstm = nn.LSTM(4, 3)

out, hidden = lstm(input_minibatch, hidden)

print(hidden[0].size())

print(hidden[0])

torch.Size([1, 5, 3])
Variable containing:
(0 ,.,.) = 
 -0.1969  0.1116 -0.2867
 -0.0516 -0.0308 -0.0018
 -0.5920  0.0826  0.0916
 -0.2875 -0.0891  0.0091
 -0.2871 -0.0840 -0.0164
[torch.FloatTensor of size 1x5x3]



In [104]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
print(embeds)
lookup_tensor = torch.LongTensor([[0, 1], [1, 1], [0, 0]])
print(lookup_tensor)
helloandworld_embed = embeds(Variable(lookup_tensor))
print(helloandworld_embed)
print(helloandworld_embed.view((2, 3, -1)))

Embedding(2, 5)

 0  1
 1  1
 0  0
[torch.LongTensor of size 3x2]

Variable containing:
(0 ,.,.) = 
 -0.0084  0.5745 -0.0230 -0.5933  0.7945
  2.2256 -0.8400 -0.4712  0.3147 -0.0600

(1 ,.,.) = 
  2.2256 -0.8400 -0.4712  0.3147 -0.0600
  2.2256 -0.8400 -0.4712  0.3147 -0.0600

(2 ,.,.) = 
 -0.0084  0.5745 -0.0230 -0.5933  0.7945
 -0.0084  0.5745 -0.0230 -0.5933  0.7945
[torch.FloatTensor of size 3x2x5]

Variable containing:
(0 ,.,.) = 
 -0.0084  0.5745 -0.0230 -0.5933  0.7945
  2.2256 -0.8400 -0.4712  0.3147 -0.0600
  2.2256 -0.8400 -0.4712  0.3147 -0.0600

(1 ,.,.) = 
  2.2256 -0.8400 -0.4712  0.3147 -0.0600
 -0.0084  0.5745 -0.0230 -0.5933  0.7945
 -0.0084  0.5745 -0.0230 -0.5933  0.7945
[torch.FloatTensor of size 2x3x5]



In [2]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])]

char_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        for char in word:
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)
print(char_to_ix)

{'v': 11, 'd': 3, 'o': 4, 'y': 13, 't': 7, 'a': 6, 'r': 12, 'b': 14, 'e': 2, 'T': 0, 'E': 10, 'l': 9, 'g': 5, 'k': 15, 'p': 8, 'h': 1}


In [3]:
print(len(char_to_ix))

16


In [4]:
def prepare_words(sentence, char_to_ix):
    fillchar = 16
    length = 9
    lookupTensor = []
    for w in sentence:
        idxs = [char_to_ix[char] for char in w]
        while len(idxs) < length:
            idxs.append(fillchar)
        lookupTensor.append(idxs)
    return Variable(torch.LongTensor(lookupTensor))

In [5]:
print(prepare_words(training_data[0][0], char_to_ix))

Variable containing:
    0     1     2    16    16    16    16    16    16
    3     4     5    16    16    16    16    16    16
    6     7     2    16    16    16    16    16    16
    7     1     2    16    16    16    16    16    16
    6     8     8     9     2    16    16    16    16
[torch.LongTensor of size 5x9]



In [6]:
print(prepare_words(training_data[1][0], char_to_ix))

Variable containing:
   10    11     2    12    13    14     4     3    13
   12     2     6     3    16    16    16    16    16
    7     1     6     7    16    16    16    16    16
   14     4     4    15    16    16    16    16    16
[torch.LongTensor of size 4x9]



In [7]:
print(len(prepare_words(training_data[1][0], char_to_ix)))

4


In [8]:
def prepare_sequence(seq, word_to_ix):
    idxs = [word_to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

In [9]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

{'Everybody': 5, 'read': 6, 'ate': 2, 'the': 3, 'The': 0, 'book': 8, 'apple': 4, 'dog': 1, 'that': 7}


In [10]:
class LSTMTagger(nn.Module):

    def __init__(self, char_embedding_dim, charLSTM_hidden_dim, char_vocab_size, 
                 word_embedding_dim, wordLSTM_hidden_dim, word_vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        
        self.charLSTM_hidden_dim = charLSTM_hidden_dim
        self.wordLSTM_hidden_dim = wordLSTM_hidden_dim

        self.char_embeddings = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.word_embeddings = nn.Embedding(word_vocab_size, word_embedding_dim)

        # The LSTM takes char embeddings as inputs, and outputs hidden states
        # with dimensionality char_hidden_dim.
        self.charLSTM = nn.LSTM(char_embedding_dim, charLSTM_hidden_dim)
        self.wordLSTM = nn.LSTM(word_embedding_dim + charLSTM_hidden_dim, wordLSTM_hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(wordLSTM_hidden_dim, tagset_size)
        self.charLSTM_hidden = None
        self.wordLSTM_hidden = self.init_wordLSTM_hidden()

    def init_charLSTM_hidden(self, batch_size):
        return (Variable(torch.zeros(1, batch_size, self.charLSTM_hidden_dim)),
                Variable(torch.zeros(1, batch_size, self.charLSTM_hidden_dim)))
                                              
    def init_wordLSTM_hidden(self):
    # Before we've done anything, we dont have any hidden state.
    # Refer to the Pytorch documentation to see exactly
    # why they have this dimensionality.
    # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, wordLSTM_hidden_dim)), 
                Variable(torch.zeros(1, 1, wordLSTM_hidden_dim)))

    def forward(self, char_level_sentence, word_level_sentence):
        char_embeds = self.char_embeddings(char_level_sentence)
        char_lstm_out, self.charLSTM_hidden = self.charLSTM(char_embeds.view(9, len(char_level_sentence), -1), self.charLSTM_hidden)
        word_embeds = self.word_embeddings(word_level_sentence)
        char_level_rep = self.charLSTM_hidden[0].view(len(char_level_sentence), -1)
        wordLSTM_input = torch.cat([word_embeds, char_level_rep], 1) 
        word_lstm_out, self.wordLSTM_hidden = self.wordLSTM(wordLSTM_input.view(len(word_level_sentence), 1, -1), self.wordLSTM_hidden)
        tag_space = self.hidden2tag(word_lstm_out.view(len(word_level_sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [11]:
char_embedding_dim, charLSTM_hidden_dim, char_vocab_size = 4, 6, len(char_to_ix)+1      
word_embedding_dim, wordLSTM_hidden_dim, word_vocab_size = 8, 10, len(word_to_ix)
tagset_size = len(tag_to_ix)

model = LSTMTagger(char_embedding_dim, charLSTM_hidden_dim, char_vocab_size,
                   word_embedding_dim, wordLSTM_hidden_dim, word_vocab_size,
                   tagset_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [12]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i
char_level_sentence = prepare_words(training_data[1][0], char_to_ix)
word_level_sentence = prepare_sequence(training_data[1][0], word_to_ix)

model.wordLSTM_hidden = model.init_wordLSTM_hidden()
model.charLSTM_hidden = model.init_charLSTM_hidden(len(char_level_sentence))

tag_scores = model(char_level_sentence, word_level_sentence)
print(tag_scores)

Variable containing:
-1.1086 -0.9924 -1.2063
-1.0558 -1.0907 -1.1517
-0.9301 -1.2282 -1.1626
-0.9934 -1.2685 -1.0543
[torch.FloatTensor of size 4x3]



In [14]:
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.wordLSTM_hidden = model.init_wordLSTM_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        char_level_sentence = prepare_words(sentence, char_to_ix)
        word_level_sentence = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        model.charLSTM_hidden = model.init_charLSTM_hidden(len(char_level_sentence))

        # Step 3. Run our forward pass.
        tag_scores = model(char_level_sentence, word_level_sentence)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

In [19]:
# See what the scores are after training
char_level_sentence = prepare_words(training_data[0][0], char_to_ix)
word_level_sentence = prepare_sequence(training_data[0][0], word_to_ix)

model.wordLSTM_hidden = model.init_wordLSTM_hidden()
model.charLSTM_hidden = model.init_charLSTM_hidden(len(char_level_sentence))

tag_scores = model(char_level_sentence, word_level_sentence)
print(tag_scores)
_, preds = torch.max(tag_scores.data, 1)
print(preds)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!

Variable containing:
-0.0037 -5.6912 -7.9595
-6.6330 -0.0037 -6.0447
-6.6286 -5.8357 -0.0043
-0.0010 -7.6598 -7.5149
-7.0904 -0.0013 -7.6986
[torch.FloatTensor of size 5x3]


 0
 1
 2
 0
 1
[torch.LongTensor of size 5x1]



In [20]:
# See what the scores are after training
char_level_sentence = prepare_words(training_data[1][0], char_to_ix)
word_level_sentence = prepare_sequence(training_data[1][0], word_to_ix)

model.wordLSTM_hidden = model.init_wordLSTM_hidden()
model.charLSTM_hidden = model.init_charLSTM_hidden(len(char_level_sentence))

tag_scores = model(char_level_sentence, word_level_sentence)
print(tag_scores)
_, preds = torch.max(tag_scores.data, 1)
print(preds)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!

Variable containing:
-6.4791 -0.0026 -6.8495
-7.4138 -6.5673 -0.0020
-0.0007 -7.7771 -8.2438
-7.0292 -0.0020 -6.7628
[torch.FloatTensor of size 4x3]


 1
 2
 0
 1
[torch.LongTensor of size 4x1]

