In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import string
from collections import defaultdict
import numpy as np
from tqdm import tqdm_notebook as tqdm

torch.manual_seed(1)

<torch._C.Generator at 0x27c88d43c10>

In [2]:
# SENTENCE LSTM EXAMPLE


# We imagine we have a  word embedding space of space of dim 10 and the LSTM returns an embedding of dim 5
lstm = nn.LSTM(10, 5) 
inputs = [torch.randn(1, 10) for _ in range(5)]  # make a sentence of length 5
print(inputs)

# initialize the hidden state.
hidden = (torch.randn(1, 1, 5),
          torch.randn(1, 1, 5))

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

## or all at once

inputs = torch.cat(inputs).view(len(inputs), 1, -1)
print(inputs)
# clean out hidden state like above
hidden = (torch.randn(1, 1, 5),
          torch.randn(1, 1, 5))  

out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

[tensor([[ 1.4697, -0.3951, -0.5101,  1.1163, -0.5926,  0.9089, -1.0792, -0.6751,
          1.4083, -1.8456]]), tensor([[-0.5655, -0.9094, -0.4891, -0.6175,  0.3836,  0.3852,  0.7890,  1.1363,
         -0.1147, -0.0180]]), tensor([[ 0.0491,  0.4272, -0.8996,  0.5313,  0.4034,  1.4521, -2.4182, -1.1906,
          0.6964,  1.1296]]), tensor([[ 0.2214, -0.0558,  1.2057,  1.9486, -0.0766, -0.8562, -0.7870, -0.8161,
          0.5470, -1.1707]]), tensor([[-0.4699, -1.6271, -0.1127,  1.5980, -0.8445, -1.0489,  0.9387,  0.5378,
          1.5372, -0.6943]])]
tensor([[[ 1.4697, -0.3951, -0.5101,  1.1163, -0.5926,  0.9089, -1.0792,
          -0.6751,  1.4083, -1.8456]],

        [[-0.5655, -0.9094, -0.4891, -0.6175,  0.3836,  0.3852,  0.7890,
           1.1363, -0.1147, -0.0180]],

        [[ 0.0491,  0.4272, -0.8996,  0.5313,  0.4034,  1.4521, -2.4182,
          -1.1906,  0.6964,  1.1296]],

        [[ 0.2214, -0.0558,  1.2057,  1.9486, -0.0766, -0.8562, -0.7870,
          -0.8161,  0.5470, -1.1

In [3]:
# LETTERS LSTM EXAMPLE


# letters have an embedding space of dim 26 and the LSTM returns an embedding of dim say 4
lstm = nn.LSTM(26, 4) 
inputs = [torch.randn(1, 26) for _ in range(5)]  # make a word of length 5 letters
print(inputs)

# initialize the hidden state.
hidden = (torch.randn(1, 1, 4),
          torch.randn(1, 1, 4))

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

## or all at once

inputs = torch.cat(inputs).view(len(inputs), 1, -1)
print(inputs)
# clean out hidden state like above
hidden = (torch.randn(1, 1, 4),
          torch.randn(1, 1, 4))  

out, hidden = lstm(inputs, hidden)
print(out)

# the hidden layer will be our character level embedding
# it will be concatenated to our word embedding, before being fed into the sentence level LSTM
print(hidden)

[tensor([[-1.5116,  0.1907,  0.2044,  0.1639, -1.2945, -0.1286, -0.0571, -0.0711,
          1.1658,  0.1701, -1.0919,  0.0826,  0.0131, -1.1464, -0.2603, -0.3115,
          0.8936, -1.0561, -0.5676,  0.4355,  0.0008, -0.0789,  0.8751,  0.5548,
         -0.9748, -0.5950]]), tensor([[ 0.8703,  0.8653,  0.2820,  0.9745,  0.1880, -0.9599, -0.4766, -2.0444,
          1.3598, -0.3804,  1.5289, -1.6573,  0.0821,  1.1764,  0.0877, -0.9174,
          0.9862,  0.4376,  0.0470, -0.0343, -0.0269, -0.3772,  0.2464,  0.5727,
          0.5574,  0.3598]]), tensor([[ 0.9314, -2.4962,  0.7922,  1.1106,  0.1389,  1.1380,  0.5049, -1.1455,
         -0.1846, -1.4921,  0.4998, -0.8199,  0.4210, -1.7431, -1.6584,  1.7252,
          1.3536,  1.6513,  0.0011, -0.8320,  0.6357, -0.7775, -0.0557, -0.0443,
          2.0859,  0.1051]]), tensor([[ 0.1401, -1.0772,  1.1557, -1.0234,  0.9199,  1.3019,  0.9390,  0.8462,
          0.9443, -0.7599,  0.6024,  0.1352, -0.6365,  0.0678,  1.0910, -0.1847,
          0.0786, 

In [4]:
training_data = [
    ("The dog ate the apple".lower().split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".lower().split(), ["NN", "V", "DET", "NN"])
]

letter_id = dict(zip(string.ascii_letters[:26],range(26)))

word_id = defaultdict(lambda:len(word_id))
all_tags = set()

for sent, tags in training_data:
    all_tags |= set(tags)
    for word in sent:
        word_id[word]
tag_id = dict(zip(all_tags,range(len(all_tags))))
word_id = dict(word_id)

print(letter_id)
print(word_id)
print(tag_id)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25}
{'the': 0, 'dog': 1, 'ate': 2, 'apple': 3, 'everybody': 4, 'read': 5, 'that': 6, 'book': 7}
{'DET': 0, 'V': 1, 'NN': 2}


In [5]:


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def get_letter_vector(letter):
    v = np.zeros(26)
    v[letter_id[letter]] = 1
    return torch.tensor(v,dtype=torch.long)

def get_word_vector(word):
  return torch.cat([get_letter_vector(letter) for letter in word]).view(-1,26)


In [6]:
prepare_sequence('the ate'.split(),word_id)

tensor([0, 2])

In [7]:
get_letter_vector('b')

tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])

In [8]:
get_word_vector('ab')

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])

In [9]:
N_LETTERS = 26

WORD_EMBEDDING_SIZE = 6
HIDDEN_DIM_LETTER_LSTM = 5
TOTAL_EMBEDDING_SIZE = WORD_EMBEDDING_SIZE + HIDDEN_DIM_LETTER_LSTM

HIDDEN_DIM_FULL_LSTM = 6

VOCAB_SIZE = len(word_id)

TAGSET_SIZE = len(tag_id)


In [10]:
def init_hidden(dim):
        return (torch.zeros(1, 1, dim),
                torch.zeros(1, 1, dim))

In [11]:
## Try the simple LSTM by hand (no letter embedding)


nn_word_embeddings = nn.Embedding(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
lstm_full = nn.LSTM(WORD_EMBEDDING_SIZE, HIDDEN_DIM_FULL_LSTM)  
hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)

sentence = 'the dog ate the apple'.split()
n_words = len(sentence)
word_ids = prepare_sequence(sentence,word_id)
print(word_ids)
word_embeddings = nn_word_embeddings(word_ids)
print(word_embeddings)


for word_embedding in word_embeddings:
    print(word_embedding.dtype)
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden_full = lstm_full(word_embedding.view(1, 1, -1), hidden_full)

print(out)
# out is our final embedding
print(hidden_full)

tensor([0, 1, 2, 0, 3])
tensor([[-0.5517,  0.8091, -0.4110,  0.5568,  0.5924,  0.4504],
        [ 1.0375,  0.0895, -1.9281,  0.4494,  0.8918,  0.7742],
        [-0.9697, -0.6761,  0.7772,  0.6192,  1.6007, -0.9435],
        [-0.5517,  0.8091, -0.4110,  0.5568,  0.5924,  0.4504],
        [-1.1967, -0.4981, -0.4123,  0.6369,  0.9843, -1.1036]],
       grad_fn=<EmbeddingBackward>)
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
tensor([[[ 0.1472,  0.3769, -0.4417,  0.3877,  0.1941,  0.0209]]],
       grad_fn=<CatBackward>)
(tensor([[[ 0.1472,  0.3769, -0.4417,  0.3877,  0.1941,  0.0209]]],
       grad_fn=<ViewBackward>), tensor([[[ 0.2729,  0.6183, -0.6431,  0.7131,  0.2420,  0.0385]]],
       grad_fn=<ViewBackward>))


In [12]:
## Try the letter LSTM by hand



lstm_letters = nn.LSTM(N_LETTERS, HIDDEN_DIM_LETTER_LSTM)  
hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)

word = 'orange'
n_letters = len(word)
letter_ids = get_word_vector(word).type(torch.FloatTensor)
print(letter_ids)


for letter in letter_ids:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    letter_out, hidden_letters = lstm_letters(letter.view(1, 1, -1), hidden_letters)

print(letter_out)
# letter_out is our letter level embedding for the word
print(hidden_letters)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[[-0.3223, -0.3033,  0.0765, -0.1239,  0.2630]]], grad_fn=<CatBackward>)
(tensor([[[-0.3223, -0.3033,  0.0765, -0.1239,  0.2630]]], grad_fn=<ViewBackward>), tensor([[[-0.6339, -0.5351,  0.1922, -0.2950,  0.4478]]], grad_fn=<ViewBackward>))


In [15]:
## Putting it all together



lstm_letters = nn.LSTM(N_LETTERS, HIDDEN_DIM_LETTER_LSTM)  
hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)

nn_word_embeddings = nn.Embedding(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
# the full LSTM now takes a larger input : concatenating the original word embedding + letter level embedding (output of first LSTM)
lstm_full = nn.LSTM(WORD_EMBEDDING_SIZE+HIDDEN_DIM_LETTER_LSTM, HIDDEN_DIM_FULL_LSTM)  
hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)




sentence = 'the dog ate the apple'.split()
n_words = len(sentence)
word_ids = prepare_sequence(sentence,word_id)
print(word_ids)
word_embeddings = nn_word_embeddings(word_ids)
print(word_embeddings)
    
all_outs = []

for pos in range(n_words):
    word_embedding = word_embeddings[pos]
    print(word_embedding.dtype)
    
    word = sentence[pos]
    n_letters = len(word)
    letter_ids = get_word_vector(word).type(torch.FloatTensor)


    
    for letter in letter_ids:
        letter_out, hidden_letters = lstm_letters(letter.view(1, 1, -1), hidden_letters)

    # we prepare to concatenate the word embedding and letter level embedding
    letter_out = letter_out.view(-1,1)
    word_embedding = word_embedding.view(-1,1)
    full_lstm_input = torch.cat([word_embedding,letter_out])

    # and can then feed this into the full LSTM
    out, hidden_full = lstm_full(full_lstm_input.view(1, 1, -1), hidden_full)
    all_outs.append(out[-1])


# out is our final embedding
print(all_outs)



tensor([0, 1, 2, 0, 3])
tensor([[-0.1382,  0.6345, -0.2083, -0.0995,  1.3494, -0.5304],
        [-0.0582, -1.3537,  0.6772,  0.6895,  0.2098,  1.0609],
        [ 0.1944,  0.2364,  0.4106,  1.9754,  1.5703, -1.1593],
        [-0.1382,  0.6345, -0.2083, -0.0995,  1.3494, -0.5304],
        [-1.4360, -0.0371, -2.2927, -1.7063,  0.0197, -1.6556]],
       grad_fn=<EmbeddingBackward>)
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
[tensor([[-0.1716,  0.1784,  0.0308,  0.0682, -0.0352,  0.1832]],
       grad_fn=<SelectBackward>), tensor([[-0.0274,  0.2264,  0.1154,  0.0255, -0.0347, -0.0906]],
       grad_fn=<SelectBackward>), tensor([[-0.1173,  0.2796,  0.1380, -0.0939, -0.2960,  0.1391]],
       grad_fn=<SelectBackward>), tensor([[-0.2909,  0.2802,  0.1335, -0.0009, -0.2475,  0.2279]],
       grad_fn=<SelectBackward>), tensor([[-0.2872,  0.1694,  0.2392,  0.4215, -0.4810,  0.1172]],
       grad_fn=<SelectBackward>)]


Create the model:



In [23]:
class CharLevelLSTM(nn.Module):

    def __init__(self):
        super(CharLevelLSTM, self).__init__()

        self.lstm_letters = nn.LSTM(N_LETTERS, HIDDEN_DIM_LETTER_LSTM)  
        self.hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)

        self.nn_word_embeddings = nn.Embedding(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
        # the full LSTM now takes a larger input : concatenating the original word embedding + letter level embedding (output of first LSTM)
        self.lstm_full = nn.LSTM(WORD_EMBEDDING_SIZE+HIDDEN_DIM_LETTER_LSTM, HIDDEN_DIM_FULL_LSTM)  
        self.hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)


        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(HIDDEN_DIM_FULL_LSTM, TAGSET_SIZE)
        

    

    def forward(self, sentence):
        # note the input is now actual words, not word_ids

        n_words = len(sentence)

        word_ids = prepare_sequence(sentence,word_id)
        # print(word_ids)
        word_embeddings = self.nn_word_embeddings(word_ids)
        # print(word_embeddings)

        all_outs = []
        
        for pos in range(n_words):
            word_embedding = word_embeddings[pos]
            # print(word_embedding.dtype)
            
            word = sentence[pos]
            n_letters = len(word)
            letter_ids = get_word_vector(word).type(torch.FloatTensor)
            
            # if we want each character level word embedding to be independent from the previous one
            self.hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)
            
            for letter in letter_ids:
                letter_out, self.hidden_letters = self.lstm_letters(letter.view(1, 1, -1), self.hidden_letters)

            # we prepare to concatenate the word embedding and letter level embedding
            letter_out = letter_out.view(-1,1)
            word_embedding = word_embedding.view(-1,1)
            full_lstm_input = torch.cat([word_embedding,letter_out])

            # and can then feed this into the full LSTM
            out, self.hidden_full = self.lstm_full(full_lstm_input.view(1, 1, -1), self.hidden_full)
            
            all_outs.append(out[-1])

            
        all_outs = torch.cat(all_outs)
        # print(out)
        # out is our final embedding
        # print(hidden_full)

        tag_space = self.hidden2tag(all_outs.view(n_words, -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [24]:
training_data[0][0]

['the', 'dog', 'ate', 'the', 'apple']

Train the model:



In [25]:
model = CharLevelLSTM()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    tag_scores = model(training_data[0][0])
    print(tag_scores)


tensor([[-0.8660, -1.3830, -1.1130],
        [-0.8799, -1.2933, -1.1686],
        [-0.8404, -1.3016, -1.2162],
        [-0.8964, -1.2955, -1.1450],
        [-0.9046, -1.3236, -1.1113]])


In [32]:
model = CharLevelLSTM()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in tqdm(range(300)):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        targets = prepare_sequence(tags, tag_id)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()


HBox(children=(IntProgress(value=0, max=300), HTML(value='')))




In [33]:

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_id)
    tag_scores = model(training_data[0][0])

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-0.0266, -4.5385, -4.1618],
        [-4.5295, -5.0415, -0.0174],
        [-2.9534, -0.0713, -4.0976],
        [-0.0220, -4.3912, -4.6667],
        [-4.1935, -4.9556, -0.0224]])


In [47]:

inverse_tag_id = {v:k for k,v in tag_id.items()}


print(list(zip(training_data[0][0],(map(inverse_tag_id.get,tag_scores.numpy().argmax(axis=1))))))

[('the', 'DET'), ('dog', 'NN'), ('ate', 'V'), ('the', 'DET'), ('apple', 'NN')]


In [48]:
tags

['NN', 'V', 'DET', 'NN']