In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import string
from collections import defaultdict
import numpy as np

torch.manual_seed(1)

<torch._C.Generator at 0x7f0531ff3270>

In [0]:
# SENTENCE LSTM EXAMPLE


# We imagine we have a  word embedding space of space of dim 10 and the LSTM returns an embedding of dim 5
lstm = nn.LSTM(10, 5) 
inputs = [torch.randn(1, 10) for _ in range(5)]  # make a sentence of length 5
print(inputs)

# initialize the hidden state.
hidden = (torch.randn(1, 1, 5),
          torch.randn(1, 1, 5))

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

## or all at once

inputs = torch.cat(inputs).view(len(inputs), 1, -1)
print(inputs)
# clean out hidden state like above
hidden = (torch.randn(1, 1, 5),
          torch.randn(1, 1, 5))  

out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

[tensor([[ 1.4697, -0.3951, -0.5101,  1.1163, -0.5926,  0.9089, -1.0792, -0.6751,
          1.4083, -1.8456]]), tensor([[-0.5655, -0.9094, -0.4891, -0.6175,  0.3836,  0.3852,  0.7890,  1.1363,
         -0.1147, -0.0180]]), tensor([[ 0.0491,  0.4272, -0.8996,  0.5313,  0.4034,  1.4521, -2.4182, -1.1906,
          0.6964,  1.1296]]), tensor([[ 0.2214, -0.0558,  1.2057,  1.9486, -0.0766, -0.8562, -0.7870, -0.8161,
          0.5470, -1.1707]]), tensor([[-0.4699, -1.6271, -0.1127,  1.5980, -0.8445, -1.0489,  0.9387,  0.5378,
          1.5372, -0.6943]])]
tensor([[[ 1.4697, -0.3951, -0.5101,  1.1163, -0.5926,  0.9089, -1.0792,
          -0.6751,  1.4083, -1.8456]],

        [[-0.5655, -0.9094, -0.4891, -0.6175,  0.3836,  0.3852,  0.7890,
           1.1363, -0.1147, -0.0180]],

        [[ 0.0491,  0.4272, -0.8996,  0.5313,  0.4034,  1.4521, -2.4182,
          -1.1906,  0.6964,  1.1296]],

        [[ 0.2214, -0.0558,  1.2057,  1.9486, -0.0766, -0.8562, -0.7870,
          -0.8161,  0.5470, -1.1

In [0]:
# LETTERS LSTM EXAMPLE


# letters have an embedding space of dim 26 and the LSTM returns an embedding of dim say 4
lstm = nn.LSTM(26, 4) 
inputs = [torch.randn(1, 26) for _ in range(5)]  # make a word of length 5 letters
print(inputs)

# initialize the hidden state.
hidden = (torch.randn(1, 1, 4),
          torch.randn(1, 1, 4))

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

## or all at once

inputs = torch.cat(inputs).view(len(inputs), 1, -1)
print(inputs)
# clean out hidden state like above
hidden = (torch.randn(1, 1, 4),
          torch.randn(1, 1, 4))  

out, hidden = lstm(inputs, hidden)
print(out)

# the hidden layer will be our character level embedding
# it will be concatenated to our word embedding, before being fed into the sentence level LSTM
print(hidden)

[tensor([[-0.8574,  1.8176,  1.1826, -1.3298,  0.5813,  0.5272,  1.7288, -0.4040,
         -0.8616,  1.7713, -0.5801, -1.8529,  0.7450,  0.6500,  1.9265,  0.7164,
         -0.1715, -1.3550, -1.1653,  1.6195,  0.3746, -0.8004, -1.1944,  0.5411,
          1.7009, -0.1816]]), tensor([[-1.2272, -0.9012,  0.5480, -2.0712,  0.4554, -1.2638,  0.2165,  0.7955,
         -1.1174, -0.5562,  0.7357, -0.0418, -0.7948, -1.4730,  1.8940,  0.1557,
          0.6981,  0.7614,  2.2613, -0.4493, -0.1287, -0.4660,  1.7010, -1.4658,
         -0.1103,  1.0926]]), tensor([[-0.5007,  0.1164, -0.7695,  2.2969, -0.2332,  0.4124, -1.2673,  1.5329,
          1.4025,  0.8013, -1.3451, -0.9675, -1.8558,  0.3900,  0.8584,  0.6327,
         -0.0534,  0.1181, -0.8883, -0.3490, -0.4198,  1.0553, -0.6366,  1.6225,
          0.2911,  0.0630]]), tensor([[ 0.0054,  0.1899, -0.2596, -1.7663,  0.8808, -1.4067,  2.3293,  0.1724,
         -0.0635, -1.3799,  0.3122, -1.0149,  0.0527, -0.5512,  1.1984,  0.2158,
          0.3646, 

In [0]:
training_data = [
    ("The dog ate the apple".lower().split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".lower().split(), ["NN", "V", "DET", "NN"])
]

letter_id = dict(zip(string.ascii_letters[:26],range(26)))

word_id = defaultdict(lambda:len(word_id))
all_tags = set()

for sent, tags in training_data:
    all_tags |= set(tags)
    for word in sent:
        word_id[word]
tag_id = dict(zip(all_tags,range(len(all_tags))))
word_id = dict(word_id)

print(letter_id)
print(word_id)
print(tag_id)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25}
{'the': 0, 'dog': 1, 'ate': 2, 'apple': 3, 'everybody': 4, 'read': 5, 'that': 6, 'book': 7}
{'V': 0, 'NN': 1, 'DET': 2}


In [0]:


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def get_letter_vector(letter):
    v = np.zeros(26)
    v[letter_id[letter]] = 1
    return torch.tensor(v,dtype=torch.long)

def get_word_vector(word):
  return torch.cat([get_letter_vector(letter) for letter in word]).view(-1,26)


In [50]:
prepare_sequence('the ate'.split(),word_id)

tensor([0, 2])

In [0]:
get_letter_vector('b')

tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])

In [51]:
get_word_vector('ab')

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])

In [0]:
N_LETTERS = 26

WORD_EMBEDDING_SIZE = 6
HIDDEN_DIM_LETTER_LSTM = 5
TOTAL_EMBEDDING_SIZE = WORD_EMBEDDING_SIZE + HIDDEN_DIM_LETTER_LSTM

HIDDEN_DIM_FULL_LSTM = 6

VOCAB_SIZE = len(word_id)

TAGSET_SIZE = len(tag_id)


In [0]:
def init_hidden(dim):
        return (torch.zeros(1, 1, dim),
                torch.zeros(1, 1, dim))

In [61]:
## Try the simple LSTM by hand (no letter embedding)


nn_word_embeddings = nn.Embedding(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
lstm_full = nn.LSTM(WORD_EMBEDDING_SIZE, HIDDEN_DIM_FULL_LSTM)  
hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)

sentence = 'the dog ate the apple'.split()
n_words = len(sentence)
word_ids = prepare_sequence(sentence,word_id)
print(word_ids)
word_embeddings = nn_word_embeddings(word_ids)
print(word_embeddings)


for word_embedding in word_embeddings:
    print(word_embedding.dtype)
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden_full = lstm_full(word_embedding.view(1, 1, -1), hidden_full)

print(out)
# out is our final embedding
print(hidden_full)

tensor([0, 1, 2, 0, 3])
tensor([[ 1.1187, -1.0463, -0.2531,  0.1002, -0.1832,  2.3387],
        [-1.8146, -0.5631, -0.5191, -1.2617,  0.4364,  1.1755],
        [-1.6904, -0.4445, -0.9425, -0.9120, -0.0203, -0.0618],
        [ 1.1187, -1.0463, -0.2531,  0.1002, -0.1832,  2.3387],
        [-1.4467,  0.7909,  0.7926, -0.6782, -0.4936, -2.6210]],
       grad_fn=<EmbeddingBackward>)
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
tensor([[[ 0.1822,  0.0649, -0.0623, -0.0042,  0.0387, -0.0080]]],
       grad_fn=<StackBackward>)
(tensor([[[ 0.1822,  0.0649, -0.0623, -0.0042,  0.0387, -0.0080]]],
       grad_fn=<StackBackward>), tensor([[[ 0.2880,  0.3061, -0.0800, -0.0076,  0.1170, -0.0522]]],
       grad_fn=<StackBackward>))


In [57]:
## Try the letter LSTM by hand



lstm_letters = nn.LSTM(N_LETTERS, HIDDEN_DIM_LETTER_LSTM)  
hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)

word = 'orange'
n_letters = len(word)
letter_ids = get_word_vector(word).type(torch.FloatTensor)
print(letter_ids)


for letter in letter_ids:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    letter_out, hidden_letters = lstm_letters(letter.view(1, 1, -1), hidden_letters)

print(letter_out)
# letter_out is our letter level embedding for the word
print(hidden_letters)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[[ 0.1705, -0.1399,  0.1995,  0.2432, -0.0396]]],
       grad_fn=<StackBackward>)
(tensor([[[ 0.1705, -0.1399,  0.1995,  0.2432, -0.0396]]],
       grad_fn=<StackBackward>), tensor([[[ 0.4784, -0.2192,  0.4624,  0.4582, -0.0817]]],
       grad_fn=<Stack

In [62]:
## Putting it all together



lstm_letters = nn.LSTM(N_LETTERS, HIDDEN_DIM_LETTER_LSTM)  
hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)

nn_word_embeddings = nn.Embedding(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
# the full LSTM now takes a larger input : concatenating the original word embedding + letter level embedding (output of first LSTM)
lstm_full = nn.LSTM(WORD_EMBEDDING_SIZE+HIDDEN_DIM_LETTER_LSTM, HIDDEN_DIM_FULL_LSTM)  
hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)




sentence = 'the dog ate the apple'.split()
n_words = len(sentence)
word_ids = prepare_sequence(sentence,word_id)
print(word_ids)
word_embeddings = nn_word_embeddings(word_ids)
print(word_embeddings)


for pos in range(n_words):
    word_embedding = word_embeddings[pos]
    print(word_embedding.dtype)
    
    word = sentence[pos]
    n_letters = len(word)
    letter_ids = get_word_vector(word).type(torch.FloatTensor)

    for letter in letter_ids:
        letter_out, hidden_letters = lstm_letters(letter.view(1, 1, -1), hidden_letters)

    # we prepare to concatenate the word embedding and letter level embedding
    letter_out = letter_out.view(-1,1)
    word_embedding = word_embedding.view(-1,1)
    full_lstm_input = torch.cat([word_embedding,letter_out])

    # and can then feed this into the full LSTM
    out, hidden_full = lstm_full(full_lstm_input.view(1, 1, -1), hidden_full)


print(out)
# out is our final embedding
print(hidden_full)



tensor([0, 1, 2, 0, 3])
tensor([[ 1.3718,  0.1913,  0.6198, -0.1384,  1.3611,  1.0847],
        [ 1.4003,  0.5050,  1.2077, -2.8796,  0.4563,  0.6948],
        [ 0.6902,  0.2315, -0.3435, -0.8470, -0.0889,  2.3110],
        [ 1.3718,  0.1913,  0.6198, -0.1384,  1.3611,  1.0847],
        [ 0.2111,  0.0057, -1.0064, -0.3516, -0.6723,  0.8524]],
       grad_fn=<EmbeddingBackward>)
torch.float32
torch.float32
torch.float32
torch.float32
torch.float32
tensor([[[ 0.4261,  0.1717, -0.1574,  0.0562, -0.0177, -0.0064]]],
       grad_fn=<StackBackward>)
(tensor([[[ 0.4261,  0.1717, -0.1574,  0.0562, -0.0177, -0.0064]]],
       grad_fn=<StackBackward>), tensor([[[ 0.8067,  0.3278, -0.2929,  0.1296, -0.0433, -0.0210]]],
       grad_fn=<StackBackward>))


Create the model:



In [0]:
class CharLevelLSTM(nn.Module):

    def __init__(self):
        super(CharLevelLSTM, self).__init__()

        self.lstm_letters = nn.LSTM(N_LETTERS, HIDDEN_DIM_LETTER_LSTM)  
        self.hidden_letters = init_hidden(HIDDEN_DIM_LETTER_LSTM)

        self.nn_word_embeddings = nn.Embedding(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
        # the full LSTM now takes a larger input : concatenating the original word embedding + letter level embedding (output of first LSTM)
        self.lstm_full = nn.LSTM(WORD_EMBEDDING_SIZE+HIDDEN_DIM_LETTER_LSTM, HIDDEN_DIM_FULL_LSTM)  
        self.hidden_full = init_hidden(HIDDEN_DIM_FULL_LSTM)


        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(HIDDEN_DIM_FULL_LSTM, TAGSET_SIZE)
        

    

    def forward(self, sentence):
        # note the input is now actual words, not word_ids

        n_words = len(sentence)

        word_ids = prepare_sequence(sentence,word_id)
        # print(word_ids)
        word_embeddings = self.nn_word_embeddings(word_ids)
        # print(word_embeddings)


        for pos in range(n_words):
            word_embedding = word_embeddings[pos]
            # print(word_embedding.dtype)
            
            word = sentence[pos]
            n_letters = len(word)
            letter_ids = get_word_vector(word).type(torch.FloatTensor)

            for letter in letter_ids:
                letter_out, self.hidden_letters = self.lstm_letters(letter.view(1, 1, -1), self.hidden_letters)

            # we prepare to concatenate the word embedding and letter level embedding
            letter_out = letter_out.view(-1,1)
            word_embedding = word_embedding.view(-1,1)
            full_lstm_input = torch.cat([word_embedding,letter_out])

            # and can then feed this into the full LSTM
            out, self.hidden_full = self.lstm_full(full_lstm_input.view(1, 1, -1), self.hidden_full)


        # print(out)
        # out is our final embedding
        # print(hidden_full)

        tag_space = self.hidden2tag(out.view(n_words, -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [65]:
training_data[0][0]

['the', 'dog', 'ate', 'the', 'apple']

Train the model:



In [67]:
model = CharLevelLSTM()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    tag_scores = model(training_data[0][0])
    print(tag_scores)


RuntimeError: ignored

In [0]:

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)