Source:
http://seba1511.net/tutorials/beginner/nlp/sequence_models_tutorial.html#annotations:QNRYtvyoEemz3m-NBWCG8A

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f4093fa5f30>

In [24]:
# Small example of LSTM

# input_size (dimension) = 3, hidden_size (here, output) = 3
lstm = nn.LSTM(3,3)

# Create five 1 x 3  vectors to be inputs
inputs = [autograd.Variable(torch.randn((1, 3)))
          for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
          autograd.Variable(torch.randn((1, 1, 3))))

print(inputs)
print("\n")
print(hidden)

[tensor([[ 1.6002,  1.3146, -0.6118]]), tensor([[-0.9419, -0.1675, -1.6990]]), tensor([[-2.0724,  1.5600, -0.5075]]), tensor([[-1.6533, -0.0907, -1.0677]]), tensor([[-0.4728, -0.0388, -0.0063]])]


(tensor([[[-0.1100,  0.1423,  0.2453]]]), tensor([[[-0.6245, -0.7920,  1.2385]]]))


In [25]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    print("out = \n", out, "\nhidden = \n", hidden, "\n")


out = 
 tensor([[[-0.3927, -0.0463,  0.5614]]], grad_fn=<CatBackward>) 
hidden = 
 (tensor([[[-0.3927, -0.0463,  0.5614]]], grad_fn=<ViewBackward>), tensor([[[-0.6644, -0.3326,  0.9072]]], grad_fn=<ViewBackward>)) 

out = 
 tensor([[[-0.1232, -0.0005,  0.2808]]], grad_fn=<CatBackward>) 
hidden = 
 (tensor([[[-0.1232, -0.0005,  0.2808]]], grad_fn=<ViewBackward>), tensor([[[-0.2591, -0.0009,  0.5684]]], grad_fn=<ViewBackward>)) 

out = 
 tensor([[[0.0109, 0.0576, 0.1792]]], grad_fn=<CatBackward>) 
hidden = 
 (tensor([[[0.0109, 0.0576, 0.1792]]], grad_fn=<ViewBackward>), tensor([[[0.0528, 0.1260, 0.6644]]], grad_fn=<ViewBackward>)) 

out = 
 tensor([[[0.0646, 0.1672, 0.2290]]], grad_fn=<CatBackward>) 
hidden = 
 (tensor([[[0.0646, 0.1672, 0.2290]]], grad_fn=<ViewBackward>), tensor([[[0.1667, 0.2820, 0.6441]]], grad_fn=<ViewBackward>)) 

out = 
 tensor([[[0.1181, 0.0708, 0.3723]]], grad_fn=<CatBackward>) 
hidden = 
 (tensor([[[0.1181, 0.0708, 0.3723]]], grad_fn=<ViewBackward>), tensor([[[0

In [26]:
# Alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states
# throughout the sequence. the second is just the most 
# recent hidden state (compare the last slice of "out" with 
# "hidden" below, they are the same). The reason for this 
# is that: "out" will give you access to all hidden states 
# in the sequence "hidden" will allow you to continue 
# the sequence and backpropogate, by passing it as an 
# argument  to the lstm at a later time.
# Add the extra 2nd dimension.

# concatenate the tensor inputs along the rows
inputs = torch.cat(inputs).view(len(inputs), 1, -1)

# clean out hidden state (erase previous state for sake of this example)
hidden = (autograd.Variable(torch.randn(1,1,3)), 
          autograd.Variable(torch.randn((1,1,3))))

out, hidden = lstm(inputs, hidden)


print("inputs = ", inputs, "\n")
print("out = ", out, "\n")
print("hidden = ", hidden, "\n")

inputs =  tensor([[[ 1.6002,  1.3146, -0.6118]],

        [[-0.9419, -0.1675, -1.6990]],

        [[-2.0724,  1.5600, -0.5075]],

        [[-1.6533, -0.0907, -1.0677]],

        [[-0.4728, -0.0388, -0.0063]]]) 

out =  tensor([[[-0.4970,  0.0303,  0.1713]],

        [[-0.2354,  0.2378,  0.1109]],

        [[-0.0393,  0.1080,  0.1264]],

        [[ 0.0043,  0.1939,  0.2011]],

        [[ 0.0627,  0.0760,  0.3533]]], grad_fn=<CatBackward>) 

hidden =  (tensor([[[0.0627, 0.0760, 0.3533]]], grad_fn=<ViewBackward>), tensor([[[0.1189, 0.1811, 0.7124]]], grad_fn=<ViewBackward>)) 



### Example: An LSTM for Part-of-Speech Tagging

In this section, we will use an LSTM to get part of speech tags. 

$$
The model is as follows: let our input sentence be
w_1, ..., w_M, where w_i \in V, and V = the vocabulary.
<br> Also let:
<br> T = tag set, 
<br> y_i = tag of word w_i
<br> h_i = hidden state at timestep i

The output is a sequence \hat y_1, ..., \hat y_M where \hat y_i \in T
$$

# major TODO edit above: how to do latex in jupyter

In [29]:

tagToIndex = {"DET":0, "NN":1, "V":2}
EMBEDDING_DIM = 6
HIDDEN_DIM = 6




# Prepare data
def prepareSequence(seq, toIndex):
    indices = [toIndex[w] for w in seq]
    tensorIndices = torch.LongTensor(indices)
    return autograd.Variable(tensorIndices)


trainingData = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]


wordToIndex = {}

for sent, tags in trainingData:
    for word in sent: 
        if word not in wordToIndex:
            wordToIndex[word] = len(wordToIndex)
            
        
print(wordToIndex)


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [30]:
# Create the model

class LSTMTagger(nn.Module):
    
    def __init__(self, embeddingDim, hiddenDim, vocabSize, tagsetSize):
        super(LSTMTagger, self).__init__()
        self.hiddenDim = hiddenDim 
        self.wordEmbeddings = nn.Embedding(vocabSize, embeddingDim)
        
        # The LSTM takes word embeddings as inputs and outputs hidden states
        # with dimensionality hiddenDim
        self.lstm = nn.LSTM(embeddingDim, hiddenDim)
        
        # The Linear layer that maps from hidden state
        # space to the tag space
        self.hiddenToTagLayer = nn.Linear(hiddenDim, tagsetSize)
        self.hiddenLayer = self.initHiddenLayer()
        
    def initHiddenLayer(self):
        # Before doing anything we have NO hidden state.
        # Creating one here (?)
        # The axes semantics are (numLayers, miniBatchSize, hiddenDim)
        return (autograd.Variable(torch.zeros(1, 1, self.hiddenDim)),
                autograd.Variable(torch.zeros(1, 1, self.hiddenDim)))
    
    def forward(self, sentence):
        embed = self.wordEmbeddings(sentence)
        
        lstmOut, self.hiddenLayer = self.lstm(
            embed.view(len(sentence), 1, -1), 
            self.hiddenLayer
        )
        
        tagSpace = self.hiddenToTagLayer(lstmOut.view(len(sentence), -1))
        
        tagScores = F.log_softmax(tagSpace)
        
        return tagScores 

In [32]:
# Train the model
model = LSTMTagger(embeddingDim = EMBEDDING_DIM, 
                   hiddenDim = HIDDEN_DIM,
                   vocabSize = len(wordToIndex),
                   tagsetSize = len(tagToIndex))

lossFunction = nn.NLLLoss()

optimizer = optim.SGD(model.parameters(), lr = 0.1)

print(model)

LSTMTagger(
  (wordEmbeddings): Embedding(9, 6)
  (lstm): LSTM(6, 6)
  (hiddenToTagLayer): Linear(in_features=6, out_features=3, bias=True)
)


In [None]:
# See the scores before training