In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f7850f06e28>

In [28]:
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn(1, 3))
          for _ in range(5)]
inputs

[Variable containing:
 -0.5060 -0.0125  0.8425
 [torch.FloatTensor of size 1x3], Variable containing:
  0.7640 -0.9862 -1.9526
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.3435  0.2060 -0.6058
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.3751 -0.1837 -0.0318
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.2993 -1.0755 -0.1291
 [torch.FloatTensor of size 1x3]]

In [22]:
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
          autograd.Variable(torch.randn(1, 1, 3)))

hidden

(Variable containing:
 (0 ,.,.) = 
  -0.4621 -0.5060  1.1233
 [torch.FloatTensor of size 1x1x3], Variable containing:
 (0 ,.,.) = 
   0.4800 -0.0344 -0.4928
 [torch.FloatTensor of size 1x1x3])

In [29]:
inputs[0].view(1, 1, -1)

Variable containing:
(0 ,.,.) = 
 -0.5060 -0.0125  0.8425
[torch.FloatTensor of size 1x1x3]

In [30]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
#     print(out)
#     print(hidden)

In [32]:
# alternatively, we can do the entire sequence all at once.

inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
    torch.randn((1, 1, 3))))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.0463 -0.2616  0.0733

(1 ,.,.) = 
  0.0946 -0.1927 -0.0841

(2 ,.,.) = 
  0.1387  0.0398 -0.1610

(3 ,.,.) = 
  0.1861  0.1196 -0.0948

(4 ,.,.) = 
  0.2235  0.1454 -0.0428
[torch.FloatTensor of size 5x1x3]

(Variable containing:
(0 ,.,.) = 
  0.2235  0.1454 -0.0428
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
  0.4146  0.5658 -0.2482
[torch.FloatTensor of size 1x1x3]
)


In [37]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

In [33]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])]

word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'Everybody': 5, 'ate': 2, 'apple': 4, 'that': 7, 'read': 6, 'dog': 1, 'book': 8, 'the': 3, 'The': 0}


In [34]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [44]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [45]:
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(tag_scores)

Variable containing:
-1.2865 -1.0214 -1.0115
-1.2074 -1.0607 -1.0362
-1.2565 -1.0547 -1.0022
-1.2158 -1.0656 -1.0244
-1.2071 -1.0159 -1.0821
[torch.FloatTensor of size 5x3]



In [46]:
training_data[0][0]

['The', 'dog', 'ate', 'the', 'apple']

In [48]:
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward(retain_graph=True)
        optimizer.step()

In [49]:
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(tag_scores)

Variable containing:
-0.1502 -2.0514 -4.5158
-5.8275 -0.0116 -4.7582
-3.5487 -4.8382 -0.0374
-0.0372 -4.1244 -3.8944
-5.8471 -0.0086 -5.1687
[torch.FloatTensor of size 5x3]



In [50]:
class CharLvlRep(nn.Module):
    def __init__(self, embedding_dim, rep_dim, char_size):  
        super(CharLvlRep, self).__init__()
        self.char_embeddings = nn.Embedding(char_size, embedding_dim).cuda()
        self.lstm = nn.LSTM(embedding_dim, rep_dim).cuda()
        
    def forward(self, word, lstm_istate, is_train=False):
        embeds = self.char_embeddings(word)
        lstm_istate_var = (autograd.Variable(lstm_istate[0], requires_grad=False, volatile=not is_train),
                           autograd.Variable(lstm_istate[1], requires_grad=False, volatile=not is_train))
        char_reps, _ = self.lstm(embeds.view(len(word), 1, -1), lstm_istate_var)
        final_char_rep = (char_reps[char_reps.size()[0]-1, :, :])
        return final_char_rep

In [51]:
class LSTMTagger_(nn.Module):
    def __init__(self, char_embedding_dim, char_rep_dim, char_size, 
                 word_embedding_dim, vocab_size, hidden_dim, tagset_size):
        super(LSTMTagger, self).__init__()
        self.char_rep_dim = char_rep_dim
        self.model_char = CharLvlRep(char_embedding_dim, char_rep_dim, char_size)
        self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim).cuda()
        self.lstm = nn.LSTM(word_embedding_dim + char_rep_dim, hidden_dim).cuda()
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size).cuda()

    def forward(self, sentence, words, word_lstm_istate, char_lstm_istate, is_train=False):
        word_embeds = self.word_embeddings(sentence)
        word_embeds = word_embeds.view(len(sentence), 1, -1)
      
        char_reps = autograd.Variable(torch.zeros(len(words),1,self.char_rep_dim).cuda(), volatile=not is_train)
        for idx, word in enumerate(words):
            char_reps[idx, :, :] = self.model_char(word, char_lstm_istate, is_train)
      
        embeds_cat = torch.cat((word_embeds, char_reps), dim=2)
      
        word_lstm_istate_var = (autograd.Variable(word_lstm_istate[0], requires_grad=False, volatile=not is_train),
                                autograd.Variable(word_lstm_istate[1], requires_grad=False, volatile=not is_train))
      
        lstm_out, _ = self.lstm(embeds_cat, word_lstm_istate_var)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [None]:
for epoch in range(N_EPOCH):
    is_train = True
    train_accuracy = 0.0
    for idx, (sentence, tags) in enumerate(train_data):
    # new training sequence -> zero the gradients of all models
    model.zero_grad()
    
    # one-hot encoding of chars in words
    words_in = []
    
    for word in sentence:
        words_in.append(prepare_sequence(word, char_to_ix, is_train))
    
    # one-hot encoding of words in sentence
    sentence_in = prepare_sequence(sentence, word_to_ix, is_train)
    
    # compute the scores (forward pass)
    tag_scores = model(sentence_in, words_in, WORD_ISTATE, CHAR_ISTATE, is_train)
    
    # one-hot encoding of the labels for each word
    targets = prepare_sequence(tags, tag_to_ix, is_train)
    
    # compute the accuracy
    train_accuracy += get_accuracy(tag_scores, targets)
    
    # compute the loss, gradients, and update the parameters by
    # calling optimizer.step()
    loss = loss_function(tag_scores, targets)
    loss.backward()
    optimizer.step()
    
    loss_hyst.append(loss.data[0])
    
    if idx % PRINT_EVERY == 0:
        print("It {}: loss = {}".format(idx,loss.data[0]))
  
    train_accuracy /= len(train_data)
    train_accuracy_hyst.append(train_accuracy.data[0])
    print("Epoch {}: train_accuracy = {}".format(epoch, train_accuracy.data[0]))
  
    # evaluate the validation accuracy after each epoch
    is_train = False
    valid_accuracy = 0.0
    for sentence, tags in valid_data:
        # one-hot encoding of chars in words
        words_in = []
    
    for word in sentence:
        words_in.append(prepare_sequence(word, char_to_ix, is_train))
    
    # one-hot encoding of words in sentence
    sentence_in = prepare_sequence(sentence, word_to_ix, is_train)
    
    # compute the scores (forward pass)
    tag_scores = model(sentence_in, words_in, WORD_ISTATE, CHAR_ISTATE, is_train)
    
    # one-hot encoding of the labels for each word
    targets = prepare_sequence(tags, tag_to_ix, is_train)
    
    # compute the accuracy
    valid_accuracy += get_accuracy(tag_scores, targets)
    
    valid_accuracy /= len(valid_data)
    valid_accuracy_hyst.append(valid_accuracy.data[0])
    print("Epoch {}: valid_accuracy = {}".format(epoch, valid_accuracy.data[0]))
  
    # save the best model so far
    if valid_accuracy.data[0] > best_valid_accuracy:
        torch.save(model.state_dict(), SAVE_PATH)
        best_valid_accuracy = valid_accuracy.data[0]