In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
torch.manual_seed(1)


<torch._C.Generator at 0x104e42ef0>

In [None]:
lstm = nn.LSTM(3, 3) # input dim = 3, output dim = 3
inputs = [autograd.Variable(torch.randn(1, 3)) for _ in range(5)] # sequence of length 5
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
          autograd.Variable(torch.randn((1, 1, 3))))

In [None]:
hidden[0]

In [None]:
hidden_states = []
for i in inputs:
    hidden_states.append(hidden)
    j = i.view(1, 1, -1) # the second dim is the batch size
    out, hidden = lstm(j, hidden)
print(hidden_states)

In [None]:
# or, pass through the entire sequence all at once
print(inputs)
inputs = torch.cat(inputs)
print(inputs)

In [None]:
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
    torch.randn((1, 1, 3))))  # clean out hidden state, shape is num_layers * minibatch_size * hidden_dim

out, hidden = lstm(inputs, hidden) # out is all of the hidden states, hidden is the last hidden state

In [None]:
print(out)


In [None]:
print(hidden)

In [2]:
def prepare_sequence(sequence, word_to_ix):
    """
    Returns an autograd.Variable that represents a sequence. 
    @param sequence: list of words 
    @param word_to_ix: dict mapping words to indices
    @return tensor - an autograd.Variable representing the sequence
    """
    tensor = torch.LongTensor([word_to_ix[word] for word in sequence])
    return autograd.Variable(tensor)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]),
    ("The teacher scolded the student".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("She told me that she liked me".split(), ["NN", "V", "NN", "DET", "NN", "V", "NN"])
]

test_data = [
    ("The teacher scolded the student".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("She told me that she liked me".split(), ["NN", "V", "NN", "DET", "NN", "V", "NN"])
]


n = training_data + test_data
vocab = [tup[0] for tup in n]
vocab_set = set()
for v in vocab:
    for word in v:
        vocab_set.add(word)
vocab = vocab_set
print(vocab)

word_to_ix = {word: i for word, i in zip(vocab, (i for i in range(len(vocab))))}
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

EMBEDDING_DIM = 100
HIDDEN_DIM = 20
HIDDEN_DIM_2 = 20


{'told', 'liked', 'The', 'read', 'that', 'me', 'Everybody', 'apple', 'She', 'the', 'teacher', 'she', 'ate', 'book', 'student', 'dog', 'scolded'}
{'told': 0, 'The': 2, 'that': 4, 'liked': 1, 'Everybody': 6, 'me': 5, 'She': 8, 'apple': 7, 'the': 9, 'teacher': 10, 'she': 11, 'ate': 12, 'book': 13, 'student': 14, 'dog': 15, 'scolded': 16, 'read': 3}


In [7]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, fixed_vector_size = 100, hidden_dim_2 = 20):
        """Initializes an LSTM for POS tagging.
        @param embedding_dim - dimensionality of each word vector
        @param hidden_dim - LSTM hidden unit size
        @param vocab_size - number of words in the vocabulary
        @param tagset_size - number of possible different tagsets
        @param verbose: whether to log stuff or not
        """
        super().__init__()
        
        # initialize dimensionality constants
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim_2 = hidden_dim
        
        # The first LSTM takes word embedding as inputs, and outputs hidden states with hidden_dim dimensionality
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # A linear layer maps the final hidden state of the first LSTM to a fixed-length vector
        self.to_fixed_vector = nn.Linear(hidden_dim, fixed_vector_size)
        
        # another LSTM reads the fixed vector and outputs the sequence
        self.lstm2 = nn.LSTM(fixed_vector_size, self.hidden_dim_2) # this second hidden dim can be different
        
        # the linear layer maps the 2nd LSTM's hidden state to tag space
        self.linear_final = nn.Linear(self.hidden_dim_2, tagset_size)

        # hidden layer initialization
        self.hidden = self.init_hidden(self.hidden_dim)
        self.hidden2 = self.init_hidden(self.hidden_dim_2)

    def init_hidden(self, hidden_dim):
        """Initialize the hidden state in self.hidden
        Dimensions are num_layers * minibatch_size * hidden_dim
        """
        return (autograd.Variable(torch.zeros(1, 1, hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, hidden_dim)))

    def forward(self, sentence):
        # first, conver tthe sentence to a word embedding
        embeds = self.word_embeddings(sentence)
        embeds_view = embeds.view(len(sentence), 1, -1)
        # forward the embeddings through the first lstm
        lstm_out, self.hidden = self.lstm(
            embeds_view, self.hidden)
        # generate the fixed length vector
        fixed_length_vector = self.to_fixed_vector(lstm_out.view(len(sentence), -1))
        vector_view = fixed_length_vector.view(len(sentence), 1, -1)
        # forward the fixed length vector through the second LSTM
        lstm_out_2, self.hidden2 = self.lstm2(vector_view, self.hidden2)
        # compute scores for each word in the sentence.
        scores2 = F.softmax(self.linear_final(lstm_out_2.view(len(sentence), -1)))
        return scores2

def make_predictions(tag_scores, tag_to_ix):
    max_score_idxs = np.argmax(tag_scores.data.numpy(), axis = 1)
    ix_to_tag = {idx: tag for tag, idx in tag_to_ix.items()}
    predictions = [ix_to_tag[score] for score in list(max_score_idxs)]
    return predictions
    

In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

from functools import reduce
size = sum([reduce(lambda x, y: x * y, param.shape) for param in model.parameters()])
print('number of parameters: {}'.format(size))
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print('initial predictions: {}'.format(make_predictions(tag_scores, tag_to_ix)))

def clear_grads_and_hidden_state(model):
    model.zero_grad()
    # note: assumes both of the LSTMs have same hidden dimensionality, TODO fix this
    model.hidden = model.init_hidden(HIDDEN_DIM)
    model.hidden2 = model.init_hidden(HIDDEN_DIM)
    
for epoch in range(500):
    if epoch % 50 == 0:
        print('training')
    for sentence, tags in training_data:
        
        # remember to clear out the grads and re-initialize the hidden state
        clear_grads_and_hidden_state(model)

        # prepare inputs and labels
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward(retain_graph=True)
        optimizer.step()

# See what the scores are after training

for data in training_data:
    sentence, labels = data
    print('sentence: {}'.format(sentence))
    inputs = prepare_sequence(sentence, word_to_ix)
    scores = model(inputs)
    print('predictions: {}'.format(make_predictions(scores, tag_to_ix)))
    print('actual: {}'.format(labels))

number of parameters: 23383
initial predictions: ['NN', 'NN', 'NN', 'NN', 'NN']
training




training
training
