# LSTM POS tagger: word level

In [1]:
%load_ext autoreload
%autoreload 2

## load data

In [2]:
from data import load_penn_treebank_data

In [3]:
train_data, test_data = load_penn_treebank_data()

train size: 2739
test size: 1175


In [4]:
print('train_data[0][0]: {}'.format(train_data[0][0]))
print('train_data[0][1]: {}'.format(train_data[0][1]))

train_data[0][0]: ['Too' 'much' 'money' '*ICH*-1' 'is' 'at' 'stake' 'for' 'program' 'traders'
 'to' 'give' 'up' '.']
train_data[0][1]: ['RB' 'JJ' 'NN' '-NONE-' 'VBZ' 'IN' 'NN' 'IN' 'NN' 'NNS' 'TO' 'VB' 'IN' '.']


----

## Convert data to index

In [5]:
from util import get_conversion_tables, prepare_sequence

In [6]:
word_to_ix, tag_to_ix = get_conversion_tables(train_data, min_count=1)
vocab_size = len(word_to_ix)
output_size = len(tag_to_ix)
print('vocab_size: {}'.format(vocab_size))
print('output_size: {}'.format(output_size))

vocab_size: 10260
output_size: 46


----

# LSTM

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(LSTMTagger, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, sentence):
        embeds = self.embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2out(lstm_out.view(len(sentence), -1))
        outputs = self.softmax(tag_space)
#         outputs = self.softmax(tag_space)
        return outputs

----

# Train

In [39]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [40]:
model = LSTMTagger(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, output_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [41]:
for epoch in range(10):
    running_loss = 0
    for i, (sentence, tags) in enumerate(train_data):
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        outputs = model(sentence_in)
        
        loss = loss_function(outputs, targets)

        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % 2000 == 1999:
            print('[%d, %5d] loss: %.3f' %
                 (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0            
        
print('finished')

[1,  2000] loss: 2.903
[2,  2000] loss: 2.269
[3,  2000] loss: 2.019
[4,  2000] loss: 1.883
[5,  2000] loss: 1.774
[6,  2000] loss: 1.681
[7,  2000] loss: 1.615
[8,  2000] loss: 1.560
[9,  2000] loss: 1.511
[10,  2000] loss: 1.468
finished


---

# Evaluate

In [42]:
correct = 0
total = 0

with torch.no_grad():
    for sentence, tags in test_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        outputs = model(sentence_in)
        _, predicted = torch.max(outputs.data, 1)
        
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print('Accuracy: {:.2f} %'.format(100 * correct / total)) 

Accuracy: 57.45 %
