# LSTM POS tagger

## load CoNLL-2003 data

In [23]:
import os


def load_data():
    data_dir = '../data/conll_2003/'
    train_file_path = os.path.join(data_dir, 'eng.train')
    test_file_path = os.path.join(data_dir, 'eng.testb')
    train_data = load_conll2003_data(train_file_path)
    test_data = load_conll2003_data(test_file_path)
    return train_data, test_data


def load_conll2003_data(file_path):
    data = []
    sentence, label = [], []
    
    with open(file_path) as f:
        for line in f:
            line = line.rstrip()
            if line.startswith('-DOCSTART-'):
                continue

            if line:
                word, pos, _, _ = line.split()
                sentence.append(word)
                label.append(pos)
            else:
                if not sentence:
                    continue
                data.append([sentence, label])
                sentence, label = [], []
    return data

In [15]:
train_data, test_data = load_data()

In [36]:
len(train_data)

14041

In [22]:
train_data[0]

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['NNP', 'VBZ', 'JJ', 'NN', 'TO', 'VB', 'JJ', 'NN', '.']]

In [41]:
UNK_TOKEN = '<UNK>'
UNK = 0

word_to_ix = {
    UNK_TOKEN: UNK
}
tag_to_ix = {}

for sent, tags in train_data:
    for word, tag in zip(sent, tags):
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
            
vocab_size = len(word_to_ix)
output_size = len(tag_to_ix)
print('vocab_size: {}'.format(vocab_size))
print('output_size: {}'.format(output_size))

vocab_size: 23624
output_size: 45


In [42]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix[UNK_TOKEN] for w in seq ]
    return torch.tensor(idxs, dtype=torch.long)

# LSTM

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(LSTMTagger, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax()
        
    def forward(self, sentence):
        embeds = self.embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2out(lstm_out.view(len(sentence), -1))
        outputs = self.softmax(tag_space)
        return outputs
        

# Train

In [35]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

model = LSTMTagger(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, output_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [37]:
for epoch in range(3):
    running_loss = 0
    for i, (sentence, tags) in enumerate(train_data):
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        outputs = model(sentence_in)
        
        loss = loss_function(outputs, targets)

        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % 2000 == 1999:
            print('[%d, %5d] loss: %.3f' %
                 (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0            
        
print('finished')



[1,  2000] loss: 2.596
[1,  4000] loss: 1.873
[1,  6000] loss: 1.556
[1,  8000] loss: 1.719
[1, 10000] loss: 1.688
[1, 12000] loss: 1.607
[1, 14000] loss: 1.670
[2,  2000] loss: 1.554
[2,  4000] loss: 1.361
[2,  6000] loss: 1.213
[2,  8000] loss: 1.408
[2, 10000] loss: 1.446
[2, 12000] loss: 1.393
[2, 14000] loss: 1.459
[3,  2000] loss: 1.369
[3,  4000] loss: 1.205
[3,  6000] loss: 1.079
[3,  8000] loss: 1.271
[3, 10000] loss: 1.311
[3, 12000] loss: 1.271
[3, 14000] loss: 1.333
[4,  2000] loss: 1.253
[4,  4000] loss: 1.104
[4,  6000] loss: 0.980
[4,  8000] loss: 1.172
[4, 10000] loss: 1.213
[4, 12000] loss: 1.183
[4, 14000] loss: 1.240
[5,  2000] loss: 1.167
[5,  4000] loss: 1.029
[5,  6000] loss: 0.909
[5,  8000] loss: 1.100
[5, 10000] loss: 1.134
[5, 12000] loss: 1.114
[5, 14000] loss: 1.172
[6,  2000] loss: 1.100
[6,  4000] loss: 0.968
[6,  6000] loss: 0.854
[6,  8000] loss: 1.040
[6, 10000] loss: 1.073
[6, 12000] loss: 1.060
[6, 14000] loss: 1.116
[7,  2000] loss: 1.049
[7,  4000] 

# Evaluate

In [46]:
correct = 0
total = 0

with torch.no_grad():
    for sentence, tags in test_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        outputs = model(sentence_in)
        _, predicted = torch.max(outputs.data, 1)
        
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total)) 



['NN', ':', 'NNP', 'VB', 'NNP', 'NNP', ',', 'NNP', 'IN', 'DT', 'NN', '.']
tensor([[ -0.9962,  -4.0845,  -1.9076,  -1.5714, -10.8474,  -3.8860,
         -14.7771,  -3.7973,  -6.3166,  -4.0315,  -9.0635,  -7.7966,
          -2.2779,  -6.2082, -10.6199,  -3.7595,  -9.0452,  -5.6011,
         -11.9054,  -4.7722, -15.0529,  -5.4295, -10.9981,  -6.9006,
         -14.5471, -15.7445,  -9.0105,  -7.6663,  -6.9974,  -3.5209,
          -7.2178,  -7.5900,  -9.0289,  -7.2662,  -8.9846,  -4.6342,
          -6.4313,  -8.4732,  -7.2400,  -8.6017,  -9.2454,  -8.1779,
          -5.8387,  -9.3872, -10.0225],
        [ -5.3286,  -6.1853,  -6.6719,  -7.7831, -11.4943,  -4.8145,
         -10.7168,  -0.1404,  -7.8058,  -6.7955,  -4.4008,  -9.1378,
          -3.9012,  -5.3213,  -6.7828,  -6.2387, -14.0511,  -7.3988,
         -17.3935,  -2.8861, -13.1953,  -7.9786,  -9.7156,  -9.4470,
         -11.5997, -14.3455, -13.6774,  -6.2772,  -8.4087,  -5.7706,
          -7.9856,  -6.0756,  -8.0978,  -7.2157,  -5.3873,