In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import math
import sys
import datetime
import numpy as np
import pickle
import time

from torch.autograd import Variable

"""
Blog post:
Taming LSTMs: Variable-sized mini-batches and why PyTorch is good for your health:
https://medium.com/@_willfalcon/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e
"""


'\nBlog post:\nTaming LSTMs: Variable-sized mini-batches and why PyTorch is good for your health:\nhttps://medium.com/@_willfalcon/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e\n'

In [2]:
# change the data structure to be [sent, tag]
def load_one_line(line):
    line_split_by_space = line.split()
    sent = []
    tag = []
    for item in line_split_by_space:
        item_split = item.split("/")
        if len(item_split) > 2:
            item_combine = item_split[0]
            for i in range(1, len(item_split) - 1):
                item_combine = item_combine + '/' + item_split[i]
            sent.append(item_combine)
            tag.append(item_split[i + 1])
        else:
            sent.append(item_split[0])
            tag.append(item_split[1])
    return (sent, tag)

def load_train_file(train_file):
    train_sents = []
    train_tags = []
    with open(train_file) as infile:
        for line in infile:
            train_sent, train_tag = load_one_line(line)
            train_sents.append(train_sent)
            train_tags.append(train_tag)
    return (train_sents, train_tags)
train_sents, train_tags = load_train_file('sents.train')
print(train_sents[0])
print(train_tags[0])
print(len(train_sents), len(train_tags))

['In', 'an', 'Oct.', '19', 'review', 'of', '``', 'The', 'Misanthrope', "''", 'at', 'Chicago', "'s", 'Goodman', 'Theatre', '(', '``', 'Revitalized', 'Classics', 'Take', 'the', 'Stage', 'in', 'Windy', 'City', ',', "''", 'Leisure', '&', 'Arts', ')', ',', 'the', 'role', 'of', 'Celimene', ',', 'played', 'by', 'Kim', 'Cattrall', ',', 'was', 'mistakenly', 'attributed', 'to', 'Christina', 'Haag', '.']
['IN', 'DT', 'NNP', 'CD', 'NN', 'IN', '``', 'DT', 'NN', "''", 'IN', 'NNP', 'POS', 'NNP', 'NNP', '-LRB-', '``', 'VBN', 'NNS', 'VBP', 'DT', 'NN', 'IN', 'NNP', 'NNP', ',', "''", 'NN', 'CC', 'NNS', '-RRB-', ',', 'DT', 'NN', 'IN', 'NNP', ',', 'VBN', 'IN', 'NNP', 'NNP', ',', 'VBD', 'RB', 'VBN', 'TO', 'NNP', 'NNP', '.']
39832 39832


In [3]:
#build word to index, tag to index
# unknown threshold = 1
word_to_index_count = {}
tag_to_index = {'<PAD>': 0}
for sent, tags in zip(train_sents, train_tags):
    for word, tag in zip(sent, tags):
        if word not in word_to_index_count:
            word_to_index_count[word] = [len(word_to_index_count),1]
        else:
            word_to_index_count[word][1] += 1
        if tag not in tag_to_index:
            tag_to_index[tag] = len(tag_to_index)

print(word_to_index_count['nice'])
print(tag_to_index['IN'])

word_to_index = {'<PAD>': 0, '<UNK>': 1}
for word, index_count in word_to_index_count.items():
    if index_count[1] > 1:
        word_to_index[word] = len(word_to_index)
print(len(word_to_index))
# build reverse
index_to_tag = {v:k for k,v in tag_to_index.items()}
print(len(tag_to_index))
print(index_to_tag)

[12646, 18]
1
23769
46
{0: '<PAD>', 1: 'IN', 2: 'DT', 3: 'NNP', 4: 'CD', 5: 'NN', 6: '``', 7: "''", 8: 'POS', 9: '-LRB-', 10: 'VBN', 11: 'NNS', 12: 'VBP', 13: ',', 14: 'CC', 15: '-RRB-', 16: 'VBD', 17: 'RB', 18: 'TO', 19: '.', 20: 'VBZ', 21: 'NNPS', 22: 'PRP', 23: 'PRP$', 24: 'VB', 25: 'JJ', 26: 'MD', 27: 'VBG', 28: 'RBR', 29: ':', 30: 'WP', 31: 'WDT', 32: 'JJR', 33: 'PDT', 34: 'RBS', 35: 'WRB', 36: 'JJS', 37: '$', 38: 'RP', 39: 'FW', 40: 'EX', 41: 'SYM', 42: '#', 43: 'LS', 44: 'UH', 45: 'WP$'}


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assume that we are on a CUDA machine, then this should print a CUDA device:

print(device)
max_length = 150
b = 32

cuda:0


In [5]:
class BieberLSTM(nn.Module):
    def __init__(self, nb_layers, nb_lstm_units=50, embedding_dim=32, batch_size=b):
        super(BieberLSTM, self).__init__()
        self.to(device)
        self.on_gpu = True
        self.vocab = word_to_index
        self.tags = tag_to_index

        self.nb_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        # don't count the padding tag for the classifier output
        self.nb_tags = len(self.tags)

        # when the model is bidirectional we double the output dimension
        #self.lstm

        # build actual NN
        self.__build_model()

    def __build_model(self):
        # build embedding layer first
        nb_vocab_words = len(self.vocab)

        # whenever the embedding sees the padding index it'll make the whole vector zeros
        padding_idx = self.vocab['<PAD>']
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim,
            padding_idx=padding_idx
        )

        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_layers,
            batch_first=True,
            bidirectional = True, 
            dropout = 0.5
        )

        # output layer which projects back to tag space
        self.hidden_to_tag = nn.Linear(self.nb_lstm_units * 2, self.nb_tags)
    
    def change_batch_size(self, x):
        self.batch_size = x

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(self.nb_layers * 2, self.batch_size, self.nb_lstm_units)
        hidden_b = torch.randn(self.nb_layers * 2, self.batch_size, self.nb_lstm_units)

        if self.on_gpu:
            hidden_a = hidden_a.cuda()
            hidden_b = hidden_b.cuda()

        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X, X_lengths):
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        self.hidden = self.init_hidden()
        #print('x size', X.size())

        batch_size, seq_len = X.size()

        # ---------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len, 1) -> (batch_size, seq_len, embedding_dim)
        X = self.word_embedding(X)
        #print(X.shape, 'X')

        # ---------------------
        # 2. Run through RNN
        # TRICK 2 ********************************
        # Dim transformation: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, nb_lstm_units)

        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        X_lengths = torch.tensor([X_lengths]* self.batch_size, dtype=torch.long).to(device)
        X = torch.nn.utils.rnn.pack_padded_sequence(X, X_lengths, batch_first=True)

        # now run through LSTM
        X, self.hidden = self.lstm(X, self.hidden)

        # undo the packing operation
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        # ---------------------
        # 3. Project to tag space
        # Dim transformation: (batch_sBieberLSTMize, seq_len, nb_lstm_units) -> (batch_size * seq_len, nb_lstm_units)

        # this one is a bit tricky as well. First we need to reshape the data so it goes into the linear layer
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        # run through actual linear layer
        X = self.hidden_to_tag(X)

        # ---------------------
        # 4. Create softmax activations bc we're doing classification
        # Dim transformation: (batch_size * seq_len, nb_lstm_units) -> (batch_size, seq_len, nb_tags)
        X = F.log_softmax(X, dim=1)

        # I like to reshape for mental sanity so we're back to (batch_size, seq_len, nb_tags)
        X = X.view(batch_size, seq_len, self.nb_tags)

        Y_hat = X
        return Y_hat

    def loss(self, Y_hat, Y, X_lengths):
        # TRICK 3 ********************************
        # before we calculate the negative log likelihood, we need to mask out the activations
        # this means we don't want to take into account padded items in the output vector
        # simplest way to think about this is to flatten ALL sequences into a REALLY long sequence
        # and calculate the loss on that.

        # flatten all the labels
        Y = Y.view(-1)

        # flatten all predictions
        Y_hat = Y_hat.view(-1, self.nb_tags)
        #print(Y_hat.shape[0],Y_hat.shape[1], 'Y_hat.shape')

        # create a mask by filtering out all tokens that ARE NOT the padding token
        tag_pad_token = self.tags['<PAD>']
        mask = (Y > tag_pad_token).float()
        #print('mask', mask, mask.shape)

        # count how many tokens we have
        nb_tokens = int(torch.sum(mask).data[0])
        #print('nb tokens', nb_tokens)

        # pick the values for the label and zero out the rest with the mask
        #print(Y_hat, Y, mask, '?')
        #print(Y_hat.size())
        Y_hat = Y_hat[range(Y_hat.shape[0]), Y] * mask
        #print(Y_hat, 'New Y_hat')
        # compute cross entropy loss which ignores all <PAD> tokens
        ce_loss = -torch.sum(Y_hat) / nb_tokens

        return ce_loss

In [11]:
def prepare_sequence(sents, d, is_target):
    sents_indexes = []
    for sent in sents:
        sent_indexes = []
        for w in sent:
            if w in d:
                sent_indexes.append(d[w])
            else:
                sent_indexes.append(d['<UNK>'])
        # print(len(sent_indexes), 'len sent indexes')
        if not is_target:
            sent_indexes += [d['<PAD>']] * (max_length - len(sent_indexes))
        sents_indexes.append(sent_indexes)
    return torch.tensor(sents_indexes, dtype=torch.long).to(device)

model = BieberLSTM(2) # 1 layer
model.to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay = 0.0005)
    
for epoch in range(10):  # again, normally you would NOT do 300 epochs, it is toy data
    print(epoch, ' epoch')
    #print('epoch', epoch)
    # train with batch size 32
    if epoch == 5:
        print('decrease lr')
        for g in optimizer.param_groups:
            g['lr'] = 0.01
    if epoch == 8:
        print('decrease lr further')
        for g in optimizer.param_groups:
            g['lr'] = 0.001
    for i in range(int(len(train_sents)/b)): # 39832/50 ~= 796.64 / 32 ~= 124x
        #print(i)
        model.zero_grad()
        
        start = i * b
        sents = train_sents[start:start + b]
        tags = train_tags[start:start + b]
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sents, word_to_index, False)
        targets = prepare_sequence(tags, tag_to_index, False)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in, max_length)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = model.loss(tag_scores, targets, max_length)
        if i % 500 == 0:
            print(loss)
        loss.backward()
        optimizer.step()


0  epoch
tensor(3.8241, device='cuda:0', grad_fn=<DivBackward0>)




tensor(1.4120, device='cuda:0', grad_fn=<DivBackward0>)
tensor(1.0825, device='cuda:0', grad_fn=<DivBackward0>)
1  epoch
tensor(1.0571, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.8905, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.8001, device='cuda:0', grad_fn=<DivBackward0>)
2  epoch
tensor(0.8048, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.5935, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.5366, device='cuda:0', grad_fn=<DivBackward0>)
3  epoch
tensor(0.5052, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.3152, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.3171, device='cuda:0', grad_fn=<DivBackward0>)
4  epoch
tensor(0.3299, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1945, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2202, device='cuda:0', grad_fn=<DivBackward0>)
5  epoch
decrease lr
tensor(0.2251, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1495, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1575, device='cuda:0', grad_fn=<DivBac

In [12]:
def tag_sent(sent):
    sent = sent.split()
    with torch.no_grad():
        inputs = prepare_sequence([sent] * b, word_to_index, False)
        #print(inputs)
        #model.change_batch_size(1)
        tag_scores = model(inputs, max_length)
        #print(tag_scores)
        val, tag_indexes = torch.max(tag_scores[0], 1)
        tag_names = [index_to_tag[i.item()] for i in tag_indexes]
    output = ''
    for word, tag in zip(sent, tag_names):
        output += word + '/' + tag + ' '
    return output
tag_sent('I have an apple')

'I/PRP have/VBP an/DT apple/NN '

In [13]:
def tag_and_save(test_file, out):
    sents = []
    with open(test_file) as t:
        for line in t:
            sents.append(line)
    with open(out, 'w') as o:
        for sent in sents:
            o.write(tag_sent(sent) + '\n')
    print('Finished.')
tag_and_save('sents.test', 'sents.out')

Finished.


In [14]:
def eval():
    reader = open('sents.out')
    out_lines = reader.readlines()
    reader.close()

    reader = open('sents.answer')
    ref_lines = reader.readlines()
    reader.close()

    if len(out_lines) != len(ref_lines):
        print('Error: No. of lines in output file and reference file do not match.')
        exit(0)

    total_tags = 0
    matched_tags = 0
    for i in range(0, len(out_lines)):
        cur_out_line = out_lines[i].strip()
        cur_out_tags = cur_out_line.split(' ')
        cur_ref_line = ref_lines[i].strip()
        cur_ref_tags = cur_ref_line.split(' ')
        total_tags += len(cur_ref_tags)

        for j in range(0, len(cur_ref_tags)):
            if cur_out_tags[j] == cur_ref_tags[j]:
                matched_tags += 1

    print("Accuracy=", float(matched_tags) / total_tags)
eval()

Accuracy= 0.946423697856528
