In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import numpy as np
import random
from tqdm import tqdm_notebook

torch.manual_seed(1)

<torch._C.Generator at 0x7fe01413da50>

### 1. Read and process the data.

In [2]:
path = './data/'

In [3]:
def get_data(name):
    f = open(path + name + '.txt', 'r')
    lines = f.readlines()
    
    result = []
    
    sentence = []
    tag_sentence = []
    
    for line in lines[2:]:
        if line == '\n':           
            
            if len(sentence) > 0:
                result.append((sentence, tag_sentence))
            
                sentence = []
                tag_sentence = []
            
            continue
            
        words = line.strip().split()
        
        if words[0] == '-DOCSTART-':
            continue
        
        sentence.append(words[0])
        tag_sentence.append(words[3])
            
        
    return result

In [4]:
train = get_data('train')
dev = get_data('dev')
test = get_data('test')

dataset = train + dev + test

### 2. Implement 3 strategies for loading the embeddings

In [5]:
embeddings_name = 'glove.6B.100d.txt'

def loadGloveModel(file):
    print("Loading Glove Model")
    f = open(file,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
gmodel = loadGloveModel(path + embeddings_name)

Loading Glove Model
Done. 400000  words loaded!


In [7]:
EMBEDDING_DIM = 100


In [8]:
vocab = {}
tag_vocab = {}
for sent, tags in dataset:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)
    for tag in tags:
        if tag not in tag_vocab:
            tag_vocab[tag] = len(tag_vocab)

vocab['pad'] = len(vocab)
            
vocab_size = len(vocab)

In [9]:
tag_vocab

{'B-LOC': 5,
 'B-MISC': 2,
 'B-ORG': 0,
 'B-PER': 3,
 'I-LOC': 8,
 'I-MISC': 7,
 'I-ORG': 6,
 'I-PER': 4,
 'O': 1}

**2.a.** Load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding

In [10]:
unk = gmodel['unk']

In [11]:
gmodel_strategy_1 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    if word in gmodel:
        gmodel_strategy_1[index,:] = gmodel[word]
    else:
        gmodel_strategy_1[index,:] = unk

**2.b.** load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding

In [12]:
gmodel_strategy_2 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word_lower in gmodel:
        gmodel_strategy_2[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_2[index,:] = unk

**2.c.** load the embeddings for original capitalization of words. If embedding for this word doesn't exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding

In [13]:
gmodel_strategy_3 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word]
    elif word_lower in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_3[index,:] = unk

### 3. Implement training on batches

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def prepare_batch(batch_sentences, vocab):
    batch_max_len = max([len(s[0]) for s in batch_sentences])

    # prepare a numpy array with the data, initializing the data with 'PAD' 
    # and all labels with -1; initializing labels to -1 differentiates tokens 
    # with tags from 'PAD' tokens
    batch_data = vocab['pad']*np.ones((len(batch_sentences), batch_max_len))
    batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

    # copy the data to the numpy array
    for j in range(len(batch_sentences)):
        cur_len = len(batch_sentences[j][0])
        batch_data[j][:cur_len] = prepare_sequence(batch_sentences[j][0], vocab)
        batch_labels[j][:cur_len] = prepare_sequence(batch_sentences[j][1], tag_vocab)

    # convert data them to torch LongTensors
    batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

    return batch_data, batch_labels

    
# get_batch(train[:10], vocab)

In [15]:
class LSTM_NER(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, embedding_matrix, tagset_size):
        super(LSTM_NER, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(embedding_matrix.shape[0], embedding_dim).\
            from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=True)            
        self.word_embeddings.weight.requires_grad = False

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.fc = nn.Linear(2*hidden_dim, tagset_size)
        
        # field to count epochs
        self.epoch_counter = 0
        

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        lstm_out, _ = self.lstm(embeds)
        
        lstm_out = lstm_out.view(-1, lstm_out.shape[2])
        
        fc_out = self.fc(lstm_out)
        
        tag_scores = F.log_softmax(fc_out, dim=1)
        return tag_scores

In [16]:
# dimension of hidden layer
HIDDEN_DIM = 100

# batch size
BATCH_SIZE = 64


def loss_function(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data.item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens


# create model with corresponded embeddings strategy
def get_model(model_strategy):

    model = LSTM_NER(EMBEDDING_DIM, HIDDEN_DIM, model_strategy, len(tag_vocab))
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # check scores before training
    with torch.no_grad():
        inputs, labels = prepare_batch(train[:BATCH_SIZE], vocab)
        tag_scores = model(inputs)
        
    return model, optimizer

In [36]:
# I checked different methods how to take batches.
# This one take random dubset from dataset
def get_batch(b_size, data):
    batch_sentences = random.sample(data, b_size)
    
    return prepare_batch(batch_sentences, vocab)

### 4. Implement the calculation of token-level Precision / Recall / F1 / F0.5 scores for all classes in average.

I compared 2 approaches (micro- and macro- average) for model evaluation. As we have one big tag-class ('O') and model very often predicts tags as 'O', micro-average F-score always gives good enough result even if the model started training and always returns 'O'. BTW, it takes pretty much time to detect the issue and figure out why model shows good results at the beginning and almost does not learns.

After that I implemented macro-average approach to make each tag-class equivalent.

In [66]:
F_SCORE = 1

# return F score, loss and global precision
def validate_model(model, dataset):
    
    size = len(dataset)
    
    # init counter for TP, FP, FN and each tag-class
    c_tp = np.zeros(len(tag_vocab), np.int32)
    c_fp = np.zeros(len(tag_vocab), np.int32)
    c_fn = np.zeros(len(tag_vocab), np.int32)
    
    # total loss
    loss = 0
    
    # calculate global precision
    tp = 0
    total = 0
    
    # validate each sentence of the test set separately
    for i in range(size):    
        sentence_in, targets = prepare_batch(dataset[i:i+1], vocab)

        # model scores
        tag_scores = model(sentence_in)
        
        # take each word
        for k, tag_score in enumerate(tag_scores):
            
            # the index of the max value corresponds to the index of tag in global tag_vocab
            max_val, prediction = tag_score.max(0)
            prediction = prediction.item()
            
            # labeled tag value
            target = targets[0][k].item()
                
            # if tag is predicted correctly
            if prediction == target:
                # increase TP value for corresponded tag
                c_tp[target] += 1
                # increase global TP value
                tp += 1
            else:
                # increase FP for predicted tag
                c_fp[prediction] += 1
                # increase FN for true (target) tag
                c_fn[target] += 1
            
            total += 1
        
        # increase loss function
        loss += loss_function(tag_scores, targets).item()
        
    # to prevent warning of dividing by zero
    with np.errstate(all='ignore'):

        # take macro average for precision and recall
        pr = np.mean(np.nan_to_num(c_tp / (c_tp+c_fp) ))
        rc = np.mean(np.nan_to_num(c_tp / (c_tp+c_fn) ))
        
    # calculate F score
    f_score_macro = (1 + F_SCORE**2) * pr * rc / (F_SCORE**2 * pr + rc)
    f_score_micro = tp / total

    return f_score_macro, f_score_micro, loss / size

# validate_model(model3, test)

(0.44503688817093423, 0.9029180574997308, 0.4139048891897749)

In [26]:
train_sorted = sorted(train, key=lambda item: (len(item[0])))

def train_model(model, optimizer, epochs = 5):
    
    steps = len(train) // BATCH_SIZE + 1
    
    for epoch in range(epochs):
        loss_sum = 0
        print("Epoch: {}".format(model.epoch_counter))
        model.epoch_counter += 1
        for step in tqdm_notebook(range(steps)):
            # clear gradients out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.

            # sentence_in, targets = get_batch(BATCH_SIZE, train)        
            sentence_in, targets = prepare_batch(train_sorted[step * BATCH_SIZE: (step+1) * BATCH_SIZE], vocab)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            loss_sum += loss

        avg_epoch_loss = np.round((loss_sum / steps).item(), 4)
        dev_epoch_f1_macro, dev_epoch_f1_micro, dev_epoch_loss = np.round(validate_model(model, dev), 4)

        print("""    Train loss     :{}
        Validation loss  :{}
        Validation F1 macro :{}
        Validation F1 micro :{}
    ____________________________________________________________
        """.\
        format(avg_epoch_loss, dev_epoch_loss, dev_epoch_f1_macro, dev_epoch_f1_micro))
        
#     return model

In [28]:
model1, optimizer1 = get_model(gmodel_strategy_1)
model2, optimizer2 = get_model(gmodel_strategy_2)
model3, optimizer3 = get_model(gmodel_strategy_3)

In [30]:
train_model(model1, optimizer1)

Epoch: 0


    Train loss:     0.8805
        Validate loss:  1.0424
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 1


    Train loss:     0.741
        Validate loss:  0.8954
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 2


    Train loss:     0.6961
        Validate loss:  0.7991
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 3


    Train loss:     0.6752
        Validate loss:  0.7601
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 4


    Train loss:     0.6668
        Validate loss:  0.7426
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        


In [31]:
train_model(model2, optimizer2)

Epoch: 0


    Train loss:     0.9203
        Validate loss:  1.1994
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 1


    Train loss:     0.737
        Validate loss:  1.0403
        Validate F1: 0.1731
        Validate Pr: 0.8392
    ____________________________________________________________
        
Epoch: 2


    Train loss:     0.6512
        Validate loss:  0.841
        Validate F1: 0.2242
        Validate Pr: 0.8464
    ____________________________________________________________
        
Epoch: 3


    Train loss:     0.5799
        Validate loss:  0.7114
        Validate F1: 0.2642
        Validate Pr: 0.8631
    ____________________________________________________________
        
Epoch: 4


    Train loss:     0.5225
        Validate loss:  0.6388
        Validate F1: 0.2841
        Validate Pr: 0.8736
    ____________________________________________________________
        


In [45]:
train_model(model3, optimizer3, epochs=15)

Epoch: 5


TP:  [  575 42391     0  1182     0  1013     0     0     0] 45161
FP:  [ 574 4307    0  907    0  413    0    0    0] 6201
FN:  [ 766  368  922  660 1307  824  751  346  257] 6201
Pr:  [0.5   0.908 0.    0.566 0.    0.71  0.    0.    0.   ]
    Train loss:     0.4925
        Validate loss:  0.5967
        Validate F1: 0.2943
        Validate Pr: 0.8793
    ____________________________________________________________
        
Epoch: 6


TP:  [  611 42360     0  1308     0  1089     0     0     0] 45368
FP:  [ 582 3837    0 1015    0  560    0    0    0] 5994
FN:  [ 730  399  922  534 1307  748  751  346  257] 5994
Pr:  [0.512 0.917 0.    0.563 0.    0.66  0.    0.    0.   ]
    Train loss:     0.4599
        Validate loss:  0.562
        Validate F1: 0.3
        Validate Pr: 0.8833
    ____________________________________________________________
        
Epoch: 7


TP:  [  644 42321     0  1352     8  1134     0     0     0] 45459
FP:  [ 593 3540    0 1062    5  703    0    0    0] 5903
FN:  [ 697  438  922  490 1299  703  751  346  257] 5903
Pr:  [0.521 0.923 0.    0.56  0.615 0.617 0.    0.    0.   ]
    Train loss:     0.4338
        Validate loss:  0.5345
        Validate F1: 0.3353
        Validate Pr: 0.8851
    ____________________________________________________________
        
Epoch: 8


TP:  [  688 42289     0  1353    68  1311     0     0     0] 45709
FP:  [ 610 3106    0 1030   49  858    0    0    0] 5653
FN:  [ 653  470  922  489 1239  526  751  346  257] 5653
Pr:  [0.53  0.932 0.    0.568 0.581 0.604 0.    0.    0.   ]
    Train loss:     0.4114
        Validate loss:  0.5119
        Validate F1: 0.345
        Validate Pr: 0.8899
    ____________________________________________________________
        
Epoch: 9


TP:  [  717 42258     0  1379   143  1407     0     0     0] 45904
FP:  [ 628 2834    0  986   91  919    0    0    0] 5458
FN:  [ 624  501  922  463 1164  430  751  346  257] 5458
Pr:  [0.533 0.937 0.    0.583 0.611 0.605 0.    0.    0.   ]
    Train loss:     0.3919
        Validate loss:  0.4932
        Validate F1: 0.3563
        Validate Pr: 0.8937
    ____________________________________________________________
        
Epoch: 10


TP:  [  726 42241     0  1390   193  1415     0     0     0] 45965
FP:  [ 617 2689    0  990  141  960    0    0    0] 5397
FN:  [ 615  518  922  452 1114  422  751  346  257] 5397
Pr:  [0.541 0.94  0.    0.584 0.578 0.596 0.    0.    0.   ]
    Train loss:     0.3747
        Validate loss:  0.4773
        Validate F1: 0.3578
        Validate Pr: 0.8949
    ____________________________________________________________
        
Epoch: 11


TP:  [  758 42173    24  1399   229  1486     0     0     0] 46069
FP:  [ 628 2440    1  970  164 1090    0    0    0] 5293
FN:  [ 583  586  898  443 1078  351  751  346  257] 5293
Pr:  [0.547 0.945 0.96  0.591 0.583 0.577 0.    0.    0.   ]
    Train loss:     0.3596
        Validate loss:  0.4634
        Validate F1: 0.4122
        Validate Pr: 0.8969
    ____________________________________________________________
        
Epoch: 12


TP:  [  765 42179   107  1397   258  1489     0     0     0] 46195
FP:  [ 607 2310    4  955  196 1095    0    0    0] 5167
FN:  [ 576  580  815  445 1049  348  751  346  257] 5167
Pr:  [0.558 0.948 0.964 0.594 0.568 0.576 0.    0.    0.   ]
    Train loss:     0.346
        Validate loss:  0.4512
        Validate F1: 0.4206
        Validate Pr: 0.8994
    ____________________________________________________________
        
Epoch: 13


TP:  [  762 42204   202  1408   277  1491     0     0     0] 46344
FP:  [ 619 2226   19  949  205 1000    0    0    0] 5018
FN:  [ 579  555  720  434 1030  346  751  346  257] 5018
Pr:  [0.552 0.95  0.914 0.597 0.575 0.599 0.    0.    0.   ]
    Train loss:     0.3337
        Validate loss:  0.4402
        Validate F1: 0.4277
        Validate Pr: 0.9023
    ____________________________________________________________
        
Epoch: 14


TP:  [  739 42221   299  1437   293  1515     0     0     0] 46504
FP:  [ 621 2137   26  940  211  923    0    0    0] 4858
FN:  [ 602  538  623  405 1014  322  751  346  257] 4858
Pr:  [0.543 0.952 0.92  0.605 0.581 0.621 0.    0.    0.   ]
    Train loss:     0.3224
        Validate loss:  0.4303
        Validate F1: 0.4377
        Validate Pr: 0.9054
    ____________________________________________________________
        
Epoch: 15


TP:  [  740 42222   347  1449   315  1523     0     0     0] 46596
FP:  [ 600 2072   45  948  203  898    0    0    0] 4766
FN:  [601 537 575 393 992 314 751 346 257] 4766
Pr:  [0.552 0.953 0.885 0.605 0.608 0.629 0.    0.    0.   ]
    Train loss:     0.3117
        Validate loss:  0.4213
        Validate F1: 0.4432
        Validate Pr: 0.9072
    ____________________________________________________________
        
Epoch: 16


TP:  [  749 42220   413  1470   328  1534     0     0     0] 46714
FP:  [ 595 1979   57  946  205  866    0    0    0] 4648
FN:  [592 539 509 372 979 303 751 346 257] 4648
Pr:  [0.557 0.955 0.879 0.608 0.615 0.639 0.    0.    0.   ]
    Train loss:     0.3015
        Validate loss:  0.4132
        Validate F1: 0.4508
        Validate Pr: 0.9095
    ____________________________________________________________
        
Epoch: 17


TP:  [  749 42243   454  1469   342  1538     0     0     0] 46795
FP:  [ 583 1899   71  921  223  870    0    0    0] 4567
FN:  [592 516 468 373 965 299 751 346 257] 4567
Pr:  [0.562 0.957 0.865 0.615 0.605 0.639 0.    0.    0.   ]
    Train loss:     0.2918
        Validate loss:  0.406
        Validate F1: 0.4537
        Validate Pr: 0.9111
    ____________________________________________________________
        
Epoch: 18


TP:  [  752 42226   466  1461   363  1539     0     0     0] 46807
FP:  [ 581 1871   78  907  244  874    0    0    0] 4555
FN:  [589 533 456 381 944 298 751 346 257] 4555
Pr:  [0.564 0.958 0.857 0.617 0.598 0.638 0.    0.    0.   ]
    Train loss:     0.2829
        Validate loss:  0.3996
        Validate F1: 0.4547
        Validate Pr: 0.9113
    ____________________________________________________________
        
Epoch: 19


TP:  [  772 42219   489  1468   361  1539     0     0     0] 46848
FP:  [ 586 1839   80  919  245  845    0    0    0] 4514
FN:  [569 540 433 374 946 298 751 346 257] 4514
Pr:  [0.568 0.958 0.859 0.615 0.596 0.646 0.    0.    0.   ]
    Train loss:     0.2748
        Validate loss:  0.3938
        Validate F1: 0.4578
        Validate Pr: 0.9121
    ____________________________________________________________
        


In [32]:
np.round(validate_model(model1, test), 4)

array([0.1005, 0.7863, 0.8253])

In [33]:
np.round(validate_model(model2, test), 4)

array([0.2805, 0.6471, 0.8679])

In [34]:
np.round(validate_model(model3, test), 4)

array([0.2816, 0.6512, 0.8693])