In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import numpy as np
import random
from tqdm import tqdm_notebook

torch.manual_seed(1)

<torch._C.Generator at 0x7fdfd817ca50>

### 1. Read and process the data.

In [2]:
path = './data/'

In [3]:
def get_data(name):
    f = open(path + name + '.txt', 'r')
    lines = f.readlines()
    
    result = []
    
    sentence = []
    tag_sentence = []
    
    for line in lines[2:]:
        if line == '\n':           
            
            if len(sentence) > 0:
                result.append((sentence, tag_sentence))
            
                sentence = []
                tag_sentence = []
            
            continue
            
        words = line.strip().split()
        
        if words[0] == '-DOCSTART-':
            continue
        
        sentence.append(words[0])
        tag_sentence.append(words[3])
            
        
    return result

In [4]:
train = get_data('train')
dev = get_data('dev')
test = get_data('test')

dataset = train + dev + test

### 2. Implement 3 strategies for loading the embeddings

In [5]:
embeddings_name = 'glove.6B.100d.txt'

def loadGloveModel(file):
    print("Loading Glove Model")
    f = open(file,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
gmodel = loadGloveModel(path + embeddings_name)

Loading Glove Model
Done. 400000  words loaded!


In [7]:
EMBEDDING_DIM = 100


In [8]:
vocab = {}
tag_vocab = {}
for sent, tags in dataset:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)
    for tag in tags:
        if tag not in tag_vocab:
            tag_vocab[tag] = len(tag_vocab)

vocab['pad'] = len(vocab)
            
vocab_size = len(vocab)

In [9]:
tag_vocab

{'B-LOC': 5,
 'B-MISC': 2,
 'B-ORG': 0,
 'B-PER': 3,
 'I-LOC': 8,
 'I-MISC': 7,
 'I-ORG': 6,
 'I-PER': 4,
 'O': 1}

**2.a.** Load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding

In [10]:
unk = gmodel['unk']

In [11]:
gmodel_strategy_1 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    if word in gmodel:
        gmodel_strategy_1[index,:] = gmodel[word]
    else:
        gmodel_strategy_1[index,:] = unk

**2.b.** load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding

In [12]:
gmodel_strategy_2 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word_lower in gmodel:
        gmodel_strategy_2[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_2[index,:] = unk

**2.c.** load the embeddings for original capitalization of words. If embedding for this word doesn't exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding

In [13]:
gmodel_strategy_3 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word]
    elif word_lower in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_3[index,:] = unk

### 3. Implement training on batches

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def prepare_batch(batch_sentences, vocab):
    batch_max_len = max([len(s[0]) for s in batch_sentences])

    # prepare a numpy array with the data, initializing the data with 'PAD' 
    # and all labels with -1; initializing labels to -1 differentiates tokens 
    # with tags from 'PAD' tokens
    batch_data = vocab['pad']*np.ones((len(batch_sentences), batch_max_len))
    batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

    # copy the data to the numpy array
    for j in range(len(batch_sentences)):
        cur_len = len(batch_sentences[j][0])
        batch_data[j][:cur_len] = prepare_sequence(batch_sentences[j][0], vocab)
        batch_labels[j][:cur_len] = prepare_sequence(batch_sentences[j][1], tag_vocab)

    # convert data them to torch LongTensors
    batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

    return batch_data, batch_labels

    
# get_batch(train[:10], vocab)

In [15]:
# class witch creates NER model
class LSTM_NER(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, embedding_matrix, tagset_size):
        super(LSTM_NER, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(embedding_matrix.shape[0], embedding_dim).\
            from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=True)            
        self.word_embeddings.weight.requires_grad = False

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.fc = nn.Linear(2*hidden_dim, tagset_size)
        
        # field to count epochs
        self.epoch_counter = 0
        

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        lstm_out, _ = self.lstm(embeds)
        
        lstm_out = lstm_out.view(-1, lstm_out.shape[2])
        
        fc_out = self.fc(lstm_out)
        
        tag_scores = F.log_softmax(fc_out, dim=1)
        return tag_scores

In [16]:
# dimension of hidden layer
HIDDEN_DIM = 100

# batch size
BATCH_SIZE = 64


def loss_function(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data.item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens


# create model with corresponded embeddings strategy
def get_model(model_strategy):

    model = LSTM_NER(EMBEDDING_DIM, HIDDEN_DIM, model_strategy, len(tag_vocab))
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # check scores before training
    with torch.no_grad():
        inputs, labels = prepare_batch(train[:BATCH_SIZE], vocab)
        tag_scores = model(inputs)
        
    return model, optimizer

In [17]:
# I checked different methods how to take batches.
# This one take random dubset from dataset
def get_batch(b_size, data):
    batch_sentences = random.sample(data, b_size)
    
    return prepare_batch(batch_sentences, vocab)

### 4. Implement the calculation of token-level Precision / Recall / F1 / F0.5 scores for all classes in average.

I compared 2 approaches (micro- and macro- average) for model evaluation. As we have one big tag-class ('O') and model very often predicts tags as 'O', micro-average F-score always gives good enough result even if the model started training and always returns 'O'.

After that I added a macro-average approach to take into account each tag-class equivalent.

The function below returns F1 macro and micro scores.

In [18]:
F_SCORE = 1

# return F score, loss and global precision
def validate_model(model, dataset):
    
    size = len(dataset)
    
    # init counter for TP, FP, FN and each tag-class
    c_tp = np.zeros(len(tag_vocab), np.int32)
    c_fp = np.zeros(len(tag_vocab), np.int32)
    c_fn = np.zeros(len(tag_vocab), np.int32)
    
    # total loss
    loss = 0
    
    # calculate global precision
    tp = 0
    total = 0
    
    # validate each sentence of the test set separately
    for i in range(size):    
        sentence_in, targets = prepare_batch(dataset[i:i+1], vocab)

        # model scores
        tag_scores = model(sentence_in)
        
        # take each word
        for k, tag_score in enumerate(tag_scores):
            
            # the index of the max value corresponds to the index of tag in global tag_vocab
            max_val, prediction = tag_score.max(0)
            prediction = prediction.item()
            
            # labeled tag value
            target = targets[0][k].item()
                
            # if tag is predicted correctly
            if prediction == target:
                # increase TP value for corresponded tag
                c_tp[target] += 1
                # increase global TP value
                tp += 1
            else:
                # increase FP for predicted tag
                c_fp[prediction] += 1
                # increase FN for true (target) tag
                c_fn[target] += 1
            
            total += 1
        
        # increase loss function
        loss += loss_function(tag_scores, targets).item()
        
    # to prevent warning of dividing by zero
    with np.errstate(all='ignore'):

        # take macro average for precision and recall
        pr = np.mean(np.nan_to_num(c_tp / (c_tp+c_fp) ))
        rc = np.mean(np.nan_to_num(c_tp / (c_tp+c_fn) ))
        
    # calculate F score
    f_score_macro = (1 + F_SCORE**2) * pr * rc / (F_SCORE**2 * pr + rc)
    f_score_micro = tp / total

    return f_score_macro, f_score_micro, loss / size

# validate_model(model3, test)

Here I decided to sort train set by sentence length to minimize number of paddings in the batch. It increased the result significantly.

In [25]:
train_sorted = sorted(train, key=lambda item: (len(item[0])))

def train_model(model, optimizer, epochs = 5):
    
    steps = len(train) // BATCH_SIZE + 1
    
    for epoch in range(epochs):
        loss_sum = 0
        print("Epoch: {}".format(model.epoch_counter))
        model.epoch_counter += 1
        # for step in tqdm_notebook(range(steps)):
        for step in range(steps):
            # clear gradients out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.

            # sentence_in, targets = get_batch(BATCH_SIZE, train)        
            sentence_in, targets = prepare_batch(train_sorted[step * BATCH_SIZE: (step+1) * BATCH_SIZE], vocab)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            loss_sum += loss

        avg_epoch_loss = np.round((loss_sum / steps).item(), 4)
        dev_epoch_f1_macro, dev_epoch_f1_micro, dev_epoch_loss = np.round(validate_model(model, dev), 4)

        print("""    Train loss       :{}
    Validation loss  :{}
    Validation F1 macro :{}
    Validation F1 micro :{}
    ____________________________________________________________
        """.\
        format(avg_epoch_loss, dev_epoch_loss, dev_epoch_f1_macro, dev_epoch_f1_micro))
        
#     return model

In [26]:
model1, optimizer1 = get_model(gmodel_strategy_1)
model2, optimizer2 = get_model(gmodel_strategy_2)
model3, optimizer3 = get_model(gmodel_strategy_3)

In [27]:
train_model(model1, optimizer1)

Epoch: 0
    Train loss       :0.8805
    Validation loss  :1.0424
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ____________________________________________________________
        
Epoch: 1
    Train loss       :0.741
    Validation loss  :0.8954
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ____________________________________________________________
        
Epoch: 2
    Train loss       :0.6961
    Validation loss  :0.7991
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ____________________________________________________________
        
Epoch: 3
    Train loss       :0.6752
    Validation loss  :0.7601
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ____________________________________________________________
        
Epoch: 4
    Train loss       :0.6668
    Validation loss  :0.7426
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ___________________________________________________

In [28]:
train_model(model2, optimizer2)

Epoch: 0
    Train loss       :0.9203
    Validation loss  :1.1994
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ____________________________________________________________
        
Epoch: 1
    Train loss       :0.737
    Validation loss  :1.0403
    Validation F1 macro :0.1731
    Validation F1 micro :0.8392
    ____________________________________________________________
        
Epoch: 2
    Train loss       :0.6512
    Validation loss  :0.841
    Validation F1 macro :0.2242
    Validation F1 micro :0.8464
    ____________________________________________________________
        
Epoch: 3
    Train loss       :0.5799
    Validation loss  :0.7114
    Validation F1 macro :0.2642
    Validation F1 micro :0.8631
    ____________________________________________________________
        
Epoch: 4
    Train loss       :0.5225
    Validation loss  :0.6388
    Validation F1 macro :0.2841
    Validation F1 micro :0.8736
    ________________________________________________

In [29]:
train_model(model3, optimizer3)

Epoch: 0
    Train loss       :0.9237
    Validation loss  :1.1852
    Validation F1 macro :0.101
    Validation F1 micro :0.8325
    ____________________________________________________________
        
Epoch: 1
    Train loss       :0.7388
    Validation loss  :1.0374
    Validation F1 macro :0.1749
    Validation F1 micro :0.8409
    ____________________________________________________________
        
Epoch: 2
    Train loss       :0.6589
    Validation loss  :0.8439
    Validation F1 macro :0.1938
    Validation F1 micro :0.8478
    ____________________________________________________________
        
Epoch: 3
    Train loss       :0.5922
    Validation loss  :0.7165
    Validation F1 macro :0.2604
    Validation F1 micro :0.8642
    ____________________________________________________________
        
Epoch: 4
    Train loss       :0.5359
    Validation loss  :0.6436
    Validation F1 macro :0.2789
    Validation F1 micro :0.8722
    ______________________________________________

### 5. Provide the report the performances

As we can see from the training process above, **model1**, which corresponds to the embedding building strategy 1, does not increase F1 scores despite of the decreasing the loss function.

**model2** and **model3** show very similar training progress but the last one is slightly better.

Let's compare results on the **test** set.

In [30]:
np.round(validate_model(model1, test), 4)

array([0.1005, 0.8253, 0.7863])

In [31]:
np.round(validate_model(model2, test), 4)

array([0.2805, 0.8679, 0.6471])

In [32]:
np.round(validate_model(model3, test), 4)

array([0.2816, 0.8693, 0.6512])

The best result is shown by **model3**. F1 micro score is **86.9%** and F1 macro score is **28.2%**.

Let's continue **model3** training and see the best possible result.

In [None]:
train_model(model3, optimizer3, epochs=30)

Epoch: 5
