# Assignment 3
Training a neural named entity recognition (NER) tagger 

In [None]:
import torch
import torch.nn as nn

# additinal packages
import os,io
import numpy as np
import pandas as pd
from tabulate import tabulate
from random import shuffle

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


# Disabling autoscrolling for long output
# %%javascript
# IPython.OutputArea.prototype._should_scroll = function(lines) {
#     return false;
# }


from google.colab import drive  
drive.mount(r'/content/drive/',force_remount=True) 

data_dir = os.getcwd() + '/' + 'drive/My Drive/Colab Notebooks/NLP/HW3'
data_dir

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


'/content/drive/My Drive/Colab Notebooks/NLP/HW3'

In this assignment you are required to build a full training and testing pipeline for a neural sequentail tagger for named entities, using LSTM.

The dataset that you will be working on is called ReCoNLL 2003, which is a corrected version of the CoNLL 2003 dataset: https://www.clips.uantwerpen.be/conll2003/ner/

[Train data](https://drive.google.com/file/d/1hG66e_OoezzeVKho1w7ysyAx4yp0ShDz/view?usp=sharing)

[Dev data](https://drive.google.com/file/d/1EAF-VygYowU1XknZhvzMi2CID65I127L/view?usp=sharing)

[Test data](https://drive.google.com/file/d/16gug5wWnf06JdcBXQbcICOZGZypgr4Iu/view?usp=sharing)

As you can see, the annotated texts are labeled according to the IOB annotation scheme, for 3 entity types: Person, Organization, Location.

**Task 1:** Write a funtion for reading the data from a single file (of the ones that are provided above). The function recieves a filepath and then it encodes every sentence individually using a pair of lists, one list contains the words and one list contains the tags. Each list pair will be added to a general list (data), which will be returned back from the function.

In [None]:
def read_data(filepath):
    data = []
    # TODO... write your code accordingly 
    
    # read file - split into lines
    with open(data_dir + '/' + filepath,'r') as f:
        output = f.read().splitlines()

    # initial new sentence
    words = []
    tags  = []
    
    # run on all lines and create sentences based on this way --> data[i] = list(words):list(tags)
    for o in output:
        
        # for every new sentence
        if o=='':
            data.append((words,tags))
            tags=[]
            words=[]
            continue

        # for each line split into word:tag
        w,t = o.strip().split(' ') # did not lower case
        words.append(w)
        tags.append(t)

    # when files end, add last sentence (if it's not empty)
    if len(words):
        data.append((words,tags))

    return data

# # Google Drive
# train = read_data('https://drive.google.com/file/d/1hG66e_OoezzeVKho1w7ysyAx4yp0ShDz/view?usp=sharing')
# dev = read_data('https://drive.google.com/file/d/1EAF-VygYowU1XknZhvzMi2CID65I127L/view?usp=sharing')
# test = read_data('https://drive.google.com/file/d/16gug5wWnf06JdcBXQbcICOZGZypgr4Iu/view?usp=sharing')

train,dev,test = read_data('data/connl03_train.txt'), read_data('data/connl03_dev.txt'), read_data('data/connl03_test.txt')

The following Vocab class can be served as a dictionary that maps words and tags into Ids. The UNK_TOKEN should be used for words that are not part of the training data.

In [None]:
UNK_TOKEN = 0

class Vocab:
    def __init__(self):
        self.word2id = {"__unk__": UNK_TOKEN}
        self.id2word = {UNK_TOKEN: "__unk__"}
        self.n_words = 1
        
        self.tag2id = {"O":0, "B-PER":1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
        self.id2tag = {0:"O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG"}
        
    def index_words(self, words):
      word_indexes = [self.index_word(w) for w in words]
      return word_indexes

    def index_tags(self, tags):
      tag_indexes = [self.tag2id[t] for t in tags]
      return tag_indexes
    
    def index_word(self, w):
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        return self.word2id[w]
            

**Task 2:** Write a function prepare_data that takes one of the [train, dev, test] and the Vocab instance, for converting each pair of (words,tags) to a pair of indexes. Each pair should be added to data_sequences, which will be returned back from the function.

In [None]:
vocab = Vocab()

def prepare_data(data, vocab):
    data_sequences = []
    # TODO - your code...
    
    # iterate on data
    for words,tags in data:
        
        # from string to index
        words_indexes = vocab.index_words(words)
        tags_indexes = vocab.index_tags(tags)
        
        # from index to tensor & upload on DEVICE
        words_indexes_tensor = torch.tensor(words_indexes,dtype=torch.long).to(DEVICE) 
        tags_indexes_tensor = torch.tensor(tags_indexes,dtype=torch.long).to(DEVICE)
        
        data_sequences.append((words_indexes_tensor,tags_indexes_tensor))
        
    return data_sequences, vocab

train_sequences, vocab = prepare_data(train, vocab)
dev_sequences, vocab = prepare_data(dev, vocab)
test_sequences, vocab = prepare_data(test, vocab)

**Task 3:** Write NERNet, a PyTorch Module for labeling words with NER tags. 

*input_size:* the size of the vocabulary

*embedding_size:* the size of the embeddings

*hidden_size:* the LSTM hidden size

*output_size:* the number tags we are predicting for

*n_layers:* the number of layers we want to use in LSTM

*directions:* could 1 or 2, indicating unidirectional or bidirectional LSTM, respectively

The input for your forward function should be a single sentence tensor.

In [None]:
class NERNet(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions):
        super(NERNet, self).__init__()
        # TODO: your code...
        
        # Embeddings -> LSTM -> Linear -> Cross Entropy
        
        self.input_size      = input_size
        self.output_size     = output_size
        self.embedding_size  = embedding_size
        self.hidden_size     = hidden_size
        self.n_layers        = n_layers
        self.directions      = directions
        
        # bidirectional if directions==2 else 1
        self.isTwoDirections = (directions==2)
                
        # Embbeding Layer
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # LSTM Layyer
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=self.isTwoDirections)
        
        # Linear Layer
        self.out = nn.Linear(hidden_size*directions, output_size) # if bidirectional then multiply the hidden size
    
    
    def forward(self, input_sentence):
        # TODO: your code...
        
        dim = len(input_sentence)
        sentence = input_sentence.clone().detach().to(DEVICE)
        # print(dim)                           # for debbuging

        embeds = self.embedding(sentence)
        # print(embeds.shape)                  # for debbuging
        
        lstm_out, _ = self.lstm(embeds.view(dim, 1, -1))
        # print(lstm_out.shape)                # for debbuging
        
        output = self.out(lstm_out.view(dim, -1))
        # print(output.shape)                  # for debbuging
        return output


**Task 4:** write a training loop, which takes a model (instance of NERNet) and number of epochs to train on. The loss is always CrossEntropyLoss and the optimizer is always Adam.

In [None]:
def train_loop(model, n_epochs):
  
    # shupple the data
    shuffle(train_sequences)

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Optimizer (ADAM is a fancy version of SGD)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    
    # did not ask to stop when converges to ϵ loss
    for e in range(1, n_epochs + 1):
        for sentence,tags in train_sequences:
            model.zero_grad()
            scores = model(sentence)
            criterion(scores,tags).backward()
            optimizer.step()
     

**Task 5:** write an evaluation loop on a trained model, using the dev and test datasets. This function print the true positive rate (TPR), also known as Recall and the opposite to false positive rate (FPR), also known as precision, of each label seperately (7 labels in total), and for all the 6 labels (except O) together. The caption argument for the function should be served for printing, so that when you print include it as a prefix.

In [None]:
def evaluate(model, caption=''):
    # TODO - your code goes here
    n_labels = len(vocab.tag2id)
    dev_matrix = torch.zeros(n_labels,n_labels) 
    test_matrix = torch.zeros(n_labels,n_labels)
    
    # evaluate dev
    with torch.no_grad():
        for inputs,labels in dev_sequences:
            preds = model(inputs).max(1).indices
            for label,pred in zip(labels,preds):
                dev_matrix[label,pred] += 1 
    
    # evaluate test
    with torch.no_grad():
        for inputs,labels in test_sequences:
            preds = model(inputs).max(1).indices
            for label,pred in zip(labels,preds):
                test_matrix[label,pred] += 1 
    
    # Precision -- total_true_label = dev_matrix.sum(1) 
    dev_precision = dev_matrix.diag()/dev_matrix.sum(1)
    test_precision = test_matrix.diag()/test_matrix.sum(1)
    
    # Recall -- total_pred_label = dev_matrix.sum(0)
    dev_recall = dev_matrix.diag()/dev_matrix.sum(0)
    test_recall = test_matrix.diag()/test_matrix.sum(0)
    
    # Construct a display table
    df = pd.DataFrame(columns=vocab.tag2id.keys())
    df.loc['dev  Precision'] = dev_precision.tolist()
    df.loc['dev  Recall'] = dev_precision.tolist()
    df.loc['test Precision'] = test_precision.tolist()
    df.loc['test Recall'] = test_recall.tolist()

    # add 6 labels to additional column
    dev_precision_except_0 = dev_matrix[1:,1:].diag().sum()/dev_matrix[1:,1:].sum(1).sum()
    test_precision_except_0 = test_matrix[1:,1:].diag().sum()/test_matrix[1:,1:].sum(1).sum()
    dev_recall_except_0 = dev_matrix[1:,1:].diag().sum()/dev_matrix[1:,1:].sum(0).sum()
    test_recall_except_0 = test_matrix[1:,1:].diag().sum()/test_matrix[1:,1:].sum(0).sum()
    
    df.loc['dev  Precision','All-Exapct-O'] = float(dev_precision_except_0)
    df.loc['dev  Recall','All-Exapct-O']    = float(dev_recall_except_0)
    df.loc['test Precision','All-Exapct-O'] = float(test_precision_except_0)
    df.loc['test Recall','All-Exapct-O']    = float(test_recall_except_0)


    # in case of dividing by zero
    df = df.fillna(0)

    # BOLD without importing additional packages
    print('\x1b[1;11;11;11m'+ caption + '\x1b[0m')
    print(tabulate(df,headers='keys',tablefmt='psql'))
    print()

**Task 6:** Train and evaluate a few models, all with embedding_size=300, and with the following hyper parameters (you may use that as captions for the models as well):

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)

Model 2: (hidden_size: 500, n_layers: 2, directions: 1)

Model 3: (hidden_size: 500, n_layers: 3, directions: 1)

Model 4: (hidden_size: 500, n_layers: 1, directions: 2)

Model 5: (hidden_size: 500, n_layers: 2, directions: 2)

Model 6: (hidden_size: 500, n_layers: 3, directions: 2)

Model 4: (hidden_size: 800, n_layers: 1, directions: 2)

Model 5: (hidden_size: 800, n_layers: 2, directions: 2)

Model 6: (hidden_size: 800, n_layers: 3, directions: 2)

**Get a dict with all pramam instructed above**

In [None]:
def get_models_params():
    models = {}

    INPUT_SIZE = len(vocab.word2id)
    OUTPUT_SIZE = len(vocab.tag2id)

    embedding_sizes = [300]
    hidden_sizes = [500,800]
    n_layes = [1,2,3]
    directions = [1,2]

    # create all models params
    for es in embedding_sizes:
        for hs in hidden_sizes:
            for l in n_layes:
                for d in directions:
                    models[len(models)+1] = {'input_size':INPUT_SIZE,'output_size':OUTPUT_SIZE,'embedding_size':es,'hidden_size': hs, 'n_layers': l, 'directions': d}
    return models

**Train & Eval models**

In [None]:
EPOCHS = 10
MULTIPLE_CUDA = torch.cuda.device_count()>1 

models = get_models_params()

for k in models:
    model = NERNet(**models[k])
    model.cuda()
    
    if MULTIPLE_CUDA:
        model = nn.DataParallel(model) 

    train_loop(model,n_epochs=EPOCHS)
    caption = 'Model {0}: (hidden_size: {1}, n_layers: {2}, directions: {3})'.format(k,model.hidden_size,model.n_layers,model.directions)
    evaluate(model,caption)

[1;11;11;11mModel 1: (hidden_size: 500, n_layers: 1, directions: 1)[0m
+----------------+----------+----------+----------+----------+----------+----------+----------+----------------+
|                |        O |    B-PER |    I-PER |    B-LOC |    I-LOC |    B-ORG |    I-ORG |   All-Exapct-O |
|----------------+----------+----------+----------+----------+----------+----------+----------+----------------|
| dev  Precision | 0.977067 | 0.605    | 0.630573 | 0.666667 | 0.391304 | 0.565476 | 0.37069  |       0.862434 |
| dev  Recall    | 0.977067 | 0.605    | 0.630573 | 0.666667 | 0.391304 | 0.565476 | 0.37069  |       0.862434 |
| test Precision | 0.97457  | 0.573733 | 0.621622 | 0.670554 | 0.509434 | 0.534286 | 0.325    |       0.860274 |
| test Recall    | 0.916774 | 0.821782 | 0.867925 | 0.809859 | 1        | 0.58805  | 0.550847 |       0.860274 |
+----------------+----------+----------+----------+----------+----------+----------+----------+----------------+

[1;11;11;11mModel 2: 

**Task 6:** Download the GloVe embeddings from https://nlp.stanford.edu/projects/glove/ (use the 300-dim vectors from glove.6B.zip). Then intialize the nn.Embedding module in your NERNet with these embeddings, so that you can start your training with pre-trained vectors. Repeat Task 6 and print the results for each model.

Note: make sure that vectors are aligned with the IDs in your Vocab, in other words, make sure that for example the word with ID 0 is the first vector in the GloVe matrix of vectors that you initialize nn.Embedding with. For a dicussion on how to do that, check it this link:
https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222

**Get Glove Pretrained Embeddings Weights on google colab**


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-06-27 12:16:38--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-06-27 12:16:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-06-27 12:16:39--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [None]:
!ls

drive  glove.6B.zip  sample_data


In [None]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
!ls

drive		   glove.6B.200d.txt  glove.6B.50d.txt	sample_data
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip


In [None]:
!rm glove.6B.200d.txt
!rm glove.6B.50d.txt
!rm glove.6B.100d.txt
!ls

drive  glove.6B.300d.txt  glove.6B.zip	sample_data


In [None]:
GLOVE_PATH = 'glove.6B.300d.txt'

**Helper Function**


In [None]:
def load_glove_embeddings(path, word2idx=vocab.word2id, embedding_dim=300):
    with open(path,encoding='utf-8') as f:
        embeddings = np.zeros((len(word2idx), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word2idx.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

**Train & Eval**

In [None]:
EPOCHS = 10
MULTIPLE_CUDA = torch.cuda.device_count()>1 

models = get_models_params()

for k in models:
    model = NERNet(**models[k])

    # load pretrained weights & freeze them (.grad=false)
    weights = load_glove_embeddings(GLOVE_PATH)
    model.embedding = nn.Embedding.from_pretrained(weights,freeze=True)

    model.cuda()
    
    if MULTIPLE_CUDA:
        model = nn.DataParallel(model) 

    train_loop(model,n_epochs=EPOCHS)
    caption = 'Model {0}: (hidden_size: {1}, n_layers: {2}, directions: {3})'.format(k,model.hidden_size,model.n_layers,model.directions)
    evaluate(model,caption)

[1;11;11;11mModel 1: (hidden_size: 500, n_layers: 1, directions: 1)[0m
+----------------+----------+----------+----------+----------+----------+----------+----------+----------------+
|                |        O |    B-PER |    I-PER |    B-LOC |    I-LOC |    B-ORG |    I-ORG |   All-Exapct-O |
|----------------+----------+----------+----------+----------+----------+----------+----------+----------------|
| dev  Precision | 0.952196 | 0.545    | 0.821656 | 0.644809 | 0.434783 | 0.607143 | 0.37069  |       0.750367 |
| dev  Recall    | 0.952196 | 0.545    | 0.821656 | 0.644809 | 0.434783 | 0.607143 | 0.37069  |       0.750367 |
| test Precision | 0.95249  | 0.479263 | 0.810811 | 0.620991 | 0.471698 | 0.648571 | 0.375    |       0.721168 |
| test Recall    | 0.953361 | 0.809339 | 0.574163 | 0.67619  | 0.543478 | 0.426692 | 0.657895 |       0.721168 |
+----------------+----------+----------+----------+----------+----------+----------+----------+----------------+

[1;11;11;11mModel 2: 

**Good luck!**