In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import numpy as np
from tqdm import tqdm_notebook

torch.manual_seed(1)

<torch._C.Generator at 0x7f2d48088a50>

### 1. Read and process the data.

In [2]:
path = './data/'

In [3]:
def get_data(name):
    f = open(path + name + '.txt', 'r')
    lines = f.readlines()
    
    result = []
    
    sentence = []
    tag_sentence = []
    
    for line in lines[2:]:
        if line == '\n':           
            
            if len(sentence) > 0:
                result.append((sentence, tag_sentence))
            
                sentence = []
                tag_sentence = []
            
            continue
            
        words = line.strip().split()
        
        if words[0] == '-DOCSTART-':
            continue
        
        sentence.append(words[0])
        tag_sentence.append(words[3])
            
        
    return result

In [4]:
train = get_data('train')
dev = get_data('dev')
test = get_data('test')

dataset = train + dev + test

### 2. Implement 3 strategies for loading the embeddings

In [5]:
embeddings_name = 'glove.6B.100d.txt'

def loadGloveModel(file):
    print("Loading Glove Model")
    f = open(file,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
gmodel = loadGloveModel(path + embeddings_name)

Loading Glove Model
Done. 400000  words loaded!


In [7]:
EMBEDDING_DIM = 100


In [8]:
vocab = {}
tag_vocab = {}
for sent, tags in dataset:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)
    for tag in tags:
        if tag not in tag_vocab:
            tag_vocab[tag] = len(tag_vocab)

vocab['pad'] = len(vocab)
            
vocab_size = len(vocab)

In [9]:
tag_vocab

{'B-LOC': 5,
 'B-MISC': 2,
 'B-ORG': 0,
 'B-PER': 3,
 'I-LOC': 8,
 'I-MISC': 7,
 'I-ORG': 6,
 'I-PER': 4,
 'O': 1}

**2.a.** Load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding

In [10]:
unk = gmodel['unk']

In [11]:
gmodel_strategy_1 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    if word in gmodel:
        gmodel_strategy_1[index,:] = gmodel[word]
    else:
        gmodel_strategy_1[index,:] = unk

**2.b.** load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding

In [12]:
gmodel_strategy_2 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word_lower in gmodel:
        gmodel_strategy_2[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_2[index,:] = unk

**2.c.** load the embeddings for original capitalization of words. If embedding for this word doesn't exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding

In [13]:
gmodel_strategy_3 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word]
    elif word_lower in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_3[index,:] = unk

### 3. Implement training on batches

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def prepare_batch(batch_sentences, vocab):
    batch_max_len = max([len(s[0]) for s in batch_sentences])

    #prepare a numpy array with the data, initializing the data with 'PAD' 
    #and all labels with -1; initializing labels to -1 differentiates tokens 
    #with tags from 'PAD' tokens
    batch_data = vocab['pad']*np.ones((len(batch_sentences), batch_max_len))
    batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

    #copy the data to the numpy array
    for j in range(len(batch_sentences)):
        cur_len = len(batch_sentences[j][0])
        batch_data[j][:cur_len] = prepare_sequence(batch_sentences[j][0], vocab)
        batch_labels[j][:cur_len] = prepare_sequence(batch_sentences[j][1], tag_vocab)

    #since all data are indices, we convert them to torch LongTensors
    batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

    return batch_data, batch_labels

    
# get_batch(train[:10], vocab)

In [28]:
class LSTM_NER(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, embedding_matrix, tagset_size):
        super(LSTM_NER, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(embedding_matrix.shape[0], embedding_dim).\
            from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), freeze=True)            
        self.word_embeddings.weight.requires_grad = False

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.fc = nn.Linear(2*hidden_dim, tagset_size)
        
        # field to count epochs
        self.epoch_counter = 0
        

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        lstm_out, _ = self.lstm(embeds)
        
        lstm_out = lstm_out.view(-1, lstm_out.shape[2])
        
        fc_out = self.fc(lstm_out)
        
        tag_scores = F.log_softmax(fc_out, dim=1)
        return tag_scores

In [27]:
HIDDEN_DIM = 100

BATCH_SIZE = 64

def loss_function(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data.item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens


def get_model(model_strategy):

    model = LSTM_NER(EMBEDDING_DIM, HIDDEN_DIM, model_strategy, len(tag_vocab))
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # check scores before training
    with torch.no_grad():
        inputs, labels = prepare_batch(train[:BATCH_SIZE], vocab)
        tag_scores = model(inputs)
        
    return model, optimizer

In [17]:
F_SCORE = 1

def get_batch(b_size, data):
    batch_sentences = random.sample(data, b_size)
    
    return prepare_batch(batch_sentences, vocab)


In [18]:
def validate_model(model, dataset):
    
    size = len(dataset)
    
    c_tp = np.zeros(len(tag_vocab), np.int32)
    c_fp = np.zeros(len(tag_vocab), np.int32)
    c_fn = np.zeros(len(tag_vocab), np.int32)
    
    loss = 0
    
    tp = 0
    total = 0
    
    for i in range(size):
    
        sentence_in, targets = prepare_batch(dataset[i:i+1], vocab)

        tag_scores = model(sentence_in)
        
        for k, tag_score in enumerate(tag_scores):
            
            max_val, prediction = tag_score.max(0)
            prediction = prediction.item()
            target = targets[0][k].item()
                
            if prediction == target:
                c_tp[target] += 1
                tp += 1
            else:
                c_fp[prediction] += 1
                c_fn[target] += 1
                
                
            total += 1
        
        loss += loss_function(tag_scores, targets).item()
        
    with np.errstate(all='ignore'):

        pr = np.mean(np.nan_to_num(c_tp / (c_tp+c_fp) ))
        rc = np.mean(np.nan_to_num(c_tp / (c_tp+c_fn) ))
        
    f_score = (1 + F_SCORE**2) * pr * rc / (F_SCORE**2 * pr + rc)

    return f_score, loss / size, tp/total

In [29]:
train_sorted = sorted(train, key=lambda item: (len(item[0])))

def train_model(model, optimizer, epochs = 5):
    for epoch in range(epochs):
        loss_sum = 0
        print("Epoch: {}".format(model.epoch_counter))
        model.epoch_counter += 1
        for step in tqdm_notebook(range(steps)):
            # clear gradients out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.

            # sentence_in, targets = get_batch(BATCH_SIZE, train)        
            sentence_in, targets = prepare_batch(train_sorted[step * BATCH_SIZE: (step+1) * BATCH_SIZE], vocab)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            loss_sum += loss

        avg_epoch_loss = np.round((loss_sum / steps).item(), 4)
        dev_epoch_f1, dev_epoch_loss, dev_epoch_pr = np.round(validate_model(model, dev), 4)

        print("""    Train loss:     {}
        Validate loss:  {}
        Validate F1: {}
        Validate Pr: {}
    ____________________________________________________________
        """.\
        format(avg_epoch_loss, dev_epoch_loss, dev_epoch_f1, dev_epoch_pr))
        
#     return model

In [30]:
model1, optimizer1 = get_model(gmodel_strategy_1)
model2, optimizer2 = get_model(gmodel_strategy_2)
model3, optimizer3 = get_model(gmodel_strategy_3)

In [31]:
train_model(model1, optimizer1)

Epoch: 0


    Train loss:     0.8734
        Validate loss:  1.0532
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 1


    Train loss:     0.7403
        Validate loss:  0.8992
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 2


    Train loss:     0.6967
        Validate loss:  0.8019
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 3


    Train loss:     0.6765
        Validate loss:  0.7632
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        
Epoch: 4


    Train loss:     0.6685
        Validate loss:  0.7455
        Validate F1: 0.101
        Validate Pr: 0.8325
    ____________________________________________________________
        


LSTM_NER(
  (word_embeddings): Embedding(30290, 100)
  (lstm): LSTM(100, 100, num_layers=2, bidirectional=True)
  (fc): Linear(in_features=200, out_features=9, bias=True)
)

In [None]:
# np.round(validate_model(model1, test), 4)