In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import numpy as np
import random

torch.manual_seed(1)

<torch._C.Generator at 0x7efeba752a50>

### 1. Read and process the data.

In [2]:
path = './data/'

In [3]:
def get_data(name):
    f = open(path + name + '.txt', 'r')
    lines = f.readlines()
    
    result = []
    
    sentence = []
    tag_sentence = []
    
    for line in lines[2:]:
        if line == '\n':
            
            result.append((sentence, tag_sentence))
            
            sentence = []
            tag_sentence = []
            
            continue
            
        words = line.strip().split()
        
        sentence.append(words[0])
        tag_sentence.append(words[3])
            
        
    return result

In [4]:
train = get_data('train')
dev = get_data('dev')
test = get_data('test')

dataset = train + dev + test

### 2. Implement 3 strategies for loading the embeddings

In [5]:
embeddings_name = 'glove.6B.100d.txt'

def loadGloveModel(file):
    print("Loading Glove Model")
    f = open(file,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
gmodel = loadGloveModel(path + embeddings_name)

Loading Glove Model
Done. 400000  words loaded!


In [7]:
EMBEDDING_DIM = 100


In [8]:
vocab = {}
tag_vocab = {}
for sent, tags in dataset:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)
    for tag in tags:
        if tag not in tag_vocab:
            tag_vocab[tag] = len(tag_vocab)

vocab['pad'] = len(vocab)
            
vocab_size = len(vocab)

In [9]:
tag_vocab

{'B-LOC': 5,
 'B-MISC': 2,
 'B-ORG': 0,
 'B-PER': 3,
 'I-LOC': 8,
 'I-MISC': 7,
 'I-ORG': 6,
 'I-PER': 4,
 'O': 1}

**2.a.** Load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding

In [10]:
unk = gmodel['unk']

In [11]:
gmodel_strategy_1 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    if word in gmodel:
        gmodel_strategy_1[index,:] = gmodel[word]
    else:
        gmodel_strategy_1[index,:] = unk

**2.b.** load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding

In [12]:
gmodel_strategy_2 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word_lower in gmodel:
        gmodel_strategy_2[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_2[index,:] = unk

**2.c.** load the embeddings for original capitalization of words. If embedding for this word doesn't exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding

In [13]:
gmodel_strategy_3 = np.zeros((vocab_size, EMBEDDING_DIM))

for word, index in vocab.items():
    word_lower = word.lower()
    if word in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word]
    elif word_lower in gmodel:
        gmodel_strategy_3[index,:] = gmodel[word_lower]
    else:
        gmodel_strategy_3[index,:] = unk

### 3. Implement training on batches

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def prepare_batch(batch_sentences, vocab):
    batch_max_len = max([len(s[0]) for s in batch_sentences])

    #prepare a numpy array with the data, initializing the data with 'PAD' 
    #and all labels with -1; initializing labels to -1 differentiates tokens 
    #with tags from 'PAD' tokens
    batch_data = vocab['pad']*np.ones((len(batch_sentences), batch_max_len))
    batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

    #copy the data to the numpy array
    for j in range(len(batch_sentences)):
        cur_len = len(batch_sentences[j][0])
        batch_data[j][:cur_len] = prepare_sequence(batch_sentences[j][0], vocab)
        batch_labels[j][:cur_len] = prepare_sequence(batch_sentences[j][1], tag_vocab)

    #since all data are indices, we convert them to torch LongTensors
    batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

    #convert Tensors to Variables
#     batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
    return batch_data, batch_labels

    
# get_batch(train[:10], vocab)

In [15]:
class LSTM_NER(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, embedding_matrix, tagset_size):
        super(LSTM_NER, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(embedding_matrix.shape[0], embedding_dim).\
            from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))            
        self.word_embeddings.weight.requires_grad = False

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        # self.word_embeddings = nn.Embedding(embedding_matrix.shape[0], embedding_dim)
        
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        # self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.fc = nn.Linear(hidden_dim, tagset_size)
        

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        # lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        lstm_out, _ = self.lstm(embeds)
        
        lstm_out = lstm_out.view(-1, lstm_out.shape[2])
        
        fc_out = self.fc(lstm_out)
        
        tag_scores = F.log_softmax(fc_out, dim=1)
        return tag_scores

In [16]:
HIDDEN_DIM = 100

BATCH_SIZE = 64

model = LSTM_NER(EMBEDDING_DIM, HIDDEN_DIM, gmodel_strategy_3, len(tag_vocab))
# loss_function = nn.NLLLoss()

def loss_function(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data.item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens


optimizer = optim.SGD(model.parameters(), lr=0.01)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
#     inputs = prepare_sequence(train[0][0], vocab)
    inputs, labels = prepare_batch(train[:BATCH_SIZE], vocab)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-2.1248, -2.1080, -2.2440,  ..., -2.1052, -2.2949, -2.2587],
        [-2.1333, -2.1313, -2.2185,  ..., -2.1396, -2.2498, -2.2420],
        [-2.1532, -2.0811, -2.2415,  ..., -2.1071, -2.2235, -2.2708],
        ...,
        [-2.2264, -2.1293, -2.2512,  ..., -2.1383, -2.1746, -2.1226],
        [-2.2264, -2.1293, -2.2512,  ..., -2.1383, -2.1746, -2.1226],
        [-2.2264, -2.1293, -2.2512,  ..., -2.1383, -2.1746, -2.1226]])


In [17]:
def get_batch(b_size, data):
    batch_sentences = random.sample(data, b_size)
    
    return prepare_batch(batch_sentences, vocab)

# get_batch(BATCH_SIZE, train)

In [19]:

steps = len(train) // BATCH_SIZE + 1

random.seed(42)

for epoch in range(15):  # again, normally you would NOT do 300 epochs, it is toy data
    for step in range(steps):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        
        # sentence_in = prepare_sequence(sentence, vocab)
        # targets = prepare_sequence(tags, tag_vocab)
        
        sentence_in, targets = get_batch(BATCH_SIZE, train)        

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
    print(epoch, loss)

# See what the scores are after training
with torch.no_grad():
#     inputs = prepare_sequence(train[0][0], vocab)
#     tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    
#     print(tag_scores)
    pass

0 tensor(0.2835, grad_fn=<DivBackward0>)
1 tensor(0.3101, grad_fn=<DivBackward0>)
2 tensor(0.2777, grad_fn=<DivBackward0>)
3 tensor(0.2138, grad_fn=<DivBackward0>)
4 tensor(0.2865, grad_fn=<DivBackward0>)
5 tensor(0.2663, grad_fn=<DivBackward0>)
6 tensor(0.2909, grad_fn=<DivBackward0>)
7 tensor(0.3198, grad_fn=<DivBackward0>)
8 tensor(0.2459, grad_fn=<DivBackward0>)
9 tensor(0.2133, grad_fn=<DivBackward0>)
10 tensor(0.2445, grad_fn=<DivBackward0>)
11 tensor(0.2432, grad_fn=<DivBackward0>)
12 tensor(0.2574, grad_fn=<DivBackward0>)
13 tensor(0.2742, grad_fn=<DivBackward0>)
14 tensor(0.2537, grad_fn=<DivBackward0>)


In [20]:
def validate_model(model, dev, vocab, tag_vocab):
    
    size = len(dev)
    
    sentence_in, targets = get_batch(size, dev)
    
    prediction = model(sentence_in)
    
    return prediction, targets

prediction, target = validate_model(model, test, vocab, tag_vocab)

In [21]:
index_global = 0
counter = 0
for row in target:
    for col in row:        
        if col < 0:
            continue
            
        result = col
        
        max_val, pred = prediction[ index_global ].max(0)
        
        if pred == result:
            counter += 1
        
        index_global += 1

    
print(counter / index_global )

0.8143147969570341
