In [1]:
######################################################################################################
#################Functions to load sentences and corresponding tags from text files###################
#####################################################################################################
import torch
from torch import nn
import numpy as np
import itertools
from collections import Counter
import string
import random
import os
import re
import codecs
import pickle
from linear_crf_util import *

## Load the dataset

In [2]:
train_path = "data/CoNLL2003/train_iobes.pkl"
dev_path = "data/CoNLL2003/dev_iobes.pkl"
test_path = "data/CoNLL2003/test_iobes.pkl"
with open(train_path, 'rb') as f:
    train_instances = pickle.load(f)
with open(dev_path, 'rb') as f:
    dev_instances = pickle.load(f)
with open(test_path, 'rb') as f:
    test_instances = pickle.load(f)
    print('data loaded successfully')

data loaded successfully


In [3]:
print(train_instances[0])

(('-docstart-',), ['o'])


In [4]:
###########################Load the dictionary and word indexes##################################
import pickle
import numpy as np
path_dict = '../../data/CoNLL2003/vocab/local_dict_lower.pkl'
path_emb = '../../data/CoNLL2003/vocab/local_emb_lower.pkl'
with open(path_dict, 'rb') as f:
    vocab, word2id, id2word = pickle.load(f)
with open(path_emb, 'rb') as f:
    local_glove_emb = pickle.load(f)
print('Local embeddings size:', local_glove_emb.shape)
print('Vocabulary size', len(vocab))

Local embeddings size: (26873, 300)
Vocabulary size 26873


In [5]:
tags = ['b-loc', 'b-misc', 'b-org', 'b-per', 'i-loc', 'i-misc', 'i-org', 'i-per', 'o', '<pad>', '<end>', '<start>']
tags = ['e-org', 'i-misc', 'e-misc', 'i-per', 'i-loc', 'o', 's-org', 'b-org', 
         'e-loc', 's-loc', 'i-org', 'e-per', 's-misc', 'b-per', 's-per', 
         'b-misc', 'b-loc', '<pad>', '<end>', '<start>']
tag2id = {tag:i for i, tag in enumerate(tags)}
id2tag = {i:tag for i, tag in enumerate(tags)}

In [6]:
from torch.nn.utils.rnn import pad_sequence
def sent2ids(text):
    'Map words into indexes'
    #print(text)
    return [word2id[w] for w in text]

def labels2ids(labels):
    'Map labels into integers'
    tags = []
    for l in labels:
        try:
            tag_id = tag2id[l]
        except:
            tag_id = tag2id['<unk>']
        tags.append(tag_id)
    return tags


def data_generator(sents, labels, batch_size=32, is_training=True, index=0):
    if is_training:
        select_indices = np.random.choice(len(sents), batch_size, replace=False)
    else:
        start = index
        end = min(start + batch_size, len(sents)) 
        select_indices = list(range(start, end))
    #select_indices = list(range(batch_size))
    batch_sents = np.array(sents)[select_indices]
    batch_labels = np.array(labels)[select_indices]
    
    
    #batch_sents = [list(sent) for sent in batch_sents]
    #batch_labels = [list(label) for label in batch_labels]
    #print(batch_sents)
    
    batch_sents = list(map(sent2ids, batch_sents))
    batch_labels = list(map(labels2ids, batch_labels))
    
    seq_lens = [len(s) for s in batch_sents]
    seq_lens = torch.LongTensor(seq_lens)
    max_len = max(seq_lens)
    
    batch_sents = [torch.LongTensor(s) for s in batch_sents]
    batch_sents = pad_sequence(batch_sents, batch_first=True, padding_value=word2id['<pad>'])
    
    
    batch_labels = [torch.LongTensor(s) for s in batch_labels]
    batch_labels = pad_sequence(batch_labels, batch_first=True, padding_value=tag2id['<pad>'])
    
    if not is_training:
        return batch_sents, batch_labels, seq_lens, end
    
    return batch_sents, batch_labels, seq_lens

In [7]:
train_sent_words, train_sent_tags = list(zip(*train_instances))
dev_sent_words,  dev_sent_tags = list(zip(*dev_instances))
test_sent_words,  test_sent_tags = list(zip(*test_instances))

In [8]:
##############################Example#####################################
batch_sents, batch_label, seq_lens = data_generator(train_sent_words, train_sent_tags, batch_size=32)



## Linear CRF

In [9]:
###################################oken-level predictions#############
from time import time
import copy
from sklearn.metrics import f1_score
def evaluate(model, dev_sent_words, dev_sent_tags):
    index = 0   
    total_preds = []
    total_tags = []
    while index < len(dev_sent_words):
        batch_sents, batch_tags, seq_lens, index = data_generator(dev_sent_words, 
                                                              dev_sent_tags, batch_size=batch_size, 
                                                              is_training=False, index=index)
        _, pred_labels = model.decode(batch_sents, seq_lens)
    #         preds = outs.view(-1, 3).argmax(1)
    #         batch_tags = batch_tags.view(-1)

        #ignore the padding tokens
        for i, seq_len in enumerate(seq_lens):
            preds = torch.LongTensor(pred_labels[i][:seq_lens[i]])
            tag = batch_tags[i, :seq_lens[i]]
            total_preds.append(preds)
            total_tags.append(tag)
    total_preds = torch.cat(total_preds) 
    total_tags = torch.cat(total_tags)

    f1 = f1_score(total_tags, total_preds, average='macro')
    return f1

############################entity-level prediction#########################
def evaluate_entity_pred(model, dev_sent_words, dev_sent_tags):
    p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(), Counter()
    index = 0
    predictions = []
    while index < len(dev_sent_words):
        batch_sents, batch_tags, seq_lens, index = data_generator(dev_sent_words, 
                                                              dev_sent_tags, batch_size=batch_size, 
                                                              is_training=False, index=index)
        _, batch_max_ids = model.decode(batch_sents.to(device), seq_lens.to(device))
        batch_p , batch_predict, batch_total = evaluate_batch_insts(batch_sents, 
                                                                    batch_max_ids, batch_tags, 
                                                                    seq_lens, id2tag)
        p_dict += batch_p
        total_predict_dict += batch_predict
        total_entity_dict += batch_total
    total_p = sum(list(p_dict.values()))
    total_predict = sum(list(total_predict_dict.values()))
    total_entity = sum(list(total_entity_dict.values()))
    precision, recall, fscore = get_metric(total_p, total_entity, total_predict)
    print(f"[set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, Micro F1: {fscore:.2f}")
    return precision, recall, fscore

## Debug

In [10]:
# index = 0
# batch_size = 32
# batch_sents, batch_tags, seq_lens, index = data_generator(train_sent_words, 
#                                                                   train_sent_tags, batch_size=batch_size, 
#                                                                   is_training=False, index=index)
# model = NNCRF(label_size, len(vocab), 300, 200, 'va_w')
# word_rep = model.embedder(batch_sents)
# lstm_out, _ = model.encoder(word_rep, seq_lens)
# lstm_out
# #lstm_scores = model.hidden2tag(lstm_out)
# #lstm_scores

## Training CRF Model

In [14]:
#Models defined in another file
from linear_crf import NNCRF
from torch import optim
from time import time

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
label_size = len(tag2id)
batch_size = 32
epoch = 50
trial_num = 1

f1s = []
for _ in range(trial_num):
    model = NNCRF(label_size, len(vocab), 300, 200, 'gru')
    #model = NNCRF(label_size, len(vocab), 32*32, 32, 'matmul')
    model = model.to(device)
    #model.load_pretrained_emb(path_emb, True)
    # optimizer = optim.Adam(model.parameters(), lr=0.000001, weight_decay=1e-4)
    #optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0)

    best_f1 = -1
    for i in range(epoch):
        start = time()
        model.train()
        index = 0
        #model.embedder.weight.retain_grad()
        while index < len(train_sent_words):
            batch_sents, batch_tags, seq_lens, index = data_generator(train_sent_words, 
                                                                  train_sent_tags, batch_size=batch_size, 
                                                                  is_training=False, index=index)
            optimizer.zero_grad()
            loss = model(batch_sents.to(device), seq_lens.to(device), batch_tags.to(device))
            #loss = model(batch_sents.to(device), seq_lens, batch_tags.to(device))
            loss.backward()
            ######################For debugging####################
            #print('loss', loss.item())
            #temp = model.embedder.weight.grad.mean().item()
            #print('grad', temp)
            ##########################################
            nn.utils.clip_grad_norm_(model.parameters(),5)
            optimizer.step()
            #print(index)
            if index % 8000 == 0:
                print('Loss:', loss.item())

        #print('###################Debug training f1 scores:')
        #f1 = evaluate(model, train_sent_words,train_sent_tags)
        #print('Training f1 score', f1)
        _, _, f1 = evaluate_entity_pred(model, dev_sent_words, dev_sent_tags)
        print('dev f1 score, ', f1)
        if best_f1 < f1:
            best_f1 = f1
            best_model = copy.deepcopy(model)




    _, _, f1 = evaluate_entity_pred(best_model, test_sent_words, test_sent_tags)
    print('#'*30)
    print('best val f1, test f1', round(best_f1, 4), round(f1, 4))
    f1s.append((round(best_f1, 3),  round(f1, 4)))

print(f1s)
# model.eval()
# batch_sents, batch_tags, seq_lens = data_generator(train_sent_words,train_sent_tags, batch_size=batch_size)
# loss = model(batch_sents, seq_lens, batch_tags)
# print(loss.item())
#model2 = SequenceLabeling(len(vocab), 300, 300, label_size, path=None)

biGRU
MVMA-G
MVMA-G




Loss: 313.955810546875
[set Total] Prec.: 92.99, Rec.: 3.35, Micro F1: 6.47
dev f1 score,  6.465237166991554
Loss: 264.080810546875
[set Total] Prec.: 71.26, Rec.: 10.01, Micro F1: 17.56
dev f1 score,  17.55939206138409
Loss: 231.61962890625
[set Total] Prec.: 71.15, Rec.: 20.63, Micro F1: 31.99
dev f1 score,  31.989562948467057
Loss: 207.62451171875
[set Total] Prec.: 71.48, Rec.: 27.97, Micro F1: 40.21
dev f1 score,  40.20805612676908
Loss: 199.576171875
[set Total] Prec.: 71.54, Rec.: 33.79, Micro F1: 45.90
dev f1 score,  45.902388844439365
Loss: 200.5908203125
[set Total] Prec.: 71.73, Rec.: 37.53, Micro F1: 49.28
dev f1 score,  49.27632305822561
Loss: 179.494384765625
[set Total] Prec.: 71.20, Rec.: 40.36, Micro F1: 51.51
dev f1 score,  51.514500537056925
Loss: 169.135498046875
[set Total] Prec.: 71.31, Rec.: 42.91, Micro F1: 53.58
dev f1 score,  53.582685438117245
Loss: 160.637939453125
[set Total] Prec.: 71.84, Rec.: 44.92, Micro F1: 55.28
dev f1 score,  55.27596562079321
Loss: 

### Prediction

In [22]:
286.53/61.07

4.691829048632716

In [13]:
def sents_to_insts(sentences):
    insts = []
    for sentence in sentences:
        words = sentence.split()
        words2ids = [word2id[w] for w in words]
        insts.append(words2ids)
    return insts

def predict_insts(model, dev_sent_words, dev_sent_tags):
    index = 0
    predictions = []
    while index < len(dev_sent_words):
        batch_sents, batch_tags, seq_lens, index = data_generator(dev_sent_words, 
                                                              dev_sent_tags, batch_size=batch_size, 
                                                              is_training=False, index=index)
        _, batch_max_ids = model.decode(batch_sents, seq_lens)
        
        for idx in range(len(batch_max_ids)):
            length = seq_lens[idx]
            prediction = batch_max_ids[idx][:length].tolist()
            prediction = prediction[::-1]
            prediction = [id2tag[l] for l in prediction]
            predictions.append(prediction)
    return predictions

# def predict(model, sentences):
#     sents = [sentences] if isinstance(sentences, str) else sentences
#     insts = sents_to_insts(sents)

#     test_batches = self.create_batch_data(insts)
#     predictions = self.predict_insts(test_batches)
#     if len(predictions) == 1:
#         return predictions[0]
#     else:
#         return predictions

In [11]:
evaluate_entity_pred(model, dev_sent_words, dev_sent_tags)



[set Total] Prec.: 72.43, Rec.: 7.96, Micro F1: 14.34


(72.43491577335375, 7.960282733086503, 14.344200151630021)

In [11]:
from self_defined_lstm import LSTM

In [15]:
lstm = LSTM(6, 3, bidirectional=True)

tarlor series lstm linear simplified
tarlor series lstm linear simplified


In [16]:
x = torch.randn(3, 4, 6)
lengths = torch.LongTensor([2,3,4])
out, _ = lstm(x, lengths)