In [148]:
import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import tqdm
from seqeval.metrics import f1_score

torch.manual_seed(42)

<torch._C.Generator at 0x122c38e2530>

In [106]:
train = pd.read_csv('data/train', header = None, names = ['idx','word','tag'], sep ='\s',na_values=['<NAN>'], keep_default_na=False)
train.head(20)

  train = pd.read_csv('data/train', header = None, names = ['idx','word','tag'], sep ='\s',na_values=['<NAN>'], keep_default_na=False)


Unnamed: 0,idx,word,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
5,6,boycott,O
6,7,British,B-MISC
7,8,lamb,O
8,9,.,O
9,1,Peter,B-PER


In [107]:
dev = pd.read_csv('data/dev', header = None, names = ['idx','word','tag'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)
dev.head(20)

  dev = pd.read_csv('data/dev', header = None, names = ['idx','word','tag'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)


Unnamed: 0,idx,word,tag
0,1,CRICKET,O
1,2,-,O
2,3,LEICESTERSHIRE,B-ORG
3,4,TAKE,O
4,5,OVER,O
5,6,AT,O
6,7,TOP,O
7,8,AFTER,O
8,9,INNINGS,O
9,10,VICTORY,O


In [108]:
test = pd.read_csv('data/test', header = None, names = ['idx','word'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)
test.head(20)

  test = pd.read_csv('data/test', header = None, names = ['idx','word'], sep = '\s',na_values=['<NAN>'], keep_default_na=False)


Unnamed: 0,idx,word
0,1,SOCCER
1,2,-
2,3,JAPAN
3,4,GET
4,5,LUCKY
5,6,WIN
6,7,","
7,8,CHINA
8,9,IN
9,10,SURPRISE


In [167]:
vocab_dict = {}
for word in train['word'].values.tolist():
    vocab_dict[word] = vocab_dict.get(word,0) + 1

keys = list(vocab_dict.keys())
for word in keys:
    if vocab_dict[word] < 2:
        vocab_dict['<unk>'] = vocab_dict.get('<unk>',0) + vocab_dict[word]
        del vocab_dict[word]
        
word_to_idx = {}
for i,word in enumerate(sorted(list(vocab_dict.keys()))):
    word_to_idx[word] = i
unique_tags = set(train['tag'].values.tolist())
tag_to_idx = {}
idx_to_tag = {}
for i,tag in enumerate(sorted(unique_tags)):
    tag_to_idx[tag] = i
    idx_to_tag[i] = tag
unique_tags

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [168]:
tag_to_idx

{'B-LOC': 0,
 'B-MISC': 1,
 'B-ORG': 2,
 'B-PER': 3,
 'I-LOC': 4,
 'I-MISC': 5,
 'I-ORG': 6,
 'I-PER': 7,
 'O': 8}

In [169]:
idx_to_tag

{0: 'B-LOC',
 1: 'B-MISC',
 2: 'B-ORG',
 3: 'B-PER',
 4: 'I-LOC',
 5: 'I-MISC',
 6: 'I-ORG',
 7: 'I-PER',
 8: 'O'}

In [135]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [137]:
def make_data(df,vocab_dict):
    sentences = []
    sentence_tags = []
    tags = []
    sentence = None
    for row in df.values.tolist():
        if row[0] == 1:
            if sentence:
                sentence_tags.append(tags)
                sentences.append(sentence)
            sentence = []
            tags = []
        if row[1] not in vocab_dict:
            sentence.append('<unk>')
        else:
            sentence.append(row[1])
        tags.append(row[2])
    sentence_tags.append(tags)
    sentences.append(sentence)
    return list(zip(sentences,sentence_tags))

In [138]:
training_data = make_data(train,vocab_dict)
validation_data = make_data(dev,vocab_dict)

In [139]:
training_data

[(['EU', '<unk>', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']),
 (['Peter', 'Blackburn'], ['B-PER', 'I-PER']),
 (['BRUSSELS', '1996-08-22'], ['B-LOC', 'O']),
 (['The',
   'European',
   'Commission',
   'said',
   'on',
   'Thursday',
   'it',
   'disagreed',
   'with',
   'German',
   'advice',
   'to',
   'consumers',
   'to',
   '<unk>',
   'British',
   'lamb',
   'until',
   'scientists',
   'determine',
   'whether',
   'mad',
   'cow',
   'disease',
   'can',
   'be',
   'transmitted',
   'to',
   'sheep',
   '.'],
  ['O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']),
 (['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
  

In [194]:
class BLSTM1(nn.Module):
    def __init__(self):
        super(BLSTM1, self).__init__()
        self.embedding = nn.Embedding(len(vocab), 100)
        self.blstm = nn.LSTM(input_size=100,hidden_size=256,bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.hidden = nn.Linear(512,128)
        self.ELU = nn.ELU()
        self.output = nn.Linear(128,len(unique_tags))
    
    def forward(self,sentence):
        # pad sentence to equal length
        word_embedding = self.embedding(sentence)
        lstm_out, _ = self.blstm(word_embedding.view(len(sentence), 1, -1)) # input reshaped to sequence length, batch, input dim
        lstm_out = self.dropout(lstm_out)
        x = self.hidden(lstm_out.view(len(sentence), -1))
        x = self.ELU(x)
        x = self.output(x)
        return x

In [195]:
def blstm_train(model,loss_function,optimizer,scheduler,training_data,validation_data,num_epochs=20):
    for epoch in range(num_epochs):
        train_loss = 0
        for sentence,tags in tqdm.notebook.tqdm(training_data,total=len(training_data)):
            model.zero_grad()
            
            sentence_in = prepare_sequence(sentence,word_to_idx) #X
            targets = prepare_sequence(tags,tag_to_idx) #Y
            
            tag_scores = model(sentence_in) #Yhat
            
            loss = loss_function(tag_scores, targets)
            train_loss += loss
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        train_loss /= len(training_data)
        val_loss = 0
        gold_tags = []
        pred_tags = []
        with torch.no_grad():
            for sentence,tags in validation_data:
                sentence_in = prepare_sequence(sentence,word_to_idx) #X
                targets = prepare_sequence(tags,tag_to_idx) #Y
                tag_scores = model(sentence_in) #Yhat
                gold_tags.append(tags)
                pred_tags.append([idx_to_tag[x] for x in torch.argmax(tag_scores,dim=1).tolist()])
                loss = loss_function(tag_scores, targets)
                
                val_loss += loss
        val_loss /= len(validation_data)
        val_f1 = f1_score(gold_tags,pred_tags)
        print('Epoch {}/{}:\ttrain_loss = {}\tval_loss = {}\tval_f1 = {}'.format(epoch+1,
                                                                    num_epochs,
                                                                    round(train_loss.item(),3),
                                                                    round(val_loss.item(),3),
                                                                    round(val_f1,3)))
        

In [196]:
model = BLSTM1()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

blstm_train(model,loss_function,optimizer,scheduler,training_data,validation_data)

  0%|          | 0/14987 [00:00<?, ?it/s]

Epoch 1/20:	train_loss = 2.137	val_loss = 2.137	val_f1 = 0.019


  0%|          | 0/14987 [00:00<?, ?it/s]

Epoch 2/20:	train_loss = 2.137	val_loss = 2.137	val_f1 = 0.019


  0%|          | 0/14987 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [197]:
gold_tags = []
pred_tags = []
with torch.no_grad():
    for sentence,tags in validation_data:
        sentence_in = prepare_sequence(sentence,word_to_idx) #X
        targets = prepare_sequence(tags,tag_to_idx) #Y
        tag_scores = model(sentence_in) #Yhat
        gold_tags.append(tags)
        pred_tags.append([idx_to_tag[x] for x in torch.argmax(tag_scores,dim=1).tolist()])
val_f1 = f1_score(gold_tags,pred_tags)

In [199]:
pred_tags

[['O', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'I-LOC', 'O', 'O', 'O', 'I-ORG'],
 ['O', 'O'],
 ['O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-LOC',
  'O',
  'O',
  'O',
  'I-LOC'],
 ['I-PER',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'I-LOC',
  'I-PER',
  'O',
  'O',
  'I-LOC',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-LOC',
  'O',
  'I-LOC'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-LOC'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-LOC',

In [200]:
gold_tags

[['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-LOC', 'O'],
 ['B-MISC',
  'I-MISC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O'],
 ['O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
