## Current setup using this notebook
<p>label - POS</p>
<p>Fine tune on: Raw-train</p>
<p>Evaluate on: Yap-dev (and Yap-test, but not reported)</p>
<p>Classification by: whole word (as opposed to prefix/host)</p>
<p>Morphologically informed labels? Yes </p>
<p>Shuffle/Sort? shuffle </p>

In [1]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import bclm

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


### Manually setting seeds

In [2]:
torch.manual_seed(3)
np.random.seed(3)
torch.cuda.manual_seed_all(3)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Data
`bclm.read_dataframe('spmrl', subset='train')` - gives the gold-segmented tokens <br>
`bclm.get_token_df(train, ['upostag'])` - gives the raw tokens<br>
`bclm.read_dataframe('yap_dev')` - gives the YAP tokenization (only available on `yap_dev` and `yap_test`. No `yap_train`!

In [3]:
train = bclm.read_dataframe('spmrl', subset='train')
train_df = bclm.get_token_df(train, ['upostag'])
train_df['token_str'] = train_df['token_str'].str.replace('”','"')

dev_df = bclm.read_dataframe('yap_dev')
# dev_df = bclm.get_token_df(dev, ['upostag'])
dev_df['form'] = dev_df['form'].str.replace('”','"')

test_df = bclm.read_dataframe('yap_test')
# test_df = bclm.get_token_df(test, ['upostag'])
test_df['form'] = test_df['form'].str.replace('”','"')

In [None]:
dev_df.head()

### Uniform column names
Note that the column names in yap dfs can be slightly different from spmrl dfs

In [4]:
# For Evaluating on Raw-dev/Raw-test
train_df.rename(columns = {"token_str": "form"}, inplace = True)
# dev_df.rename(columns = {"token_str": "form"}, inplace = True)
# test_df.rename(columns = {"token_str": "form"}, inplace = True)

In [5]:
## For Evaluating on Yap-dev/Yap-test
dev_df.rename(columns = {"misc_token_id": "token_id"}, inplace = True)
test_df.rename(columns = {"misc_token_id": "token_id"}, inplace = True)

In [None]:
train.head()

In [6]:
def renumber_tokenid(df):
    for index in df.index:
        df.at[index, 'token_id'] = df.at[index, 'token_id'] - 1
        
renumber_tokenid(train)
renumber_tokenid(dev_df)
renumber_tokenid(test_df)
train.head(30)

Unnamed: 0,id,form,lemma,upostag,xpostag,feats,token_id,sent_id,token_str,global_sent_id,...,deps,misc,ner_escaped,set,duplicate_sent_id,very_similar_sent_id,biose_layer0,biose_layer1,biose_layer2,biose_layer3
11301,1,"""",_,yyQUOT,yyQUOT,_,0,501,"""",501,...,_,_,_,train,,,O,O,O,O
11302,2,תהיה,היה,COP,COP,gen=F|num=S|per=3,1,501,תהיה,501,...,_,_,_,train,,,O,O,O,O
11303,3,נקמה,נקמה,NN,NN,gen=F|num=S,2,501,נקמה,501,...,_,_,_,train,,,O,O,O,O
11304,4,ו,ו,CONJ,CONJ,_,3,501,ובגדול,501,...,_,_,_,train,,,O,O,O,O
11305,5,בגדול,בגדול,RB,RB,_,3,501,ובגדול,501,...,_,_,_,train,,,O,O,O,O
11306,6,.,_,yyDOT,yyDOT,_,4,501,.,501,...,_,_,_,train,,,O,O,O,O
11307,1,גם,גם,RB,RB,_,0,502,גם,502,...,_,_,_,train,,,O,O,O,O
11308,2,יהודים,יהודי,NN,NN,gen=M|num=P,1,502,יהודים,502,...,_,_,_,train,,,O,O,O,O
11309,3,נמצאים,נמצא,BN,BN,gen=M|num=P|per=A|HebBinyan=NIFAL,2,502,נמצאים,502,...,_,_,_,train,,,O,O,O,O
11310,4,עתה,עתה,RB,RB,_,3,502,עתה,502,...,_,_,_,train,,,O,O,O,O


In [7]:
dev_df.head()

Unnamed: 0,id,form,lemma,upostag,xpostag,head,deprel,deps,misc,sent,token_id,misc_token_str,feats_gen,feats_num,feats_per,feats_tense,feats_suf_gen,feats_suf_num,feats_suf_per,sent_id
0,1,עשרות,עשר,CDT,CDT,2,num,_,_,1,0,עשרות,F,P,,,,,,1
1,2,אנשים,איש,NN,NN,3,subj,_,_,1,1,אנשים,M,P,,,,,,1
2,3,מגיעים,הגיע,BN,BN,14,conj,_,_,1,2,מגיעים,M,P,A,,,,,1
3,4,מ,מ,PREPOSITION,PREPOSITION,3,comp,_,_,1,3,מתאילנד,,,,,,,,1
4,5,תאילנד,תאילנד,NNP,NNP,4,pobj,_,_,1,3,מתאילנד,F,S,,,,,,1


### Add sorting on the dataframes

In [None]:
train_df['sent_len'] = train_df.groupby('sent_id').id.transform('size')
s = train_df.sort_values(by=['sent_len', 'sent_id', 'id']).index
train_df_sorted = train_df.reindex(s)
train_df_sorted.groupby('sent_id', sort=False).size()

In [8]:
class sentenceGetter(object):
    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data['form']
        self.labels = data['upostag']
        #for evaluating by word-accuracy
        self.correspondingToken = data['token_id']
        self.orig_sent_id = data['sent_id']
    
    def sentences(self):
        sent = []
        counter = 0
        
        for token,label, corres_tok, sent_id in zip(self.tokens, self.labels, self.correspondingToken, 
                                                    self.orig_sent_id):
            sent.append((token, label, corres_tok, sent_id))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

train_getter = sentenceGetter(train_df)
dev_getter = sentenceGetter(dev_df)
test_getter = sentenceGetter(test_df)

train_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]
train_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]

dev_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]

test_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]

print(train_sentences[10])
print(train_labels[10])

print(len(dev_sentences))
print(len(test_sentences))

['הם', 'התבקשו', 'לדווח', 'למשטרה', 'על', 'תנועותיהם', '.']
['PRP', 'VB', 'VB', 'PREPOSITION^DEF^NN', 'IN', 'NN', 'yyDOT']
490
702


In [None]:
## Get the longest sentences in the dev and test sets
longest_sent_len = 0
for sent in dev_sentences:
    if len(sent) >= longest_sent_len:
        print(len(sent))
        longest_sent_len = len(sent)
        print("index of longest sentence:{} ".format(dev_sentences.index(sent)))
        
longest_sent_len = 0
for sent in test_sentences:
    if len(sent) >= longest_sent_len:
        print(len(sent))
        longest_sent_len = len(sent)
        print("index of longest sentence:{} ".format(test_sentences.index(sent)))

In [9]:
## Remove too long sentences

# del dev_sentences[296]
# del dev_labels[296]
# del dev_corresTokens[296]
# del dev_sent_ids[296]

# del dev_sentences[226]
# del dev_labels[226]
# del dev_corresTokens[226]
# del dev_sent_ids[226]

# del dev_sentences[57]
# del dev_labels[57]
# del dev_corresTokens[57]
# del dev_sent_ids[57]

# del dev_sentences[49]
# del dev_labels[49]
# del dev_corresTokens[49]
# del dev_sent_ids[49]


# del test_sentences[396]
# del test_labels[396]
# del test_corresTokens[396]
# del test_sent_ids[396]

# del test_sentences[164]
# del test_labels[164]
# del test_corresTokens[164]
# del test_sent_ids[164]

# del test_sentences[157]
# del test_labels[157]
# del test_corresTokens[157]
# del test_sent_ids[157]

# del test_sentences[151]
# del test_labels[151]
# del test_corresTokens[151]
# del test_sent_ids[151]

## YAP deletions
del dev_sentences[296]
del dev_labels[296]
del dev_corresTokens[296]
del dev_sent_ids[296]

del dev_sentences[226]
del dev_labels[226]
del dev_corresTokens[226]
del dev_sent_ids[226]

del dev_sentences[57]
del dev_labels[57]
del dev_corresTokens[57]
del dev_sent_ids[57]

del dev_sentences[49]
del dev_labels[49]
del dev_corresTokens[49]
del dev_sent_ids[49]

del dev_sentences[24]
del dev_labels[24]
del dev_corresTokens[24]
del dev_sent_ids[24]

del dev_sentences[22]
del dev_labels[22]
del dev_corresTokens[22]
del dev_sent_ids[22]

del dev_sentences[12]
del dev_labels[12]
del dev_corresTokens[12]
del dev_sent_ids[12]

del dev_sentences[9]
del dev_labels[9]
del dev_corresTokens[9]
del dev_sent_ids[9]

del dev_sentences[5]
del dev_labels[5]
del dev_corresTokens[5]
del dev_sent_ids[5]

del test_sentences[386]
del test_labels[386]
del test_corresTokens[386]
del test_sent_ids[386]

del test_sentences[384]
del test_labels[384]
del test_corresTokens[384]
del test_sent_ids[384]

del test_sentences[377]
del test_labels[377]
del test_corresTokens[377]
del test_sent_ids[377]

del test_sentences[213]
del test_labels[213]
del test_corresTokens[213]
del test_sent_ids[213]

del test_sentences[141]
del test_labels[141]
del test_corresTokens[141]
del test_sent_ids[141]

del test_sentences[124]
del test_labels[124]
del test_corresTokens[124]
del test_sent_ids[124]

del test_sentences[45]
del test_labels[45]
del test_corresTokens[45]
del test_sent_ids[45]

del test_sentences[35]
del test_labels[35]
del test_corresTokens[35]
del test_sent_ids[35]

del test_sentences[23]
del test_labels[23]
del test_corresTokens[23]
del test_sent_ids[23]


In [None]:
train_getter = sentenceGetter(train)
dev_getter = sentenceGetter(dev)
test_getter = sentenceGetter(test)

gold_train_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]
gold_train_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]
gold_train_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]

gold_dev_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
gold_dev_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
gold_dev_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
gold_dev_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]

gold_test_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
gold_test_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
gold_test_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
gold_test_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]

print(gold_train_sentences[10])
print(gold_train_labels[10])
print(gold_train_corresTokens[10])

print(len(gold_dev_sentences))
print(len(gold_test_sentences))

In [None]:
## Delete the same gold sentences as before

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.set_device(1)

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

In [None]:
MAX_LEN = 150
bs = 32

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def tokenize(sentences, orig_labels, gold_sentences, gold_labels, gold_ranges):
    tokenized_texts = []
    labels = []
    for sent, sent_labels, gold_sent, gold_sent_labels, gold_range in zip(sentences, 
                                                                           orig_labels, 
                                                                           gold_sentences, 
                                                                           gold_labels, 
                                                                           gold_ranges):
        bert_tokens = []
        bert_labels = []
        
        for ix, (orig_token, orig_label) in enumerate(zip(sent, sent_labels)):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            orig_label_split = orig_label.split('^')
            
#             print(ix, b_tokens, orig_token, orig_label, gold_sent, gold_sent_labels, gold_range)
#             print('\n')
            
            if len(orig_label_split) > 1 and len(b_tokens) > 1:
                tokenized_labels = tokenize_by_gold(b_tokens, orig_token, gold_sent, gold_sent_labels, gold_range, ix)
#                 print(len(tokenized_labels), len(b_tokens))
#                 print(tokenized_labels, b_tokens)
#                 print(gold_sent)
                assert len(tokenized_labels)== len(b_tokens)
                
                bert_labels.extend(tokenized_labels)
                
            else:
                for b_token in b_tokens:
                    bert_labels.append(orig_label)

            
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)

#         print(len(bert_tokens), len(bert_labels))
        assert len(bert_tokens) == len(bert_labels)
        
    
    return tokenized_texts, labels


def tokenize_by_gold(bert_tokenized_wordps, orig_token, gold_sent, gold_sent_label, gold_range, ix):
    gold_tokenized_labels = []
    len_bert_tokenized_wordps = len(bert_tokenized_wordps)
    
    relevant_gold_tokens = []
    relevant_gold_labels = []
    gold_token_ix = 0
    
    for token, label, corres_token in zip(gold_sent, gold_sent_label, gold_range):
#         print(token, label, corres_token)
        if ix == corres_token:
            relevant_gold_tokens.append(token)
#             print(relevant_gold_tokens)
            relevant_gold_labels.append(label)
#             print(relevant_gold_labels)
#     print("End of sentence tokens and labels")
#     print(relevant_gold_tokens, relevant_gold_labels)
            
    for wordp_ix, wordp in enumerate(bert_tokenized_wordps):
#         print(relevant_gold_tokens, relevant_gold_labels, bert_tokenized_wordps)

        if wordp.startswith("##"):
            wordp = wordp[2:]

            
        if len(relevant_gold_labels) == 1:
            gold_tokenized_labels.extend(relevant_gold_labels)
#             print("from last relevant label")
#             print(gold_tokenized_labels)
            continue
    
        if gold_token_ix +1 < len(relevant_gold_tokens):
            if (relevant_gold_labels[gold_token_ix] == 'PREPOSITION' and 
                relevant_gold_labels[gold_token_ix+1] == 'DEF'):
                joint_label = relevant_gold_labels[gold_token_ix]
                joint_label += '^'
                joint_label += relevant_gold_labels[gold_token_ix+1]                
                gold_tokenized_labels.append(joint_label)
    #             print("from next-label is DEF")
    #             print(gold_tokenized_labels)

                relevant_gold_tokens.pop(gold_token_ix+1)
                relevant_gold_labels.pop(gold_token_ix+1)
                relevant_gold_tokens.pop(gold_token_ix)
                relevant_gold_labels.pop(gold_token_ix)
    #             print("from next-label is DEF - remaining relevant gold labels")
    #             print(relevant_gold_labels)

                gold_token_ix += 1
                continue
        
        ## word piece is identical and aligned to the gold morpheme
        if gold_token_ix +1 < len(relevant_gold_tokens):
            if wordp == relevant_gold_tokens[gold_token_ix]:

                gold_tokenized_labels.append(relevant_gold_labels[gold_token_ix])
#                 print("from wordp identical to morpheme")
#                 print(gold_tokenized_labels)

                relevant_gold_tokens.pop(gold_token_ix)
                relevant_gold_labels.pop(gold_token_ix)

    #             print("from wordp identical to morpheme - remaining relevant gold labels")
    #             print(relevant_gold_labels)

                gold_token_ix += 1
                continue


        ## word piece is contained in one morpheme
        if gold_token_ix +1 < len(relevant_gold_tokens):
            if wordp in relevant_gold_tokens[gold_token_ix]:
                gold_tokenized_labels.append(relevant_gold_labels[gold_token_ix])
#                 print("from wordp contained in morpheme")
#                 print(gold_tokenized_labels)
    #             print("from wordp contained in morphemes - remaining relevant gold labels")
    #             print(relevant_gold_labels)
                continue


        if gold_token_ix +1 < len(relevant_gold_tokens):
            this_gold_and_next_gold = relevant_gold_tokens[gold_token_ix] + relevant_gold_tokens[gold_token_ix+1]
            
            ## word piece is the start of two consequtive morphemes - only the first morpheme is removed
            if this_gold_and_next_gold.startswith(wordp):
                joint_label = relevant_gold_labels[gold_token_ix]
                joint_label += '^'
                joint_label += relevant_gold_labels[gold_token_ix+1]
                gold_tokenized_labels.append(joint_label)
#                 print("from wordp is the start of two consequtive morphemes")
#                 print(gold_tokenized_labels)
                if relevant_gold_labels[gold_token_ix+1] == 'PREPOSITION':
                    relevant_gold_tokens.pop(gold_token_ix+1)
                    relevant_gold_labels.pop(gold_token_ix+1)
                relevant_gold_tokens.pop(gold_token_ix)
                relevant_gold_labels.pop(gold_token_ix)
    #             print("from wordp is the start of two consequtive morphemes - remaining relevant gold labels")
    #             print(relevant_gold_labels)

    #             gold_token_ix += 1
                continue

            ## word piece is contained consequtive morphemes - only the first morpheme is removed
            if wordp in this_gold_and_next_gold:
                joint_label = relevant_gold_labels[gold_token_ix]
                joint_label += '^'
                joint_label += relevant_gold_labels[gold_token_ix+1]
                gold_tokenized_labels.append(joint_label)
#                 print("from wordp is contained in two consequtive morphemes")
#                 print(gold_tokenized_labels)
                if relevant_gold_labels[gold_token_ix+1] == 'PREPOSITION':
                    relevant_gold_tokens.pop(gold_token_ix+1)
                relevant_gold_tokens.pop(gold_token_ix)
                relevant_gold_labels.pop(gold_token_ix)
    #             print("from wordp is contained in two consequtive morphemes - remaining relevant gold labels")
    #             print(relevant_gold_labels)
                gold_token_ix += 1
                continue

#             if wordp_ix == len_bert_tokenized_wordps - 1:
#                 last_tag = '^'.join(relevant_gold_labels)
#                 gold_tokenized_labels.append(last_tag)
#                 print("from last wordp")
#                 print(gold_tokenized_labels)
#     #             print("from last wordp - remaining relevant gold labels")
#     #             print(relevant_gold_labels)
#                 break
    
    
        last_tag = '^'.join(relevant_gold_labels)
        gold_tokenized_labels.append(last_tag)
#         print("from else")
#         print(gold_tokenized_labels)
#         print("from else - remaining relevant gold labels")
#         print(relevant_gold_labels)
        
        
#         return gold_tokenized_labels
    
    
#     print("##### Tokenization By Gold #####")
#     print(bert_tokenized_wordps)
#     print(orig_token)
#     print(gold_tokenized_labels)
#     print("##### END #####")
#     print('\n')
        
    
    return gold_tokenized_labels

train_tokenized_texts, train_tokenized_labels = tokenize(train_sentences[0:],
                                                         train_labels[0:],
                                                         gold_train_sentences[0:], 
                                                         gold_train_labels[0:], 
                                                         gold_train_corresTokens[0:])
print(train_tokenized_texts[10])
print(train_tokenized_labels[10])

In [None]:
print(train_tokenized_texts[55])
print(train_tokenized_labels[55])

In [None]:
data = train_df
tag_vals = list(set(data["upostag"].values))
tags = ['PAD'] + tag_vals
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

print(tag2idx)
# print(idx2tag)
print(len(tags))

In [None]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen = MAX_LEN, dtype = "float32", truncating = "post", padding = "post", value = tag2idx['PAD'])
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 
                         maxlen = MAX_LEN, value = tag2idx['PAD'], padding = "post",
                        dtype = "float32", truncating = "post")
    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    return input_ids, tags, attention_masks

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_tokenized_labels)

In [None]:
tr_inputs = torch.tensor(input_ids, dtype=torch.long)
tr_tags = torch.tensor(tags, dtype=torch.long)
tr_masks = torch.tensor(attention_masks, dtype=torch.long)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(tag2idx))
model.cuda()
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
#     print (pred_flat, labels_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

epochs = 15
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

In [None]:
# Function receives a sentence with its labels, and the tokenized sentence and labels
def aggr_toks_labels_tags(orig_words, orig_labels, tok_wordps, tok_labels, predicted_tags):
    
    joint_tokens = []
    joint_labels = []
    joint_predicted = []
#     joint_test = []
    
    for word in orig_words:
        aggregated_tokenized = ""
        aggregated_label = ""
        aggregated_predicted = ""
        aggregated_test = ""
        
        while aggregated_tokenized != word:
#             print(len(tok_sent))
            tmpTok = tok_wordps.pop(0)
#             print(tmpTok)
#             print(joint_tokens)
            if tmpTok.startswith("##"):
                tmpTok = tmpTok[2:]
                
            tmpLab = tok_labels.pop(0)
#             if aggregated_label == "":
            aggregated_label += '^'
            aggregated_label += tmpLab

                
            tmpPred = predicted_tags.pop(0)
#             print(tmpPred)

            aggregated_predicted += '^'
            aggregated_predicted += tmpPred
#             if aggregated_predicted == "":
#                 aggregated_predicted = tmpPred
                
#             tmpTest = test_tags.pop(0)
#             if aggregated_test == "":
#                 aggregated_test = tmpTest
                
            aggregated_tokenized += tmpTok
#             print(aggregated_tokenized)
            
        joint_tokens.append(aggregated_tokenized)
        joint_labels.append(aggregated_label)
        joint_predicted.append(aggregated_predicted)
#         joint_test.append(aggregated_test)
        
    assert len(joint_tokens) == len(orig_words)
    assert len(joint_tokens) == len(joint_predicted)
    return joint_tokens, joint_labels, joint_predicted

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def delete_pads_from_preds(predicted_tags, test_tags):
    clean_predicted = []
    clean_test = []
    
    for ix in range(0, len(test_tags)):
        if test_tags[ix] != 'PAD':
            clean_predicted.append(predicted_tags[ix])
            clean_test.append(test_tags[ix])
            
    return clean_predicted, clean_test

def delete_pads(predicted_tags):
    clean_predicted = []    
    for ix in range(0, len(test_tags)):
        if test_tags[ix] != 'PAD':
            clean_predicted.append(predicted_tags[ix])            
    return clean_predicted

    
def calculate_accuracy(df):
    numOfCorrectPredictions = 0
    for index in df.index:
        orig_pos = df.at[index, 'test_tag']
        pred_pos = df.at[index, 'predicted_tag']
        if orig_pos == pred_pos:
            numOfCorrectPredictions += 1
    return numOfCorrectPredictions/len(df)
                
def test_model(sentence, labels, tok_sent, tok_labels, corres_tokens, sent_id):
    input_ids, tags, attention_masks = pad_sentences_and_labels([tok_sent], [tok_labels])

    val_inputs = torch.tensor(input_ids, dtype=torch.long)
    val_tags = torch.tensor(tags, dtype=torch.long)
    val_masks = torch.tensor(attention_masks, dtype=torch.long)

    test_data = TensorDataset(val_inputs, val_masks, val_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    counter = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append([list(p) for p in np.argmax(logits, axis=2)])
        
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    
    pred_tags = [idx2tag[p_ii] for p in predictions for p_i in p for p_ii in p_i]
#     test_tags = [idx2tag[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
#     print(list(zip(pred_tags, test_tags)))
    # -----------------------------------------------------------------------
#     clean_predicted, clean_test = delete_pads_from_preds(pred_tags, test_tags)
#     clean_predicted = delete_pads(pred_tags)

#     joint_tokenized, joint_labels, preds, tests = aggr_toks_labels_tags(sentence, labels, tok_sent, tok_labels, 
#                                                                         clean_predicted, clean_test)
    joint_tokenized, joint_labels, preds = aggr_toks_labels_tags(sentence, labels, tok_sent, tok_labels, 
                                                                        pred_tags)
    
    tmp = {'word': sentence, 'orig_label': labels, 'predicted_tag': preds, 
           'corresToken': corres_tokens, 'sent_id': sent_id}
    tmp_df = pd.DataFrame(data=tmp)
    # -----------------------------------------------------------------------
    
#     y_true = pd.Series(test_tags)
#     y_pred = pd.Series(pred_tags)
#     cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
#     report = classification_report(y_true, y_pred)
#     print(report)
#     print(tmp_df)
    return tmp_df



# full_df
# f1_accuracy = calculate_accuracy(full_df)
# print("Accuracy (F1): = {}".format(f1_accuracy))

In [None]:
full_dev_df = pd.DataFrame()
dev_tokenized_texts, dev_tokenized_labels = tokenize(dev_sentences, dev_labels, 
                                                     gold_dev_sentences, gold_dev_labels, gold_dev_corresTokens)

# print(len(dev_tokenized_texts), len(dev_tokenized_labels), len(dev_sentences))
for sent, label, tok_sent, tok_label, corresTokens, sent_id in zip(dev_sentences, 
                                                                   dev_labels, 
                                                                   dev_tokenized_texts, 
                                                                   dev_tokenized_labels, 
                                                                   dev_corresTokens, 
                                                                   dev_sent_ids):
    eval_df = test_model(sent, label, tok_sent, tok_label, corresTokens, sent_id)
    full_dev_df = full_dev_df.append(eval_df, ignore_index=True, sort=False)

In [None]:
full_test_df = pd.DataFrame()
test_tokenized_texts, test_tokenized_labels = tokenize(test_sentences, test_labels, 
                                                     gold_test_sentences, gold_test_labels, gold_test_corresTokens)

# print(len(dev_tokenized_texts), len(dev_tokenized_labels), len(dev_sentences))
for sent, label, tok_sent, tok_label, corresTokens, sent_id in zip(test_sentences, 
                                                                   test_labels, 
                                                                   test_tokenized_texts, 
                                                                   test_tokenized_labels, 
                                                                   test_corresTokens, 
                                                                   test_sent_ids):
    eval_df = test_model(sent, label, tok_sent, tok_label, corresTokens, sent_id)
    full_test_df = full_test_df.append(eval_df, ignore_index=True, sort=False)

In [None]:
full_dev_df.iloc[-50:]

In [None]:
from more_itertools import unique_everseen

def unique_vals_to_list(df):
    for index in df.index:
        joint_pred = df.at[index, 'predicted_tag']
        joint_orig = df.at[index, 'orig_label']
        
        predicted_tag_list = joint_pred.split('^')
        predicted_tag_list_no_empty = list(filter(None, predicted_tag_list))
        original_tag_list = joint_orig.split('^')
        original_tag_list_no_empty = list(filter(None, original_tag_list))

        
        df.at[index, 'predicted_tag'] = list(unique_everseen(predicted_tag_list_no_empty))
        df.at[index, 'orig_label'] = list(unique_everseen(original_tag_list_no_empty))
        
        
unique_vals_to_list(full_dev_df)
unique_vals_to_list(full_test_df)

In [None]:
full_test_df.tail(30)

In [None]:
def exact_match_accuracy(df):
    exact_matches = 0
    for index in df.index:
        if df.at[index, 'orig_label'] == df.at[index, 'predicted_tag']:
            exact_matches += 1
            
    return exact_matches

print("DEV - Exact Match Accuracy = {0:.2f}%".format(exact_match_accuracy(full_dev_df)/len(full_dev_df) * 100))
print("TEST - Exact Match Accuracy = {0:.2f}%".format(exact_match_accuracy(full_test_df)/len(full_test_df) * 100))

In [None]:
def existence_accuracy(df):
    # correct tag = appeared in predicted and in gold
    total_orig_num_of_labels = 0
    total_predicted_num_of_labels = 0
    total_num_of_correct_tags = 0
    
    for index in df.index:
        orig_list = df.at[index, 'orig_label']
        predicted_list = df.at[index, 'predicted_tag']
        total_orig_num_of_labels += len(orig_list)
        total_predicted_num_of_labels += len(predicted_list)
        total_num_of_correct_tags += len(set(orig_list).intersection(set(predicted_list)))
        
    precision = total_num_of_correct_tags / total_predicted_num_of_labels * 100
    recall = total_num_of_correct_tags / total_orig_num_of_labels * 100
    f1 = 2*precision*recall/(precision+recall)
    
    print("Precision: {0:.2f}%".format(precision))
    print("Recall: {0:.2f}%".format(recall))
    print("F1: {0:.2f}%".format(f1))
    
print("DEV:")
existence_accuracy(full_dev_df)
print("TEST:")
existence_accuracy(full_test_df)