In [1]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import bclm

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [2]:
torch.manual_seed(3)
np.random.seed(3)
torch.cuda.manual_seed_all(3)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
train = bclm.read_dataframe('spmrl', subset='train')
train_df = bclm.get_token_df(train, ['upostag'])
train_df['token_str'] = train_df['token_str'].str.replace('”','"')

dev = bclm.read_dataframe('spmrl', subset='dev')
dev_df = bclm.get_token_df(dev, ['upostag'])
dev_df['token_str'] = dev_df['token_str'].str.replace('”','"')

test = bclm.read_dataframe('spmrl', subset='test')
test_df = bclm.get_token_df(test, ['upostag'])
test_df['token_str'] = test_df['token_str'].str.replace('”','"')

In [4]:
train_df['prefix'] = ''
train_df['host'] = ''

dev_df['prefix'] = ''
dev_df['host'] = ''

test_df['prefix'] = ''
test_df['host'] = ''

In [5]:
dev_df.head(30)

Unnamed: 0,sent_id,token_id,token_str,upostag,set,prefix,host
0,1,1,עשרות,CDT,dev,,
1,1,2,אנשים,NN,dev,,
2,1,3,מגיעים,BN,dev,,
3,1,4,מתאילנד,PREPOSITION^NNP,dev,,
4,1,5,לישראל,PREPOSITION^NNP,dev,,
5,1,6,כשהם,TEMP^PRP,dev,,
6,1,7,נרשמים,BN,dev,,
7,1,8,כמתנדבים,PREPOSITION^NN,dev,,
8,1,9,",",yyCM,dev,,
9,1,10,אך,CC,dev,,


In [6]:
def prefix_corretion(prefix):
    not_prefixes = ['VB^POS','VB^DUMMY_AT','VB^AT', 'IN^DUMMY_AT', 'NN', 'IN', 'P',
                   'POS', 'RB', 'AT']
    first_prefix = ['DEF^BN^AT', 'CONJ^VB^AT','CONJ^BN^AT', 'CONJ^BN^AT', 'REL^VB^AT',
                   'PREPOSITION^RB', 'DEF^yyQUOT', 'IN^yyQUOT', 'CONJ^yyQUOT^DEF',
                   'PREPOSITION^yyQUOT^PREPOSITION^DEF', 'REL^yyQUOT', 'REL^IN', 'CONJ^IN',
                   'PREPOSITION^yyQUOT', 'PREPOSITION^POS', 'PREPOSITION^IN', 'PREPOSITION^yyQUOT^DEF',
                   'REL^yyQUOT^PREPOSITION', 'CONJ^yyQUOT', 'REL^AT']
    
    two_first_prefixes = ['CONJ^PREPOSITION^yyQUOT', 'PREPOSITION^DEF^yyQUOT', 'CONJ^DEF^yyQUOT']
    
    if prefix in not_prefixes:
        host = prefix
        prefix = '-'
    
    elif prefix in first_prefix:
        tag_list = prefix.split('^')
        prefix = tag_list[0]
        host = '^'.join(tag_list[1:])
        
    elif prefix in two_first_prefixes:
        tag_list = prefix.split('^')
        prefix = '^'.join(tag_list[0:2])
        host = '^'.join(tag_list[2:])
        
    else:
        host = ""
        prefix = prefix
        
    return prefix, host
        

def full_tag_to_prefix_host(df):
    for index in df.index:
        full_tag = df.at[index, 'upostag']
        full_tag_list = full_tag.split('^')
        if len(full_tag_list) > 1:
            prefix = '^'.join(full_tag_list[:-1])
            prefix, host = prefix_corretion(prefix)
            df.at[index, 'prefix'] = prefix
            if len(host) > 0:
                host += '^'
            host += full_tag_list[-1]
            df.at[index, 'host'] = host
        else:
            df.at[index, 'prefix'] = '-'
            df.at[index, 'host'] = full_tag_list[-1]

In [7]:
full_tag_to_prefix_host(train_df)
full_tag_to_prefix_host(dev_df)
full_tag_to_prefix_host(test_df)

train_df.tail(30)

Unnamed: 0,sent_id,token_id,token_str,upostag,set,prefix,host
93474,5436,15,נעליים,NN,train,-,NN
93475,5436,16,",",yyCM,train,-,yyCM
93476,5436,17,איך,QW,train,-,QW
93477,5436,18,מנהלים,BN,train,-,BN
93478,5436,19,חשבון,NN,train,-,NN
93479,5436,20,בבנק,PREPOSITION^DEF^NN,train,PREPOSITION^DEF,NN
93480,5436,21,.,yyDOT,train,-,yyDOT
93481,5437,1,אילנה,NNP,train,-,NNP
93482,5437,2,נחום,NNP,train,-,NNP
93483,5437,3,",",yyCM,train,-,yyCM


In [8]:
data = pd.concat([train_df, dev_df, test_df])
data.head()
tag_vals = list(set(data["prefix"].values))
tags = ['PAD'] + tag_vals
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

print(tag2idx)
# print(idx2tag)
print(len(tags))

{'PAD': 0, 'CONJ^PREPOSITION': 1, 'TEMP': 2, 'ZVL^PREPOSITION': 3, 'IN': 4, 'IN^RB': 5, 'PREPOSITION^REL': 6, 'TEMP^PREPOSITION': 7, 'CONJ^REL': 8, 'TEMP^PREPOSITION^DEF': 9, 'ZVL^DEF': 10, 'DEF^DEF': 11, 'ZVL': 12, 'DEF': 13, 'CONJ^IN^DEF': 14, 'REL^DEF': 15, 'PREPOSITIONIN': 16, 'REL^PREPOSITION^yyQUOT': 17, 'PREPOSITION^PREPOSITION': 18, 'ZVL^PREPOSITION^DEF': 19, 'REL^yyQUOT^DEF': 20, 'PREPOSITION^IN^DEF': 21, 'ADVERB': 22, 'IN^DEF': 23, 'REL': 24, 'CC^ZVL^DEF': 25, 'REL^ADVERB': 26, 'IN^IN': 27, 'PREPOSITION^DEF': 28, 'CONJ': 29, 'PREPOSITION^ADVERB': 30, 'CONJ^REL^PREPOSITION': 31, '-': 32, 'PREPOSITION^ PREPOSITION^DEF': 33, 'REL^PREPOSITION': 34, 'IN^REL': 35, 'CONJ^TEMP': 36, 'CONJ^REL^DEF': 37, 'TEMP^DEF': 38, 'DEF^PREPOSITION': 39, 'PREPOSITION': 40, 'CONJ^PREPOSITION^DEF': 41, 'IN^IN^DEF': 42, 'PREPOSITION^yyQUOT^PREPOSITION': 43, 'PREPOSITIONIN^PREPOSITION': 44, 'PREPOSITION^PREPOSITION^DEF': 45, 'CONJ^DEF': 46, 'REL^PREPOSITION^DEF': 47}
48


In [None]:
# for label in tag_vals:
#     print(data[data['prefix'] == 'PREPOSITIONIN'])
    
data[data['prefix'] == 'PREPOSITIONIN']

In [9]:
class sentenceGetter(object):
    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data['token_str']
        self.labels = data['prefix']
        #for evaluating by word-accuracy
        self.correspondingToken = data['token_id']
        self.orig_sent_id = data['sent_id']
    
    def sentences(self):
        sent = []
        counter = 0
        
        for token,label, corres_tok, sent_id in zip(self.tokens, self.labels, self.correspondingToken, self.orig_sent_id):
            sent.append((token, label, corres_tok, sent_id))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

train_getter = sentenceGetter(train_df)
dev_getter = sentenceGetter(dev_df)
test_getter = sentenceGetter(test_df)

train_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]
train_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]

dev_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]

test_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]

print(train_sentences[10])
print(train_labels[10])

print(len(dev_sentences))

del dev_sentences[296]
del dev_labels[296]
del dev_corresTokens[296]
del dev_sent_ids[296]


del dev_sentences[226]
del dev_labels[226]
del dev_corresTokens[226]
del dev_sent_ids[226]


del dev_sentences[57]
del dev_labels[57]
del dev_corresTokens[57]
del dev_sent_ids[57]


del dev_sentences[49]
del dev_labels[49]
del dev_corresTokens[49]
del dev_sent_ids[49]

['הם', 'התבקשו', 'לדווח', 'למשטרה', 'על', 'תנועותיהם', '.']
['-', '-', '-', 'PREPOSITION^DEF', '-', '-', '-']
490


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.set_device(1)

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

Device: cuda
Number of gpus: 4
Name of gpu: GeForce GTX 1080 Ti


In [11]:
MAX_LEN = 150
bs = 32

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def tokenize(sentences, orig_labels):
    tokenized_texts = []
    labels = []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)
        assert len(bert_tokens) == len(bert_labels)
    return tokenized_texts, labels

train_tokenized_texts, train_tokenized_labels = tokenize(train_sentences, train_labels)
print(train_tokenized_texts[10])
print(train_tokenized_labels[10])

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


['הם', 'ה', '##ת', '##בק', '##שו', 'ל', '##דו', '##וח', 'ל', '##משטרה', 'על', 'ת', '##נוע', '##ות', '##יהם', '.']
['-', '-', '-', '-', '-', '-', '-', '-', 'PREPOSITION^DEF', 'PREPOSITION^DEF', '-', '-', '-', '-', '-', '-']


In [13]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen = MAX_LEN, dtype = "float32", truncating = "post", padding = "post", value = tag2idx['PAD'])
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 
                         maxlen = MAX_LEN, value = tag2idx['PAD'], padding = "post",
                        dtype = "float32", truncating = "post")
    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    return input_ids, tags, attention_masks

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_tokenized_labels)

In [14]:
tr_inputs = torch.tensor(input_ids, dtype=torch.long)
tr_tags = torch.tensor(tags, dtype=torch.long)
tr_masks = torch.tensor(attention_masks, dtype=torch.long)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)

In [15]:
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(tag2idx))
model.cuda()
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
#     print (pred_flat, labels_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

epochs = 15
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:   7%|▋         | 1/15 [01:24<19:48, 84.88s/it]

Train loss: 0.6085739363180963


Epoch:  13%|█▎        | 2/15 [02:54<18:42, 86.32s/it]

Train loss: 0.1686880534122649


Epoch:  20%|██        | 3/15 [04:24<17:27, 87.29s/it]

Train loss: 0.10560035813403756


Epoch:  27%|██▋       | 4/15 [05:52<16:04, 87.68s/it]

Train loss: 0.07537848825909589


Epoch:  33%|███▎      | 5/15 [07:20<14:37, 87.76s/it]

Train loss: 0.054457569531606215


Epoch:  40%|████      | 6/15 [08:48<13:09, 87.70s/it]

Train loss: 0.04267892657509564


Epoch:  47%|████▋     | 7/15 [10:14<11:39, 87.42s/it]

Train loss: 0.03579133368207534


Epoch:  53%|█████▎    | 8/15 [11:42<10:11, 87.33s/it]

Train loss: 0.029969054543854373


Epoch:  60%|██████    | 9/15 [13:09<08:44, 87.39s/it]

Train loss: 0.023491860215711455


Epoch:  67%|██████▋   | 10/15 [14:36<07:16, 87.37s/it]

Train loss: 0.020394237085527397


Epoch:  73%|███████▎  | 11/15 [16:04<05:49, 87.37s/it]

Train loss: 0.019622690552894614


Epoch:  80%|████████  | 12/15 [17:31<04:22, 87.35s/it]

Train loss: 0.015630869830554155


Epoch:  87%|████████▋ | 13/15 [18:59<02:54, 87.37s/it]

Train loss: 0.015729057546674374


Epoch:  93%|█████████▎| 14/15 [20:26<01:27, 87.35s/it]

Train loss: 0.01400521895036371


Epoch: 100%|██████████| 15/15 [21:53<00:00, 87.58s/it]

Train loss: 0.012883164258949508





In [20]:
torch.save(model.state_dict(), './prefix-finetuning1.pt')

In [None]:
# Function receives a sentence with its labels, and the tokenized sentence and labels
def aggr_toks_labels_tags(orig_words, orig_labels, tok_wordps, tok_labels, predicted_tags):
    
    joint_tokens = []
    joint_labels = []
    joint_predicted = []
#     joint_test = []
    
    for word in orig_words:
        aggregated_tokenized = ""
        aggregated_label = ""
        aggregated_predicted = ""
        aggregated_test = ""
        
        while aggregated_tokenized != word:
#             print(len(tok_sent))
            tmpTok = tok_wordps.pop(0)
#             print(tmpTok)
#             print(joint_tokens)
            if tmpTok.startswith("##"):
                tmpTok = tmpTok[2:]
                
            tmpLab = tok_labels.pop(0)
#             if aggregated_label == "":
            aggregated_label += '^'
            aggregated_label += tmpLab

                
            tmpPred = predicted_tags.pop(0)
#             print(tmpPred)

            aggregated_predicted += '^'
            aggregated_predicted += tmpPred
#             if aggregated_predicted == "":
#                 aggregated_predicted = tmpPred
                
#             tmpTest = test_tags.pop(0)
#             if aggregated_test == "":
#                 aggregated_test = tmpTest
                
            aggregated_tokenized += tmpTok
#             print(aggregated_tokenized)
            
        joint_tokens.append(aggregated_tokenized)
        joint_labels.append(aggregated_label)
        joint_predicted.append(aggregated_predicted)
#         joint_test.append(aggregated_test)
        
    assert len(joint_tokens) == len(orig_words)
    assert len(joint_tokens) == len(joint_predicted)
    return joint_tokens, joint_labels, joint_predicted

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def delete_pads_from_preds(predicted_tags, test_tags):
    clean_predicted = []
    clean_test = []
    
    for ix in range(0, len(test_tags)):
        if test_tags[ix] != 'PAD':
            clean_predicted.append(predicted_tags[ix])
            clean_test.append(test_tags[ix])
            
    return clean_predicted, clean_test
    
def calculate_accuracy(df):
    numOfCorrectPredictions = 0
    for index in df.index:
        orig_pos = df.at[index, 'test_tag']
        pred_pos = df.at[index, 'predicted_tag']
        if orig_pos == pred_pos:
            numOfCorrectPredictions += 1
    return numOfCorrectPredictions/len(df)
                
def test_model(sentence, labels, tok_sent, tok_labels, corres_tokens, sent_id):
    input_ids, tags, attention_masks = pad_sentences_and_labels([tok_sent], [tok_labels])

    val_inputs = torch.tensor(input_ids, dtype=torch.long)
    val_tags = torch.tensor(tags, dtype=torch.long)
    val_masks = torch.tensor(attention_masks, dtype=torch.long)

    test_data = TensorDataset(val_inputs, val_masks, val_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    counter = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append([list(p) for p in np.argmax(logits, axis=2)])
        
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    
    pred_tags = [idx2tag[p_ii] for p in predictions for p_i in p for p_ii in p_i]
    test_tags = [idx2tag[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
#     print(list(zip(pred_tags, test_tags)))
    # -----------------------------------------------------------------------
    clean_predicted, clean_test = delete_pads_from_preds(pred_tags, test_tags)
    joint_tokenized, joint_labels, preds, tests = aggr_toks_labels_tags(sentence, labels, tok_sent, tok_labels, 
                                                                        clean_predicted, clean_test)
    
    tmp = {'word': sentence, 'orig_label': labels, 'predicted_tag': preds, 'test_tag': tests, 
           'corresToken': corres_tokens, 'sent_id': sent_id}
    tmp_df = pd.DataFrame(data=tmp)
    # -----------------------------------------------------------------------
    
#     y_true = pd.Series(test_tags)
#     y_pred = pd.Series(pred_tags)
#     cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
#     report = classification_report(y_true, y_pred)
#     print(report)
#     print(tmp_df)
    return tmp_df

full_df = pd.DataFrame()
dev_tokenized_texts, dev_tokenized_labels = tokenize(dev_sentences, dev_labels)
for sent, label, tok_sent, tok_label, corresTokens, sent_id in zip(dev_sentences, dev_labels, dev_tokenized_texts, 
                                                                   dev_tokenized_labels, dev_corresTokens, 
                                                                   dev_sent_ids):
    test_df = test_model(sent, label, tok_sent, tok_label, corresTokens, sent_id)
    full_df = full_df.append(test_df, ignore_index=True, sort=False)

# full_df
f1_accuracy = calculate_accuracy(full_df)
print("Accuracy (F1): = {}".format(f1_accuracy))

In [None]:
full_df.iloc[0:30]

In [None]:
def rename_prefix_output(prefix_df):
    prefix_df = prefix_df.rename(columns={"orig_label": "orig_prefix", "predicted_tag": "predicted_prefix"})
    
def rename_host_output(host_df):
    host_df = host_df.rename(columns={"orig_label": "orig_host", "predicted_tag": "predicted_host"})

In [None]:
rename_prefix_output(full_df)
full_df.to_csv('prefix-5-setting1.csv')

In [None]:
full_df.head()

In [None]:
host_df = pd.read_csv()

In [None]:

def join_prefix_host(prefix_df, host_df)

In [None]:
from more_itertools import unique_everseen

def unique_vals_to_list(df):
    for index in df.index:
        joint_pred = df.at[index, 'predicted_tag']
        joint_orig = df.at[index, 'orig_label']
        
        predicted_tag_list = joint_pred.split('^')
        predicted_tag_list_no_empty = list(filter(None, predicted_tag_list))
        original_tag_list = joint_orig.split('^')
        original_tag_list_no_empty = list(filter(None, original_tag_list))

        
        df.at[index, 'predicted_tag'] = list(unique_everseen(predicted_tag_list_no_empty))
        df.at[index, 'orig_label'] = list(unique_everseen(original_tag_list_no_empty))
        
        
unique_vals_to_list(full_df)

In [None]:
full_df.head(30)

In [None]:
def exact_match_accuracy(df):
    exact_matches = 0
    for index in df.index:
        if df.at[index, 'orig_label'] == df.at[index, 'predicted_tag']:
            exact_matches += 1
            
    return exact_matches

print("Exact Match Accuracy = {0:.2f}%".format(exact_match_accuracy(full_df)/len(full_df) * 100))

In [None]:
def existence_accuracy(df):
    # correct tag = appeared in predicted and in gold
    total_orig_num_of_labels = 0
    total_predicted_num_of_labels = 0
    total_num_of_correct_tags = 0
    
    for index in df.index:
        orig_list = df.at[index, 'orig_label']
        predicted_list = df.at[index, 'predicted_tag']
        total_orig_num_of_labels += len(orig_list)
        total_predicted_num_of_labels += len(predicted_list)
        total_num_of_correct_tags += len(set(orig_list).intersection(set(predicted_list)))
        
    precision = total_num_of_correct_tags / total_predicted_num_of_labels * 100
    recall = total_num_of_correct_tags / total_orig_num_of_labels * 100
    f1 = 2*precision*recall/(precision+recall)
    
    print("Precision: {0:.2f}%".format(precision))
    print("Recall: {0:.2f}%".format(recall))
    print("F1: {0:.2f}%".format(f1))
    
existence_accuracy(full_df)

Creating standard df for multi-label pos

In [None]:
class StdDf(object):
    def __init__(self):
        self.data = pd.concat([train, dev, test])
        self.tag_vals = list(set(data['upostag'].values))
        self.df = pd.DataFrame(columns = tag_vals)
        self.create_multilabel_df()
        
    def create_multilabel_df(self):        
        self.df['sent_id'] = '0'
        self.df['token_id'] = '0'
        self.df['token'] = ''
        cols = self.df.columns.tolist()
        cols = cols[-3:] + cols[:-3]
        self.df = self.df[cols]
        self.df[self.df.columns[3:]] = 0


std_df = StdDf()
std_df.df.head()

In [None]:
def raw_to_multilabel_df(raw_df):
    multi_df = StdDf()
    for index in raw_df.index:
        multi_df.df.at[index, 'sent_id'] = raw_df.at[index, 'sent_id']
        multi_df.df.at[index, 'token_id'] = raw_df.at[index, 'token_id']
        multi_df.df.at[index, 'token'] = raw_df.at[index, 'token_str']
        
        l_pos_tags = raw_df.at[index, 'upostag']
        multi_df.df.at[index, l_pos_tags] = 1
        
    return multi_df.df
        
    
multi_dev_df = raw_to_multilabel_df(dev_df)
multi_dev_df.head()

In [None]:
dev_df.head()

In [None]:
multi_dev_df[multi_dev_df.columns[3:]] = 0

for index in dev_df.index:
    l_pos_tags = dev_df.at[index, 'upostag']
    multi_dev_df.at[index, l_pos_tags] = 1
    
multi_dev_df.head()