## Current setup using this notebook
<p>label - POS</p>
<p>Fine tune on: raw-train</p>
<p>Evaluate on: Raw-dev (and Raw-test, but not reported)</p>
<p>Classification by: whole word (as opposed to prefix/host)</p>
<p>Morphologically informed labels? None </p>
<p>Shuffle/Sort? shuffle </p>

In [None]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import bclm

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

### Manually setting seeds

In [None]:
torch.manual_seed(3)
np.random.seed(3)
torch.cuda.manual_seed_all(3)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Data
`bclm.read_dataframe('spmrl', subset='train')` - gives the gold-segmented tokens <br>
`bclm.get_token_df(train, ['upostag'])` - gives the raw tokens<br>
`bclm.read_dataframe('yap_dev')` - gives the YAP tokenization (only available on `yap_dev` and `yap_test`. No `yap_train`!

In [None]:
train = bclm.read_dataframe('spmrl', subset='train')
train_df = bclm.get_token_df(train, ['upostag', 'biose'])
train_df['token_str'] = train_df['token_str'].str.replace('”','"')

dev = bclm.read_dataframe('spmrl', subset='dev')
dev_df = bclm.get_token_df(dev, ['upostag', 'biose'])
dev_df['token_str'] = dev_df['token_str'].str.replace('”','"')

test = bclm.read_dataframe('spmrl', subset='test')
test_df = bclm.get_token_df(test, ['upostag', 'biose'])
test_df['token_str'] = test_df['token_str'].str.replace('”','"')

In [None]:
dev_df.head(20)

### Uniform column names
Note that the column names in yap dfs can be slightly different from spmrl dfs

In [None]:
# For Evaluating on Raw-dev/Raw-test
train_df.rename(columns = {"token_str": "form"}, inplace = True)
dev_df.rename(columns = {"token_str": "form"}, inplace = True)
test_df.rename(columns = {"token_str": "form"}, inplace = True)

In [None]:
## For Evaluating on Yap-dev/Yap-test
# dev_df.rename(columns = {"misc_token_id": "token_id"}, inplace = True)
# test_df.rename(columns = {"misc_token_id": "token_id"}, inplace = True)

In [None]:
dev_df.head(10)

### Add sorting on the dataframes

In [None]:
# train_df['sent_len'] = train_df.groupby('sent_id').id.transform('size')
# s = train_df.sort_values(by=['sent_len', 'sent_id', 'id']).index
# train_df_sorted = train_df.reindex(s)
# train_df_sorted.groupby('sent_id', sort=False).size()

In [None]:
# train_df_sorted.iloc[150:200]

### Get lists of sentences
Note that longest sentences from dev and test splits need to be removed, and those sentences change depending on wether or not sorting was applied.

In [None]:
class sentenceGetter(object):
    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data['form']
        self.labels = data['upostag']
        #for evaluating by word-accuracy
        self.correspondingToken = data['token_id']
        self.orig_sent_id = data['sent_id']
    
    def sentences(self):
        sent = []
        counter = 0
        
        for token,label, corres_tok, sent_id in zip(self.tokens, self.labels, self.correspondingToken, self.orig_sent_id):
            sent.append((token, label, corres_tok, sent_id))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

train_getter = sentenceGetter(train_df)
dev_getter = sentenceGetter(dev_df)
test_getter = sentenceGetter(test_df)

train_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]
train_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]

dev_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]

test_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]

print(train_sentences[10])
print(train_labels[10])

print(len(dev_sentences))
print(len(test_sentences))

In [None]:
## Get the longest sentences in the dev and test sets
longest_sent_len = 0
for sent in dev_sentences:
    if len(sent) >= longest_sent_len:
        print(len(sent))
        longest_sent_len = len(sent)
        print("index of longest sentence:{} ".format(dev_sentences.index(sent)))
        
longest_sent_len = 0
for sent in test_sentences:
    if len(sent) >= longest_sent_len:
        print(len(sent))
        longest_sent_len = len(sent)
        print("index of longest sentence:{} ".format(test_sentences.index(sent)))

In [None]:
## Remove too long sentences

del dev_sentences[296]
del dev_labels[296]
del dev_corresTokens[296]
del dev_sent_ids[296]

del dev_sentences[226]
del dev_labels[226]
del dev_corresTokens[226]
del dev_sent_ids[226]

del dev_sentences[57]
del dev_labels[57]
del dev_corresTokens[57]
del dev_sent_ids[57]

del dev_sentences[49]
del dev_labels[49]
del dev_corresTokens[49]
del dev_sent_ids[49]


del test_sentences[396]
del test_labels[396]
del test_corresTokens[396]
del test_sent_ids[396]

del test_sentences[164]
del test_labels[164]
del test_corresTokens[164]
del test_sent_ids[164]

del test_sentences[157]
del test_labels[157]
del test_corresTokens[157]
del test_sent_ids[157]

del test_sentences[151]
del test_labels[151]
del test_corresTokens[151]
del test_sent_ids[151]

# ## YAP deletions
# del dev_sentences[296]
# del dev_labels[296]
# del dev_corresTokens[296]
# del dev_sent_ids[296]

# del dev_sentences[226]
# del dev_labels[226]
# del dev_corresTokens[226]
# del dev_sent_ids[226]

# del dev_sentences[57]
# del dev_labels[57]
# del dev_corresTokens[57]
# del dev_sent_ids[57]

# del dev_sentences[49]
# del dev_labels[49]
# del dev_corresTokens[49]
# del dev_sent_ids[49]

# del dev_sentences[24]
# del dev_labels[24]
# del dev_corresTokens[24]
# del dev_sent_ids[24]

# del dev_sentences[22]
# del dev_labels[22]
# del dev_corresTokens[22]
# del dev_sent_ids[22]

# del dev_sentences[12]
# del dev_labels[12]
# del dev_corresTokens[12]
# del dev_sent_ids[12]

# del dev_sentences[9]
# del dev_labels[9]
# del dev_corresTokens[9]
# del dev_sent_ids[9]

# del dev_sentences[5]
# del dev_labels[5]
# del dev_corresTokens[5]
# del dev_sent_ids[5]

# del test_sentences[386]
# del test_labels[386]
# del test_corresTokens[386]
# del test_sent_ids[386]

# del test_sentences[384]
# del test_labels[384]
# del test_corresTokens[384]
# del test_sent_ids[384]

# del test_sentences[377]
# del test_labels[377]
# del test_corresTokens[377]
# del test_sent_ids[377]

# del test_sentences[213]
# del test_labels[213]
# del test_corresTokens[213]
# del test_sent_ids[213]

# del test_sentences[141]
# del test_labels[141]
# del test_corresTokens[141]
# del test_sent_ids[141]

# del test_sentences[124]
# del test_labels[124]
# del test_corresTokens[124]
# del test_sent_ids[124]

# del test_sentences[45]
# del test_labels[45]
# del test_corresTokens[45]
# del test_sent_ids[45]

# del test_sentences[35]
# del test_labels[35]
# del test_corresTokens[35]
# del test_sent_ids[35]

# del test_sentences[23]
# del test_labels[23]
# del test_corresTokens[23]
# del test_sent_ids[23]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.set_device(0)

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

In [None]:
MAX_LEN = 150
bs = 32

In [None]:
tokenizer = BertTokenizer.from_pretrained('/home/nlp/aharonr6/git/AlephBert/models/alephbert-256-new-wiki-16k/checkpoint-205000', do_lower_case=False)
def tokenize(sentences, orig_labels):
    tokenized_texts = []
    labels = []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)
        assert len(bert_tokens) == len(bert_labels)
    return tokenized_texts, labels

train_tokenized_texts, train_tokenized_labels = tokenize(train_sentences, train_labels)
print(train_tokenized_texts[10])
print(train_tokenized_labels[10])

In [None]:
data = train_df
tag_vals = list(set(data["upostag"].values))
tags = ['PAD'] + tag_vals
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

print(tag2idx)
# print(idx2tag)
print(len(tags))

In [None]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen = MAX_LEN, dtype = "float32", truncating = "post", padding = "post", value = tag2idx['PAD'])
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 
                         maxlen = MAX_LEN, value = tag2idx['PAD'], padding = "post",
                        dtype = "float32", truncating = "post")
    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    return input_ids, tags, attention_masks

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_tokenized_labels)

In [None]:
tr_inputs = torch.tensor(input_ids, dtype=torch.long)
tr_tags = torch.tensor(tags, dtype=torch.long)
tr_masks = torch.tensor(attention_masks, dtype=torch.long)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size=bs)

In [None]:
from transformers import get_linear_schedule_with_warmup

model = BertForTokenClassification.from_pretrained('/home/nlp/aharonr6/git/AlephBert/models/alephbert-256-new-wiki-16k/checkpoint-205000',
                                                   num_labels=len(tag2idx),
                                                   output_attentions = False,
                                                   output_hidden_states = False)
model.cuda()
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
#     print (pred_flat, labels_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

epochs = 15
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item() 
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

In [None]:
# Function receives a sentence with its labels, and the tokenized sentence and labels
def aggr_toks_labels_tags(orig_words, orig_labels, tok_wordps, tok_labels, predicted_tags):
    
    joint_tokens = []
    joint_labels = []
    joint_predicted = []
#     joint_test = []
    
    for word in orig_words:
        aggregated_tokenized = ""
        aggregated_label = ""
        aggregated_predicted = ""
        aggregated_test = ""
        
        while aggregated_tokenized != word:
#             print(len(tok_sent))
            tmpTok = tok_wordps.pop(0)
#             print(tmpTok)
#             print(joint_tokens)
            if tmpTok.startswith("##"):
                tmpTok = tmpTok[2:]
                
            tmpLab = tok_labels.pop(0)
#             if aggregated_label == "":
            aggregated_label += '^'
            aggregated_label += tmpLab

                
            tmpPred = predicted_tags.pop(0)
#             print(tmpPred)

            aggregated_predicted += '^'
            aggregated_predicted += tmpPred
#             if aggregated_predicted == "":
#                 aggregated_predicted = tmpPred
                
#             tmpTest = test_tags.pop(0)
#             if aggregated_test == "":
#                 aggregated_test = tmpTest
                
            aggregated_tokenized += tmpTok
#             print(aggregated_tokenized)
            
        joint_tokens.append(aggregated_tokenized)
        joint_labels.append(aggregated_label)
        joint_predicted.append(aggregated_predicted)
#         joint_test.append(aggregated_test)
        
    assert len(joint_tokens) == len(orig_words)
    assert len(joint_tokens) == len(joint_predicted)
    return joint_tokens, joint_labels, joint_predicted

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def delete_pads_from_preds(predicted_tags, test_tags):
    clean_predicted = []
    clean_test = []
    
    for ix in range(0, len(test_tags)):
        if test_tags[ix] != 'PAD':
            clean_predicted.append(predicted_tags[ix])
            clean_test.append(test_tags[ix])
            
    return clean_predicted, clean_test
    
def calculate_accuracy(df):
    numOfCorrectPredictions = 0
    for index in df.index:
        orig_pos = df.at[index, 'orig_label']
        pred_pos = df.at[index, 'predicted_tag']
        if orig_pos == pred_pos:
            numOfCorrectPredictions += 1
    return numOfCorrectPredictions/len(df)
                
def test_model(sentence, labels, tok_sent, tok_labels, corres_tokens, sent_id):
    input_ids, tags, attention_masks = pad_sentences_and_labels([tok_sent], [tok_labels])

    val_inputs = torch.tensor(input_ids, dtype=torch.long)
    val_tags = torch.tensor(tags, dtype=torch.long)
    val_masks = torch.tensor(attention_masks, dtype=torch.long)

    test_data = TensorDataset(val_inputs, val_masks, val_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    counter = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append([list(p) for p in np.argmax(logits, axis=2)])
        
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += outputs[0].mean().item()
        eval_accuracy += flat_accuracy(logits, label_ids)

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    
    pred_tags = [idx2tag[p_ii] for p in predictions for p_i in p for p_ii in p_i]
    joint_tokenized, joint_labels, preds = aggr_toks_labels_tags(sentence, labels, tok_sent, tok_labels, 
                                                                        pred_tags)
    
    tmp = {'word': sentence, 'orig_label': labels, 'predicted_tag': preds, 
           'corresToken': corres_tokens, 'sent_id': sent_id}
    tmp_df = pd.DataFrame(data=tmp)
    return tmp_df

In [None]:
full_dev_df = pd.DataFrame()
dev_tokenized_texts, dev_tokenized_labels = tokenize(dev_sentences, dev_labels)
for sent, label, tok_sent, tok_label, corresTokens, sent_id in zip(dev_sentences, dev_labels, dev_tokenized_texts, 
                                                                   dev_tokenized_labels, dev_corresTokens, 
                                                                   dev_sent_ids):
    eval_df = test_model(sent, label, tok_sent, tok_label, corresTokens, sent_id)
    full_dev_df = full_dev_df.append(eval_df, ignore_index=True, sort=False)

# full_df
f1_accuracy = calculate_accuracy(full_dev_df)
print("Accuracy (F1): = {}".format(f1_accuracy))

In [None]:
full_dev_df.head()

In [None]:
full_test_df = pd.DataFrame()
test_tokenized_texts, test_tokenized_labels = tokenize(test_sentences, test_labels)
for sent, label, tok_sent, tok_label, corresTokens, sent_id in zip(test_sentences, test_labels, test_tokenized_texts, 
                                                                   test_tokenized_labels, test_corresTokens, 
                                                                   test_sent_ids):
    eval_df = test_model(sent, label, tok_sent, tok_label, corresTokens, sent_id)
    full_test_df = full_test_df.append(eval_df, ignore_index=True, sort=False)

# full_df
f1_accuracy = calculate_accuracy(full_test_df)
print("Accuracy (F1): = {}".format(f1_accuracy))

In [None]:
full_dev_df.tail(30)

In [None]:
# # For evaluating on gold-dev/gold-test only - regrouping the tokens to words

# dev_predicted = full_dev_df.groupby(['sent_id', 'corresToken']).apply(lambda x: '^'.join(x.predicted_tag)).reset_index()
# dev_original = full_dev_df.groupby(['sent_id', 'corresToken']).apply(lambda x: '^'.join(x.orig_label)).reset_index()
# dev_combined = pd.merge(dev_original, dev_predicted, on=['sent_id', 'corresToken'])
# dev_combined.rename(columns = {"0_x": "orig_label", "0_y":"predicted_tag"}, inplace = True)

# test_predicted = full_test_df.groupby(['sent_id', 'corresToken']).apply(lambda x: '^'.join(x.predicted_tag)).reset_index()
# test_original = full_test_df.groupby(['sent_id', 'corresToken']).apply(lambda x: '^'.join(x.orig_label)).reset_index()
# test_combined = pd.merge(test_original, test_predicted, on=['sent_id', 'corresToken'])
# test_combined.rename(columns = {"0_x": "orig_label", "0_y":"predicted_tag"}, inplace = True)
# # word_acc_test = full_test_df.groupby(['sent_id', 'corresToken']).apply(lambda x: '^'.join(x.predicted_tag)).reset_index()

In [None]:
# dev_combined.head()

In [None]:
# dev_combined.to_csv('ftRaw_evalGoldDev_whole_pos_shuffle.csv')
# test_combined.to_csv('ftRaw_evalGoldTest_whole_pos_shuffle.csv')

In [None]:
from more_itertools import unique_everseen

def unique_vals_to_list(df):
    for index in df.index:
        joint_pred = df.at[index, 'predicted_tag']
        joint_orig = df.at[index, 'orig_label']
        
        predicted_tag_list = joint_pred.split('^')
        predicted_tag_list_no_empty = list(filter(None, predicted_tag_list))
        original_tag_list = joint_orig.split('^')
        original_tag_list_no_empty = list(filter(None, original_tag_list))

        
        df.at[index, 'predicted_tag'] = list(unique_everseen(predicted_tag_list_no_empty))
        df.at[index, 'orig_label'] = list(unique_everseen(original_tag_list_no_empty))
        
        
unique_vals_to_list(full_dev_df)
unique_vals_to_list(full_test_df)

In [None]:
dev_combined.tail(30)

In [None]:
def exact_match_accuracy(df):
    exact_matches = 0
    for index in df.index:
        if df.at[index, 'orig_label'] == df.at[index, 'predicted_tag']:
            exact_matches += 1
            
    return exact_matches

print("DEV - Exact Match Accuracy = {0:.2f}%".format(exact_match_accuracy(full_dev_df)/len(full_dev_df) * 100))
print("TEST - Exact Match Accuracy = {0:.2f}%".format(exact_match_accuracy(full_test_df)/len(full_test_df) * 100))

In [None]:
def existence_accuracy(df):
    # correct tag = appeared in predicted and in gold
    total_orig_num_of_labels = 0
    total_predicted_num_of_labels = 0
    total_num_of_correct_tags = 0
    
    for index in df.index:
        orig_list = df.at[index, 'orig_label']
        predicted_list = df.at[index, 'predicted_tag']
        total_orig_num_of_labels += len(orig_list)
        total_predicted_num_of_labels += len(predicted_list)
        total_num_of_correct_tags += len(set(orig_list).intersection(set(predicted_list)))
        
    precision = total_num_of_correct_tags / total_predicted_num_of_labels * 100
    recall = total_num_of_correct_tags / total_orig_num_of_labels * 100
    f1 = 2*precision*recall/(precision+recall)
    
    print("Precision: {0:.2f}%".format(precision))
    print("Recall: {0:.2f}%".format(recall))
    print("F1: {0:.2f}%".format(f1))
    
print("DEV:")
existence_accuracy(full_dev_df)
print("TEST:")
existence_accuracy(full_test_df)

In [None]:
full_dev_df.iloc[30:55]

In [None]:
len(dev_df[dev_df['upostag'] == 'DEF^NN'])/len(dev_df)