# **NER-based QA on MultiRC - Evaluation Notebook**

Steps:

Optional -

Data transformation script - To convert MultiRC dataset into NER dataset (arg - input jsonl MultiRC file, creates a csv file in output path for NER QA)

1. Load DEV set (TEST set not publicly available for MultiRC)

2. Pre-processing

3. Load trained model

4. Prediction

5. Analysis

# Mount GDrive & Prepare Paths

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# Paths - Data, Model
PARENT_DIR = "/content/gdrive/My Drive/MultiRC_NER"
TRAINED_MODEL_DIR = PARENT_DIR + "/output/trained_v4"
DATA_PATH = PARENT_DIR + "/data/dev_v4.csv"
EVAL_PATH = PARENT_DIR + "/evaluations/results.txt"
PRED_PATH = PARENT_DIR + "/evaluations"

In [3]:
!ls '/content/gdrive/My Drive/MultiRC_NER/data'

backup	dev_sample_v4.csv  dev_v4.csv  qa  train_v4.csv  vocab.txt


## OPTIONAL - Preprocess the MultiRC QA data and convert into NER format

In [4]:
import json
import re
import os

INPUT_FILE = "/content/gdrive/My Drive/MultiRC_NER/data/qa/dev.json"
OUTPUT_FILE = "/content/gdrive/My Drive/MultiRC_NER/data/eval.csv"
DATA_PATH = OUTPUT_FILE

DELIMITER = ','
IDX = 1
INTERNAL_TAG = 'I'

# Initializing the file
file = open(INPUT_FILE)

# Loading the data
data = json.load(file)

#print(data['data'])

paragraphList = data['data']

if os.path.exists(OUTPUT_FILE):
    os.remove(OUTPUT_FILE)

# HEADERS
with open(OUTPUT_FILE,'a+') as file:
    file.write("ID"+DELIMITER+"TOKEN"+DELIMITER+"TAG\n")

# to convert the sentence into CSV cell format (word,tag)
def generateCSVCell(idx, sentence, ch):
    words = list()
    cell = ''
    if ' ' in sentence:
        words = sentence.split(' ')
        wordIdx = 1
        for word in words:
            if len(word.strip()) is not 0:
                if wordIdx == 1:
                    if "." in word or "?" in word:
                        pos = word.find(".")
                        if pos == -1:
                            pos = word.find("?")
                        cell = cell + str(idx) + DELIMITER + word[0:pos] + DELIMITER + ch + '\n'
                        cell = cell + str(idx) + DELIMITER + word[pos:] + DELIMITER + INTERNAL_TAG + '\n'
                    else:
                        cell = cell + str(idx) + DELIMITER + word + DELIMITER + ch + '\n'
                else:
                    if "." in word or "?" in word:
                        pos = word.find(".")
                        if pos == -1:
                            pos = word.find("?")
                        cell = cell + str(idx) + DELIMITER + word[0:pos] + DELIMITER + INTERNAL_TAG + '\n'
                        cell = cell + str(idx) + DELIMITER + word[pos:] + DELIMITER + INTERNAL_TAG + '\n'
                    else:
                        cell = cell + str(idx) + DELIMITER + word + DELIMITER + INTERNAL_TAG + '\n'
                wordIdx+=1
        return cell
    
    if "." in sentence:
        temp = ''
        pos = sentence.find(".")
        temp = temp + str(idx) + DELIMITER + sentence[0:pos] + DELIMITER + ch + '\n'
        temp = temp + str(idx) + DELIMITER + sentence[pos:] + DELIMITER + INTERNAL_TAG + '\n'
        return temp
        
    return str(idx) + DELIMITER + sentence + DELIMITER + ch + '\n'
    
# to remove the redundant tag and data represent in the text
def sentenceFormat(sentence):
    """ Need to change this to remove """
    sentence = sentence.replace('</b>','').replace('<b>','').replace('<br>','').replace(',','').replace('\\','').replace('"','')
    sentence = re.sub('Sent\s[0-9]*:', '', sentence)
    sentence = sentence.strip()
    return sentence



for paragraphs in paragraphList:
    
    paraInfo = paragraphs['paragraph']
    sentences = paraInfo['text'].split('<br><b>')
    questionList = paraInfo['questions']
    
    
    
    for questions in questionList:
        questionText = questions['question']
        sentences_used = questions['sentences_used']
        
        if "?" not in questionText:
            questionText = questionText + "?"
        
        print(IDX, " --- ", questionText)
        
        paraData=''
        for idx in range(0,len(sentences)):
            paraData = paraData + generateCSVCell(IDX, sentenceFormat(sentences[idx]), 'P')
        
        quesData = generateCSVCell(IDX, sentenceFormat(questionText), 'Q')
        
        answersList = questions['answers']
        ansData = ''
        
        for answers in answersList:
            """ Adding dot at the end of answer """
            answerText = answers['text'] + "."
            isAnswer = answers['isAnswer']
            if isAnswer:
                ansData = ansData + generateCSVCell(IDX, sentenceFormat(answerText), 'C')
            else:
                ansData = ansData + generateCSVCell(IDX, sentenceFormat(answerText), 'W')
        
        IDX += 1

        with open(OUTPUT_FILE,'a+') as file:
            file.write(quesData+ansData+paraData)
            #file.write("*****************************,****,******\n")


1  ---  Who were the people that opposed Air New Zealand's decisions?
2  ---  What company did Air New Zealand collaborate with the make the new video?
3  ---  How many named models to appear in the video?
4  ---  How might Air New Zealand's video partner benefited from helping to make this video?
5  ---  How was the "Safety in Paradise" video received?
6  ---  Where was the "Safety in Paradise" video shot?
7  ---  Which airline teamed up with Sports Illustrated magazine in order to produce a safety video?
8  ---  What is the "paradise" they are talking about?
9  ---  Which company created "The world's most beautiful safety video"?
10  ---  Which women appeared in the "Safety in Paradise" safety video?
11  ---  Deborah Russel critiqued a video produced by what company?
12  ---  What kind of career does Christie Brinkley have?
13  ---  How many male models were featured in the current video?
14  ---  Where did Chuck find weapons?
15  ---  How many times does Chuck come across the cave w

# Pre-processing NER data

**Requirements**

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

!pip install seqeval
!pip install transformers

import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F

import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW



Using TensorFlow backend.


**Preparing data**

In [0]:
df_data = pd.read_csv(DATA_PATH,sep=",",encoding="utf-8").fillna(method='ffill')

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["TOKEN"].values.tolist(),
                                                           s["TAG"].values.tolist())]
        self.grouped = self.data.groupby("ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None



# Get full document data structure
getter = SentenceGetter(df_data)
# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
# Get TAG labels data
labels = [[s[1] for s in sent] for sent in getter.sentences]
# Convert tag name into index
tags_vals = list(set(df_data["TAG"].values))
# Add X  label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')
tags_vals = set(tags_vals)
# Manual definition
tag2idx={'C': 2,
 'I': 3,
 'P': 0,
 'Q': 1,
 'W': 4,
 'X':5,
 '[CLS]':6,
 '[SEP]':7}
# Mapping index to name (reverse)
tag2name={tag2idx[key] : key for key in tag2idx.keys()}
# Setup GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
# Vocab
vocabulary = PARENT_DIR + "/data/vocab.txt"
# TODO: Try with increase value
# CAUTION - Should be less than 512
max_len  = 384
# Load tokenizer
# load tokenizer, with manual file address or pretrained address
tokenizer=BertTokenizer(vocab_file=vocabulary,do_lower_case=False)
# TOKENIZER TEXT
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in (zip(sentences, labels)):
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_lable.append(lab)
            else:
                temp_lable.append('X')  
                
    # Add [SEP] at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1

# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")

# Make label into id, pad with "W" meaning others/wrong
# Note - Replaced "O" -> "W" (wrong)
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["W"], padding="post",
                     dtype="long", truncating="post")

# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]

# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]

# Load val set
val_inputs, val_tags, val_masks, val_segs = input_ids, tags, attention_masks, segment_ids
val_inputs = torch.tensor(val_inputs)
val_tags = torch.tensor(val_tags)
val_masks = torch.tensor(val_masks)
val_segs = torch.tensor(val_segs)

# Set batch num
batch_num = 16

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

No.0,len:471
texts:[CLS] ( CNN ) - - Air New Zealand ' s latest in - flight safety video released Tuesday is already another viral hit but is encounter ##ing some t ##ur ##bul ##ence over its use of several bi ##kini - clad Sports Illustrated models . View the video here Previous versions of the video - - starring anything from Ho ##bb ##its to Bear G ##ryl ##ls to New Zealand ' s all conquer ##ing All Blacks rugby team - - have revolution ##ized the on - board safety message airlines deliver to passengers . The most recent effort though is being criticized by some as neither ground - breaking nor as creative after the airline teamed up with Sports Illustrated magazine to produce what it ' s calling The world ' s most beautiful safety video . The Safety in Paradise video which rolls out on Air New Zealand flights at the end of February is beautifully shot and certainly cheerful and fun . It was filmed in the Cook Islands - - home to several stunning beaches - - and coincide ##s with th

# Load pretrained model

In [0]:
model = BertForTokenClassification.from_pretrained(TRAINED_MODEL_DIR,num_labels=len(tag2idx))

In [0]:
model.cuda();

# Evaluate on data

In [0]:
model.eval();

In [0]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []
probs = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits_prob, _ = torch.max(F.softmax(logits, dim=2),dim=2)
    logits_prob = logits_prob.detach().cpu().numpy()
    # print(logits_prob)
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []

        # Prob one
        prob_temp = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
                    prob_temp.append(logits_prob[i][j])

            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)
        probs.append(prob_temp)

        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

# Save the report into file
output_eval_file = EVAL_PATH
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =953
  Batch size = 16
f1 socre: 0.895044
Accuracy score: 0.988966
***** Eval results *****

           precision    recall  f1-score   support

        I     0.8749    0.9472    0.9096     16004
        C     0.6181    0.6443    0.6310      1417
        P     0.9113    0.9765    0.9428     12060
        W     0.7062    0.7125    0.7093      1805
        Q     0.9920    0.9828    0.9874       756

micro avg     0.8605    0.9324    0.8950     32042
macro avg     0.8705    0.9324    0.9003     32042

f1 socre: 0.895044
Accuracy score: 0.988966


# Write Predictions

In [0]:
# print(y_true)
# print(y_pred)
# print(probs)
#TODO : Save to csv problem: 2d list
# np.savetxt(PRED_PATH, zip(y_true.flatten(), y_pred.flatten(), probs.flatten()), delimiter=',', fmt='%f')
import pickle
with open(PRED_PATH + "/ytrue_v2", "wb") as f:
  pickle.dump(y_true,f)
with open(PRED_PATH + "/ypred_v2", "wb") as f:
  pickle.dump(y_pred,f)
with open(PRED_PATH + "/probs_v2", "wb") as f:
  pickle.dump(probs,f)


In [0]:
!ls "/content/gdrive/My Drive/MultiRC_NER/evaluations"

preds.txt  probs_v2  results.txt  sample_results.txt  ypred_v2	ytrue_v2


# Analysis