# Intermediate files

We will load some output/intermediate files from a recent experiment

In [1]:
base='/dccstor/jsmc-nmt-01/bool/expts/toolkit/b/b19'
mrc_file=f'{base}/mrc/eval_predictions.json'
qtc_file=f'{base}/qtc/eval_predictions.json'
evc_file=f'{base}/evc/eval_predictions.json'
out_file=f'{base}/eval_predictions_merge.json'

# Display helper

These file have many fields - to display them better we use a helper routine to convert to dataframes.

In [2]:
from examples.boolqa.mrc2dataset  import create_dataset_from_run_mrc_output

from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

# Based on https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb
def show_elements(dataset, cols):
    df = pd.DataFrame(dataset)
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df[cols].to_html()))
    
import random

Select some random examples

In [3]:
eval_examples=create_dataset_from_run_mrc_output(mrc_file, unpack=False)
random_idxs = random.sample(range(len(eval_examples)), 20)
random_eval_examples = eval_examples.select(random_idxs)
eval_examples



Dataset({
    features: ['example_id', 'cls_score', 'start_logit', 'end_logit', 'span_answer', 'span_answer_score', 'start_index', 'end_index', 'passage_index', 'target_type_logits', 'span_answer_text', 'yes_no_answer', 'start_stdev', 'end_stdev', 'query_passage_similarity', 'normalized_span_answer_score', 'confidence_score', 'question', 'language', 'order', 'rank'],
    num_rows: 18670
})

# Samples of MRC output

Here we show `question`'s and predicted answer `span_answer_text` for the random examples.  This is at the initial stage of question answering - a purely extractive system.  The confidence in the span answer is given by `span_answer_score`, which is a function of `start_logit`, `end_logit`, and `target_type_logits`.

In [4]:
cols=['example_id','question','span_answer_text','language', 'span_answer_score', 'start_logit', 'end_logit', 'target_type_logits']
show_elements(random_eval_examples,cols)  # Show random

Unnamed: 0,example_id,question,span_answer_text,language,span_answer_score,start_logit,end_logit,target_type_logits
0,430e8175-a99f-4706-a973-6acefbacc05d,2011 నాటికి రంగరాయపురం గ్రామంలో ఎన్ని ఎకరాల బీడు భూమి ఉంది?,555,telugu,-4.04541,-2.015625,-1.291992,"[6.64453125, 2.25390625, -1.4765625, -6.13671875, -6.70703125]"
1,911d16d4-00d3-4a84-b9f0-9963f494035c,"Je,mke wa rais wa kwanza wa Togo alifahamika kama nani?",Remmy Ongala,swahili,-5.534668,-2.144531,-1.897461,"[7.1953125, 2.19921875, -1.626953125, -6.15625, -6.7890625]"
2,37e1b0c3-36ee-4f78-9524-4bcdbfb3db2b,슈트라우스는 몇 곡을 작곡했는가?,11,korean,-0.070801,1.358398,1.640625,"[5.94140625, 4.16796875, -0.90283203125, -6.63671875, -7.33203125]"
3,1b32e9f1-0a68-46af-a662-764bf90da040,가장 많은 원주민을 학살한 나라는 어디인가?,미국의,korean,-1.623291,-0.58252,-0.191406,"[5.19140625, 3.4453125, -0.51806640625, -6.03125, -6.78515625]"
4,078ea738-bf8e-45ae-a51f-f1289cec1978,"Je,ni kilomita ngapi kutoka Dar hadi Moro?",km² 72 939,swahili,-3.681152,-2.082031,-1.047852,"[5.6171875, 2.1328125, -0.6787109375, -6.0, -6.56640625]"
5,f3f23174-cd43-45e6-8c00-94bc20341026,미국 드라마 House의 제작자는 누구인가요?,쇼어와 브라이언 싱어,korean,9.704102,3.810547,4.660156,"[0.6640625, 7.1015625, -0.0531005859375, -3.666015625, -4.33984375]"
6,225e4d88-1c1f-47b0-85a7-4459b9707295,Kuka Suomessa aloitti papyrologian ensimmäisenä?,Karl Weber,finnish,-1.331421,0.390381,0.770996,"[5.34375, 3.091796875, -0.5947265625, -6.09375, -6.75390625]"
7,1bd32839-9243-49c9-b3eb-ab7b036ea9b2,デビアスグループの創始者は誰？,セシル・ローズとチャールズ・ラッド,japanese,8.98999,4.53125,4.859375,"[1.6728515625, 7.05078125, 0.1519775390625, -4.69140625, -5.51171875]"
8,98ef8010-8e7a-45c8-a628-f06df52c0c23,Kapan Bob Dylan lahir?,24 Mei 1941,indonesian,8.55188,4.175781,4.496094,"[2.03515625, 7.12890625, -0.226318359375, -4.8671875, -5.69921875]"
9,8948b2be-d94f-4a0e-8178-7644e953b1f0,Kalenda ya Gregori kiligunduliwa nchini gani?,Italia,swahili,0.461914,1.299805,1.680664,"[5.33203125, 4.19140625, -0.9169921875, -6.2734375, -6.90625]"


# Samples of QTC output

Three fields have been added: `question_type_pred` which is `YN` if the question is a boolean question, and `NONE` if the question is not boolean - typically factoid in this dataset.
The other field `question_type_scores` contains the classifier scores (logits) for each class.  The final field, `question_type_conf` contains score of the selected class.

In [5]:
eval_examples=create_dataset_from_run_mrc_output(qtc_file, unpack=False)
random_eval_examples = eval_examples.select(random_idxs)
eval_examples

Dataset({
    features: ['example_id', 'cls_score', 'start_logit', 'end_logit', 'span_answer', 'span_answer_score', 'start_index', 'end_index', 'passage_index', 'target_type_logits', 'span_answer_text', 'yes_no_answer', 'start_stdev', 'end_stdev', 'query_passage_similarity', 'normalized_span_answer_score', 'confidence_score', 'question', 'language', 'order', 'rank', 'question_type_pred', 'question_type_scores', 'question_type_conf'],
    num_rows: 18670
})

In [18]:
cols=['example_id','question','span_answer_text', 'question_type_pred', 'question_type_scores']
show_elements(random_eval_examples,cols)  # Show random

Unnamed: 0,example_id,question,span_answer_text,question_type_pred,question_type_scores
0,c091b558-c7fe-4d07-9bf6-eef8feaac8d8,Who led American forces in the Battle of Huế?,Colonel Stanley S. Hughes,short_answer,"{'boolean': -2.9970808029174805, 'short_answer': 3.781102180480957}"
1,233a6264-edd4-4e76-a497-1be970da22bb,Is Pippin the Hunchback dead?,"Sometime around 811, Pepin died while at Prüm, likely from plague.[2]",boolean,"{'boolean': 3.4151370525360107, 'short_answer': -4.333227634429932}"
2,5889d06d-f101-4434-99d5-50226e5165f8,What does heritage mean?,"a landmark or area which is selected by the United Nations Educational, Scientific and Cultural Organization",short_answer,"{'boolean': -2.9972896575927734, 'short_answer': 3.780919075012207}"
3,b5656b60-1f47-4057-87f8-048b0183bc66,Who was the first DC Comic character to have a film made about them?,Superman,short_answer,"{'boolean': -2.996394395828247, 'short_answer': 3.7805237770080566}"
4,ba805f2a-4e17-4560-923d-1679ef218021,Who was the first ruler in the Palaiologos dynasty of the Byzantine Empire?,Michael VIII Palaiologos,short_answer,"{'boolean': -2.9974093437194824, 'short_answer': 3.781903028488159}"
5,06397d0d-f5e8-4be4-9c5e-ee93a494515d,How many ships are currently in Britain's Naval forces?,75,short_answer,"{'boolean': -2.995952606201172, 'short_answer': 3.7787346839904785}"
6,18a8b35c-8eb6-4282-b127-d4ff43c0726b,"Who designed the US Capitol buildings in Washington, D.C.?",William Thornton,short_answer,"{'boolean': -2.996086835861206, 'short_answer': 3.7806718349456787}"
7,4e13fea4-0501-4c31-bff1-3dfee9793732,Does Bob Jones University do any research?,"The Department of Biology hosts two research programs on campus, one in cancer research, the other in animal behavior.[50]",boolean,"{'boolean': 3.4162871837615967, 'short_answer': -4.333107948303223}"
8,a41f80d5-d372-4c4b-a8d5-12b7cbc124a5,What is the poverty rate in Oaxaca?,2012.,short_answer,"{'boolean': -2.997530937194824, 'short_answer': 3.7812016010284424}"
9,115c806a-2d51-4241-84c0-db3dec2758a8,"Who wrote the song ""Let there be Peace on Earth""?",Jill Jackson-Miller and Sy Miller,short_answer,"{'boolean': -2.9963998794555664, 'short_answer': 3.781179904937744}"


# Samples of EVC output 
As above this classifier adds three new fields.  `boolean_answer_pred` is `True` if the predicted answer to a boolean question is positive/yes, `False` if the answer is negative/no, and `NONE` if there is no support for either answer in the context.  `boolean_answer_scores` provides the scores (logits) of each class, and `boolean_answer_conf` is the score of the selected class.
We select the English questions from the dev set (they are not scored by tydi_eval.py), which have a higher fraction of boolean questions.  The boolean questions in the tydi dataset are overwhelmingly biased towards having a `yes` rather than a `no`  as the answer.  We suspect that the question writers were attempting to confirm existing knowledge.
Note that the answer classifier runs even on the short answer questions in order to simplify merging - a real deployed system would run the answer classifier only on questions that are predicted to be boolean.

In [14]:
eval_examples=create_dataset_from_run_mrc_output(evc_file, unpack=False)
eval_examples=eval_examples.filter(lambda x:x['language']=='english')
random_idxs = random.sample(range(len(eval_examples)), 20)

random_eval_examples = eval_examples.select(random_idxs)
eval_examples

  0%|          | 0/19 [00:00<?, ?ba/s]

Dataset({
    features: ['example_id', 'cls_score', 'start_logit', 'end_logit', 'span_answer', 'span_answer_score', 'start_index', 'end_index', 'passage_index', 'target_type_logits', 'span_answer_text', 'yes_no_answer', 'start_stdev', 'end_stdev', 'query_passage_similarity', 'normalized_span_answer_score', 'confidence_score', 'question', 'language', 'order', 'rank', 'question_type_pred', 'question_type_scores', 'question_type_conf', 'boolean_answer_pred', 'boolean_answer_scores', 'boolean_answer_conf'],
    num_rows: 1031
})

In [17]:
cols=['example_id','question','span_answer_text', 'question_type_pred', 'boolean_answer_pred', 'boolean_answer_scores']
show_elements(random_eval_examples,cols)  # Show random

Unnamed: 0,example_id,question,span_answer_text,question_type_pred,boolean_answer_pred,boolean_answer_scores
0,c091b558-c7fe-4d07-9bf6-eef8feaac8d8,Who led American forces in the Battle of Huế?,Colonel Stanley S. Hughes,short_answer,yes,"{'no': -5.311473369598389, 'no_answer': 6.015173435211182, 'yes': 0.008014324121177197}"
1,233a6264-edd4-4e76-a497-1be970da22bb,Is Pippin the Hunchback dead?,"Sometime around 811, Pepin died while at Prüm, likely from plague.[2]",boolean,yes,"{'no': -5.676512718200684, 'no_answer': 3.449946403503418, 'yes': 2.6473116874694824}"
2,5889d06d-f101-4434-99d5-50226e5165f8,What does heritage mean?,"a landmark or area which is selected by the United Nations Educational, Scientific and Cultural Organization",short_answer,yes,"{'no': -5.071897506713867, 'no_answer': 5.384366035461426, 'yes': 0.37900829315185547}"
3,b5656b60-1f47-4057-87f8-048b0183bc66,Who was the first DC Comic character to have a film made about them?,Superman,short_answer,yes,"{'no': -4.825044631958008, 'no_answer': 6.125892639160156, 'yes': -0.7141074538230896}"
4,ba805f2a-4e17-4560-923d-1679ef218021,Who was the first ruler in the Palaiologos dynasty of the Byzantine Empire?,Michael VIII Palaiologos,short_answer,yes,"{'no': -4.749837398529053, 'no_answer': 5.676550388336182, 'yes': -0.37514519691467285}"
5,06397d0d-f5e8-4be4-9c5e-ee93a494515d,How many ships are currently in Britain's Naval forces?,75,short_answer,yes,"{'no': -4.612551689147949, 'no_answer': 6.040470600128174, 'yes': -0.5953368544578552}"
6,18a8b35c-8eb6-4282-b127-d4ff43c0726b,"Who designed the US Capitol buildings in Washington, D.C.?",William Thornton,short_answer,yes,"{'no': -4.905400276184082, 'no_answer': 6.422454833984375, 'yes': -0.6582269668579102}"
7,4e13fea4-0501-4c31-bff1-3dfee9793732,Does Bob Jones University do any research?,"The Department of Biology hosts two research programs on campus, one in cancer research, the other in animal behavior.[50]",boolean,yes,"{'no': -6.795248985290527, 'no_answer': 3.5200817584991455, 'yes': 3.3028197288513184}"
8,a41f80d5-d372-4c4b-a8d5-12b7cbc124a5,What is the poverty rate in Oaxaca?,2012.,short_answer,yes,"{'no': -4.843001365661621, 'no_answer': 6.419010639190674, 'yes': -0.7518181800842285}"
9,115c806a-2d51-4241-84c0-db3dec2758a8,"Who wrote the song ""Let there be Peace on Earth""?",Jill Jackson-Miller and Sy Miller,short_answer,yes,"{'no': -4.852911949157715, 'no_answer': 6.349478244781494, 'yes': -0.652286171913147}"


In [10]:
df=pd.DataFrame(eval_examples)
df.query('question_type_pred=="boolean"')['boolean_answer_pred'].value_counts()

yes    1825
no      153
Name: boolean_answer_pred, dtype: int64

In [13]:
df[df['language']=='english']['question_type_pred'].value_counts()

boolean    112
Name: question_type_pred, dtype: int64

# Final output

The final output file is in a format suitable for the tydi evalutation script and contains no textual information.  The `confidence_score` is normalized to `[0,1]` by the score normalizer based the confidence score of the original mrc output, and the prediction of the question type classifier.

In [20]:
df_out=pd.read_json(out_file)
df_out

Unnamed: 0,example_id,start_position,end_position,passage_index,yes_no_answer,confidence_score
0,b9eba742-f264-4fec-92d0-9b8de5ad0bd7,986,1020,2,0,0.653616
1,a71fe9c2-e518-4923-bced-4fe99cc7ff44,371,388,1,0,0.020314
2,2dcec21a-60bb-4ee9-85bc-6066f1ac6de3,14805,14807,27,0,0.038392
3,84603ec2-1637-4b13-96f5-5ef701c74a12,5993,6010,12,0,0.308387
4,a32fd17a-736a-4299-a543-bc26d68fa8dc,1,22,0,0,0.197467
...,...,...,...,...,...,...
18665,2634d4e7-99a7-40ad-aafa-829dc1270e3d,2563,2586,5,0,0.659538
18666,36227a7f-f387-4d98-99b4-0ba522db3746,183,192,0,0,0.001075
18667,17c7494a-c30e-4c0b-a0ba-b4c0487edf05,1483,1585,4,0,0.193370
18668,1e37c005-6d6b-436f-a97c-8dabc2c67704,723,731,3,0,0.004098
