In [1]:
import gzip
import shutil

Uncompress the training data in simplified format 

In [2]:
with gzip.open(f'v1.0-simplified_simplified-nq-train.jsonl.gz', 'rb') as f_in:
    with open(f'v1.0-simplified_simplified-nq-train.jsonl', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Read from a `.jsonl` file format (list of valid `json` one entry for each line, separated by `\n`)

In [3]:
import json
from pandas.io.json import json_normalize
import pandas as pd

def read_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print(f'Loaded {len(data)} records from {input_path}')
    return data

Load **all** simplified format training data (this can take a while ~17.5GB) 

In [4]:
nq_sample_list = read_jsonl(f'v1.0-simplified_simplified-nq-train.jsonl')

Loaded 307373 records from v1.0-simplified_simplified-nq-train.jsonl


Explore the structure of the `anotations` list (contain all the information about the correct answer(s) to the question, if any)

In [40]:
df = pd.DataFrame(nq_sample_list, 
                  columns=[
                    'question_text', # 'question_tokens', 
                    'document_text', # 'document_url', 'document_html', # 'document_tokens',
                    'annotations', 
                    'long_answer_candidates', 
                    'example_id', 
                  ]
                 ) 

In [42]:
pd.set_option('display.max_colwidth', 200)
df[['annotations']]

Unnamed: 0,annotations
0,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1952, 'candidate_index': 54, 'end_token': 2019}, 'short_answers': [{'start_token': 1960, 'end_token': 1969}], 'annotation_id': 59316545022..."
1,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 212, 'candidate_index': 15, 'end_token': 310}, 'short_answers': [{'start_token': 213, 'end_token': 215}], 'annotation_id': 120348741537837..."
2,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 319, 'candidate_index': 24, 'end_token': 438}, 'short_answers': [], 'annotation_id': 10527123009892725162}]"
3,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 509, 'candidate_index': 59, 'end_token': 576}, 'short_answers': [{'start_token': 512, 'end_token': 514}], 'annotation_id': 146347963651525..."
4,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1}, 'short_answers': [], 'annotation_id': 11038549994888625916}]"
...,...
307368,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 218, 'candidate_index': 6, 'end_token': 321}, 'short_answers': [{'start_token': 293, 'end_token': 295}, {'start_token': 307, 'end_token': ..."
307369,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1}, 'short_answers': [], 'annotation_id': 4082814573060225469}]"
307370,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 173, 'candidate_index': 13, 'end_token': 265}, 'short_answers': [{'start_token': 174, 'end_token': 177}], 'annotation_id': 425351403563793..."
307371,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1}, 'short_answers': [], 'annotation_id': 1828800823834337985}]"


In [22]:
type(nq_sample_list)

list

Count the number of unanswered questions (in `[2]` it is recommended to aggresively reduce the number of unanswered questions during training in order to improve learning performance) 

In [36]:
no_long_ans_count = 0
for i in range(len(nq_sample_list)):
    l = nq_sample_list[i]['annotations'][0]
    if l['long_answer']['start_token'] == -1:
        no_long_ans_count = no_long_ans_count + 1
print(no_long_ans_count)
    

155225


In [37]:
ann_count = 0
for i in range(len(nq_sample_list)):
    l = nq_sample_list[i]['annotations']
    if len(l) == 1:
        ann_count = ann_count + 1
print(ann_count)

307373


More than half are unanswered questions (this can also make the size of the data during development more manageable)

In [39]:
no_long_ans_count * 100 / ann_count

50.50053192700725