In [2]:
import gzip
import shutil

Uncompress the training data in simplified format 

In [2]:
# TODO: add logic to avoid data re-extraction if file already exists
with gzip.open(f'v1.0-simplified_simplified-nq-train.jsonl.gz', 'rb') as f_in:
    with open(f'v1.0-simplified_simplified-nq-train.jsonl', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Read from a `.jsonl` file format (list of valid `json` one entry for each line, separated by `\n`)

In [3]:
import json
from pandas.io.json import json_normalize
import pandas as pd

def read_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print(f'Loaded {len(data)} records from {input_path}')
    return data

Load **all** simplified format training data (this can take a while ~17.5GB) 

In [4]:
nq_data = read_jsonl(f'v1.0-simplified_simplified-nq-train.jsonl')

Loaded 307373 records from v1.0-simplified_simplified-nq-train.jsonl


Explore the structure of the `anotations` list (contain all the information about the correct answer(s) to the question, if any)

In [5]:
df = pd.DataFrame(nq_data, 
                  columns=[
                    'question_text', # 'question_tokens', 
                    'document_text', # 'document_url', 'document_html', # 'document_tokens',
                    'annotations', 
                    'long_answer_candidates', 
                    'example_id', 
                  ]
                 ) 

In [6]:
pd.set_option('display.max_colwidth', 200)
df[['annotations']]

Unnamed: 0,annotations
0,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 1952, 'candidate_index': 54, 'end_token': 2019}, 'short_answers': [{'start_token': 1960, 'end_token': 1969}], 'annotation_id': 59316545022..."
1,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 212, 'candidate_index': 15, 'end_token': 310}, 'short_answers': [{'start_token': 213, 'end_token': 215}], 'annotation_id': 120348741537837..."
2,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 319, 'candidate_index': 24, 'end_token': 438}, 'short_answers': [], 'annotation_id': 10527123009892725162}]"
3,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 509, 'candidate_index': 59, 'end_token': 576}, 'short_answers': [{'start_token': 512, 'end_token': 514}], 'annotation_id': 146347963651525..."
4,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1}, 'short_answers': [], 'annotation_id': 11038549994888625916}]"
...,...
307368,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 218, 'candidate_index': 6, 'end_token': 321}, 'short_answers': [{'start_token': 293, 'end_token': 295}, {'start_token': 307, 'end_token': ..."
307369,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1}, 'short_answers': [], 'annotation_id': 4082814573060225469}]"
307370,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': 173, 'candidate_index': 13, 'end_token': 265}, 'short_answers': [{'start_token': 174, 'end_token': 177}], 'annotation_id': 425351403563793..."
307371,"[{'yes_no_answer': 'NONE', 'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1}, 'short_answers': [], 'annotation_id': 1828800823834337985}]"


In [7]:
type(nq_data)

list

Setup some constants for ranging over the data.

In [9]:
data_size = len(nq_data)
data_range = range(data_size)

Count the number of questions with a singleton annotations list.

In [10]:
ann_count = 0

for i in data_range:
    l = nq_data[i]['annotations']
    if len(l) == 1:
        ann_count = ann_count + 1
        
print(ann_count)

307373


Count the number of unanswered questions (in `[2]` it is recommended to aggresively reduce the number of unanswered questions during training in order to improve learning performance).

In [11]:
no_long_ans_count = 0

for i in data_range:
    l = nq_data[i]['annotations'][0]
    if l['long_answer']['start_token'] == -1:
        no_long_ans_count = no_long_ans_count + 1
        
print(no_long_ans_count)    

155225


More than half are unanswered questions (this can also make the size of the data during development more manageable).

In [39]:
no_long_ans_count * 100 / ann_count

50.50053192700725

Split the data in two using the anotations. Extract queries with a non empty long answer anotation.

In [14]:
nq_data_answered = []

for i in data_range:
    l = nq_data[i]['annotations'][0]
    if l['long_answer']['start_token'] != -1:
        nq_data_answered.append(nq_data[i])
        
print(len(nq_data_answered))

152148


In [18]:
assert len(nq_data_answered) == ann_count - no_long_ans_count

Extract queries for which the anotators could not find an answer in the article (in the alocated time)

In [21]:
nq_data_no_l_ans = []

for i in data_range:
    l = nq_data[i]['annotations'][0]
    if l['long_answer']['end_token'] == -1:
        nq_data_no_l_ans.append(nq_data[i])
        
print(len(nq_data_no_l_ans))

155225


In [22]:
assert len(nq_data_no_l_ans) == no_long_ans_count

Extract a sa ple of only `10%` of the unannotated (i.e. unanswered) data.

In [24]:
import random
ten_p_no_ans = len(nq_data_no_l_ans) // 10 
nq_data_no_l_ans_10 = random.sample(nq_data_no_l_ans, ten_p_no_ans)

In [25]:
assert len(nq_data_no_l_ans_10) == ten_p_no_ans

Merge the 10% of no answer data sample back into the answered date subset.

In [31]:
nq_sampled_data = nq_data_answered + nq_data_no_l_ans_10
print(len(nq_sampled_data))

167670


Ensure the data is not partitioned by annotations.

In [27]:
random.shuffle(nq_sampled_data)

In [32]:
print(len(nq_sampled_data))

167670


Write the newly sampled data to a `.jsonl` file.

In [29]:
def write_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print(f'Wrote {len(data)} records to {output_path}')

In [30]:
write_jsonl(nq_sampled_data, 'nq_sampled_10p_no_ans.jsonl')

Wrote 167670 records to nq_sampled_10p_no_ans.jsonl


In [35]:
from pathlib import Path
Path('nq_sampled_10p_no_ans.jsonl').stat().st_size / 1024**3

8.920995890162885

Wrangled data size of the `.jsonl` uncompressed file is ~9.5GB. 