Import utilities. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/stmnk/qa/blob/master/data/dev/NQ-dataset-sample-local.ipynb)

In [1]:
import gzip
import shutil

Extract a `.gz` file.

In [2]:
with gzip.open(f'nq-dev-00.jsonl.gz', 'rb') as f_in:
    with open(f'nq-dev-00.jsonl', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Read data from a `.jsonl` file.

In [5]:
import json
from pandas.io.json import json_normalize
import pandas as pd

def read_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print(f'Loaded {len(data)} records from {input_path}'
    return data

Inspect the data frame.

In [27]:
nq_sample_list = read_jsonl(f'nq-dev-00.jsonl')


df = pd.DataFrame(nq_sample_list, columns=[
    'example_id', 
    'question_text', 'question_tokens', 
    'document_url', 'document_html', # 'document_tokens',
    'long_answer_candidates', 
    'annotations', 
]) 
df

Loaded 1600 records from nq-dev-00.jsonl


Unnamed: 0,example_id,question_text,question_tokens,document_url,document_html,long_answer_candidates,annotations
0,5225754983651766092,what purpose did seasonal monsoon winds have o...,"[what, purpose, did, seasonal, monsoon, winds,...",https://en.wikipedia.org//w/index.php?title=Tr...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 44666, 'end_token': 161, 'start_...","[{'annotation_id': 4323936797498927989, 'long_..."
1,6986236841860957647,where did they film high school musical two,"[where, did, they, film, high, school, musical...",https://en.wikipedia.org//w/index.php?title=Hi...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 58952, 'end_token': 260, 'start_...","[{'annotation_id': 4831085488325731996, 'long_..."
2,-3290814144789249484,who got the first nobel prize in physics,"[who, got, the, first, nobel, prize, in, physics]",https://en.wikipedia.org//w/index.php?title=Li...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 36110, 'end_token': 179, 'start_...","[{'annotation_id': 10138100176517733689, 'long..."
3,5745452844331879752,who has the rights to alice in wonderland,"[who, has, the, rights, to, alice, in, wonderl...",https://en.wikipedia.org//w/index.php?title=Al...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 46964, 'end_token': 178, 'start_...","[{'annotation_id': 2559678672569860137, 'long_..."
4,8851020722386421469,when is the next deadpool movie being released,"[when, is, the, next, deadpool, movie, being, ...",https://en.wikipedia.org//w/index.php?title=De...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 60014, 'end_token': 272, 'start_...","[{'annotation_id': 927142593907478770, 'long_a..."
...,...,...,...,...,...,...,...
1595,-4616596799374362422,who was the pinkerton detective agency's first...,"[who, was, the, pinkerton, detective, agency, ...",https://en.wikipedia.org//w/index.php?title=Ka...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 56155, 'end_token': 65, 'start_b...","[{'annotation_id': 13939247266108964303, 'long..."
1596,-3650291155113659146,how many episodes are there in modern family,"[how, many, episodes, are, there, in, modern, ...",https://en.wikipedia.org//w/index.php?title=Li...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 44549, 'end_token': 227, 'start_...","[{'annotation_id': 17314900466083915677, 'long..."
1597,3381924381590631417,who built the first temple for god in jerusalem,"[who, built, the, first, temple, for, god, in,...",https://en.wikipedia.org//w/index.php?title=So...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 57724, 'end_token': 144, 'start_...","[{'annotation_id': 2468504027145325568, 'long_..."
1598,-1370702280698958195,when did the simpsons first air in uk,"[when, did, the, simpsons, first, air, in, uk]",https://en.wikipedia.org//w/index.php?title=Th...,"<!DOCTYPE html>\n<HTML class=""client-js ve-not...","[{'end_byte': 157703, 'end_token': 703, 'start...","[{'annotation_id': 17148029089918722229, 'long..."


Write data to a `.jsonl` file.

In [None]:

def write_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))