Mount the drive

In [None]:
from google.colab import drive
DRIVE_ROOT = '/content/drive'
drive.mount(DRIVE_ROOT)


Extract a `.gz` file

In [None]:
import gzip
import shutil
DATA_PATH = 'My Drive/data/train'

with gzip.open(f'{DRIVE_ROOT}/{DATA_PATH}/nq-train-00.jsonl.gz', 'rb') as f_in:
    with open(f'{DRIVE_ROOT}/{DATA_PATH}/nq-train-00.jsonl', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Read data from a `.jsonl` file

In [None]:
import json
import json_lines
import pandas as pd

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

Inspect the data frame

In [4]:
DEV_PATH = 'My Drive/data'
nq_sample_list = load_jsonl(f'{DEV_ROOT}/{DEV_PATH}/nq-dev-sample.jsonl')
df = pd.DataFrame(nq_sample_list, columns=[
    'example_id', 'question_text', 'question_tokens', 'document_url'
]) # 'document_tokens', 'document_html'
df

Unnamed: 0,example_id,question_text,question_tokens,document_url
0,5225754983651766092,what purpose did seasonal monsoon winds have o...,"[what, purpose, did, seasonal, monsoon, winds,...",https://en.wikipedia.org//w/index.php?title=Tr...
1,6986236841860957647,where did they film high school musical two,"[where, did, they, film, high, school, musical...",https://en.wikipedia.org//w/index.php?title=Hi...
2,-3290814144789249484,who got the first nobel prize in physics,"[who, got, the, first, nobel, prize, in, physics]",https://en.wikipedia.org//w/index.php?title=Li...
3,5745452844331879752,who has the rights to alice in wonderland,"[who, has, the, rights, to, alice, in, wonderl...",https://en.wikipedia.org//w/index.php?title=Al...
4,8851020722386421469,when is the next deadpool movie being released,"[when, is, the, next, deadpool, movie, being, ...",https://en.wikipedia.org//w/index.php?title=De...
...,...,...,...,...
195,-2446865335924530326,what is windows defender and what does it do,"[what, is, windows, defender, and, what, does,...",https://en.wikipedia.org//w/index.php?title=Wi...
196,-4047248994442147533,when is the next time easter falls on april 11th,"[when, is, the, next, time, easter, falls, on,...",https://en.wikipedia.org//w/index.php?title=Li...
197,-7428135141357814239,how far from the heavy rain of a thunder storm...,"[how, far, from, the, heavy, rain, of, a, thun...",https://en.wikipedia.org//w/index.php?title=Th...
198,-510179348025098787,what does the board of directors consist of,"[what, does, the, board, of, directors, consis...",https://en.wikipedia.org//w/index.php?title=Bo...


Write data to a `.jsonl` file

In [None]:

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))