In [51]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

In [52]:
def _read_squad(path):
    path = Path(path)
    squad_dict = None
    with open(path) as f:
        squad_dict = json.load(f)
    rows = []
    for group in tqdm(squad_dict['data']):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                _id = qa["id"]
                is_impossible = qa["is_impossible"]
                question = qa['question']
                for a in qa["answers"]:
                    row = {}
                    row["id"] = _id 
                    row["is_impossible"] = is_impossible
                    row["question"] = question
                    row["answer_start"] = a["answer_start"]
                    i = row["answer_start"]
                    j = row["answer_start"] + len(a["text"])
                    assert a["text"] == context[i:j]
                    row["answer_end"] = j
                    row["answer_text"] = a["text"]
                    row["context"] = context
                    rows.append(row)
                if "plausible_answers" in qa:
                    for a in qa["plausible_answers"]:
                        row = {}
                        row["id"] = _id 
                        row["is_impossible"] = is_impossible
                        row["question"] = question
                        row["answer_start"] = a["answer_start"]
                        i = row["answer_start"]
                        j = row["answer_start"] + len(a["text"])
                        assert a["text"] == context[i:j]
                        row["answer_end"] = j
                        row["answer_text"] = a["text"]
                        row["context"] = context
                        rows.append(row)
    df = pd.DataFrame.from_records(rows)
    df["answer_start"] = df["answer_start"].astype(np.int16)
    df["answer_end"] = df["answer_end"].astype(np.int16)
    df["is_impossible"] = df["is_impossible"].astype(np.int8)
    return df

In [53]:
df = _read_squad("input/squad/train-v2.0.json")

100%|██████████| 442/442 [00:00<00:00, 3563.47it/s]


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   question       130319 non-null  object
 3   answer_start   130319 non-null  int16 
 4   answer_end     130319 non-null  int16 
 5   answer_text    130319 non-null  object
 6   context        130319 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 4.6+ MB


In [55]:
df.head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
0,56be85543aeaaa14008c9063,0,When did Beyonce start becoming popular?,269,286,in the late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,56be85543aeaaa14008c9065,0,What areas did Beyonce compete in when she was...,207,226,singing and dancing,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2,56be85543aeaaa14008c9066,0,When did Beyonce leave Destiny's Child and bec...,526,530,2003,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3,56bf6b0f3aeaaa14008c9601,0,In what city and state did Beyonce grow up?,166,180,"Houston, Texas",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4,56bf6b0f3aeaaa14008c9602,0,In which decade did Beyonce become famous?,276,286,late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...


In [56]:
df["is_impossible"].value_counts()

0    86821
1    43498
Name: is_impossible, dtype: int64

In [57]:
%%time
df.to_parquet("output/squad/train.parquet", index=False)

CPU times: user 250 ms, sys: 234 ms, total: 484 ms
Wall time: 498 ms


In [58]:
df = _read_squad("input/squad/dev-v2.0.json")

100%|██████████| 35/35 [00:00<00:00, 1917.78it/s]


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26232 entries, 0 to 26231
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             26232 non-null  object
 1   is_impossible  26232 non-null  int8  
 2   question       26232 non-null  object
 3   answer_start   26232 non-null  int16 
 4   answer_end     26232 non-null  int16 
 5   answer_text    26232 non-null  object
 6   context        26232 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 948.0+ KB


In [60]:
df.head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
0,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,The Normans (Norman: Nourmands; French: Norman...
1,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,The Normans (Norman: Nourmands; French: Norman...
2,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,The Normans (Norman: Nourmands; French: Norman...
3,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,The Normans (Norman: Nourmands; French: Norman...
4,56ddde6b9a695914005b9629,0,When were the Normans in Normandy?,94,117,10th and 11th centuries,The Normans (Norman: Nourmands; French: Norman...


In [61]:
df["is_impossible"].value_counts()

0    20302
1     5930
Name: is_impossible, dtype: int64

In [62]:
%%time
df.to_parquet("output/squad/val.parquet", index=False)

CPU times: user 62.5 ms, sys: 31.2 ms, total: 93.8 ms
Wall time: 74.8 ms
