In [16]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

In [17]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [18]:
def _read_squad(path):
    path = Path(path)
    squad_dict = None
    with open(path) as f:
        squad_dict = json.load(f)
    rows = []
    for group in tqdm(squad_dict['data']):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                _id = qa["id"]
                is_impossible = qa["is_impossible"]
                question = qa['question']
                for a in qa["answers"]:
                    row = {}
                    row["id"] = _id 
                    row["is_impossible"] = is_impossible
                    row["question"] = question
                    row["answer_start"] = a["answer_start"]
                    i = row["answer_start"]
                    j = row["answer_start"] + len(a["text"])
                    assert a["text"] == context[i:j]
                    # first position which follows AFTER the answer span
                    row["answer_end"] = j
                    row["answer_text"] = a["text"]
                    row["context"] = context
                    rows.append(row)
                if "plausible_answers" in qa:
                    for a in qa["plausible_answers"]:
                        row = {}
                        row["id"] = _id 
                        row["is_impossible"] = is_impossible
                        row["question"] = question
                        row["answer_start"] = a["answer_start"]
                        i = row["answer_start"]
                        j = row["answer_start"] + len(a["text"])
                        assert a["text"] == context[i:j]
                        row["answer_end"] = j
                        row["answer_text"] = a["text"]
                        row["context"] = context
                        rows.append(row)
    df = pd.DataFrame.from_records(rows)
    df["answer_start"] = df["answer_start"].astype(np.int16)
    df["answer_end"] = df["answer_end"].astype(np.int16)
    df["is_impossible"] = df["is_impossible"].astype(np.int8)
    return df

In [19]:
df = _read_squad("input/squad/train-v2.0.json")

100%|██████████| 442/442 [00:00<00:00, 2820.52it/s]


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   question       130319 non-null  object
 3   answer_start   130319 non-null  int16 
 4   answer_end     130319 non-null  int16 
 5   answer_text    130319 non-null  object
 6   context        130319 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 4.6+ MB


In [21]:
df.head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
0,56be85543aeaaa14008c9063,0,When did Beyonce start becoming popular?,269,286,in the late 1990s,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
1,56be85543aeaaa14008c9065,0,What areas did Beyonce compete in when she was growing up?,207,226,singing and dancing,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
2,56be85543aeaaa14008c9066,0,When did Beyonce leave Destiny's Child and become a solo singer?,526,530,2003,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
3,56bf6b0f3aeaaa14008c9601,0,In what city and state did Beyonce grow up?,166,180,"Houston, Texas","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
4,56bf6b0f3aeaaa14008c9602,0,In which decade did Beyonce become famous?,276,286,late 1990s,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."


In [22]:
df["is_impossible"].value_counts()

0    86821
1    43498
Name: is_impossible, dtype: int64

In [23]:
df[df["is_impossible"] == 1].head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
2075,5a8d7bf7df8bba001a0f9ab1,1,What category of game is Legend of Zelda: Australia Twilight?,128,144,action-adventure,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2076,5a8d7bf7df8bba001a0f9ab2,1,What consoles can be used to play Australia Twilight?,194,210,GameCube and Wii,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2077,5a8d7bf7df8bba001a0f9ab3,1,When was Australia Twilight launched in North America?,569,582,November 2006,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2078,5a8d7bf7df8bba001a0f9ab4,1,When could GameCube owners purchase Australian Princess?,569,582,November 2006,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2079,5a8d7bf7df8bba001a0f9ab5,1,What year was the Legend of Zelda: Australian Princess originally planned for release?,364,368,2005,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"


In [24]:
%%time
df.to_parquet("output/squad/train.parquet", index=False)

CPU times: user 234 ms, sys: 266 ms, total: 500 ms
Wall time: 795 ms


In [25]:
df = _read_squad("input/squad/dev-v2.0.json")

100%|██████████| 35/35 [00:00<00:00, 1697.96it/s]


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26232 entries, 0 to 26231
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             26232 non-null  object
 1   is_impossible  26232 non-null  int8  
 2   question       26232 non-null  object
 3   answer_start   26232 non-null  int16 
 4   answer_end     26232 non-null  int16 
 5   answer_text    26232 non-null  object
 6   context        26232 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 948.0+ KB


In [27]:
df.head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
0,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
1,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
2,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
3,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
4,56ddde6b9a695914005b9629,0,When were the Normans in Normandy?,94,117,10th and 11th centuries,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."


In [28]:
df["is_impossible"].value_counts()

0    20302
1     5930
Name: is_impossible, dtype: int64

In [29]:
%%time
df.to_parquet("output/squad/val.parquet", index=False)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 39.5 ms
