In [27]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

In [28]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [29]:
def _read_squad(path):
    path = Path(path)
    squad_dict = None
    with open(path) as f:
        squad_dict = json.load(f)
    rows = []
    for group in tqdm(squad_dict['data']):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                _id = qa["id"]
                is_impossible = qa["is_impossible"]
                question = qa['question']
                for a in qa["answers"]:
                    row = {}
                    row["id"] = _id 
                    row["is_impossible"] = is_impossible
                    row["question"] = question
                    row["answer_start"] = a["answer_start"]
                    i = row["answer_start"]
                    j = row["answer_start"] + len(a["text"])
                    assert a["text"] == context[i:j]
                    # first position which follows AFTER the answer span
                    row["answer_end"] = j
                    row["answer_text"] = a["text"]
                    row["context"] = context
                    rows.append(row)
                if "plausible_answers" in qa:
                    for a in qa["plausible_answers"]:
                        row = {}
                        row["id"] = _id 
                        row["is_impossible"] = is_impossible
                        row["question"] = question
                        row["answer_start"] = a["answer_start"]
                        i = row["answer_start"]
                        j = row["answer_start"] + len(a["text"])
                        assert a["text"] == context[i:j]
                        row["answer_end"] = j
                        row["answer_text"] = a["text"]
                        row["context"] = context
                        rows.append(row)
    df = pd.DataFrame.from_records(rows)
    df["answer_start"] = df["answer_start"].astype(np.int16)
    df["answer_end"] = df["answer_end"].astype(np.int16)
    df["is_impossible"] = df["is_impossible"].astype(np.int8)
    return df

In [30]:
df = _read_squad("input/squad/train-v2.0.json")

100%|██████████| 442/442 [00:00<00:00, 2404.31it/s]


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   question       130319 non-null  object
 3   answer_start   130319 non-null  int16 
 4   answer_end     130319 non-null  int16 
 5   answer_text    130319 non-null  object
 6   context        130319 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 4.6+ MB


In [32]:
df.head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
0,56be85543aeaaa14008c9063,0,When did Beyonce start becoming popular?,269,286,in the late 1990s,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
1,56be85543aeaaa14008c9065,0,What areas did Beyonce compete in when she was growing up?,207,226,singing and dancing,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
2,56be85543aeaaa14008c9066,0,When did Beyonce leave Destiny's Child and become a solo singer?,526,530,2003,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
3,56bf6b0f3aeaaa14008c9601,0,In what city and state did Beyonce grow up?,166,180,"Houston, Texas","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
4,56bf6b0f3aeaaa14008c9602,0,In which decade did Beyonce become famous?,276,286,late 1990s,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."


In [33]:
df["is_impossible"].value_counts()

0    86821
1    43498
Name: is_impossible, dtype: int64

In [34]:
df[df["is_impossible"] == 1].head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
2075,5a8d7bf7df8bba001a0f9ab1,1,What category of game is Legend of Zelda: Australia Twilight?,128,144,action-adventure,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2076,5a8d7bf7df8bba001a0f9ab2,1,What consoles can be used to play Australia Twilight?,194,210,GameCube and Wii,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2077,5a8d7bf7df8bba001a0f9ab3,1,When was Australia Twilight launched in North America?,569,582,November 2006,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2078,5a8d7bf7df8bba001a0f9ab4,1,When could GameCube owners purchase Australian Princess?,569,582,November 2006,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"
2079,5a8d7bf7df8bba001a0f9ab5,1,What year was the Legend of Zelda: Australian Princess originally planned for release?,364,368,2005,"The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]"


In [35]:
%%time
df["cq_length"] = df["context"].str.len() + df["question"].str.len() 
df["cq_length"] = df["cq_length"].astype(np.int16)

CPU times: user 93.8 ms, sys: 0 ns, total: 93.8 ms
Wall time: 84.2 ms


In [36]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [37]:
%%time
enc = tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(enc.keys())}\nlen={len(enc['input_ids'])}")

Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=130319
CPU times: user 55.2 s, sys: 5.23 s, total: 1min
Wall time: 16.9 s


In [38]:
%%time
df["cq_token_length"] = [len(v) for v in enc["input_ids"]]
df["cq_token_length"] = df["cq_token_length"].astype(np.int16)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 34.3 ms


In [39]:
index = df[(df["cq_token_length"] > 499) & (df["cq_token_length"] <= 512)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count      29.000000
mean     2327.482759
std       174.631167
min      2174.000000
1%       2174.000000
5%       2174.400000
10%      2175.000000
20%      2187.600000
30%      2229.800000
40%      2238.000000
50%      2272.000000
60%      2287.600000
70%      2295.000000
80%      2496.000000
90%      2571.200000
95%      2715.200000
99%      2741.600000
max      2750.000000
Name: cq_length, dtype: float64

In [40]:
index = df[(df["cq_token_length"] > 246) & (df["cq_token_length"] <= 256)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count    2394.000000
mean     1185.904344
std       108.709116
min       678.000000
1%        854.930000
5%       1000.000000
10%      1052.300000
20%      1107.600000
30%      1144.000000
40%      1169.000000
50%      1191.500000
60%      1216.800000
70%      1243.000000
80%      1278.000000
90%      1314.700000
95%      1346.350000
99%      1397.140000
max      1465.000000
Name: cq_length, dtype: float64

In [41]:
index = df[(df["cq_token_length"] > 118) & (df["cq_token_length"] <= 128)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count    10876.000000
mean       607.685546
std         46.221126
min        332.000000
1%         485.000000
5%         543.000000
10%        556.000000
20%        572.000000
30%        584.000000
40%        595.000000
50%        606.000000
60%        617.000000
70%        629.000000
80%        644.000000
90%        666.000000
95%        685.000000
99%        723.000000
max        796.000000
Name: cq_length, dtype: float64

In [42]:
%%time
df.to_parquet("output/squad/train.parquet", index=False)

CPU times: user 266 ms, sys: 266 ms, total: 531 ms
Wall time: 790 ms


In [43]:
df = _read_squad("input/squad/dev-v2.0.json")

100%|██████████| 35/35 [00:00<00:00, 56.97it/s]


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26232 entries, 0 to 26231
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             26232 non-null  object
 1   is_impossible  26232 non-null  int8  
 2   question       26232 non-null  object
 3   answer_start   26232 non-null  int16 
 4   answer_end     26232 non-null  int16 
 5   answer_text    26232 non-null  object
 6   context        26232 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 948.0+ KB


In [45]:
df.head()

Unnamed: 0,id,is_impossible,question,answer_start,answer_end,answer_text,context
0,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
1,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
2,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
3,56ddde6b9a695914005b9628,0,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
4,56ddde6b9a695914005b9629,0,When were the Normans in Normandy?,94,117,10th and 11th centuries,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."


In [46]:
df["is_impossible"].value_counts()

0    20302
1     5930
Name: is_impossible, dtype: int64

In [47]:
%%time
df["cq_length"] = df["context"].str.len() + df["question"].str.len() 
df["cq_length"] = df["cq_length"].astype(np.int16)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 15.2 ms


In [48]:
%%time
enc = tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(enc.keys())}\nlen={len(enc['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=26232
CPU times: user 13.2 s, sys: 46.9 ms, total: 13.2 s
Wall time: 4.24 s


In [49]:
%%time
df["cq_token_length"] = [len(v) for v in enc["input_ids"]]
df["cq_token_length"] = df["cq_token_length"].astype(np.int16)

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 9.61 ms


In [50]:
index = df[(df["cq_token_length"] > 499) & (df["cq_token_length"] <= 512)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count      11.000000
mean     2491.909091
std        50.020905
min      2421.000000
1%       2421.000000
5%       2421.000000
10%      2421.000000
20%      2421.000000
30%      2495.000000
40%      2500.000000
50%      2500.000000
60%      2500.000000
70%      2509.000000
80%      2548.000000
90%      2548.000000
95%      2548.000000
99%      2548.000000
max      2548.000000
Name: cq_length, dtype: float64

In [51]:
index = df[(df["cq_token_length"] > 246) & (df["cq_token_length"] <= 256)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count     559.000000
mean     1177.529517
std       102.572968
min       917.000000
1%        930.000000
5%        967.000000
10%      1042.000000
20%      1121.000000
30%      1134.000000
40%      1154.000000
50%      1176.000000
60%      1224.000000
70%      1244.000000
80%      1269.000000
90%      1304.000000
95%      1334.000000
99%      1361.000000
max      1368.000000
Name: cq_length, dtype: float64

In [52]:
index = df[(df["cq_token_length"] > 118) & (df["cq_token_length"] <= 128)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count    2043.000000
mean      616.722467
std        42.649057
min       502.000000
1%        540.000000
5%        555.000000
10%       564.200000
20%       579.000000
30%       589.000000
40%       601.000000
50%       612.000000
60%       625.000000
70%       639.000000
80%       656.000000
90%       674.000000
95%       691.000000
99%       714.000000
max       769.000000
Name: cq_length, dtype: float64

In [53]:
%%time
df.to_parquet("output/squad/val.parquet", index=False)

CPU times: user 31.2 ms, sys: 31.2 ms, total: 62.5 ms
Wall time: 87.8 ms
