In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
def _read_squad(path):
    path = Path(path)
    squad_dict = None
    with open(path) as f:
        squad_dict = json.load(f)
    rows = []
    for group in tqdm(squad_dict['data']):
        title = group["title"]
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                _id = qa["id"]
                is_impossible = qa["is_impossible"]
                question = qa['question']
                for a in qa["answers"]:
                    row = {}
                    row["id"] = _id 
                    row["is_impossible"] = is_impossible
                    row["title"] = title
                    row["question"] = question
                    row["answer_start"] = a["answer_start"]
                    i = row["answer_start"]
                    j = row["answer_start"] + len(a["text"])
                    assert a["text"] == context[i:j]
                    # first position which follows AFTER the answer span
                    row["answer_end"] = j
                    row["answer_text"] = a["text"]
                    row["context"] = context
                    rows.append(row)
                if is_impossible:
                    if "plausible_answers" in qa and len(qa["plausible_answers"]) != 0:
                        for a in qa["plausible_answers"]:
                            row = {}
                            row["id"] = _id 
                            row["is_impossible"] = is_impossible
                            row["title"] = title
                            row["question"] = question
                            row["answer_start"] = a["answer_start"]
                            i = row["answer_start"]
                            j = row["answer_start"] + len(a["text"])
                            assert a["text"] == context[i:j]
                            row["answer_end"] = j
                            row["answer_text"] = a["text"]
                            row["context"] = context
                            rows.append(row)
                    else:
                        # at least one entry for "impossible" pair
                        row = {}
                        row["id"] = _id 
                        row["is_impossible"] = is_impossible
                        row["title"] = title
                        row["question"] = question
                        row["answer_start"] = -1
                        row["answer_end"] = -1
                        row["answer_text"] = ""
                        row["context"] = context
                        rows.append(row)
    df = pd.DataFrame.from_records(rows)
    df["answer_start"] = df["answer_start"].astype(np.int16)
    df["answer_end"] = df["answer_end"].astype(np.int16)
    df["is_impossible"] = df["is_impossible"].astype(np.int8)
    return df

In [4]:
df = _read_squad("input/train-v2.0.json")
df.info()

100%|███████████████████████████████| 442/442 [00:00<00:00, 3507.84it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   title          130319 non-null  object
 3   question       130319 non-null  object
 4   answer_start   130319 non-null  int16 
 5   answer_end     130319 non-null  int16 
 6   answer_text    130319 non-null  object
 7   context        130319 non-null  object
dtypes: int16(2), int8(1), object(5)
memory usage: 5.6+ MB


In [5]:
assert len(df) == len(set(df["id"]))

In [6]:
df.head()

Unnamed: 0,id,is_impossible,title,question,answer_start,answer_end,answer_text,context
0,56be85543aeaaa14008c9063,0,Beyoncé,When did Beyonce start becoming popular?,269,286,in the late 1990s,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
1,56be85543aeaaa14008c9065,0,Beyoncé,What areas did Beyonce compete in when she was growing up?,207,226,singing and dancing,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
2,56be85543aeaaa14008c9066,0,Beyoncé,When did Beyonce leave Destiny's Child and become a solo singer?,526,530,2003,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
3,56bf6b0f3aeaaa14008c9601,0,Beyoncé,In what city and state did Beyonce grow up?,166,180,"Houston, Texas","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."
4,56bf6b0f3aeaaa14008c9602,0,Beyoncé,In which decade did Beyonce become famous?,276,286,late 1990s,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy""."


In [7]:
df["is_impossible"].value_counts(normalize=True)

0    0.666219
1    0.333781
Name: is_impossible, dtype: float64

In [8]:
df[df["is_impossible"] == 1].sample(5).head(5)

Unnamed: 0,id,is_impossible,title,question,answer_start,answer_end,answer_text,context
115175,5a65bfbec2b11c001a425d2f,1,Antibiotics,What is one common result of using genes from a young age?,57,76,increased body mass,"Exposure to antibiotics early in life is associated with increased body mass in humans and mouse models. Early life is a critical period for the establishment of the intestinal microbiota and for metabolic development. Mice exposed to subtherapeutic antibiotic treatment (STAT)– with either penicillin, vancomycin, penicillin and vancomycin, or chlortetracycline had altered composition of the gut microbiota as well as its metabolic capabilities. Moreover, research have shown that mice given low-dose penicillin (1 μg/g body weight) around birth and throughout the weaning process had an increased body mass and fat mass, accelerated growth, and increased hepatic expression of genes involved in adipogenesis, compared to controlled mice. In addition, penicillin in combination with a high-fat diet increased fasting insulin levels in mice. However, it is unclear whether or not antibiotics cause obesity in humans. Studies have found a correlation between early exposure of antibiotics (<6 months) and increased body mass (at 10 and 20 months). Another study found that the type of antibiotic exposure was also significant with the highest risk of being overweight in those given macrolides compared to penicillin and cephalosporin. Therefore, there is correlation between antibiotic exposure in early life and obesity in humans, but whether or not there is a causal relationship remains unclear. Although there is a correlation between antibiotic use in early life and obesity, the effect of antibiotics on obesity in humans needs to be weighed against the beneficial effects of clinically indicated treatment with antibiotics in infancy."
73077,5ad417a0604f3c001a4003aa,1,Affirmative_action_in_the_United_States,What is the failure rate for black law school graduates compared to asians for the bar exam?,358,368,four times,"Richard Sander claims that by artificially elevating minority students into schools they otherwise would not be capable of attending, this discourages them and tends to engender failure and high dropout rates for these students. For example, about half of black college students rank in the bottom 20 percent of their classes, black law school graduates are four times as likely to fail bar exams as are whites, and interracial friendships are more likely to form among students with relatively similar levels of academic preparation; thus, blacks and Hispanics are more socially integrated on campuses where they are less academically mismatched. He claims that the supposed ""beneficiaries"" of affirmative action – minorities – do not actually benefit and rather are harmed by the policy. Sander's claims have been disputed, and his empirical analyses have been subject to substantial criticism. A group including some of the country's lead statistical methodologists told the Supreme Court that Sander's analyses were sufficiently flawed that the Court would be wise to ignore them entirely. At the same time many scholars have found that minorities gain substantially from affirmative action."
32136,5a361229788daf001a5f86f9,1,Himachal_Pradesh,How does the climate vary in the trans-Himalayan region?,122,270,"from hot and subhumid tropical in the southern tracts to, with more elevation, cold, alpine, and glacial in the northern and eastern mountain ranges","Due to extreme variation in elevation, great variation occurs in the climatic conditions of Himachal . The climate varies from hot and subhumid tropical in the southern tracts to, with more elevation, cold, alpine, and glacial in the northern and eastern mountain ranges. The state has areas like Dharamsala that receive very heavy rainfall, as well as those like Lahaul and Spiti that are cold and almost rainless. Broadly, Himachal experiences three seasons: summer, winter, and rainy season. Summer lasts from mid-April till the end of June and most parts become very hot (except in the alpine zone which experiences a mild summer) with the average temperature ranging from 28 to 32 °C (82 to 90 °F). Winter lasts from late November till mid March. Snowfall is common in alpine tracts (generally above 2,200 metres (7,218 ft) i.e. in the higher and trans-Himalayan region)."
63845,5ad3eb6c604f3c001a3ff728,1,Yale_University,What did President Salovey believe wouldn't happen if Calhoun's name was removed from the college?,1793,1862,"it would ""obscure"" his ""legacy of slavery rather than addressing it.""","In the wake of the racially-motivated"" church shooting in Charleston, South Carolina, Yale was under criticism again in the summer of 2015 for Calhoun College, one of 12 residential colleges, which was named after John C. Calhoun, a slave-owner and strong slavery supporter in the nineteenth century. In July 2015 students signed a petition calling for the name change. They argued in the petition that—while Calhoun was respected in the 19th century as an ""extraordinary American statesman""—he was ""one of the most prolific defenders of slavery and white supremacy"" in the history of the United States. In August 2015 Yale President Peter Salovey addressed the Freshman Class of 2019 in which he responded to the racial tensions but explained why the college would not be renamed. He described Calhoun as a ""a notable political theorist, a vice president to two different U.S. presidents, a secretary of war and of state, and a congressman and senator representing South Carolina."" He acknowledged that Calhoun also ""believed that the highest forms of civilization depend on involuntary servitude. Not only that, but he also believed that the races he thought to be inferior, black people in particular, ought to be subjected to it for the sake of their own best interests."" Racial tensions increased in the fall of 2015 centering on comments by Nicholas A. Christakis and his wife Erika regarding freedom of speech. In April 2016 Salovey announced that ""despite decades of vigorous alumni and student protests,"" Calhoun's name will remain on the Yale residential college explaining that it is preferable for Yale students to live in Calhoun's ""shadow"" so they will be ""better prepared to rise to the challenges of the present and the future."" He claimed that if they removed Calhoun's name, it would ""obscure"" his ""legacy of slavery rather than addressing it."" ""Yale is part of that history"" and ""We cannot erase American history, but we can confront it, teach it and learn from it."" One change that will be issued is the title of “master” for faculty members who serve as residential college leaders will be renamed to “head of college” due to its connotation of slavery."
7992,5ad0104777cf76001a686878,1,Republic_of_the_Congo,What did Sassou's regime not attempt to censor?,49,71,corruption revelations,"Internationally, Sassou's regime has been hit by corruption revelations despite attempts to censor them. One French investigation found over 110 bank accounts and dozens of lavish properties in France; Sassou denounced embezzlement investigations as ""racist"" and ""colonial""."


In [9]:
%%time
df["cq_length"] = df["context"].str.len() + df["question"].str.len() 
df["cq_length"] = df["cq_length"].astype(np.int16)

Wall time: 88 ms


In [10]:
pretrained_dir = "../pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='../pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [11]:
pretrained_dir = "../pretrained/albert-base-v2"
model_max_length = 512
sp_tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(sp_tokenizer)}\n{sp_tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='../pretrained/albert-base-v2', vocab_size=30000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False)})
['input_ids', 'token_type_ids', 'attention_mask']


In [12]:
pretrained_dir = "../pretrained/distilroberta-base"
model_max_length = 512
bpe_tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(bpe_tokenizer)}\n{bpe_tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='../pretrained/distilroberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})
['input_ids', 'attention_mask']


In [13]:
%%time
x = tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
col = "cq_wp_length"
df[col] = [len(v) for v in x["input_ids"]]
df[col] = df[col].astype(np.int16)

Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=130319
Wall time: 56.4 s


In [14]:
%%time
x = sp_tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
col = "cq_sp_length"
df[col] = [len(v) for v in x["input_ids"]]
df[col] = df[col].astype(np.int16)

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=130319
Wall time: 1min 35s


In [15]:
%%time
x = bpe_tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
col = "cq_bpe_length"
df[col] = [len(v) for v in x["input_ids"]]
df[col] = df[col].astype(np.int16)

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


dict_keys(['input_ids', 'attention_mask'])
len=130319
Wall time: 1min 7s


In [16]:
cols = ["cq_wp_length", "cq_sp_length", "cq_bpe_length"]
df[cols].describe(percentiles=percentiles)

Unnamed: 0,cq_wp_length,cq_sp_length,cq_bpe_length
count,130319.0,130319.0,130319.0
mean,170.726632,174.188384,173.00574
std,65.407215,67.240044,96.779923
min,35.0,35.0,37.0
1%,55.0,56.0,57.0
5%,84.0,86.0,86.0
10%,108.0,109.0,109.0
20%,124.0,126.0,125.0
30%,135.0,137.0,136.0
40%,146.0,149.0,148.0


In [17]:
index = df[(df["cq_wp_length"] > 499) & (df["cq_wp_length"] <= 512)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count      29.000000
mean     2327.482759
std       174.631167
min      2174.000000
1%       2174.000000
5%       2174.400000
10%      2175.000000
20%      2187.600000
30%      2229.800000
40%      2238.000000
50%      2272.000000
60%      2287.600000
70%      2295.000000
80%      2496.000000
90%      2571.200000
95%      2715.200000
99%      2741.600000
max      2750.000000
Name: cq_length, dtype: float64

In [18]:
index = df[(df["cq_wp_length"] > 246) & (df["cq_wp_length"] <= 256)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count    2394.000000
mean     1185.904344
std       108.709116
min       678.000000
1%        854.930000
5%       1000.000000
10%      1052.300000
20%      1107.600000
30%      1144.000000
40%      1169.000000
50%      1191.500000
60%      1216.800000
70%      1243.000000
80%      1278.000000
90%      1314.700000
95%      1346.350000
99%      1397.140000
max      1465.000000
Name: cq_length, dtype: float64

In [19]:
index = df[(df["cq_wp_length"] > 118) & (df["cq_wp_length"] <= 128)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count    10876.000000
mean       607.685546
std         46.221126
min        332.000000
1%         485.000000
5%         543.000000
10%        556.000000
20%        572.000000
30%        584.000000
40%        595.000000
50%        606.000000
60%        617.000000
70%        629.000000
80%        644.000000
90%        666.000000
95%        685.000000
99%        723.000000
max        796.000000
Name: cq_length, dtype: float64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   is_impossible  130319 non-null  int8  
 2   title          130319 non-null  object
 3   question       130319 non-null  object
 4   answer_start   130319 non-null  int16 
 5   answer_end     130319 non-null  int16 
 6   answer_text    130319 non-null  object
 7   context        130319 non-null  object
 8   cq_length      130319 non-null  int16 
 9   cq_wp_length   130319 non-null  int16 
 10  cq_sp_length   130319 non-null  int16 
 11  cq_bpe_length  130319 non-null  int16 
dtypes: int16(6), int8(1), object(5)
memory usage: 6.6+ MB


In [21]:
%%time
df.to_parquet("output/train.parquet", index=False)

Wall time: 450 ms


# Dev set

In [22]:
df = _read_squad("input/dev-v2.0.json")
df.info()

100%|█████████████████████████████████| 35/35 [00:00<00:00, 1521.39it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26247 entries, 0 to 26246
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             26247 non-null  object
 1   is_impossible  26247 non-null  int8  
 2   title          26247 non-null  object
 3   question       26247 non-null  object
 4   answer_start   26247 non-null  int16 
 5   answer_end     26247 non-null  int16 
 6   answer_text    26247 non-null  object
 7   context        26247 non-null  object
dtypes: int16(2), int8(1), object(5)
memory usage: 1.1+ MB





In [23]:
df.head()

Unnamed: 0,id,is_impossible,title,question,answer_start,answer_end,answer_text,context
0,56ddde6b9a695914005b9628,0,Normans,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
1,56ddde6b9a695914005b9628,0,Normans,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
2,56ddde6b9a695914005b9628,0,Normans,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
3,56ddde6b9a695914005b9628,0,Normans,In what country is Normandy located?,159,165,France,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
4,56ddde6b9a695914005b9629,0,Normans,When were the Normans in Normandy?,94,117,10th and 11th centuries,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."


In [24]:
df["is_impossible"].value_counts(normalize=True)

0    0.773498
1    0.226502
Name: is_impossible, dtype: float64

In [25]:
%%time
df["cq_length"] = df["context"].str.len() + df["question"].str.len() 
df["cq_length"] = df["cq_length"].astype(np.int16)

Wall time: 21 ms


In [26]:
%%time
x = tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
col = "cq_wp_length"
df[col] = [len(v) for v in x["input_ids"]]
df[col] = df[col].astype(np.int16)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=26247
Wall time: 14.3 s


In [27]:
%%time
x = sp_tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
col = "cq_sp_length"
df[col] = [len(v) for v in x["input_ids"]]
df[col] = df[col].astype(np.int16)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=26247
Wall time: 22.1 s


In [28]:
%%time
x = bpe_tokenizer(list(df["context"]), list(df["question"]))
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
col = "cq_bpe_length"
df[col] = [len(v) for v in x["input_ids"]]
df[col] = df[col].astype(np.int16)

dict_keys(['input_ids', 'attention_mask'])
len=26247
Wall time: 14.6 s


In [29]:
cols = ["cq_wp_length", "cq_sp_length", "cq_bpe_length"]
df[cols].describe(percentiles=percentiles)

Unnamed: 0,cq_wp_length,cq_sp_length,cq_bpe_length
count,26247.0,26247.0,26247.0
mean,181.900865,185.490456,182.890807
std,74.241469,76.358966,73.820075
min,41.0,41.0,42.0
1%,74.0,75.0,76.0
5%,110.0,111.0,111.0
10%,119.0,122.0,120.0
20%,131.0,133.0,132.0
30%,140.0,143.0,141.0
40%,151.0,153.0,152.0


In [30]:
index = df[(df["cq_wp_length"] > 499) & (df["cq_wp_length"] <= 512)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count      11.000000
mean     2491.909091
std        50.020905
min      2421.000000
1%       2421.000000
5%       2421.000000
10%      2421.000000
20%      2421.000000
30%      2495.000000
40%      2500.000000
50%      2500.000000
60%      2500.000000
70%      2509.000000
80%      2548.000000
90%      2548.000000
95%      2548.000000
99%      2548.000000
max      2548.000000
Name: cq_length, dtype: float64

In [31]:
index = df[(df["cq_wp_length"] > 246) & (df["cq_wp_length"] <= 256)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count     559.000000
mean     1177.529517
std       102.572968
min       917.000000
1%        930.000000
5%        967.000000
10%      1042.000000
20%      1121.000000
30%      1134.000000
40%      1154.000000
50%      1176.000000
60%      1224.000000
70%      1244.000000
80%      1269.000000
90%      1304.000000
95%      1334.000000
99%      1361.000000
max      1368.000000
Name: cq_length, dtype: float64

In [32]:
index = df[(df["cq_wp_length"] > 118) & (df["cq_wp_length"] <= 128)].index
df.loc[index, "cq_length"].describe(percentiles=percentiles)

count    2044.000000
mean      616.679061
std        42.683754
min       502.000000
1%        537.720000
5%        555.000000
10%       564.000000
20%       579.000000
30%       589.000000
40%       601.000000
50%       612.000000
60%       625.000000
70%       639.000000
80%       656.000000
90%       674.000000
95%       691.000000
99%       714.000000
max       769.000000
Name: cq_length, dtype: float64

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26247 entries, 0 to 26246
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             26247 non-null  object
 1   is_impossible  26247 non-null  int8  
 2   title          26247 non-null  object
 3   question       26247 non-null  object
 4   answer_start   26247 non-null  int16 
 5   answer_end     26247 non-null  int16 
 6   answer_text    26247 non-null  object
 7   context        26247 non-null  object
 8   cq_length      26247 non-null  int16 
 9   cq_wp_length   26247 non-null  int16 
 10  cq_sp_length   26247 non-null  int16 
 11  cq_bpe_length  26247 non-null  int16 
dtypes: int16(6), int8(1), object(5)
memory usage: 1.3+ MB


In [34]:
%%time
df.to_parquet("output/dev.parquet", index=False)

Wall time: 56.2 ms
