In [1]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm
from typing import Iterable, Callable
from questionanswering import squad

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
df = squad.parse_json_file("input/dev-v2.0.json")
df.info()

100%|███████████████████████████████████| 35/35 [00:01<00:00, 18.18it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26247 entries, 0 to 26246
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            26247 non-null  object
 1   title         26247 non-null  object
 2   question      26247 non-null  object
 3   answer_text   26247 non-null  object
 4   answer_start  26247 non-null  int16 
 5   context       26247 non-null  object
dtypes: int16(1), object(5)
memory usage: 1.1+ MB





In [4]:
# drop duplicate ids in dev set
df.drop_duplicates(["id"], inplace=True, ignore_index=True)
assert len(df) == 11873
df.set_index("id", drop=False, inplace=True)
questions = list(df["question"])
contexts = list(df["context"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11873 entries, 56ddde6b9a695914005b9628 to 5ad28ad0d7d075001a4299cf
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            11873 non-null  object
 1   title         11873 non-null  object
 2   question      11873 non-null  object
 3   answer_text   11873 non-null  object
 4   answer_start  11873 non-null  int16 
 5   context       11873 non-null  object
dtypes: int16(1), object(5)
memory usage: 579.7+ KB


In [5]:
df.head()

Unnamed: 0_level_0,id,title,question,answer_text,answer_start,context
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
56ddde6b9a695914005b9628,56ddde6b9a695914005b9628,Normans,in what country is normandy located?,france,163,"the normans norman : nourmands ; french : normands ; latin : normanni were the people who in the 10th and 11th centuries gave their name to normandy , a region in france . they were descended from norse norman comes from norseman raiders and pirates from denmark , iceland and norway who , under their leader rollo , agreed to swear fealty to king charles iii of west francia . through generations of assimilation and mixing with the native frankish and roman - gaulish populations , their descendants would gradually merge with the carolingian - based cultures of west francia . the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century , and it continued to evolve over the succeeding centuries ."
56ddde6b9a695914005b9629,56ddde6b9a695914005b9629,Normans,when were the normans in normandy?,10th and 11th centuries,97,"the normans norman : nourmands ; french : normands ; latin : normanni were the people who in the 10th and 11th centuries gave their name to normandy , a region in france . they were descended from norse norman comes from norseman raiders and pirates from denmark , iceland and norway who , under their leader rollo , agreed to swear fealty to king charles iii of west francia . through generations of assimilation and mixing with the native frankish and roman - gaulish populations , their descendants would gradually merge with the carolingian - based cultures of west francia . the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century , and it continued to evolve over the succeeding centuries ."
56ddde6b9a695914005b962a,56ddde6b9a695914005b962a,Normans,from which countries did the norse originate?,"denmark , iceland and norway",255,"the normans norman : nourmands ; french : normands ; latin : normanni were the people who in the 10th and 11th centuries gave their name to normandy , a region in france . they were descended from norse norman comes from norseman raiders and pirates from denmark , iceland and norway who , under their leader rollo , agreed to swear fealty to king charles iii of west francia . through generations of assimilation and mixing with the native frankish and roman - gaulish populations , their descendants would gradually merge with the carolingian - based cultures of west francia . the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century , and it continued to evolve over the succeeding centuries ."
56ddde6b9a695914005b962b,56ddde6b9a695914005b962b,Normans,who was the norse leader?,rollo,309,"the normans norman : nourmands ; french : normands ; latin : normanni were the people who in the 10th and 11th centuries gave their name to normandy , a region in france . they were descended from norse norman comes from norseman raiders and pirates from denmark , iceland and norway who , under their leader rollo , agreed to swear fealty to king charles iii of west francia . through generations of assimilation and mixing with the native frankish and roman - gaulish populations , their descendants would gradually merge with the carolingian - based cultures of west francia . the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century , and it continued to evolve over the succeeding centuries ."
56ddde6b9a695914005b962c,56ddde6b9a695914005b962c,Normans,what century did the normans first gain their separate identity?,10th century,680,"the normans norman : nourmands ; french : normands ; latin : normanni were the people who in the 10th and 11th centuries gave their name to normandy , a region in france . they were descended from norse norman comes from norseman raiders and pirates from denmark , iceland and norway who , under their leader rollo , agreed to swear fealty to king charles iii of west francia . through generations of assimilation and mixing with the native frankish and roman - gaulish populations , their descendants would gradually merge with the carolingian - based cultures of west francia . the distinct cultural and ethnic identity of the normans emerged initially in the first half of the 10th century , and it continued to evolve over the succeeding centuries ."


In [6]:
s = pd.Series(np.where(df["answer_start"]==-1, True, False))
assert s.value_counts()[True] == 5945
s.value_counts(normalize=True)

True     0.500716
False    0.499284
dtype: float64

# Word, token lengths

In [7]:
%%time
col = "qc_length"
df[col] = df["question"].str.len() + df["context"].str.len() 
df[col] = df[col].astype(np.int32)

Wall time: 8 ms


In [8]:
%%time
col = "a_length"
df[col] = df["answer_text"].str.len()
df[col] = df[col].astype(np.int32)

Wall time: 5.97 ms


In [9]:
def word_length(cols: Iterable) -> Callable:
    def f(row) -> int:
        res = 0
        for col in cols:
            res += len(row[col].split())
        return res
    
    return f

In [10]:
%%time
col = "qc_word_length"
df[col] = df.progress_apply(word_length(["question", "context"]), axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████| 11873/11873 [00:00<00:00, 48978.58it/s]

Wall time: 246 ms





In [11]:
%%time
col = "a_word_length"
df[col] = df.progress_apply(word_length(["answer_text"]), axis=1)
df[col] = df[col].astype(np.int32)

100%|█████████████████████████| 11873/11873 [00:00<00:00, 104115.30it/s]

Wall time: 117 ms





In [12]:
#pretrained_dir = "../pretrained/google/electra-small-discriminator"
#tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=512)
#print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

In [13]:
#pretrained_dir = "../pretrained/albert-base-v2"
#sp_tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=512)
#print(f"{repr(sp_tokenizer)}\n{sp_tokenizer.model_input_names}")

In [14]:
#pretrained_dir = "../pretrained/distilroberta-base"
#bpe_tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=512)
#print(f"{repr(bpe_tokenizer)}\n{bpe_tokenizer.model_input_names}")

In [15]:
#%%time
#x = tokenizer(questions, contexts)
#print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
#col = "qc_wp_length"
#df[col] = [len(v) for v in x["input_ids"]]
#df[col] = df[col].astype(np.int16)

In [16]:
#%%time
#x = sp_tokenizer(questions, contexts)
#print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
#col = "qc_sp_length"
#df[col] = [len(v) for v in x["input_ids"]]
#df[col] = df[col].astype(np.int16)

In [17]:
#%%time
#x = bpe_tokenizer(questions, contexts)
#print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")
#col = "qc_bpe_length"
#df[col] = [len(v) for v in x["input_ids"]]
#df[col] = df[col].astype(np.int16)

In [18]:
#cols = ["qc_length", "a_length", "qc_word_length", "a_word_length", 
#        "qc_wp_length", "qc_sp_length", "qc_bpe_length"]
#df[cols].describe(percentiles=percentiles)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11873 entries, 56ddde6b9a695914005b9628 to 5ad28ad0d7d075001a4299cf
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              11873 non-null  object
 1   title           11873 non-null  object
 2   question        11873 non-null  object
 3   answer_text     11873 non-null  object
 4   answer_start    11873 non-null  int16 
 5   context         11873 non-null  object
 6   qc_length       11873 non-null  int32 
 7   a_length        11873 non-null  int32 
 8   qc_word_length  11873 non-null  int32 
 9   a_word_length   11873 non-null  int32 
dtypes: int16(1), int32(4), object(5)
memory usage: 765.3+ KB


In [20]:
%%time
df.to_parquet("output/dev.parquet", index=False)

Wall time: 49 ms
