In [1]:
%load_ext autoreload
%autoreload 2

#### Provides an example of using an AssessmentDataDownloader

In [8]:
from deh.dl.squad import SquadAssessmentDataDownloader

# Instantiate a SquadAssessmentDataDownloader:
dl = SquadAssessmentDataDownloader(cache_dir="../../data/qa_dl_cache")

In [7]:
dl.save_contexts( "../../data/contexts/" )
dl.save_question_answers( "../../data/qas/qas.tsv" )

#### JSON file experimentation

In [2]:
import json

cache_file_path = "../../data/qa_dl_cache/dev-v2.0.json"

with open (cache_file_path) as json_file:
    json_data = json.load(json_file)

In [10]:
# Normalize the JSON file

import pandas as pd
df_table = pd.json_normalize( 
    json_data["data"], 
    record_path=["paragraphs", "qas", "answers"], 
    meta=[
        "title", 
        ["paragraphs","context"], 
        ["paragraphs","qas", "question"],
        ["paragraphs","qas", "is_impossible"]
    ]
)

selector = { 
    "paragraphs.qas.question": "question",
    "text": "answer", 
    "paragraphs.context": "context", 
    "paragraphs.qas.is_impossible": "is_impossible" }

df_table = df_table.rename(columns=selector)[[*selector.values()]]

df_table[0:5]

Unnamed: 0,question,answer,context,is_impossible
0,In what country is Normandy located?,France,The Normans (Norman: Nourmands; French: Norman...,False
1,In what country is Normandy located?,France,The Normans (Norman: Nourmands; French: Norman...,False
2,In what country is Normandy located?,France,The Normans (Norman: Nourmands; French: Norman...,False
3,In what country is Normandy located?,France,The Normans (Norman: Nourmands; French: Norman...,False
4,When were the Normans in Normandy?,10th and 11th centuries,The Normans (Norman: Nourmands; French: Norman...,False


In [12]:
import pandas as pd
impossible_df = pd.json_normalize( 
    json_data["data"], 
    record_path=["paragraphs", "qas",], 
    meta=[
        "title", 
        ["paragraphs","context"],
    ]
)

impossible_df["answer"] = ""
rename = {"paragraphs.context": "context"}
impossible_df = impossible_df.rename(columns=rename)[["question", "answer", "context", "is_impossible"]]

impossible_df = impossible_df[ impossible_df["is_impossible"] == True ]

Unnamed: 0,question,answer,context,is_impossible
5,Who gave their name to Normandy in the 1000's ...,,The Normans (Norman: Nourmands; French: Norman...,True
6,What is France a region of?,,The Normans (Norman: Nourmands; French: Norman...,True
7,Who did King Charles III swear fealty to?,,The Normans (Norman: Nourmands; French: Norman...,True
8,When did the Frankish identity emerge?,,The Normans (Norman: Nourmands; French: Norman...,True
12,What type of major impact did the Norman dynas...,,"The Norman dynasty had a major political, cult...",True
...,...,...,...,...
11863,What does not change macroscopic closed systems?,,The connection between macroscopic nonconserva...,True
11869,What does not have a metric counterpart?,,"The pound-force has a metric counterpart, less...",True
11870,What is the force exerted by standard gravity ...,,"The pound-force has a metric counterpart, less...",True
11871,What force leads to a commonly used unit of mass?,,"The pound-force has a metric counterpart, less...",True


In [17]:
df_table = pd.concat([df_table,impossible_df], ignore_index=True)

In [20]:
# Create context id:

df_table["context_id"] = df_table.groupby(["context"]).ngroup()
df_table.sort_values(by="context_id")[0:100]

Unnamed: 0,question,answer,context,is_impossible,context_id
32878,"The term ""southern"" California usually refers ...",,"""Southern California"" is not a formal geograph...",False,0
32876,"Geographically speaking, where is California's...",,"""Southern California"" is not a formal geograph...",False,0
32877,How many miles south of San Jose is the north ...,,"""Southern California"" is not a formal geograph...",False,0
32879,"Other than Point Conception, what landmark is ...",,"""Southern California"" is not a formal geograph...",False,0
32880,Point Conception is an example of a landmark a...,,"""Southern California"" is not a formal geograph...",False,0
...,...,...,...,...,...
16996,Western Imperialism divided the globe accordin...,world systems theory,"""The word ‘empire’ comes from the Latin word i...",False,2
16990,The amount of land a country controls is its g...,distinction,"""The word ‘empire’ comes from the Latin word i...",False,2
30578,Most imperialism was carried out using which m...,,"""The word ‘empire’ comes from the Latin word i...",False,2
42450,One country's authority over a number of other...,,"""The word ‘empire’ comes from the Latin word i...",False,2


In [27]:
# De-duplicate Rows
de_dupe_df = df_table.drop_duplicates()
len(de_dupe_df)/len(df_table)

0.5115752142646045

In [29]:
de_dupe_df.sort_values(by="context_id")[0:10]

Unnamed: 0,text,answer_start,title,paragraphs.context,paragraphs.qas.question,paragraphs.qas.is_impossible,context_id
1007,"37° 9' 58.23""",194,Southern_California,"""Southern California"" is not a formal geograph...","Geographically speaking, where is California's...",False,0
1010,11,225,Southern_California,"""Southern California"" is not a formal geograph...",How many miles south of San Jose is the north ...,False,0
1013,ten,453,Southern_California,"""Southern California"" is not a formal geograph...","The term ""southern"" California usually refers ...",False,0
1016,Tehachapi Mountains,740,Southern_California,"""Southern California"" is not a formal geograph...","Other than Point Conception, what landmark is ...",False,0
1018,northern,767,Southern_California,"""Southern California"" is not a formal geograph...",Point Conception is an example of a landmark a...,False,0
1019,the Tehachapi Mountains,736,Southern_California,"""Southern California"" is not a formal geograph...",Point Conception is an example of a landmark a...,False,0
16834,Wahhabi/Salafi jihadist extremist militant,190,Islamism,"""The Islamic State"", formerly known as the ""Is...",What type of group is The Islamic State?,False,1
16847,international recognition,660,Islamism,"""The Islamic State"", formerly known as the ""Is...",What does the Islamic State lack from the inte...,False,1
16846,recognition,674,Islamism,"""The Islamic State"", formerly known as the ""Is...",What does the Islamic State lack from the inte...,False,1
16843,ten million,506,Islamism,"""The Islamic State"", formerly known as the ""Is...",How many people did the Islamic State control ...,False,1


In [36]:
de_dupe_df.groupby("context_id", as_index=False).agg(
            context=("paragraphs.context","first"),
        )

Unnamed: 0,context_id,context
0,0,"""Southern California"" is not a formal geograph..."
1,1,"""The Islamic State"", formerly known as the ""Is..."
2,2,"""The word ‘empire’ comes from the Latin word i..."
3,3,2013 Economics Nobel prize winner Robert J. Sh...
4,4,A B cell identifies pathogens when antibodies ...
...,...,...
1199,1199,are prime for any natural number n. Here repr...
1200,1200,are prime. Prime numbers of this form are know...
1201,1201,can have infinitely many primes only when a an...
1202,1202,"where is the mass of the object, is the velo..."
