# Setup data for NLP processing

In [4]:
# head -10000 ../data/simplified/final_data.jsonl > final_data_10K.jsonl
# head -5000 ../data/simplified/final_data.jsonl > final_data_5K.jsonl

import json
import pandas as pd

pd.set_option('display.max_colwidth', 200)

## Import data as a list

In [5]:
def read_jsonl_list(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line_data = json.loads(line.rstrip('\n|\r'))
            data.append(line_data)
    print(f'Loaded {len(data)} records from {input_path}')
    return data

## Import data as a pandas data frame

In [6]:
def read_jsonl_df(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data_norm = pd.DataFrame()
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line_data = json.loads(line.rstrip('\n|\r'))
            line_data_norm = pd.json_normalize(line_data)
            line_data_norm_annotations = pd.json_normalize(
              line_data, record_path='annotations', record_prefix='Ann.',
            )
            # line_data_norm_answ_candidates = pd.io.json.json_normalize(
            #   line_data, record_path='long_answer_candidates', record_prefix='AnsCand.',
            # )
            line_data_norm = pd.concat(
                [
                  line_data_norm,
                  line_data_norm_annotations
                  # , line_data_norm_answ_candidates
                ]
                , axis=1
            )
            
            data_norm = pd.concat([data_norm, line_data_norm])
    print(f'Normalized into {len(data_norm)} records from {input_path}.')
    return data_norm

In [7]:
# nq_data = read_jsonl(f'final_data_15K.jsonl')
nq_df = read_jsonl_df(f'final_data_10K.jsonl')

Normalized into 10000 records from final_data_10K.jsonl.


In [12]:
nq_df[['question_text', 'document_text']]

Unnamed: 0,question_text,document_text
0,london bridges falling down falling down falling down london bridge is falling down,"London Bridge is Falling down - wikipedia <H1> London Bridge is Falling down </H1> <P> </P> <Table> <Tr> <Th_colspan=""2""> `` London Bridge Is Falling Down '' </Th> </Tr> <Tr> <Td_colspan=""2""> Illu..."
0,how high were the winds from hurricane matthew,"Hurricane Matthew - wikipedia <H1> Hurricane Matthew </H1> This is the latest accepted revision , reviewed on 11 December 2017 . Jump to : navigation , search For other storms of the same name , s..."
0,diana in her own words channel 4 narrator,"Diana : in her Own Words - wikipedia <H1> Diana : in her Own Words </H1> Jump to : navigation , search <P> Diana : In Her Own Words is a television documentary that was broadcast on Channel 4 in t..."
0,where did dierks bentley go to high school,"Dierks Bentley - wikipedia <H1> Dierks Bentley </H1> Jump to : navigation , search <Table> <Tr> <Th_colspan=""2""> Dierks Bentley </Th> </Tr> <Tr> <Td_colspan=""2""> Bentley in April 2010 </Td> </Tr> ..."
0,all india rank of ajay kumar garg engineering college,"Ajay Kumar Garg Engineering college - Wikipedia <H1> Ajay Kumar Garg Engineering college </H1> Jump to : navigation , search <Table> <Tr> <Td> </Td> <Td> This article relies too much on references..."
...,...,...
0,where to find the story of the prodigal son in the bible,"Parable of the Prodigal Son - wikipedia <H1> Parable of the Prodigal Son </H1> Jump to : navigation , search `` Prodigal son '' redirects here . For other uses , see The Prodigal Son ( disambiguat..."
0,who wrote kings 1 and 2 in the bible,"Books of Kings - wikipedia <H1> Books of Kings </H1> Jump to : navigation , search For other uses , see Book of Kings . `` 4 Kings '' redirects here . For the esports team , see 4Kings . <Table> <..."
0,when does new mama mia film come out,"Mamma Mia ! ( Film ) - wikipedia <H1> Mamma Mia ! ( Film ) </H1> <Table> <Tr> <Th_colspan=""2""> Mamma Mia ! </Th> </Tr> <Tr> <Td_colspan=""2""> Theatrical release poster </Td> </Tr> <Tr> <Th> Directe..."
0,who developed procedure used to measure size of an earthquake,Seismic magnitude scales - Wikipedia <H1> Seismic magnitude scales </H1> For use of earthquake magnitude scales in Wikipedia see Template : M . <Table> <Tr> <Td> Part of a series on </Td> </Tr> <T...


In [13]:
import re 

In [21]:
df = nq_df.drop(
    ['document_url', 'long_answer_candidates', 'annotations', 'Ann.yes_no_answer', 'Ann.short_answers',
        'Ann.long_answer.candidate_index'
    ]
    , axis=1
)

In [23]:
dc = df['document_text']
df.drop(['document_text'], axis=1)

Unnamed: 0,question_text,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...
0,where to find the story of the prodigal son in the bible,1085679002286572362,4.797133e+18,97,153
0,who wrote kings 1 and 2 in the bible,2606362058919009898,1.737980e+19,-1,-1
0,when does new mama mia film come out,-2943731698500887380,1.750850e+19,474,547
0,who developed procedure used to measure size of an earthquake,7968296286655798045,1.484800e+19,-1,-1


In [25]:
split_pattern = r'<P>'
df['doc'] = dc.str.split(split_pattern)

In [26]:
df

Unnamed: 0,document_text,question_text,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token,doc
0,"London Bridge is Falling down - wikipedia <H1> London Bridge is Falling down </H1> <P> </P> <Table> <Tr> <Th_colspan=""2""> `` London Bridge Is Falling Down '' </Th> </Tr> <Tr> <Td_colspan=""2""> Illu...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1,"[London Bridge is Falling down - wikipedia <H1> London Bridge is Falling down </H1> , </P> <Table> <Tr> <Th_colspan=""2""> `` London Bridge Is Falling Down '' </Th> </Tr> <Tr> <Td_colspan=""2""> Illu..."
0,"Hurricane Matthew - wikipedia <H1> Hurricane Matthew </H1> This is the latest accepted revision , reviewed on 11 December 2017 . Jump to : navigation , search For other storms of the same name , s...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136,"[Hurricane Matthew - wikipedia <H1> Hurricane Matthew </H1> This is the latest accepted revision , reviewed on 11 December 2017 . Jump to : navigation , search For other storms of the same name , ..."
0,"Diana : in her Own Words - wikipedia <H1> Diana : in her Own Words </H1> Jump to : navigation , search <P> Diana : In Her Own Words is a television documentary that was broadcast on Channel 4 in t...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1,"[Diana : in her Own Words - wikipedia <H1> Diana : in her Own Words </H1> Jump to : navigation , search , Diana : In Her Own Words is a television documentary that was broadcast on Channel 4 in t..."
0,"Dierks Bentley - wikipedia <H1> Dierks Bentley </H1> Jump to : navigation , search <Table> <Tr> <Th_colspan=""2""> Dierks Bentley </Th> </Tr> <Tr> <Td_colspan=""2""> Bentley in April 2010 </Td> </Tr> ...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822,"[Dierks Bentley - wikipedia <H1> Dierks Bentley </H1> Jump to : navigation , search <Table> <Tr> <Th_colspan=""2""> Dierks Bentley </Th> </Tr> <Tr> <Td_colspan=""2""> Bentley in April 2010 </Td> </Tr>..."
0,"Ajay Kumar Garg Engineering college - Wikipedia <H1> Ajay Kumar Garg Engineering college </H1> Jump to : navigation , search <Table> <Tr> <Td> </Td> <Td> This article relies too much on references...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1,"[Ajay Kumar Garg Engineering college - Wikipedia <H1> Ajay Kumar Garg Engineering college </H1> Jump to : navigation , search <Table> <Tr> <Td> </Td> <Td> This article relies too much on reference..."
...,...,...,...,...,...,...,...
0,"Parable of the Prodigal Son - wikipedia <H1> Parable of the Prodigal Son </H1> Jump to : navigation , search `` Prodigal son '' redirects here . For other uses , see The Prodigal Son ( disambiguat...",where to find the story of the prodigal son in the bible,1085679002286572362,4.797133e+18,97,153,"[Parable of the Prodigal Son - wikipedia <H1> Parable of the Prodigal Son </H1> Jump to : navigation , search `` Prodigal son '' redirects here . For other uses , see The Prodigal Son ( disambigua..."
0,"Books of Kings - wikipedia <H1> Books of Kings </H1> Jump to : navigation , search For other uses , see Book of Kings . `` 4 Kings '' redirects here . For the esports team , see 4Kings . <Table> <...",who wrote kings 1 and 2 in the bible,2606362058919009898,1.737980e+19,-1,-1,"[Books of Kings - wikipedia <H1> Books of Kings </H1> Jump to : navigation , search For other uses , see Book of Kings . `` 4 Kings '' redirects here . For the esports team , see 4Kings . <Table> ..."
0,"Mamma Mia ! ( Film ) - wikipedia <H1> Mamma Mia ! ( Film ) </H1> <Table> <Tr> <Th_colspan=""2""> Mamma Mia ! </Th> </Tr> <Tr> <Td_colspan=""2""> Theatrical release poster </Td> </Tr> <Tr> <Th> Directe...",when does new mama mia film come out,-2943731698500887380,1.750850e+19,474,547,"[Mamma Mia ! ( Film ) - wikipedia <H1> Mamma Mia ! ( Film ) </H1> <Table> <Tr> <Th_colspan=""2""> Mamma Mia ! </Th> </Tr> <Tr> <Td_colspan=""2""> Theatrical release poster </Td> </Tr> <Tr> <Th> Direct..."
0,Seismic magnitude scales - Wikipedia <H1> Seismic magnitude scales </H1> For use of earthquake magnitude scales in Wikipedia see Template : M . <Table> <Tr> <Td> Part of a series on </Td> </Tr> <T...,who developed procedure used to measure size of an earthquake,7968296286655798045,1.484800e+19,-1,-1,[Seismic magnitude scales - Wikipedia <H1> Seismic magnitude scales </H1> For use of earthquake magnitude scales in Wikipedia see Template : M . <Table> <Tr> <Td> Part of a series on </Td> </Tr> <...


In [2]:
lower_case = lambda x: x.lower()

dc[1].apply(lower_case)

NameError: name 'dc' is not defined