# Setup data for NLP processing

In [128]:
# head -10000 ../data/simplified/final_data.jsonl > final_data_10K.jsonl
# head -5000 ../data/simplified/final_data.jsonl > final_data_5K.jsonl

import json
import pandas as pd
from pandas.io.json import json_normalize

pd.set_option('display.max_colwidth', 200)

## Import data as a list

In [129]:
def read_jsonl_list(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line_data = json.loads(line.rstrip('\n|\r'))
            data.append(line_data)
    print(f'Loaded {len(data)} records from {input_path}')
    return data

## Import data as a pandas data frame

In [130]:
def read_jsonl_df(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data_norm = pd.DataFrame()
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line_data = json.loads(line.rstrip('\n|\r'))
            line_data_norm = json_normalize(line_data)
            line_data_norm_annotations = json_normalize(
              line_data, record_path='annotations', record_prefix='Ann.',
            )
            # line_data_norm_answ_candidates = pd.io.json.json_normalize(
            #   line_data, record_path='long_answer_candidates', record_prefix='AnsCand.',
            # )
            line_data_norm = pd.concat(
                [
                  line_data_norm,
                  line_data_norm_annotations
                  # , line_data_norm_answ_candidates
                ]
                , axis=1
            )
            
            data_norm = pd.concat([data_norm, line_data_norm])
    print(f'Normalized into {len(data_norm)} records from {input_path}.')
    return data_norm

In [131]:
# nq_data = read_jsonl(f'final_data_10K.jsonl')
nq_df = read_jsonl_df(f'final_data_5K.jsonl')

Normalized into 5000 records from final_data_5K.jsonl.


In [132]:
df = nq_df.drop(
    ['document_url', 'long_answer_candidates', 'annotations', 'Ann.yes_no_answer', 'Ann.short_answers',
        'Ann.long_answer.candidate_index'
    ]
    , axis=1
)

In [133]:
lower_case = lambda x: x.lower()
df['doc'] = df['document_text'].apply(lower_case)
df['query'] = df['question_text'].apply(lower_case)
df = df.drop(['document_text', 'question_text'], axis=1)

In [134]:
cols = list(df.columns)
cols = cols[-2:] + cols[:-2]
cols

['doc',
 'query',
 'example_id',
 'Ann.annotation_id',
 'Ann.long_answer.start_token',
 'Ann.long_answer.end_token']

In [135]:
df = df[cols]
df

Unnamed: 0,doc,query,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,"london bridge is falling down - wikipedia <h1> london bridge is falling down </h1> <p> </p> <table> <tr> <th_colspan=""2""> `` london bridge is falling down '' </th> </tr> <tr> <td_colspan=""2""> illu...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,"hurricane matthew - wikipedia <h1> hurricane matthew </h1> this is the latest accepted revision , reviewed on 11 december 2017 . jump to : navigation , search for other storms of the same name , s...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,"diana : in her own words - wikipedia <h1> diana : in her own words </h1> jump to : navigation , search <p> diana : in her own words is a television documentary that was broadcast on channel 4 in t...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,"dierks bentley - wikipedia <h1> dierks bentley </h1> jump to : navigation , search <table> <tr> <th_colspan=""2""> dierks bentley </th> </tr> <tr> <td_colspan=""2""> bentley in april 2010 </td> </tr> ...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,"ajay kumar garg engineering college - wikipedia <h1> ajay kumar garg engineering college </h1> jump to : navigation , search <table> <tr> <td> </td> <td> this article relies too much on references...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...,...
0,"kensington roof gardens - wikipedia <h1> kensington roof gardens </h1> jump to : navigation , search <table> <tr> <td> </td> <td> this article needs additional citations for verification . please ...",when did the roof gardens in kensington open,2751457415860083587,1.843238e+19,-1,-1
0,internet in north korea - wikipedia <h1> internet in north korea </h1> <p> </p> <table> <tr> <th> internet </th> </tr> <tr> <td> an opte project visualization of routing paths through a portion of...,who can use the internet in north korea,5958138391726624145,3.037154e+18,-1,-1
0,"caesar salad - wikipedia <h1> caesar salad </h1> this is the latest accepted revision , reviewed on 30 august 2017 . jump to : navigation , search <table> caesar salad <tr> <td_colspan=""2""> a caes...",the ceasar salad was first served in which country,-1673009993299625057,2.244182e+18,27,143
0,"poverty in the united states - wikipedia <h1> poverty in the united states </h1> jump to : navigation , search number in poverty and poverty rate : 1959 to 2015 . united states . <p> poverty is a ...",what is poverty level income in the united states,7112317244048327025,1.725171e+19,1416,1582


In [136]:
import re 

split_pattern = r'<p>' # TODO: (<p>|<table>|<li>|<ol>) or beautifull soup?

In [137]:
df['doc'] = df['doc'].str.split(split_pattern)

In [138]:
df

Unnamed: 0,doc,query,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,"[london bridge is falling down - wikipedia <h1> london bridge is falling down </h1> , </p> <table> <tr> <th_colspan=""2""> `` london bridge is falling down '' </th> </tr> <tr> <td_colspan=""2""> illu...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,"[hurricane matthew - wikipedia <h1> hurricane matthew </h1> this is the latest accepted revision , reviewed on 11 december 2017 . jump to : navigation , search for other storms of the same name , ...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,"[diana : in her own words - wikipedia <h1> diana : in her own words </h1> jump to : navigation , search , diana : in her own words is a television documentary that was broadcast on channel 4 in t...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,"[dierks bentley - wikipedia <h1> dierks bentley </h1> jump to : navigation , search <table> <tr> <th_colspan=""2""> dierks bentley </th> </tr> <tr> <td_colspan=""2""> bentley in april 2010 </td> </tr>...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,"[ajay kumar garg engineering college - wikipedia <h1> ajay kumar garg engineering college </h1> jump to : navigation , search <table> <tr> <td> </td> <td> this article relies too much on reference...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...,...
0,"[kensington roof gardens - wikipedia <h1> kensington roof gardens </h1> jump to : navigation , search <table> <tr> <td> </td> <td> this article needs additional citations for verification . please...",when did the roof gardens in kensington open,2751457415860083587,1.843238e+19,-1,-1
0,"[internet in north korea - wikipedia <h1> internet in north korea </h1> , </p> <table> <tr> <th> internet </th> </tr> <tr> <td> an opte project visualization of routing paths through a portion of...",who can use the internet in north korea,5958138391726624145,3.037154e+18,-1,-1
0,"[caesar salad - wikipedia <h1> caesar salad </h1> this is the latest accepted revision , reviewed on 30 august 2017 . jump to : navigation , search <table> caesar salad <tr> <td_colspan=""2""> a cae...",the ceasar salad was first served in which country,-1673009993299625057,2.244182e+18,27,143
0,"[poverty in the united states - wikipedia <h1> poverty in the united states </h1> jump to : navigation , search number in poverty and poverty rate : 1959 to 2015 . united states . , poverty is a ...",what is poverty level income in the united states,7112317244048327025,1.725171e+19,1416,1582


In [139]:
from bs4 import BeautifulSoup

In [140]:
def drop_tags(html):
    soup = BeautifulSoup(html)
    text = soup.get_text()
    return text

In [141]:
df['doc'] = df['doc'].apply(lambda x: list(map(lambda y: drop_tags(y), x)))

In [142]:
df

Unnamed: 0,doc,query,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,"[london bridge is falling down - wikipedia london bridge is falling down , , `` london bridge is falling down '' ( also known as `` my fair lady '' or `` london bridge '' ) is a traditional eng...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,"[hurricane matthew - wikipedia hurricane matthew this is the latest accepted revision , reviewed on 11 december 2017 . jump to : navigation , search for other storms of the same name , see tropi...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,"[diana : in her own words - wikipedia diana : in her own words jump to : navigation , search , diana : in her own words is a television documentary that was broadcast on channel 4 in the united ...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,"[dierks bentley - wikipedia dierks bentley jump to : navigation , search dierks bentley bentley in april 2010 background information birth name frederick dierks bentley ( ...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,"[ajay kumar garg engineering college - wikipedia ajay kumar garg engineering college jump to : navigation , search this article relies too much on references to primary sources . please imp...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...,...
0,"[kensington roof gardens - wikipedia kensington roof gardens jump to : navigation , search this article needs additional citations for verification . please help improve this article by add...",when did the roof gardens in kensington open,2751457415860083587,1.843238e+19,-1,-1
0,"[internet in north korea - wikipedia internet in north korea , , internet access is available but strictly limited in north korea ; it is only permitted with special authorization and primarily...",who can use the internet in north korea,5958138391726624145,3.037154e+18,-1,-1
0,"[caesar salad - wikipedia caesar salad this is the latest accepted revision , reviewed on 30 august 2017 . jump to : navigation , search caesar salad a caesar salad course hors d'œuvre ...",the ceasar salad was first served in which country,-1673009993299625057,2.244182e+18,27,143
0,"[poverty in the united states - wikipedia poverty in the united states jump to : navigation , search number in poverty and poverty rate : 1959 to 2015 . united states . , poverty is a state of d...",what is poverty level income in the united states,7112317244048327025,1.725171e+19,1416,1582


In [143]:
df.to_pickle('data_split_p_drop_html_tags.csv')

In [144]:
df = pd.read_pickle('data_split_p_drop_html_tags.csv')

In [147]:
df.shape

(5000, 6)

In [151]:
def drop_empty(arr, size):
    non_empty = list(filter(lambda x: len(x) > size, arr))
    return non_empty

In [154]:
df['doc'] = df['doc'].apply(lambda x: drop_empty(x, 2))

In [155]:
df

Unnamed: 0,doc,query,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,"[london bridge is falling down - wikipedia london bridge is falling down , `` london bridge is falling down '' ( also known as `` my fair lady '' or `` london bridge '' ) is a traditional englis...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,"[hurricane matthew - wikipedia hurricane matthew this is the latest accepted revision , reviewed on 11 december 2017 . jump to : navigation , search for other storms of the same name , see tropi...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,"[diana : in her own words - wikipedia diana : in her own words jump to : navigation , search , diana : in her own words is a television documentary that was broadcast on channel 4 in the united ...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,"[dierks bentley - wikipedia dierks bentley jump to : navigation , search dierks bentley bentley in april 2010 background information birth name frederick dierks bentley ( ...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,"[ajay kumar garg engineering college - wikipedia ajay kumar garg engineering college jump to : navigation , search this article relies too much on references to primary sources . please imp...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...,...
0,"[kensington roof gardens - wikipedia kensington roof gardens jump to : navigation , search this article needs additional citations for verification . please help improve this article by add...",when did the roof gardens in kensington open,2751457415860083587,1.843238e+19,-1,-1
0,"[internet in north korea - wikipedia internet in north korea , internet access is available but strictly limited in north korea ; it is only permitted with special authorization and primarily us...",who can use the internet in north korea,5958138391726624145,3.037154e+18,-1,-1
0,"[caesar salad - wikipedia caesar salad this is the latest accepted revision , reviewed on 30 august 2017 . jump to : navigation , search caesar salad a caesar salad course hors d'œuvre ...",the ceasar salad was first served in which country,-1673009993299625057,2.244182e+18,27,143
0,"[poverty in the united states - wikipedia poverty in the united states jump to : navigation , search number in poverty and poverty rate : 1959 to 2015 . united states . , poverty is a state of d...",what is poverty level income in the united states,7112317244048327025,1.725171e+19,1416,1582


In [156]:
import nltk
# nltk.download()
import string

In [157]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [158]:
def clean_doc(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [159]:
df['doc'] = df['doc'].apply(lambda x: list(map(lambda y: clean_doc(y),x)))

In [160]:
df.to_pickle('data_nltk_clean.csv')

In [197]:
df = pd.read_pickle('data_nltk_clean.csv')

In [198]:
df

Unnamed: 0,doc,query,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,"[[london, bridg, fall, wikipedia, london, bridg, fall, ], [, london, bridg, fall, also, known, fair, ladi, london, bridg, tradit, english, nurseri, rhyme, sing, game, found, differ, version, world...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,"[[hurrican, matthew, wikipedia, hurrican, matthew, latest, accept, revis, review, 11, decemb, 2017, jump, navig, search, storm, name, see, tropic, storm, matthew, hurrican, matthew, categori, 5, m...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,"[[diana, word, wikipedia, diana, word, jump, navig, search, ], [diana, word, televis, documentari, broadcast, channel, 4, unit, kingdom, 6, august, 2017, film, includ, footag, late, diana, princes...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,"[[dierk, bentley, wikipedia, dierk, bentley, jump, navig, search, dierk, bentley, bentley, april, 2010, background, inform, birth, name, frederick, dierk, bentley, 1975, 11, 20, novemb, 20, 1975, ...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,"[[ajay, kumar, garg, engin, colleg, wikipedia, ajay, kumar, garg, engin, colleg, jump, navig, search, articl, reli, much, refer, primari, sourc, pleas, improv, ad, secondari, tertiari, sourc, marc...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...,...
0,"[[kensington, roof, garden, wikipedia, kensington, roof, garden, jump, navig, search, articl, need, addit, citat, verif, pleas, help, improv, articl, ad, citat, reliabl, sourc, unsourc, materi, ma...",when did the roof gardens in kensington open,2751457415860083587,1.843238e+19,-1,-1
0,"[[internet, north, korea, wikipedia, internet, north, korea, ], [internet, access, avail, strictli, limit, north, korea, permit, special, author, primarili, use, govern, purpos, foreign, countri, ...",who can use the internet in north korea,5958138391726624145,3.037154e+18,-1,-1
0,"[[caesar, salad, wikipedia, caesar, salad, latest, accept, revis, review, 30, august, 2017, jump, navig, search, caesar, salad, caesar, salad, cours, hor, dœuvr, salad, place, origin, mexico, regi...",the ceasar salad was first served in which country,-1673009993299625057,2.244182e+18,27,143
0,"[[poverti, unit, state, wikipedia, poverti, unit, state, jump, navig, search, number, poverti, poverti, rate, 1959, 2015, unit, state, ], [poverti, state, depriv, lack, usual, social, accept, amou...",what is poverty level income in the united states,7112317244048327025,1.725171e+19,1416,1582


In [199]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()

In [200]:
def par_2_vec(par):
    par_tfidf = tfidf_vect.fit_transform(par)
    return par_tfidf

In [201]:
# df['doc'] = df['doc'].apply(lambda x: drop_empty(x, 9)) 
def replace_empty(arr, size):
    rep_empty = list(map(lambda x: ['SEP'] if len(x) <= size else x, arr)) # placeholder
    return rep_empty

In [210]:
df['doc'] = df['doc'].apply(lambda x: replace_empty(x, 6)) # avoid empty vocab error

In [211]:
df['doc'] = df['doc'].apply(lambda x: list(map(lambda y: par_2_vec(y), x)))

In [212]:
df.to_pickle('data_vect_doc.csv')

In [213]:
df = pd.read_pickle('data_vect_doc.csv')
df

Unnamed: 0,doc,query,example_id,Ann.annotation_id,Ann.long_answer.start_token,Ann.long_answer.end_token
0,"[ (0, 2)\t1.0\n (1, 0)\t1.0\n (2, 1)\t1.0\n (3, 3)\t1.0\n (4, 2)\t1.0\n (5, 0)\t1.0\n (6, 1)\t1.0, (1, 26)\t1.0\n (2, 6)\t1.0\n (3, 17)\t1.0\n (4, 2)\t1.0\n (5, 23)\t1.0\n (6, 16)\t1...",london bridges falling down falling down falling down london bridge is falling down,-1484752057412977131,1.714421e+19,-1,-1
0,"[ (0, 34)\t1.0\n (1, 47)\t1.0\n (2, 77)\t1.0\n (3, 34)\t1.0\n (4, 47)\t1.0\n (5, 42)\t1.0\n (6, 12)\t1.0\n (7, 61)\t1.0\n (8, 60)\t1.0\n (9, 1)\t1.0\n (10, 25)\t1.0\n (11, 5)\t1.0\n (...",how high were the winds from hurricane matthew,-6558346961355674827,1.043755e+19,115,136
0,"[ (0, 0)\t1.0\n (1, 5)\t1.0\n (2, 4)\t1.0\n (3, 0)\t1.0\n (4, 5)\t1.0\n (5, 1)\t1.0\n (6, 2)\t1.0\n (7, 3)\t1.0, (0, 10)\t1.0\n (1, 28)\t1.0\n (2, 24)\t1.0\n (3, 12)\t1.0\n (4, 5)\t1...",diana in her own words channel 4 narrator,4494234814999936938,1.478877e+19,-1,-1
0,"[ (0, 24)\t1.0\n (1, 15)\t1.0\n (2, 54)\t1.0\n (3, 24)\t1.0\n (4, 15)\t1.0\n (5, 32)\t1.0\n (6, 38)\t1.0\n (7, 46)\t1.0\n (8, 24)\t1.0\n (9, 15)\t1.0\n (10, 15)\t1.0\n (11, 10)\t1.0\n ...",where did dierks bentley go to high school,-7522925448152928839,1.408139e+19,683,822
0,"[ (0, 12)\t1.0\n (1, 25)\t1.0\n (2, 20)\t1.0\n (3, 18)\t1.0\n (4, 15)\t1.0\n (5, 55)\t1.0\n (6, 12)\t1.0\n (7, 25)\t1.0\n (8, 20)\t1.0\n (9, 18)\t1.0\n (10, 15)\t1.0\n (11, 24)\t1.0\n ...",all india rank of ajay kumar garg engineering college,8852753410566798291,4.288499e+18,-1,-1
...,...,...,...,...,...,...
0,"[ (0, 31)\t1.0\n (1, 47)\t1.0\n (2, 23)\t1.0\n (3, 60)\t1.0\n (4, 31)\t1.0\n (5, 47)\t1.0\n (6, 23)\t1.0\n (7, 30)\t1.0\n (8, 40)\t1.0\n (9, 49)\t1.0\n (10, 14)\t1.0\n (11, 41)\t1.0\n ...",when did the roof gardens in kensington open,2751457415860083587,1.843238e+19,-1,-1
0,"[ (0, 0)\t1.0\n (1, 2)\t1.0\n (2, 1)\t1.0\n (3, 3)\t1.0\n (4, 0)\t1.0\n (5, 2)\t1.0\n (6, 1)\t1.0, (0, 25)\t1.0\n (1, 2)\t1.0\n (2, 6)\t1.0\n (3, 52)\t1.0\n (4, 31)\t1.0\n (5, 36)\t1...",who can use the internet in north korea,5958138391726624145,3.037154e+18,-1,-1
0,"[ (0, 6)\t1.0\n (1, 40)\t1.0\n (2, 49)\t1.0\n (3, 6)\t1.0\n (4, 40)\t1.0\n (5, 21)\t1.0\n (6, 2)\t1.0\n (7, 37)\t1.0\n (8, 36)\t1.0\n (9, 1)\t1.0\n (10, 3)\t1.0\n (11, 0)\t1.0\n (12, ...",the ceasar salad was first served in which country,-1673009993299625057,2.244182e+18,27,143
0,"[ (0, 5)\t1.0\n (1, 9)\t1.0\n (2, 8)\t1.0\n (3, 10)\t1.0\n (4, 5)\t1.0\n (5, 9)\t1.0\n (6, 8)\t1.0\n (7, 2)\t1.0\n (8, 3)\t1.0\n (9, 7)\t1.0\n (10, 4)\t1.0\n (11, 5)\t1.0\n (12, 5)\t1...",what is poverty level income in the united states,7112317244048327025,1.725171e+19,1416,1582


In [214]:
type(df['doc'].iloc[1][0])

scipy.sparse.csr.csr_matrix