In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected = True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 1000)
from bokeh.models import Panel, Tabs
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
import lightgbm as lgb
import plotly.figure_factory as ff
import gc
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import json
from keras.preprocessing import text, sequence
from sklearn.feature_extraction.text import CountVectorizer


# File Description

* simplified-nq-train.jsonl - the training data, in newline-delimited JSON format.
* simplified-nq-kaggle-test.jsonl - the test data, in newline-delimited JSON format.
* sample_submission.csv - a sample submission file in the correct format

# Data fields
* document_text - the text of the article in question (with some HTML tags to provide document structure). The text can be tokenized by splitting on whitespace.
* question_text - the question to be answered
* long_answer_candidates - a JSON array containing all of the plausible long answers.
* annotations - a JSON array containing all of the correct long + short answers. Only provided for train.
* document_url - the URL for the full article. Provided for informational purposes only. This is NOT the simplified version of the article so indices from this cannot be used directly. The content may also no longer match the html used to generate document_text. Only provided for train.
* example_id - unique ID for the sample.

Let's check the submission file to understand better what we need to predict

# Submission File
For each ID in the test set, you must predict a) a set of start:end token indices, b) a YES/NO answer if applicable (short answers ONLY), or c) a BLANK answer if no prediction can be made. The file should contain a header and have the following format:

* -7853356005143141653_long,6:18
* -7853356005143141653_short,YES
* -545833482873225036_long,105:200
* -545833482873225036_short,
* -6998273848279890840_long,
* -6998273848279890840_short,NO

Interesting :).

# Evaluation¶
Submissions are evaluated using micro F1 between the predicted and expected answers. Predicted long and short answers must match exactly the token indices of one of the ground truth labels ((or match YES/NO if the question has a yes/no short answer). There may be up to five labels for long answers, and more for short. If no answer applies, leave the prediction blank/null.

The metric in this competition diverges from the original metric in two key respects: 1) short and long answer formats do not receive separate scores, but are instead combined into a micro F1 score across both formats, and 2) this competition's metric does not use confidence scores to find an optimal threshold for predictions.



# Load Data

The dataset is huge, for exploration purpose we are going perform the exploratory analysis over a sample of the dataset. Let's read the training data and extract a sample (hopefully the dataset is shuffled so that the first records are random)

In [None]:
path = '/kaggle/input/tensorflow2-question-answering/'
train_path = 'simplified-nq-train.jsonl'
test_path = 'simplified-nq-test.jsonl'
sample_submission_path = 'sample_submission.csv'

def read_data(path, sample = True, chunksize = 30000):
    if sample == True:
        df = []
        with open(path, 'rt') as reader:
            for i in range(chunksize):
                df.append(json.loads(reader.readline()))
        df = pd.DataFrame(df)
        print('Our sampled dataset have {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    else:
        df = pd.read_json(path, orient = 'records', lines = True)
        print('Our dataset have {} rows and {} columns'.format(df.shape[0], df.shape[1]))
        gc.collect()
    return df

train = read_data(path+train_path, sample = True)
test = read_data(path+test_path, sample = False)
train.head()

In [None]:
sample_submission = pd.read_csv(path + sample_submission_path)
print('Our sample submission have {} rows'.format(sample_submission.shape[0]))
sample_submission.head()

First let's explore if we have missing values

# Missing Values

In [None]:
def missing_values(df):
    df = pd.DataFrame(df.isnull().sum()).reset_index()
    df.columns = ['features', 'n_missing_values']
    return df
missing_values(train)

In [None]:
missing_values(test)

Great we don't have missing values.

# Logic

This extructure is not easy to understand, let's explore the first line of our train set to understand the logic of this dataset.

In [None]:
question_text_0 = train.loc[0, 'question_text']
question_text_0

I believe this is the main question you need to respond.

In [None]:
document_text_0 = train.loc[0, 'document_text'].split()
" ".join(document_text_0[:800])

> So in the first column we have a huge wikipedia text. This is where we need to find the answer for the previous question

In [None]:
long_answer_candidates_0 = train.loc[0, 'long_answer_candidates']
long_answer_candidates_0[0:10]

This are all the possibles long answers ranges. In other words they give you the start indices and last indices of all the possibles long answers in the document text columns that could answer the question.

In [None]:
annotations_0 = train['annotations'][0][0]
annotations_0

* This is our target variable. In this case this is telling us that our long answer starts in indices 1952 and end at indices 2019.
* Also, we have a short answer that starts at indices 1960 and end at indices 1969.
* In this example we dont have a yes or no answer
* If you check the submission file we have 692 rows, this means that for each row in the test set we have to predict the short and long answer
* Sometime long and short answer are not available, in this case it's possible that we have a Yes or No answer for the short answer.

Lets check the entire logic of the first line of our train set

In [None]:
print('Our question is : ', question_text_0)
print('Our short answer is : ', " ".join(document_text_0[annotations_0['short_answers'][0]['start_token']:annotations_0['short_answers'][0]['end_token']]))
print('Our long answer is : ', " ".join(document_text_0[annotations_0['long_answer']['start_token']:annotations_0['long_answer']['end_token']]))

* Now that we understand the main logic let's check the distributions of our target variable. Remeber that our test it's going to expand to 692 rows because for each question we need to answer the long and short answer.

# Target Variable Exploration

In [None]:
yes_no_answer = []
for i in range(len(train)):
    yes_no_answer.append(train['annotations'][i][0]['yes_no_answer'])
yes_no_answer = pd.DataFrame({'yes_no_answer': yes_no_answer})
    
def bar_plot(df, column, title, width, height, n, get_count = True):
    if get_count == True:
        cnt_srs = df[column].value_counts(normalize = True)[:n]
    else:
        cnt_srs = df
        
    trace = go.Bar(
        x = cnt_srs.index,
        y = cnt_srs.values,
        marker = dict(
            color = '#1E90FF',
        ),
    )

    layout = go.Layout(
        title = go.layout.Title(
            text = title,
            x = 0.5
        ),
        font = dict(size = 14),
        width = width,
        height = height,
    )

    data = [trace]
    fig = go.Figure(data = data, layout = layout)
    py.iplot(fig, filename = 'bar_plot')
bar_plot(yes_no_answer, 'yes_no_answer', 'Yes No Answer Distribution', 800, 500, 3)

* 98.7% is None
* The amount of observations that are YES and NO only sum 1.3%!

In [None]:
# this function extract the short answers and fill a dataframe
def extract_target_variable(df, short = True):
    if short:
        short_answer = []
        for i in range(len(df)):
            short = df['annotations'][i][0]['short_answers']
            if short == []:
                yes_no = df['annotations'][i][0]['yes_no_answer']
                if yes_no == 'NO' or yes_no == 'YES':
                    short_answer.append(yes_no)
                else:
                    short_answer.append('EMPTY')
            else:
                short = short[0]
                st = short['start_token']
                et = short['end_token']
                short_answer.append(f'{st}'+':'+f'{et}')
        short_answer = pd.DataFrame({'short_answer': short_answer})
        return short_answer
    else:
        long_answer = []
        for i in range(len(df)):
            long = df['annotations'][i][0]['long_answer']
            if long['start_token'] == -1:
                long_answer.append('EMPTY')
            else:
                st = long['start_token']
                et = long['end_token']
                long_answer.append(f'{st}'+':'+f'{et}')
        long_answer = pd.DataFrame({'long_answer': long_answer})
        return long_answer
        
short_answer = extract_target_variable(train)
short_answer.head()

In [None]:
short_answer['type'] = short_answer['short_answer'].copy()
short_answer.loc[(short_answer['short_answer']!='EMPTY') & (short_answer['short_answer']!='YES') & (short_answer['short_answer']!='NO'), 'type'] =  'TEXT'
bar_plot(short_answer, 'type', 'Short Answer Distribution', 800, 500, 10)

# Short Answer Results

* We have 63.47% of the observations with a empty text
* We have 35.23% of the observations with a start and end token result
* We have the same distribution for YES and NO from the previous plot

In [None]:
long_answer = extract_target_variable(train, False)
long_answer.head()

In [None]:
long_answer['type'] = long_answer['long_answer'].copy()
long_answer.loc[(long_answer['long_answer']!='EMPTY'), 'type'] =  'TEXT'
bar_plot(long_answer, 'type', 'Long Answer Distribution', 800, 500, 10)

# Long Answer Results
* We have 50.16% of the observations empty
* We have 49.84% of the observarions with a start and end token result

# Question Explorations

Let's explore our question_text column which tell us the question that we want to answer with a segment of the document text

* Count the number of words and check distribution
* Most common words

In [None]:
def count_word_frequency(series, top = 0, bot = 20):
    cv = CountVectorizer()   
    cv_fit = cv.fit_transform(series)    
    word_list = cv.get_feature_names(); 
    count_list = cv_fit.toarray().sum(axis=0)
    frequency = pd.DataFrame({'Word': word_list, 'Frequency': count_list})
    frequency.sort_values(['Frequency'], ascending = False, inplace = True)
    frequency['Percentage'] = frequency['Frequency']/frequency['Frequency'].sum()
    frequency.drop('Frequency', inplace = True, axis = 1)
    frequency['Percentage'] = frequency['Percentage'].round(3)
    frequency = frequency.iloc[top:bot]
    frequency.set_index('Word', inplace = True)
    bar_plot(pd.Series(frequency['Percentage']), 'Percentage', 'Question Text Word Frequency Distribution', 800, 500, 20, False)
    return frequency
    
frequency = count_word_frequency(train['question_text'])

* "the" word corresponds to 9.5% of the words in question_text column

> Let´s check the next 20 words to check if we have some common topic.

In [None]:
frequency = count_word_frequency(train['question_text'], 20, 40)

So we have some interesting words like many, world, played, name, song, movie and a lot more.

Let´s check the test set and see if we have the same behaviour.

In [None]:
frequency = count_word_frequency(test['question_text'])

* Top words are repeated

In [None]:
frequency = count_word_frequency(test['question_text'], 20, 40)

* We can see some differences between train and test.

# Document Text Exploration

We are only going to analyze the test set because this documents are very big.

In [None]:
frequency = count_word_frequency(test['document_text'])

* We need to clean this column to have a better idea.
* Leaving this part for the future notebooks because i believe it´s not easy to clean it.

# Document URL

This is the url of the document text. Maybee we can use this for something but i will not analyze this variable because i believe we will not get any insight from it.

# Preprocess and Model

* In this section we are going to build a baseline model
* First, we need to create a preprocess function to pass the data and get a training and testing set with the correct format.
* The main idea of my preprocessing is making a train were each long answer candidate text is going to be a row, in other words we are going to use the indices of the annotation to extract from the document text the answer. Next we can use the extracted segment and the question as features and label a ground truth variable y with 1 and 0.
* Short answer have 4 possible answers, we are going to transform it to a binary classification problem were YES and NO are going to be empty answers.
* We have a nice sample dataset to try this preprocessing function, let's start coding.

In [None]:
def build_train_test_long(df, train = True):
    final_long_answer_frame = pd.DataFrame()
    if train == True:
        # get long answer
        long_answer = extract_target_variable(df, False)
        
        # iterate over each row to get the possible answers
        for index, row in df.iterrows():
            start_end_tokens = []
            questions = []
            responds = []
            for i in row['long_answer_candidates']:
                start_token = i['start_token']
                end_token = i['end_token']
                start_end_token = str(i['start_token']) + ':' + str(i['end_token'])
                question = row['question_text']
                respond = " ".join(row['document_text'].split()[start_token : end_token])
                start_end_tokens.append(start_end_token)
                questions.append(question)
                responds.append(respond)

            long_answer_frame = pd.DataFrame({'question': questions, 'respond': responds, 'start_end_token': start_end_tokens})
            long_answer_frame['answer'] = long_answer.iloc[index][0]
            long_answer_frame['target'] = long_answer_frame['start_end_token'] == long_answer_frame['answer']
            long_answer_frame['target'] = long_answer_frame['target'].astype('int16')
            long_answer_frame.drop(['answer'], inplace = True, axis = 1)
            final_long_answer_frame = pd.concat([final_long_answer_frame, long_answer_frame])
        return final_long_answer_frame
    else:
         # iterate over each row to get the possible answers
        for index, row in df.iterrows():
            start_end_tokens = []
            questions = []
            responds = []
            for i in row['long_answer_candidates']:
                start_token = i['start_token']
                end_token = i['end_token']
                start_end_token = str(i['start_token']) + ':' + str(i['end_token'])
                question = row['question_text']
                respond = " ".join(row['document_text'].split()[start_token : end_token])
                start_end_tokens.append(start_end_token)
                questions.append(question)
                responds.append(respond)

            long_answer_frame = pd.DataFrame({'question': questions, 'respond': responds, 'start_end_token': start_end_tokens})
            final_long_answer_frame = pd.concat([final_long_answer_frame, long_answer_frame])
        return final_long_answer_frame
        


def build_train_test_short(df, train = True):
    
    final_short_answer_frame = pd.DataFrame()
    
    if train == True:
        # get short answer
        short_answer = extract_target_variable(df, True)

        # iterate over each row to get the possible answer
        for index, row in df.iterrows():
            start_tokens = []
            end_tokens = []
            start_end_tokens = []
            questions = []
            responds = []
            for i in row['long_answer_candidates']:
                start_token = i['start_token']
                end_token = i['end_token']
                start_end_token = str(i['start_token']) + ':' + str(i['end_token'])
                question = row['question_text']
                respond = " ".join(row['document_text'].split()[int(start_token) : int(end_token)])
                start_tokens.append(start_token)
                end_tokens.append(end_token)
                start_end_tokens.append(start_end_token)
                questions.append(question)
                responds.append(respond)

            short_answer_frame = pd.DataFrame({'question': questions, 'respond': responds, 'start_token': start_tokens, 'end_token': end_tokens, 'start_end_token': start_end_tokens})
            short_answer_frame['answer'] = short_answer.iloc[index][0]
            short_answer_frame['start_token_an'] = short_answer_frame['answer'].apply(lambda x: x.split(':')[0] if ':' in x else 0)
            short_answer_frame['end_token_an'] = short_answer_frame['answer'].apply(lambda x: x.split(':')[1] if ':' in x else 0)
            short_answer_frame['start_token_an'] = short_answer_frame['start_token_an'].astype(int)
            short_answer_frame['end_token_an'] = short_answer_frame['end_token_an'].astype(int)
            short_answer_frame['target'] = 0
            short_answer_frame.loc[(short_answer_frame['start_token_an'] >= short_answer_frame['start_token']) & (short_answer_frame['end_token_an'] <= short_answer_frame['end_token']), 'target'] = 1
            short_answer_frame.drop(['answer', 'start_token', 'end_token', 'start_token_an', 'end_token_an'], inplace = True, axis = 1)
            final_short_answer_frame = pd.concat([final_short_answer_frame, short_answer_frame])
        return final_short_answer_frame
    else:
        # iterate over each row to get the possible answer
        for index, row in df.iterrows():
            start_end_tokens = []
            questions = []
            responds = []
            for i in row['long_answer_candidates']:
                start_token = i['start_token']
                end_token = i['end_token']
                start_end_token = str(i['start_token']) + ':' + str(i['end_token'])
                question = row['question_text']
                respond = " ".join(row['document_text'].split()[int(start_token) : int(end_token)])
                start_end_tokens.append(start_end_token)
                questions.append(question)
                responds.append(respond)

            short_answer_frame = pd.DataFrame({'question': questions, 'respond': responds, 'start_end_token': start_end_tokens})
            final_short_answer_frame = pd.concat([final_short_answer_frame, short_answer_frame])
        return final_short_answer_frame

In [None]:
sh = build_train_test_long(train.head())
sh.head()

In [None]:
sh[sh['target']==1]

In [None]:
long_answer.head()

* This is a sample of our training set for long answers.
* We have 4 text answer for the first 5 question
* For long answers we will get a probability for each question response combination. We will filter this result by each question to check if there is a hight probability for one of the answers. 
* For short answer we are going to do the same, but is harder because we only will know the long tokens. We need to figure how we can extract the short token indices.
* The dataframe is very large so maybee we will need to remake this function so they work with a batching process.
* We can make 2 models to resolve this problem (long answer and short answer)

# Building the model in a different script to handle the memory better