# TL;DR

* Most of the data that has very long answer's document is the article like `List_of_...`
    * `long_answer` is the span of long list in the document
    * `short_answer` is the one of the cell in the list
* Some documents (`document_text`) are linked to multiple questions (`question_text`)
    * linked questions are unique
    * but some questions have the same `long_answer` span

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import json
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

# Check data keys

In [None]:
from itertools import islice

nq_train_jsonl = "/kaggle/input/tensorflow2-question-answering/simplified-nq-train.jsonl"

with open(nq_train_jsonl, "r") as f:
    for line in islice(f, 1):
        train =json.loads(line)

In [None]:
train.keys()

In [None]:
train['annotations']

In [None]:
train['annotations'][0]['long_answer']

# Retrive very long answer data
Retrive train data that contains `long_answer` which have very long token (>10000)

In [None]:
data_list = []
with open(nq_train_jsonl, "r") as f:
    for line in tqdm(f):
        data = json.loads(line)
        long_ans = data['annotations'][0]['long_answer']
        if long_ans['end_token'] - long_ans['start_token'] > 10000:
            data_list.append(data)

In [None]:
len(data_list)

## Save results

In [None]:
# save
with open("./very-long-answer-nq-train.jsonl", "w")as f:
    for l in data_list:
        json.dump(l, f)
        f.write("\n")

# Convert to DataFrame

In [None]:
df = pd.DataFrame.from_dict(data_list)
df.head()

In [None]:
# annotations data to column
df["yes_no_answer"] = df["annotations"].apply(lambda q: q[0]["yes_no_answer"])

df["long_answer_end"] = df["annotations"].apply(lambda q: q[0]["long_answer"]["end_token"])
df["long_answer_start"] = df["annotations"].apply(lambda q: q[0]["long_answer"]["start_token"])
df["long_answer_length"] = df.loc[:,"long_answer_end":"long_answer_start"].diff(axis=1)["long_answer_start"].abs()

In [None]:
def apply_ans_end(entry):
    if len(entry)==0:
        return None
    return  entry[0]["end_token"]

def apply_ans_start(entry):
    if len(entry)==0:
        return None
    return  entry[0]["start_token"]

df["short_answers"] = df["annotations"].apply(lambda q: q[0]["short_answers"])
df["short_answer_end"] = df["short_answers"].apply(apply_ans_end)
df["short_answer_start"] = df["short_answers"].apply(apply_ans_start)
df["short_answer_length"] = df.loc[:,"short_answer_end":"short_answer_start"].diff(axis=1)["short_answer_start"].abs()

In [None]:
# question_text
df["head_word"] = df["question_text"].apply(lambda q:q.split()[0])

In [None]:
df.head()

## View very long answer data

In [None]:
from IPython.core.display import HTML

In [None]:
df['document_url'][0]

This `document_text` is [List of the highest major summits of North America](https://en.wikipedia.org//w/index.php?title=List_of_the_highest_major_summits_of_North_America&amp;oldid=835916791) at Wikipendia


In [None]:
def get_answer_text(s, is_short=False):
    if is_short:
        beg = int(s['short_answer_start'])
        end = int(s['short_answer_end'])
    else:
        beg = int(s['long_answer_start'])
        end = int(s['long_answer_end'])
        
    if beg is not None and end is not None:
        return " ".join(s['document_text'].split(" ")[beg:end])
    else:
        return None

### long answer

In [None]:
# long answer
HTML(get_answer_text(df.iloc[0]))

### short answer

In [None]:
# short answer
HTML(get_answer_text(df.iloc[0], True))

# EDA
## short answer

In [None]:
df["short_answer_length"].describe()

In [None]:
df["short_answer_length"].plot.hist()

In [None]:
df["short_answer_length"].value_counts()

## yes_no_answer

In [None]:
df["yes_no_answer"].value_counts()

## max short_answer_length

In [None]:
max_idx = df["short_answer_length"].idxmax()

In [None]:
print(df.iloc[max_idx])

In [None]:
print(df.iloc[max_idx]['document_url'])

Electron configurations of the elements (data page) : https://en.wikipedia.org//w/index.php?title=Electron_configurations_of_the_elements_(data_page)

In [None]:
df.iloc[max_idx]['question_text']

### short answer

In [None]:
# short ans
HTML(get_answer_text(df.iloc[max_idx], True))

In [None]:
get_answer_text(df.iloc[max_idx], True)

### long answer

In [None]:
# long ans 
HTML(get_answer_text(df.iloc[max_idx], False))

## max long_answer_length

In [None]:
max_long = df["long_answer_length"].idxmax()

In [None]:
df.iloc[max_long]

In [None]:
print(df.iloc[max_long]['document_url'])
print(df.iloc[max_long]['question_text'])

List of Xbox 360 games https://en.wikipedia.org//w/index.php?title=List_of_Xbox_360_games

divided into two pages now
* https://en.wikipedia.org/wiki/List_of_Xbox_360_games_(A–L)
* https://en.wikipedia.org/wiki/List_of_Xbox_360_games_(M–Z)

### long answer

In [None]:
# long ans 
HTML(get_answer_text(df.iloc[max_long]))

# count question_text
count word

In [None]:
df["question_text"].str.split()

In [None]:
from collections import Counter
cnt = Counter()
for item in df["question_text"].str.split().to_list():
    cnt.update(item)
cnt.most_common(50)

# count head word of question_text 

In [None]:
# count head word
plt.figure(figsize=(10,8),dpi=100)
plt.rcParams["font.size"] = 6

df.head_word.value_counts().plot(kind='bar')

In [None]:
df.head_word.value_counts().head(20)

`list`, `xbox` ,`cities` is not *interrogative word* (what, when, who, how, which).

## head word: list

In [None]:
pd.set_option("display.max_colwidth", 100)

In [None]:
df[df["head_word"]=="list"][["question_text", "document_url", "long_answer_start", "long_answer_end"]] \
.sort_values(by=["document_url", "long_answer_start"])

## head word: xbox

In [None]:
df[df["head_word"]=="xbox"][["question_text", "document_url", "long_answer_start", "long_answer_end"]] \
.sort_values(by=["document_url", "long_answer_start"])

## head word: cities

In [None]:
df[df["head_word"]=="cities"][["question_text", "document_url", "long_answer_start", "long_answer_end"]] \
.sort_values(by=["document_url", "long_answer_start"])

# count document_url like "List_of_xx"

In [None]:
df['document_url'].str.contains('=List_of_', regex=True).describe()

## nunique

In [None]:
df['document_url'].nunique()

In [None]:
df['question_text'].nunique()

In [None]:
df.groupby(['document_url']) \
.count()[['long_answer_length', 'short_answer_length']] \
.sort_values(by = 'long_answer_length', ascending=False) \
.head(10)

In [None]:
df.groupby(['document_url']).count()['long_answer_length'].hist(bins=19, grid=False)

In [None]:
df.groupby(['document_url']).count()['long_answer_length'].value_counts()

 * one document, one question : 254/340 = 75% 
 * one document, multiple question : 25% 
 
 Some documents are linked to multiple questions.
 
 ***

 
 Note:
 
 * If you considered about Wikipedia Revision history (like `&amp;oldid=XXX`) , more documents are linked to multiple questions.
     * For example, [List_of_Xbox_360_games_compatible_with_Xbox_One](https://en.wikipedia.org//w/index.php?title=List_of_Xbox_360_games_compatible_with_Xbox_One) is old version of [List_of_backward_compatible_games_for_Xbox_One](https://en.wikipedia.org//w/index.php?title=List_of_backward_compatible_games_for_Xbox_One)
     * https://en.wikipedia.org/w/index.php?title=List_of_backward_compatible_games_for_Xbox_One&action=history