In [53]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from extract.extract import norm_text

def nd(arr):
    return np.asarray(arr).reshape(-1)

def yex(ax):
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    # now plot both limits against eachother
    ax.plot(lims, lims, c="k", alpha=0.75, zorder=0)
    ax.set(**{"aspect": "equal", "xlim": lims, "ylim": lims})
    return ax
fsize = 15
plt.rcParams.update({"font.size": fsize})
%config InlineBackend.figure_format = 'retina'

In [54]:
from extract.extract import tokenize

In [55]:
df.columns

Index(['source', 'target', 'n_aln', 'start', 'old_start', 'nutks_src',
       'nutks_tgt', 'ntks_src', 'ntks_tgt'],
      dtype='object')

In [56]:
df = pd.read_json("data/unique_trn.json")

# this should be done before grouping
def fix_answer_for_tokens(x):
    idx = x["answer_start"]
    x["old_answer_start"] = idx
    adjusted_idx = idx

    if idx > 0:
        if x["context"][idx - 1] == " ":
            x["answer_text"] = " " + x["answer_text"]
            adjusted_idx = idx - 1
        else:
            adjusted_idx = idx
    x["answer_start"] = adjusted_idx
    return x

df = df.apply(
    fix_answer_for_tokens , axis=1
)

# groupby the context and answer, and put the set start positions into df
df = df.groupby(["context", "answer_text"])[["answer_text", "answer_start", "old_answer_start", "question"]].agg(
    answer_len = ("answer_text", "count"), 
    answer_start_idx=("answer_start", set), 
    old_answer_start_idx=("old_answer_start", set), 
    question=("question", "first")
    ).reset_index().sort_values(
        "answer_len"
    ).reset_index(drop=True).drop(
        "question", axis=1
    ).rename(
        columns={
            "context": "source",
            "answer_text": "target",
            "answer_len": "n_aln",
            "answer_start_idx": "start",
            "old_answer_start_idx": "old_start"
            }
        )

In [57]:
df.head()

Unnamed: 0,source,target,n_aln,start,old_start
0,"\n Australia: The event was held in Canberra, ...",16,1,{105},{106}
1,Southeast Raleigh is bounded by downtown on th...,"Rock Quarry Road, Poole Road, and New Bern Av...",1,{148},{149}
2,Southeast Raleigh is bounded by downtown on th...,"Chastain, Chavis Heights, Raleigh Country Club,",1,{229},{230}
3,"Southeast Asia has an area of approximately 4,...",Java,1,{212},{213}
4,"Southeast Asia has an area of approximately 4,...",Indonesia,1,{272},{273}


In [58]:
# Fix dataset
df["nsource"] = df["source"].apply(norm_text)
df["ntarget"] = df["target"].apply(norm_text)
df["same_source"] = df.apply(lambda x: x["nsource"] == x["source"], axis=1)
df["same_target"] = df.apply(lambda x: x["ntarget"] == x["target"], axis=1)
df["naive_same"] = df.apply(
    lambda x: x["source"].index(x["target"]) in x["start"] if x["target"] in x["source"] else False, axis=1
)

df = df.query("same_source & same_target & naive_same").reset_index(drop=True)
del df["naive_same"]
del df["same_source"]
del df["same_target"]
del df["nsource"]
del df["ntarget"]


In [59]:
df.shape

(56027, 5)

In [46]:
tokenize(" hello world", "whitespace")[0]

[{'token': 'hello', 'enc_token': 'hello', 'start_idx': 1, 'end_idx': 6},
 {'token': 'world', 'enc_token': 'world', 'start_idx': 7, 'end_idx': 12}]

In [61]:
df["nutks_src"] = df["source"].apply(lambda x: set([i["enc_token"] for i in tokenize(x)[0]])).apply(len)
df["nutks_tgt"] = df["target"].apply(lambda x: set([i["enc_token"] for i in tokenize(x)[0]])).apply(len)

df["ntks_src"] = df["source"].apply(lambda x: tokenize(x)[0]).apply(len)
df["ntks_tgt"] = df["target"].apply(lambda x: tokenize(x)[0]).apply(len)

df["nutks_ws_src"] = df["source"].apply(lambda x: set([i["enc_token"] for i in tokenize(x, "whitespace")[0]])).apply(len)
df["nutks_ws_tgt"] = df["target"].apply(lambda x: set([i["enc_token"] for i in tokenize(x, "whitespace")[0]])).apply(len)

df["ntks_ws_src"] = df["source"].apply(lambda x: tokenize(x, "whitespace")[0]).apply(len)
df["ntks_ws_tgt"] = df["target"].apply(lambda x: tokenize(x, "whitespace")[0]).apply(len)

In [62]:
df.query("ntks_ws_tgt > 2").shape

(23843, 13)

In [63]:
df.to_json("data/train.json", orient="records")

In [50]:
tmp = pd.read_json("data/train.json", orient="records")
tmp["start"] = tmp["start"].apply(set)
tmp["old_start"] = tmp["old_start"].apply(set)

In [51]:
(tmp != df).sum()

source       0
target       0
n_aln        0
start        0
old_start    0
nutks_src    0
nutks_tgt    0
ntks_src     0
ntks_tgt     0
dtype: int64

In [52]:
tmp.head()

Unnamed: 0,source,target,n_aln,start,old_start,nutks_src,nutks_tgt,ntks_src,ntks_tgt
0,Southeast Raleigh is bounded by downtown on th...,"Rock Quarry Road, Poole Road, and New Bern Av...",1,{148},{149},85,10,121,12
1,Southeast Raleigh is bounded by downtown on th...,"Chastain, Chavis Heights, Raleigh Country Club,",1,{229},{230},85,9,121,12
2,"Southeast Asia has an area of approximately 4,...",Java,1,{212},{213},81,1,155,1
3,"Southeast Asia has an area of approximately 4,...","4,000,000 km2",1,{43},{44},81,6,155,8
4,Southeast Raleigh is bounded by downtown on th...,Shaw University,1,{538},{539},85,2,121,2
