# Dataset insights

In [None]:
import json
import numpy as np

## Getting Datasets Infos to compare with SQuAD v2.0

In [None]:
import pandas as pd

ADDITION = "-na"
NAME = "crawl"

In [None]:
%%time
train_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-train.jsonl"
val_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-val.jsonl"
test_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-test.jsonl"

train = pd.read_json(path_or_buf=train_path, lines=True)
val = pd.read_json(path_or_buf=val_path, lines=True)
test = pd.read_json(path_or_buf=test_path, lines=True)

print("Total instances")
print("---------------")
print("Train:" , train.shape)
print("Val:", val.shape)
print("Test:", test.shape)
print()

train["answer_start"] = train.apply(lambda row: row.answers["answer_start"][0], axis=1)
train_na = train[train.answer_start == -1]

val["answer_start"] = val.apply(lambda row: row.answers["answer_start"][0], axis=1)
val_na = val[val.answer_start == -1]

test["answer_start"] = test.apply(lambda row: row.answers["answer_start"][0], axis=1)
test_na = test[test.answer_start == -1]

print("Negative Instances (= Instance with no answers)")
print("-------------------------")
print("Train_na:" , train_na.shape)
print("Val_na:", val_na.shape)
print("Test_na:", test_na.shape)
print()

print("Total addresses")
print("---------------")
print("Train:", len(train.orig_id.unique()))
print("Val:", len(val.orig_id.unique()))
print("Test:", len(test.orig_id.unique()))
print()

print("Negative Addresses (= addresses with no answers)")
print("-------------------------")
print("Train:", len(train_na.orig_id.unique()))
print("Val:", len(val_na.orig_id.unique()))
print("Test:", len(test_na.orig_id.unique()))
print()

## Search for duplicates

In [None]:
import pandas as pd

ADDITION = "-na"
NAME = "crawl"

In [None]:
%%time
train_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-train.jsonl"
val_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-val.jsonl"
test_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-test.jsonl"

train = pd.read_json(path_or_buf=train_path, lines=True)
val = pd.read_json(path_or_buf=val_path, lines=True)
test = pd.read_json(path_or_buf=test_path, lines=True)

In [None]:
alldft = train.append(val)
alldf = alldft.append(test)
alltext = alldf[["context"]]
alltext_unique = alltext.drop_duplicates()
alltext_unique["length"] = alltext_unique.apply(lambda row: len(row.context), axis=1)
alltext_unique.sort_values(by="length")

traintext = train[["context"]]
traintext_unique = traintext.drop_duplicates()

valtext = val[["context"]]
valtext_unique = valtext.drop_duplicates()

testtext = test[["context"]]
testtext_unique = testtext.drop_duplicates()

print(traintext_unique.shape[0] + valtext_unique.shape[0] + testtext_unique.shape[0])

tmp = traintext_unique.append(valtext_unique)
combined_text = tmp.append(testtext_unique)


new = alltext_unique.append(combined_text)
new["length"] = new.apply(lambda row: len(row.context), axis=1)
new.drop_duplicates()

In [None]:
new.drop_duplicates().sort_values(by="length")

In [None]:
combined_text.context.isin(alltext_unique.context).value_counts()

## Language distribution

In [None]:
import json
import pandas as pd
from tqdm import tqdm

ADDITION = "-na"
NAME = "crawl"

train_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-train.jsonl"
train = pd.read_json(path_or_buf=train_path, lines=True)
print(train.shape)

In [None]:
%%time
with open("testcases.json", "r") as f:
    testcases = json.load(f)
    if NAME.lower() == "crawl":
        testcases = [tc for tc in testcases if tc["source"] == "CRAWL"]
    else:
        testcases = [tc for tc in testcases if tc["source"] in ["CRAWL", "EMAIL", "GRAB", "SCAN"]]
    print(len(testcases))

In [None]:
%%time

tqdm.pandas()

# match country codes to instances
langs = {tc["id"]: tc["country_code"] for tc in testcases}

def match_country(orig_id, langs):
    return langs[orig_id]

train["country"] = train.progress_apply(lambda row: match_country(row.orig_id, langs), axis=1)    

In [None]:
print("Number of different countries:", len(train.country.value_counts()))
print("Sum of the instances:", train.country.value_counts().sum())
print("------")
train.country.value_counts()[:10]

In [None]:
train = train[train.country != ""]
train.country.value_counts()[:10].plot(kind="bar")

## Position of address blocks within imprints of the crawl-na-train

In [None]:
import pandas as pd
import numpy as np

In [None]:
%%time
train = pd.read_json(path_or_buf=f"../data/crawl-na/crawl-na-train.jsonl", lines=True)

In [None]:
def get_relative_position(row):
    context = row.context
    fixed = row.fixed
    
    return context.find(fixed) / len(context)

train["relative_position"] = train.apply(lambda row: get_relative_position(row), axis=1)

In [None]:
train.relative_position.median()

In [None]:
train.relative_position.describe()

In [None]:
np.percentile(list(dict(train.relative_position).values()), 66)