# Create crawl variants

## 1. Create crawl thin (= crawl without html noise)

In [None]:
import json
import numpy as np
import pandas as pd

from tqdm import tqdm

In [None]:
%%time

ADDITION = "-na"
NAME = "crawl"
train_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-train.jsonl"
val_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-val.jsonl"
test_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-test.jsonl"

train = pd.read_json(path_or_buf=train_path, lines=True)
val = pd.read_json(path_or_buf=val_path, lines=True)
test = pd.read_json(path_or_buf=test_path, lines=True)

### Fixed text is the new context

In [None]:
train = train.drop("context", axis=1)
val = val.drop("context", axis=1)
test = test.drop("context", axis=1)

new_train = train.rename(columns={'fixed': 'context'})
new_val = val.rename(columns={'fixed': 'context'})
new_test = test.rename(columns={'fixed': 'context'})

In [None]:
new_train.head()

In [None]:
def get_df_list(df):
    df_list = []
    
    for _, row in df.iterrows():
        answer = dict(row["answers"])["text"][0]
        if answer not in row["context"] and answer != "EMPTY":
            print(f"{row['id']} answer is not in context.")
        else:
            if answer != "EMPTY":
                new_answer_start = row["context"].find(answer)
                row["answers"] = {
                    "text": row["answers"]["text"],
                    "answer_start": [new_answer_start]
                }
                
            df_list.append(dict(row))
    
    return df_list

new_train_list = get_df_list(new_train)
new_val_list = get_df_list(new_val)
new_test_list = get_df_list(new_test)

#### Save jsons

In [None]:
output_dir = "crawl-thin-na/crawl-thin-na"

with open(f"{output_dir}-train.jsonl", "w") as f:
    for element in new_train_list:
        f.write(json.dumps(element, ensure_ascii=False))
        f.write("\n")
        
with open(f"{output_dir}-val.jsonl", "w") as f:
    for element in new_val_list:
        f.write(json.dumps(element, ensure_ascii=False))
        f.write("\n")
        
with open(f"{output_dir}-test.jsonl", "w") as f:
    for element in new_test_list:
        f.write(json.dumps(element, ensure_ascii=False))
        f.write("\n")

## 2. Create Crawl-Synth

In [None]:
import json
import pandas as pd
import random

from tqdm import tqdm

from dataset_utils import combine_splits

ADDITION = "-na"
NAME = "crawl"

In [None]:
old_train_path = f"{NAME}{ADDITION}/{NAME}{ADDITION}-train.jsonl"
old_train = pd.read_json(path_or_buf=old_train_path, lines=True)

In [None]:
combined_scans = combine_splits("synthetic/scan/scan")

In [None]:
combined_scans.keys()

In [None]:
only_synth_train = []

with open(f"{NAME}{ADDITION}/{NAME}{ADDITION}-synth-train-whole.jsonl", "r") as f:
    for line in tqdm(f, desc="Parse big jsonl"):
        instance = json.loads(line)
        if instance["orig_id"] in combined_scans.keys():
            only_synth_train.append(instance)

In [None]:
len(only_synth_train)

In [None]:
with open(f"{NAME}{ADDITION}/{NAME}{ADDITION}-synth-train-only.jsonl", "w+") as f:
    for element in only_synth_train:
        f.write(json.dumps(element, ensure_ascii=False))
        f.write("\n")

## 3. Extract crawl ids

In [None]:
import json
import pandas as pd

In [None]:
path = "crawl-na/crawl-na"
train = pd.read_json(path_or_buf=f"{path}-train.jsonl", lines=True)
val = pd.read_json(path_or_buf=f"{path}-val.jsonl", lines=True)
test = pd.read_json(path_or_buf=f"{path}-test.jsonl", lines=True)

all_df = train.append(val).append(test).reset_index().drop("index", axis=1)

In [None]:
for i in test.orig_id.tolist():
    if i in val.orig_id.tolist():
        print("ja")

In [None]:
all_df.shape

In [None]:
unique_ids = sorted(set(all_df.orig_id.tolist()))

In [None]:
with open("knowledge/type-data/dataset-ids.txt", 'w') as output:
    for row in unique_ids:
        output.write(str(row) + '\n')