In [20]:
import pandas as pd
import os
from tqdm import tqdm
from utils import avg
from utils import evidence_to_mask

def to_data_df(df, data_dir):
    data_df = []
    columns = ['text', 'classification', 'rationale']
    for i in tqdm(range(len(df))):
        df_row = df.loc[i]
        doc_ids = df_row['docids']
        query = df_row['query']
        evidence_list = df_row['evidences']
        if evidence_list:
            evidence_list = [x for xx in evidence_list for x in xx]
        classification = df_row['classification']
        
        text = ''
        for doc in doc_ids:
            file = f'{data_dir}/docs/{doc}'
            if os.path.isfile(file):
                f = open(file, 'r', encoding="utf-8") 
                for line in f.readlines():
                    text += line.rstrip() + ' '
            else:
                print("???")
                print(file)
                quit()
        
        tokens = text.split()
        rationale_mask = evidence_to_mask(tokens, evidence_list)
        
        # joining text and query with [SEP]
        QA = f"{query} [SEP] {text}"
        rationale_mask = [1]*(len(query.split())+1) + rationale_mask
    
        data_df.append([QA, classification, rationale_mask])
    data_df = pd.DataFrame(data_df, columns=columns)
    
    data_df_shuffled=data_df.sample(frac=1).reset_index(drop=True)
    return data_df_shuffled

In [21]:
dataset = "boolq"

In [22]:
data_dir = f'../data/{dataset}'
train = pd.read_json(f'{data_dir}/train.jsonl', lines=True)
test = pd.read_json(f'{data_dir}/test.jsonl', lines=True)
val = pd.read_json(f'{data_dir}/val.jsonl', lines=True)

In [23]:
train.info()
# test.info()
# val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6363 entries, 0 to 6362
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   annotation_id   6363 non-null   int64  
 1   classification  6363 non-null   object 
 2   docids          6363 non-null   object 
 3   evidences       6363 non-null   object 
 4   query           6363 non-null   object 
 5   query_type      0 non-null      float64
dtypes: float64(1), int64(1), object(4)
memory usage: 298.4+ KB


In [None]:
train_data_df = to_data_df(train, data_dir)
train_data_df.to_csv(f"others/{dataset}/train.csv",index_label="id")
test_data_df = to_data_df(test, data_dir)
test_data_df.to_csv(f"others/{dataset}/test.csv",index_label="id")
val_data_df = to_data_df(val, data_dir)
val_data_df.to_csv(f"others/{dataset}/val.csv",index_label="id")

100%|█████████████████████████████████████████████████████████████████████████████| 6363/6363 [00:19<00:00, 330.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2807/2807 [00:07<00:00, 351.25it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1491/1491 [00:04<00:00, 363.10it/s]


# SANITY CHECK

In [10]:
data_df = test_data_df
import json

def reduce_by_alpha(text, rationale, fidelity_type="sufficiency"):
    reduced_text = ""
    # whitespace tokenization
    tokens = text.split()

    for idx in range(len(tokens)):
        try:
            if fidelity_type == "sufficiency" and rationale[idx] >= 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
            elif fidelity_type == "comprehensiveness" and rationale[idx] < 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
        except Exception as e:
            if fidelity_type == "comprehensiveness":
                reduced_text = reduced_text + tokens[idx] + " "

    # removed the last space from the text
    if len(reduced_text) > 0:
        reduced_text = reduced_text[:-1]

    return reduced_text

data_df = data_df[data_df['rationale'].notna()]
data_df.reset_index(drop=True, inplace=True)


data_df["sufficiency_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="sufficiency"), axis=1)
data_df["comprehensiveness_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="comprehensiveness"), axis=1)

In [15]:
data_df['classification']

Unnamed: 0,text,classification,rationale,sufficiency_text,comprehensiveness_text
0,does ethanol take more energy make that produc...,False,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",does ethanol take more energy make that produc...,"ETHANOL FUEL Ethanol fuel is ethyl alcohol , t..."
1,is house tax and property tax are same [SEP] P...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",is house tax and property tax are same [SEP] S...,PROPERTY TAX A house tax or millage rate is an...
2,is pain experienced in a missing body part or ...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",is pain experienced in a missing body part or ...,. Sensations are recorded most frequently foll...
3,is harry potter and the escape from gringotts ...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",is harry potter and the escape from gringotts ...,HARRY POTTER AND THE ESCAPE . HISTORY Section:...
4,is there a difference between hydroxyzine hcl ...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",is there a difference between hydroxyzine hcl ...,"HYDROXYZINE Hydroxyzine , sold under the brand..."
...,...,...,...,...,...
2802,do exoplanetary systems follow the titus bode ...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",do exoplanetary systems follow the titus bode ...,TITIUS – BODE LAW The Titius – Bode law ( some...
2803,is manic depression the same as bi polar [SEP]...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",is manic depression the same as bi polar [SEP]...,associated with bipolar disorder . The causes ...
2804,was whiskey galore based on a true story [SEP]...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, ...",was whiskey galore based on a true story [SEP]...,SS POLITICIAN + Galore ! . ORIGIN Section::::O...
2805,are there plants on the international space st...,True,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",are there plants on the international space st...,PLANTS IN SPACE Plants in space is about plant...


In [19]:
list(train_data_df['classification']).count("False")

2409