In [1]:
import pandas as pd
import os
from tqdm import tqdm
from utils import avg, evidence_to_mask

def to_data_df(df, docs_df):
    data_df = []
    columns = ['text', 'classification', 'rationale', 'query']
    for i in tqdm(range(len(df))):
        df_row = df.loc[i]
        doc_id = df_row['annotation_id']
            
        query = df_row['query']
        evidence_list = df_row['evidences'][0]
        classification = df_row['classification']
        
        text = docs_df.loc[docs_df['docid'] == doc_id].iloc[0]['document']
        
        tokens = text.split()
        rationale_mask = evidence_to_mask(tokens, evidence_list)
        
        query = query.replace("[sep]","[SEP]")
        
#         QA = f"{text}[SEP] {query}"
#         rationale_mask = [1] + rationale_mask + [1]*(len(query.split())+2) 
        
        # joining text and query with [SEP]
#         QA = f"{text} [SEP] {query}"
#         rationale_mask = rationale_mask + [1]*(len(query.split())+1) 
    
        QA = text + " "
    
        data_df.append([QA, classification, rationale_mask, query])
    data_df = pd.DataFrame(data_df, columns=columns)
    return data_df

    data_df_shuffled=data_df.sample(frac=1).reset_index(drop=True)
    return data_df_shuffled

In [2]:
dataset = "cose"

In [3]:
data_dir = f'../data/{dataset}'
train = pd.read_json(f'{data_dir}/train.jsonl', lines=True)
test = pd.read_json(f'{data_dir}/test.jsonl', lines=True)
val = pd.read_json(f'{data_dir}/val.jsonl', lines=True)
docs = pd.read_json(f'{data_dir}/docs.jsonl', lines=True)

In [4]:
train_data_df = to_data_df(train, docs)
train_data_df.to_csv(f"{dataset}/train.csv",index_label="id")
test_data_df = to_data_df(test, docs)
test_data_df.to_csv(f"{dataset}/test.csv",index_label="id")
val_data_df = to_data_df(val, docs)
val_data_df.to_csv(f"{dataset}/val.csv",index_label="id")

100%|█████████████████████████████████████████████████████████████████████████████| 8752/8752 [00:52<00:00, 167.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1079/1079 [00:03<00:00, 293.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1086/1086 [00:03<00:00, 301.15it/s]


In [12]:
train_data_df['text'][0]

'What group of old testimant believers gather in a place called a temple to worship ?'

# SANITY CHECK

In [38]:
data_df = test_data_df
import json

def reduce_by_alpha(text, rationale, fidelity_type="sufficiency"):
    reduced_text = ""
    # whitespace tokenization
    tokens = text.split()

    for idx in range(len(tokens)):
        try:
            if fidelity_type == "sufficiency" and rationale[idx] >= 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
            elif fidelity_type == "comprehensiveness" and rationale[idx] < 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
        except Exception as e:
            if fidelity_type == "comprehensiveness":
                reduced_text = reduced_text + tokens[idx] + " "

    # removed the last space from the text
    if len(reduced_text) > 0:
        reduced_text = reduced_text[:-1]

    return reduced_text

data_df = data_df[data_df['rationale'].notna()]
data_df.reset_index(drop=True, inplace=True)


data_df["sufficiency_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="sufficiency"), axis=1)
data_df["comprehensiveness_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="comprehensiveness"), axis=1)

In [39]:
data_df['sufficiency_text'][0]

"are black widow and red back spiders the same [SEP] Section::::Placement . A member of the genus Latrodectus in the family Theridiidae , the redback belongs in a clade with the black widow spider , with the katipo as its closest relative . A 2004 molecular study supports the redback 's status as a distinct species , as does the unique abdomen - presenting behaviour of the male during mating . The close relationship between the two species is shown when mating : the male redback is able to successfully mate with a female katipo producing hybrid offspring . However , the male katipo is too heavy to mate with the female redback , as it triggers a predatory response in the female when it approaches the web , causing the female to eat it . There is evidence of interbreeding between female katipo and male redbacks in the wild"

In [36]:
test["evidences"][0]

[[{'docid': 'BH_wiki_60_0',
   'end_sentence': 189,
   'end_token': 4179,
   'start_sentence': 181,
   'start_token': 3930,
   'text': 'Section::::Energy balance . All biomass goes through at least some of these steps : it needs to be grown , collected , dried , fermented , distilled , and burned . All of these steps require resources and an infrastructure . The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance ( or " energy returned on energy invested " ) . Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US : one unit of fossil - fuel energy is required to create 1.3 energy units from the resulting ethanol . The energy balance for sugarcane ethanol produced in Brazil is more favorable , with one unit of fossil - fuel energy required to create 8 from the ethanol . Energy balance estimates are not easily produced , th

In [28]:
test_data_df.iloc[0]['text']

"[CLS] Eating is part of living , but your body does n't use it all and the next day you will be doing what ? [SEP] (A) reduced [SEP] (B) getting full [SEP] (C) becoming full [SEP] (D) chewing [SEP] (E) defecating [SEP]"

# DATA STATS

In [1]:
def generate_class_stats(train_df, test_df, val_df):
    text_lens_0 = []
    text_lens_1 = []
    rationale_lens_0 = []
    rationale_lens_1 = []
    rationale_percent_0 = []
    rationale_percent_1 = []
    class_distribution = [0,0]
    for df in [train_df, test_df, val_df]:
        for i in range(len(df)):
            df_row = df.loc[i]
            clas = df_row['classification']
            text = df_row['text']
            rationale = df_row['rationale']
            query = df_row['query']
            
            query_len = len(query.split())
            text_len = len(text.split()) 
            rationale_len = rationale.count(1) 
            rationale_percent = rationale_len/text_len
            if clas == "REFUTES":
                text_lens_0.append(text_len)
                rationale_lens_0.append(rationale_len)
                rationale_percent_0.append(rationale_percent)
                class_distribution[0] += 1
            else:
                text_lens_1.append(text_len)
                rationale_lens_1.append(rationale_len)
                rationale_percent_1.append(rationale_percent)
                class_distribution[1] += 1
                
    all_stats = {"text_lens_0": text_lens_0,
                 "text_lens_1": text_lens_1,
                 "text_lens_all":text_lens_0 + text_lens_1,
                 "rationale_lens_0":rationale_lens_0,
                 "rationale_lens_1":rationale_lens_1,
                 "rationale_lens_all":rationale_lens_0 + rationale_lens_1,
                 "rationale_percent_0": rationale_percent_0,
                 "rationale_percent_1": rationale_percent_1,
                 "rationale_percent_all": rationale_percent_0 + rationale_percent_1,
                 "class_distr":[class_distribution[0]/sum(class_distribution),class_distribution[1]/sum(class_distribution)]
                }
    return all_stats

In [None]:
all_stats = generate_class_stats(train_data_df,test_data_df,val_data_df)
for key,val in all_stats.items():
    print(f"{key}: {avg(val)}")
print(all_stats["class_distr"])