In [120]:
import pandas as pd
import os
from tqdm import tqdm
from utils import avg, evidence_to_mask

def to_data_df(df, data_dir):
    data_df = []
    columns = ['text', 'classification', 'rationale']
    for i in tqdm(range(len(df))):
        df_row = df.loc[i]
        doc_ids = df_row['docids']
        query = df_row['query']
        evidence_list = df_row['evidences']
        if evidence_list:
            evidence_list = [x for xx in evidence_list for x in xx]
        classification = df_row['classification']
        
        text = ''
        for doc in doc_ids:
            file = f'{data_dir}/docs/{doc}'
            if os.path.isfile(file):
                f = open(file, 'r', encoding="utf-8") 
                for line in f.readlines():
                    text += line.rstrip() + ' '
            else:
                print("???")
                print(file)
                quit()
        
        tokens = text.split()
        rationale_mask = evidence_to_mask(tokens, evidence_list)
        
        # joining text and query with [SEP]
        QA = f"{query} [SEP] {text}"
        rationale_mask = [1]*(len(query.split())+1) + rationale_mask
    
#         QA = f"[CLS] {query} [SEP] {text} [SEP]"
#         rationale_mask = [1]*(len(query.split())+2) + rationale_mask + [1]
        
        data_df.append([QA, classification, rationale_mask])
    data_df = pd.DataFrame(data_df, columns=columns)
#     return data_df
    
    data_df_shuffled=data_df.sample(frac=1).reset_index(drop=True)
    return data_df_shuffled

In [117]:
dataset = "scifact"

In [118]:
data_dir = f'../data/{dataset}'
train = pd.read_json(f'{data_dir}/train.jsonl', lines=True)
test = pd.read_json(f'{data_dir}/test.jsonl', lines=True)
val = pd.read_json(f'{data_dir}/val.jsonl', lines=True)

In [121]:
train_data_df = to_data_df(train, data_dir)
train_data_df.to_csv(f"{dataset}/train.csv",index_label="id")
test_data_df = to_data_df(test, data_dir)
test_data_df.to_csv(f"{dataset}/test.csv",index_label="id")
val_data_df = to_data_df(val, data_dir)
val_data_df.to_csv(f"{dataset}/val.csv",index_label="id")

100%|██████████████████████████████████████████████████████████████████████████████| 405/405 [00:00<00:00, 1081.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 1092.22it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 917.95it/s]


# SANITY CHECK

In [21]:
data_df = test_data_df
import json

def reduce_by_alpha(text, rationale, fidelity_type="sufficiency"):
    reduced_text = ""
    # whitespace tokenization
    tokens = text.split()

    for idx in range(len(tokens)):
        try:
            if fidelity_type == "sufficiency" and rationale[idx] >= 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
            elif fidelity_type == "comprehensiveness" and rationale[idx] < 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
        except Exception as e:
            if fidelity_type == "comprehensiveness":
                reduced_text = reduced_text + tokens[idx] + " "

    # removed the last space from the text
    if len(reduced_text) > 0:
        reduced_text = reduced_text[:-1]

    return reduced_text

data_df = data_df[data_df['rationale'].notna()]
data_df.reset_index(drop=True, inplace=True)


data_df["sufficiency_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="sufficiency"), axis=1)
data_df["comprehensiveness_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="comprehensiveness"), axis=1)

In [38]:
data_df['sufficiency_text'][0]

'1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants. [SEP] We propose as an alternative explanation that variants much less common than the associated one may create " synthetic associations " by occurring , stochastically , more often in association with one of the alleles at the common site versus the other allele . We show that they are not only possible , but inevitable , and that under simple but reasonable genetic models , they are likely to account for or contribute to many of the recently identified signals reported in genome-wide association studies . In conclusion , uncommon or rare genetic variants can easily create synthetic associations that are credited to common variants , and this possibility requires careful consideration in the interpretation and follow up of GWAS signals .'

In [41]:
test.iloc[0]

annotation_id                                                     3
query             1,000 genomes project enables mapping of genet...
evidences         [[{'text': 'We propose as an alternative expla...
classification                                             SUPPORTS
query_type                                                      NaN
docids                                                   [14717500]
Name: 0, dtype: object

In [86]:
test['evidences'][0]

[[{'text': 'We propose as an alternative explanation that variants much less common than the associated one may create " synthetic associations " by occurring , stochastically , more often in association with one of the alleles at the common site versus the other allele .',
   'docid': '14717500',
   'start_token': 77,
   'end_token': 121,
   'start_sentence': 2,
   'end_sentence': 3},
  {'text': 'We show that they are not only possible , but inevitable , and that under simple but reasonable genetic models , they are likely to account for or contribute to many of the recently identified signals reported in genome-wide association studies .',
   'docid': '14717500',
   'start_token': 169,
   'end_token': 211,
   'start_sentence': 5,
   'end_sentence': 6}],
 [{'text': 'In conclusion , uncommon or rare genetic variants can easily create synthetic associations that are credited to common variants , and this possibility requires careful consideration in the interpretation and follow up of G

# DATA STATS

In [79]:
def count_rationale_len(df):
    rationale_dict = {}
    for evidence_list in df['evidences']:
        if evidence_list:
            evidence_list = [x for xx in evidence_list for x in xx]
            
        for evidence in evidence_list:
            if type(evidence) is not dict: print("?????????")
                
            rationale_len = evidence['end_token'] - evidence['start_token']
            assert (rationale_len == len(evidence['text'].split()))
            
            doc_id = evidence['docid']
            if doc_id not in rationale_dict.keys(): 
                rationale_dict[doc_id] = rationale_len
            else: 
                rationale_dict[doc_id] += rationale_len
                
    return rationale_dict

def count_text_len(data_dir):
    def file_len(file):
        f = open(file, 'rb')   
        text_len = 0
        for line in f.readlines():
            text_len += len(line.rstrip().split())
        return text_len

    text_dict = {}
    data_dir = f"{data_dir}/docs"
    for filename in os.listdir(data_dir):
        f = f"{data_dir}/{filename}"
        if os.path.isfile(f):
            text_dict[filename] = file_len(f)
    text_lens = list(text_dict.values())
    
    return text_dict

def class_distr(train, test, val):
    class_dict = {}
    for df in [train, test, val]:
        classifications = list(df['classification'])
        for c in classifications:
            if c not in class_dict.keys():
                class_dict[c] = 1
            else:
                class_dict[c] += 1

    counts = list(class_dict.values())
    distribution = [x/sum(counts) for x in counts]

    return class_dict, distribution

In [138]:
def generate_class_stats(train_df, test_df, val_df):
    text_lens_0 = []
    text_lens_1 = []
    rationale_lens_0 = []
    rationale_lens_1 = []
    rationale_percent_0 = []
    rationale_percent_1 = []
    class_distribution = [0,0]
    for df in [train_df, test_df, val_df]:
        for i in range(len(df)):
            df_row = df.loc[i]
            clas = df_row['classification']
            text = df_row['text']
            rationale = df_row['rationale']
            text_len = len(text.split())
            rationale_len = rationale.count(1)
            rationale_percent = rationale_len/text_len
            if clas == "REFUTES":
                text_lens_0.append(text_len)
                rationale_lens_0.append(rationale_len)
                rationale_percent_0.append(rationale_percent)
                class_distribution[0] += 1
            else:
                text_lens_1.append(text_len)
                rationale_lens_1.append(rationale_len)
                rationale_percent_1.append(rationale_percent)
                class_distribution[1] += 1
    return text_lens_0,text_lens_1,rationale_lens_0,rationale_lens_1,rationale_percent_0,rationale_percent_1,class_distribution

In [139]:
text_lens_0,text_lens_1,rationale_lens_0,rationale_lens_1,rationale_percent_0,rationale_percent_1,class_distribution = generate_class_stats(train_data_df,test_data_df,val_data_df)
text_lens_all = text_lens_0 + text_lens_1
rationale_lens_all = rationale_lens_0 + rationale_lens_1
rationale_percent_all = rationale_percent_0 + rationale_percent_1
class_distr = [class_distribution[0]/sum(class_distribution),class_distribution[1]/sum(class_distribution)]