In [46]:
import pandas as pd
import os
from tqdm import tqdm
from utils import avg, evidence_to_mask

def to_data_df(df, data_dir):
    data_df = []
    columns = ['text', 'classification', 'rationale', 'query']
    for i in tqdm(range(len(df))):
        df_row = df.loc[i]
        text_id = df_row['annotation_id']
        idx = text_id.find('txt')
        text_id = text_id[:idx+3]
        query = df_row['query']
        evidence_list = df_row['evidences']
        if evidence_list:
            evidence_list = [x for xx in evidence_list for x in xx]
        classification = df_row['classification']
        
        file = f'{data_dir}/docs/{text_id}'
        if os.path.isfile(file):
            f = open(file, 'r', encoding="utf-8") 
            text = ''
            for line in f.readlines():
                text += line.rstrip() + ' '
        else:
            print("???")      
        
        tokens = text.split()
        rationale_mask = evidence_to_mask(tokens, evidence_list)
        
######## THIS WORKS!!!!!!!!!##############
#         query = query.replace("||","[SEP]")
#         QA = f"{text} {query}"
#         rationale_mask = rationale_mask + [1]*(len(query.split())+1)
        
        query = query.split("||")
        QA = f"{text} {query[0]}"
#         rationale_mask = rationale_mask + [1]*(len(query.split())+1)
        
        data_df.append([QA, classification, rationale_mask, query[1]])
    data_df = pd.DataFrame(data_df, columns=columns)
    return data_df

In [47]:
dataset = "multirc"

In [48]:
data_dir = f'../data/{dataset}'
train = pd.read_json(f'{data_dir}/train.jsonl', lines=True)
test = pd.read_json(f'{data_dir}/test.jsonl', lines=True)
val = pd.read_json(f'{data_dir}/val.jsonl', lines=True)

In [49]:
test_data_df = to_data_df(test, data_dir)

100%|█████████████████████████████████████████████████████████████████████████████| 4848/4848 [00:06<00:00, 786.70it/s]


In [53]:
test_data_df.iloc[0]['query']

' Cannot be swayed by wealth or political or social influences'

In [36]:
train_data_df = to_data_df(train, data_dir)
train_data_df.to_csv(f"{dataset}/train.csv",index_label="id")
test_data_df = to_data_df(test, data_dir)
test_data_df.to_csv(f"{dataset}/test.csv",index_label="id")
val_data_df = to_data_df(val, data_dir)
val_data_df.to_csv(f"{dataset}/val.csv",index_label="id")

100%|██████████████████████████████████████████████████████████████████████████| 24029/24029 [00:19<00:00, 1256.58it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4848/4848 [00:03<00:00, 1313.82it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3214/3214 [00:02<00:00, 1126.13it/s]


# SANITY CHECK

In [60]:
def rationale_check(text,rationale):
    tokens = text.split()
    out = ""
    for i, b in enumerate(rationale):
        if b:
           out += tokens[i] + " "
    return out

In [38]:
data_df = test_data_df
import json

def reduce_by_alpha(text, rationale, fidelity_type="sufficiency"):
    reduced_text = ""
    # whitespace tokenization
    tokens = text.split()

    for idx in range(len(tokens)):
        try:
            if fidelity_type == "sufficiency" and rationale[idx] >= 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
            elif fidelity_type == "comprehensiveness" and rationale[idx] < 0.5:
                reduced_text = reduced_text + tokens[idx] + " "
        except Exception as e:
            if fidelity_type == "comprehensiveness":
                reduced_text = reduced_text + tokens[idx] + " "

    # removed the last space from the text
    if len(reduced_text) > 0:
        reduced_text = reduced_text[:-1]

    return reduced_text

data_df = data_df[data_df['rationale'].notna()]
data_df.reset_index(drop=True, inplace=True)


data_df["sufficiency_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="sufficiency"), axis=1)
data_df["comprehensiveness_text"] = data_df[
    ["text", "rationale"]].apply(lambda s: reduce_by_alpha(*s, fidelity_type="comprehensiveness"), axis=1)

In [61]:
rationale_check(train_data_df.iloc[2]['text'],train_data_df.iloc[2]['rationale'])

'is windows movie maker part of windows essentials [SEP] WINDOWS MOVIE MAKER Windows Movie Maker ( formerly known as Windows Live Movie Maker in Windows 7 ) was a video editing software by Microsoft . It was a part of Windows Essentials software suite and offered the ability to create and edit videos as well as to publish them on OneDrive , Facebook , Vimeo , YouTube , and Flickr '

In [32]:
train_data_df['text'][0]

'As his car slid downtown on Tuesday morning the mind of Arnold Thorndike was occupied with such details of daily routine as the purchase of a railroad , the Japanese loan , the new wing to his art gallery , and an attack that morning , in his own newspaper , upon his pet trust . But his busy mind was not too occupied to return the salutes of the traffic policemen who cleared the way for him . Or , by some genius of memory , to recall the fact that it was on this morning young Spear was to be sentenced for theft . It was a charming morning . The spring was at full tide , and the air was sweet and clean . Mr. Thorndike considered whimsically that to send a man to jail with the memory of such a morning clinging to him was adding a year to his sentence . He regretted he had not given the probation officer a stronger letter . He remembered the young man now , and favorably . A shy , silent youth , deft in work , and at other times conscious and embarrassed . But that , on the part of a ste

# DATA STATS

In [16]:
def generate_class_stats(train_df, test_df, val_df):
    text_lens_0 = []
    text_lens_1 = []
    rationale_lens_0 = []
    rationale_lens_1 = []
    rationale_percent_0 = []
    rationale_percent_1 = []
    class_distribution = [0,0]
    for df in [train_df, test_df, val_df]:
        for i in range(len(df)):
            df_row = df.loc[i]
            clas = df_row['classification']
            text = df_row['text']
            rationale = df_row['rationale']
            query = df_row['query']
            
            query_len = len(query.split())
            text_len = len(text.split())
            rationale_len = rationale.count(1)
            rationale_percent = rationale_len/text_len
            if clas == "False":
                text_lens_0.append(text_len)
                rationale_lens_0.append(rationale_len)
                rationale_percent_0.append(rationale_percent)
                class_distribution[0] += 1
            else:
                text_lens_1.append(text_len)
                rationale_lens_1.append(rationale_len)
                rationale_percent_1.append(rationale_percent)
                class_distribution[1] += 1
                
    all_stats = {"text_lens_0": text_lens_0,
                 "text_lens_1": text_lens_1,
                 "text_lens_all":text_lens_0 + text_lens_1,
                 "rationale_lens_0":rationale_lens_0,
                 "rationale_lens_1":rationale_lens_1,
                 "rationale_lens_all":rationale_lens_0 + rationale_lens_1,
                 "rationale_percent_0": rationale_percent_0,
                 "rationale_percent_1": rationale_percent_1,
                 "rationale_percent_all": rationale_percent_0 + rationale_percent_1,
                 "class_distr":[class_distribution[0]/sum(class_distribution),class_distribution[1]/sum(class_distribution)]
                }
    return all_stats

In [17]:
all_stats = generate_class_stats(train_data_df,test_data_df,val_data_df)
for key,val in all_stats.items():
    print(f"{key}: {avg(val)}")
print(all_stats["class_distr"])

text_lens_0: 323.8500361291757
text_lens_1: 319.06843971631207
text_lens_all: 321.74911969087907
rationale_lens_0: 68.61352898671558
rationale_lens_1: 71.32503546099291
rationale_lens_all: 69.80489856969244
rationale_percent_0: 0.2240102525983751
rationale_percent_1: 0.23606268594042473
rationale_percent_all: 0.22930579683578883
class_distr: 0.5
[0.5606244741516313, 0.4393755258483687]


In [77]:
len(train_data_df.iloc[0]['text'].split())

5914

In [12]:
c = train['classification']

In [13]:
len(c)

24029

In [15]:
list(c).count("False")

13456