In [29]:
import pandas as pd
import os
from tqdm import tqdm
from utils import avg
from utils import evidence_to_mask

def to_data_df(df, data_dir):
    data_df = []
    columns = ['text', 'classification', 'rationale']
    for i in tqdm(range(len(df))):
        df_row = df.loc[i]
        
        doc_id = df_row['annotation_id']
        evidence_list = df_row['evidences']
        if evidence_list:
            evidence_list = evidence_list[0]
        classification = df_row['classification']
        
        text = ''
        file = f'{data_dir}/docs/{doc_id}'
        if os.path.isfile(file):
            f = open(file, 'r', encoding="utf-8") 
            for line in f.readlines():
                text += line.rstrip() + ' '
        else:
            print("???")
            print(file)
            quit()
        
        tokens = text.split()
        rationale_mask = evidence_to_mask(tokens, evidence_list)
        
        # joining text and query with [SEP]
#         QA = f"[CLS] {text} [SEP]" 
        QA = f"{text}"
        rationale_mask = rationale_mask
    
        data_df.append([QA, classification, rationale_mask])
    data_df = pd.DataFrame(data_df, columns=columns)
    
    data_df_shuffled=data_df.sample(frac=1).reset_index(drop=True)
    return data_df_shuffled

In [30]:
dataset = "movies"

In [31]:
data_dir = f'../data/{dataset}'
train = pd.read_json(f'{data_dir}/train.jsonl', lines=True)
test = pd.read_json(f'{data_dir}/test.jsonl', lines=True)
val = pd.read_json(f'{data_dir}/val.jsonl', lines=True)

In [34]:
def class_distr(df):
    classifications = list(df['classification'])
    class_dict = {}
    for c in classifications:
        if c not in class_dict.keys():
            class_dict[c] = 1
        else:
            class_dict[c] += 1

    counts = list(class_dict.values())
    distribution = [x/sum(counts) for x in counts]

    return class_dict, distribution
class_distr(train)
# test.info()
# val.info()

({'NEG': 800, 'POS': 800}, [0.5, 0.5])

In [33]:
train_data_df = to_data_df(train, data_dir)
train_data_df.to_csv(f"{dataset}/train.csv",index_label="id")
test_data_df = to_data_df(test, data_dir)
test_data_df.to_csv(f"{dataset}/test.csv",index_label="id")
val_data_df = to_data_df(val, data_dir)
val_data_df.to_csv(f"{dataset}/val.csv",index_label="id")

100%|████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:01<00:00, 1113.52it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 199/199 [00:00<00:00, 1165.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 920.96it/s]


In [26]:
test_data_df = to_data_df(test, data_dir)

100%|███████████████████████████████████████████████████████████████████████████████| 199/199 [00:00<00:00, 854.09it/s]


In [27]:
test_data_df

Unnamed: 0,text,classification,rationale
0,"[CLS] you know something , christmas is not ab...",NEG,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."
1,[CLS] wow ! what a movie . it 's everything a ...,POS,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[CLS] post - chasing amy , a slew of love - tr...",NEG,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,[CLS] this has been an extraordinary year for ...,POS,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ..."
4,[CLS] anna and the king is at least the fourth...,NEG,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
194,[CLS] director dominic sena ( who made the hig...,POS,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
195,"[CLS] "" when you get out of jail , you can kil...",POS,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
196,"[CLS] as the twin surfer dudes , stew and phil...",NEG,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
197,[CLS] there are those of us who think of lesli...,NEG,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
