In [None]:
%load_ext autoreload

import json
import os
import pandas as pd

from src.common import read_data, save_data, DATA_PATH, QT_VERACITY_LABELS
from src.evidence_processor import EvidenceProcessor
from collections import Counter

#### Finding top evidences for decomposed questions / claim

In [None]:
%autoreload

processor = EvidenceProcessor(decomposed=False)

DECOMPOSITION = 'flant5'

for split in ['train', 'val', 'test']:
    claims = read_data(f'{DECOMPOSITION}/{split}_decomposed_{DECOMPOSITION}.json')
    claims = processor.transform(claims)
    save_data(f'{DECOMPOSITION}/{split}_evidences_decomposed_{DECOMPOSITION}.json', claims)
    # save_data(f'{DECOMPOSITION}/{split}_evidences_decomposed_{DECOMPOSITION}.json', claims)

#### Assinging top100 evidences to claims

In [None]:
%autoreload

def process(claims_file, evidences_file):
    claims = read_data(claims_file)
    evidences = read_data(evidences_file)

    evidences = {e['claim']: e for e in evidences}

    for claim in claims:
        claim['top100evidences'] = evidences[claim['claim']]['top100evidences']

    print(len(claims), len(evidences))
    save_data(claims_file, claims)


# for split in ['train', 'val', 'test']:
#     process(f'flant5/{split}_decomposed_flant5.json', f'raw_data/{split}_claims.json')

# process('train_claims_quantemp.json', 'train_evidences.json')
# process('val_claims_quantemp.json', 'val_evidences.json')
# process('test_claims_quantemp.json', 'test_evidences.json')

In [None]:
print(len(read_data('raw_data/train_claims.json')))


#### Fixing format for gpt3.5-turbo decomposition

In [None]:
%autoreload

claims = read_data(f'raw_data/test_claims_evidences.json')
decomposed_questions = pd.read_csv(f'{DATA_PATH}/test_claimdecomp.csv', sep="@")

for claim in claims:#
    questions = decomposed_questions[decomposed_questions['claims'] == claim['claim']]['questions']

    if len(questions) == 0:
        questions = []
    elif len(questions) == 1:
        questions = questions.iloc[0].split("Next Question: ")
    else:
        print("ERROR")

    questions = [q.strip() for q in questions]
    claim['questions'] = questions


with open(f'{DATA_PATH}/test_decomposed_gpt3.5-turbo.json', "w") as file:
    json.dump(claims, file, indent=4)

#### Fixing label names in gpt3.5-turbo decomposition

In [None]:
%autoreload

def fix_labels(filename):
    claims = read_data(filename)

    for claim in claims:
        if claim['label'] == "Half True/False":
            claim['label'] = "Conflicting"

        assert claim['label'] in QT_VERACITY_LABELS

    print(len(claims))
    save_data(f'fixed_{filename}', claims)


fix_labels('train_evidences_decomposed_gpt3.5-turbo.json')
fix_labels('val_evidences_decomposed_gpt3.5-turbo.json')
fix_labels('test_evidences_decomposed_gpt3.5-turbo.json')


#### Extract nested evidences to questions and evidences lists

In [None]:
%autoreload

def extract(filename):
    claims = read_data(filename)

    for claim in claims:
        questions = [e['questions'] for e in claim['evidences']]

        del claim['evidences']
        claim['questions'] = questions

    print(len(claims))
    save_data(f'{DATA_PATH}/fixed_{filename}', claims)

extract('train_evidences_decomposed_gpt3.5-turbo.json')
extract('val_evidences_decomposed_gpt3.5-turbo.json')
extract('test_evidences_decomposed_gpt3.5-turbo.json')

#### Assigning gpt3.5-turbo decomposition questions to

In [None]:
%autoreload

def process(claims_filename, questions_filename):
    claims = read_data(f'raw_data/{claims_filename}')
    questions = read_data(f'gpt3.5-turbo/{questions_filename}')
    print(len(claims), len(questions))

    questions_dict = {q['claim']: q['questions'] for q in questions}

    for claim in claims:
        claim['questions'] = questions_dict[claim['claim']]

    save_data(f'gpt3.5-turbo/processed_{claims_filename}', claims)


process('train_claims.json', 'train_decomposed_gpt3.5-turbo.json')
process('val_claims.json', 'val_decomposed_gpt3.5-turbo.json')
process('test_claims.json', 'test_decomposed_gpt3.5-turbo.json')

#### Rename field key

In [None]:
def rename_key(path, filename, old_key, new_key):
    claims = read_data(os.path.join(path, filename))

    for claim in claims:
        claim[new_key] = claim.pop(old_key)

    print(len(claims))
    save_data(os.path.join(path, f'renamed_{filename}'), claims)

path = 'gpt3.5-turbo'
rename_key(path, 'train_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')
rename_key(path, 'val_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')
rename_key(path, 'test_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')

#### Remove field

In [None]:
def remove_field(filename, key):
    claims = read_data(filename)

    for claim in claims:
        del claim[key]

    print(len(claims))
    save_data(f'{filename}.JSON', claims)

for split in ['train', 'val', 'test']:
    remove_field(f'custom_decomposition/{split}_decomposed_flant5_predicted_type.json', 'top100evidences')

#### Fix missing top100evidences and doc fields

In [None]:
data = read_data('gpt3.5-turbo/train_decomposed_gpt3.5-turbo.json')
raw = read_data('raw_data/train_claims.json')

for d, r in zip(data, raw):
    d['top100evidences'] = r['top100evidences']
    d['doc'] = r['doc']

save_data('gpt3.5-turbo/fixed_train_decomposed_gpt3.5-turbo.json', data)

#### Add indentation to json files

In [None]:
# test_flant5_claimdecomp_subquestions_evidences_processed.json
# train_flant5_claimdecomp_subquestions_evidences_processed.json
# val_flant5_claimdecomp_subquestions_evidences_processed.json

name = 'flant5/test_flant5_claimdecomp_subquestions_evidences_processed.json'
claims = read_data(name)
save_data(name, claims)


#### Add taxonomy labels

In [None]:
def attach_taxonomy_labels(filename, raw_filename):
    claims = read_data(filename)
    raw_claims = read_data(raw_filename)

    raw_claims_dict = {claim['claim']: claim['taxonomy_label'] for claim in raw_claims}

    for claim in claims:
        claim['taxonomy_label'] = raw_claims_dict[claim['claim']]

    print(len(claims))
    save_data(f'{filename}.json', claims)

attach_taxonomy_labels(f'flant5/train_decomposed_flant5.json', 'raw_data/train_claims.json')
attach_taxonomy_labels(f'flant5/val_decomposed_flant5.json', 'raw_data/val_claims.json')
attach_taxonomy_labels(f'flant5/test_decomposed_flant5.json', 'raw_data/test_claims.json')

#### Get examples for each category

In [None]:
data = read_data("raw_data/train_claims.json")

types = [p['taxonomy_label'].strip() for p in data]
counter = Counter(types)

def save_examples(data, category):
    examples = [p['claim'] for p in data if p['taxonomy_label'].strip() == category][:20]
    with open(os.path.join(DATA_PATH, "custom_decomposition", f'{category}.txt'), "w") as file:
        file.write("\n".join(examples))

for category in counter:
    print(f'{category}: {counter[category]}')
    save_examples(data, category)

#### Progate taxonomy_label field from decomposed to evidences_decomposed

In [None]:
def propagate_taxonomy_labels(filename, target_filename):
    claims = read_data(filename)
    target_claims = read_data(target_filename)

    for claim, target_claim in zip(claims, target_claims):
        target_claim['taxonomy_label'] = claim['taxonomy_label']

    print(len(claims), len(target_claims))
    save_data(f'{target_filename}.json', target_claims)

propagate_taxonomy_labels('flant5/train_decomposed_flant5.json', 'flant5/train_evidences_decomposed_flant5.json')
propagate_taxonomy_labels('flant5/val_decomposed_flant5.json', 'flant5/val_evidences_decomposed_flant5.json')
propagate_taxonomy_labels('flant5/test_decomposed_flant5.json', 'flant5/test_evidences_decomposed_flant5.json')


#### Assign evidences field


In [None]:
def propagate(filename, target_filename):
    claims = read_data(filename)
    target_claims = read_data(target_filename)

    for claim, target_claim in zip(claims, target_claims):
        target_claim['evidences'] = claim['evidences']

    print(len(claims), len(target_claims))
    save_data(f'{target_filename}.json', target_claims)

for split in ['train', 'val', 'test']:
    propagate(f'flant5/{split}_evidences_decomposed_flant5.json', f'custom_decomposition/{split}_decomposed_flant5_predicted_type.json')
