In [2]:
%load_ext autoreload

import json
import os
import pandas as pd

from src.common import read_data, save_data, DATA_PATH, QT_VERACITY_LABELS
from src.evidence_processor import EvidenceProcessor

#### Finding top evidences for decomposed questions / claim

In [24]:
%autoreload

processor = EvidenceProcessor(decomposed=False)

DECOMPOSITION = 'flant5'

for split in ['train', 'val', 'test']:
    claims = read_data(f'{DECOMPOSITION}/{split}_decomposed_{DECOMPOSITION}.json')
    claims = processor.transform(claims)
    save_data(f'{DECOMPOSITION}/{split}_evidences_decomposed_{DECOMPOSITION}.json', claims)
    # save_data(f'{DECOMPOSITION}/{split}_evidences_decomposed_{DECOMPOSITION}.json', claims)

  0%|          | 0/9935 [00:00<?, ?it/s]

  0%|          | 0/3084 [00:00<?, ?it/s]

  0%|          | 0/2495 [00:00<?, ?it/s]

#### Assinging top100 evidences to claims

In [17]:
%autoreload

def process(claims_file, evidences_file):
    claims = read_data(claims_file)
    evidences = read_data(evidences_file)

    evidences = {e['claim']: e for e in evidences}

    for claim in claims:
        claim['top100evidences'] = evidences[claim['claim']]['top100evidences']

    print(len(claims), len(evidences))
    save_data(claims_file, claims)


# for split in ['train', 'val', 'test']:
#     process(f'flant5/{split}_decomposed_flant5.json', f'raw_data/{split}_claims.json')

# process('train_claims_quantemp.json', 'train_evidences.json')
# process('val_claims_quantemp.json', 'val_evidences.json')
# process('test_claims_quantemp.json', 'test_evidences.json')

9935 9929
3084 3073
2495 2495


In [21]:
print(len(read_data('raw_data/train_claims.json')))


9935


#### Fixing format for gpt3.5-turbo decomposition

In [9]:
%autoreload

claims = read_data(f'raw_data/test_claims_evidences.json')
decomposed_questions = pd.read_csv(f'{DATA_PATH}/test_claimdecomp.csv', sep="@")

for claim in claims:#
    questions = decomposed_questions[decomposed_questions['claims'] == claim['claim']]['questions']

    if len(questions) == 0:
        questions = []
    elif len(questions) == 1:
        questions = questions.iloc[0].split("Next Question: ")
    else:
        print("ERROR")

    questions = [q.strip() for q in questions]
    claim['questions'] = questions


with open(f'{DATA_PATH}/test_decomposed_gpt3.5-turbo.json', "w") as file:
    json.dump(claims, file, indent=4)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Fixing label names in gpt3.5-turbo decomposition

In [18]:
%autoreload

def fix_labels(filename):
    claims = read_data(filename)

    for claim in claims:
        if claim['label'] == "Half True/False":
            claim['label'] = "Conflicting"

        assert claim['label'] in QT_VERACITY_LABELS

    print(len(claims))
    save_data(f'fixed_{filename}', claims)


fix_labels('train_evidences_decomposed_gpt3.5-turbo.json')
fix_labels('val_evidences_decomposed_gpt3.5-turbo.json')
fix_labels('test_evidences_decomposed_gpt3.5-turbo.json')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
9935
3084
2495


#### Extract nested evidences to questions and evidences lists

In [19]:
%autoreload

def extract(filename):
    claims = read_data(filename)

    for claim in claims:
        questions = [e['questions'] for e in claim['evidences']]

        del claim['evidences']
        claim['questions'] = questions

    print(len(claims))
    save_data(f'{DATA_PATH}/fixed_{filename}', claims)

extract('train_evidences_decomposed_gpt3.5-turbo.json')
extract('val_evidences_decomposed_gpt3.5-turbo.json')
extract('test_evidences_decomposed_gpt3.5-turbo.json')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
9935
3084
2495


#### Assigning gpt3.5-turbo decomposition questions to

In [23]:
%autoreload

def process(claims_filename, questions_filename):
    claims = read_data(f'raw_data/{claims_filename}')
    questions = read_data(f'gpt3.5-turbo/{questions_filename}')
    print(len(claims), len(questions))

    questions_dict = {q['claim']: q['questions'] for q in questions}

    for claim in claims:
        claim['questions'] = questions_dict[claim['claim']]

    save_data(f'gpt3.5-turbo/processed_{claims_filename}', claims)


process('train_claims.json', 'train_decomposed_gpt3.5-turbo.json')
process('val_claims.json', 'val_decomposed_gpt3.5-turbo.json')
process('test_claims.json', 'test_decomposed_gpt3.5-turbo.json')

9935 9935
3084 3084
2495 2495


#### Rename field key

In [29]:
def rename_key(path, filename, old_key, new_key):
    claims = read_data(os.path.join(path, filename))

    for claim in claims:
        claim[new_key] = claim.pop(old_key)

    print(len(claims))
    save_data(os.path.join(path, f'renamed_{filename}'), claims)

path = 'gpt3.5-turbo'
rename_key(path, 'train_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')
rename_key(path, 'val_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')
rename_key(path, 'test_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')

9935
3084
2495


#### Remove field

In [None]:
def remove_field(path, filename, key):
    claims = read_data(os.path.join(path, filename))

    for claim in claims:
        del claim[key]

    print(len(claims))
    save_data(os.path.join(path, f'processed_{filename}'), claims)

path = 'flant5'
remove_field(path, 'train_decomposed_flant5', 'evidences')

#### Fix missing top100evidences and doc fields

In [12]:
data = read_data('gpt3.5-turbo/train_decomposed_gpt3.5-turbo.json')
raw = read_data('raw_data/train_claims.json')

for d, r in zip(data, raw):
    d['top100evidences'] = r['top100evidences']
    d['doc'] = r['doc']

save_data('gpt3.5-turbo/fixed_train_decomposed_gpt3.5-turbo.json', data)

#### Add indentation to json files

In [14]:
# test_flant5_claimdecomp_subquestions_evidences_processed.json
# train_flant5_claimdecomp_subquestions_evidences_processed.json
# val_flant5_claimdecomp_subquestions_evidences_processed.json

name = 'flant5/test_flant5_claimdecomp_subquestions_evidences_processed.json'
claims = read_data(name)
save_data(name, claims)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/szymong/tud/nlp/fact-checking/data/flan-t5/test_flant5_claimdecomp_subquestions_evidences_processed.json'