In [36]:
%load_ext autoreload

import json
import os
import pandas as pd

from src.common import read_data, save_data, DATA_PATH, QT_VERACITY_LABELS
from src.evidence_processor import EvidenceProcessor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Finding top evidences for decomposed questions / claim

In [51]:
%autoreload

processor = EvidenceProcessor(decomposed=True, top_k=1)

DECOMPOSITION = 'gpt3.5-turbo'

for split in ['train', 'val', 'test']:
    claims = read_data(f'{DECOMPOSITION}/{split}_decomposed_{DECOMPOSITION}.json')
    claims = processor.transform(claims)
    save_data(f'{DECOMPOSITION}/{split}_evidences_decomposed_{DECOMPOSITION}.json', claims)

  0%|          | 0/9935 [00:00<?, ?it/s]

KeyError: 'top100evidences'

In [49]:
DECOMPOSITION = 'gpt3.5-turbo'
claims = read_data(f'{DECOMPOSITION}/val_decomposed_{DECOMPOSITION}.json')
claims = [claims[3]]
print(claims[0]['claim'])

Durch einen Vergleich mit den Symptomen einer Covid-19-Erkrankung entsteht der Eindruck, die Impfung von Biontech/Pfizer verursache ähnliche oder gar schlimmere Symptome als Covid-19. Der Impfstoff sei zudem laut der U.S. Food and Drug Administration (FDA) nicht zugelassen.


In [50]:
%autoreload
processor.transform(claims)

  0%|          | 0/1 [00:00<?, ?it/s]

[[ 0.1202354   0.10268626  0.1030638 ]
 [ 0.22245136  0.16404705  0.25227696]
 [ 0.23014194  0.12849608  0.23000824]
 [ 0.17599234  0.1808079   0.19720061]
 [ 0.06709042  0.10257602  0.09295434]
 [ 0.26672608  0.17740151  0.27775317]
 [ 0.12978634  0.12140166  0.14179914]
 [ 0.26021358  0.18170056  0.2623487 ]
 [ 0.18649955  0.10943734  0.21212196]
 [ 0.1339188   0.07766123  0.15638766]
 [ 0.20529182  0.19065559  0.22627622]
 [ 0.14290802  0.12982573  0.14962734]
 [ 0.16022953  0.14611997  0.15857342]
 [ 0.05993903  0.08239873  0.09091972]
 [-0.03265695 -0.00210017 -0.02638025]
 [ 0.08587708  0.06623653  0.07442784]
 [ 0.10025234  0.03869817  0.09716454]
 [ 0.01872294  0.08825772  0.04968441]
 [ 0.08710689 -0.00170853  0.1072647 ]
 [ 0.07076254  0.08525328  0.10544405]
 [ 0.24723083  0.14921212  0.2307147 ]
 [ 0.03701835  0.00903926  0.05036606]
 [ 0.12939395  0.13022119  0.15396728]
 [ 0.17360248  0.15353468  0.20886613]
 [ 0.1604015   0.14081664  0.15633066]
 [ 0.13173252  0.12260018

[{'crawled_date': '2023-02-13T18:00:05',
  'country_of_origin': 'germany',
  'label': 'False',
  'url': 'https://correctiv.org/faktencheck/2021/02/03/vergleich-der-nebenwirkungen-der-impfung-und-von-symptomen-bei-covid-19-fuehrt-in-die-irre/?lang=de',
  'lang': 'en',
  'claim': 'Durch einen Vergleich mit den Symptomen einer Covid-19-Erkrankung entsteht der Eindruck, die Impfung von Biontech/Pfizer verursache ähnliche oder gar schlimmere Symptome als Covid-19. Der Impfstoff sei zudem laut der U.S. Food and Drug Administration (FDA) nicht zugelassen.',
  'taxonomy_label': 'interval',
  'label_original': 'Fehlender Kontext',
  'subquestions': ['does the biontech/pfizer vaccine cause similar or worse symptoms than covid-19?\n',
   ' is the biontech/pfizer vaccine not approved by the u.s. food and drug administration (fda)?\n',
   ' is there evidence to support the claim that the biontech/pfizer vaccine causes similar or worse symptoms than covid-19?'],
  'evidences': ['im video "impact of 

#### Assinging top100 evidences to claims

In [None]:
%autoreload

def process(claims_file, evidences_file):
    claims = read_data(claims_file)
    evidences = read_data(evidences_file)

    evidences = {e['claim']: e for e in evidences}

    for claim in claims:
        claim['top100evidences'] = evidences[claim['claim']]['top100evidences']

    print(len(claims), len(evidences))
    save_data(f'processed_{claims_file}', claims)


process('train_claims_quantemp.json', 'train_evidences.json')
process('val_claims_quantemp.json', 'val_evidences.json')
process('test_claims_quantemp.json', 'test_evidences.json')

#### Fixing format for gpt3.5-turbo decomposition

In [9]:
%autoreload

claims = read_data(f'raw_data/test_claims_evidences.json')
decomposed_questions = pd.read_csv(f'{DATA_PATH}/test_claimdecomp.csv', sep="@")

for claim in claims:#
    questions = decomposed_questions[decomposed_questions['claims'] == claim['claim']]['questions']

    if len(questions) == 0:
        questions = []
    elif len(questions) == 1:
        questions = questions.iloc[0].split("Next Question: ")
    else:
        print("ERROR")

    questions = [q.strip() for q in questions]
    claim['questions'] = questions


with open(f'{DATA_PATH}/test_decomposed_gpt3.5-turbo.json', "w") as file:
    json.dump(claims, file, indent=4)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Fixing label names in gpt3.5-turbo decomposition

In [18]:
%autoreload

def fix_labels(filename):
    claims = read_data(filename)

    for claim in claims:
        if claim['label'] == "Half True/False":
            claim['label'] = "Conflicting"

        assert claim['label'] in QT_VERACITY_LABELS

    print(len(claims))
    save_data(f'fixed_{filename}', claims)


fix_labels('train_evidences_decomposed_gpt3.5-turbo.json')
fix_labels('val_evidences_decomposed_gpt3.5-turbo.json')
fix_labels('test_evidences_decomposed_gpt3.5-turbo.json')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
9935
3084
2495


#### Extract nested evidences to questions and evidences lists

In [19]:
%autoreload

def extract(filename):
    claims = read_data(filename)

    for claim in claims:
        questions = [e['questions'] for e in claim['evidences']]

        del claim['evidences']
        claim['questions'] = questions

    print(len(claims))
    save_data(f'{DATA_PATH}/fixed_{filename}', claims)

extract('train_evidences_decomposed_gpt3.5-turbo.json')
extract('val_evidences_decomposed_gpt3.5-turbo.json')
extract('test_evidences_decomposed_gpt3.5-turbo.json')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
9935
3084
2495


#### Assigning gpt3.5-turbo decomposition questions to

In [23]:
%autoreload

def process(claims_filename, questions_filename):
    claims = read_data(f'raw_data/{claims_filename}')
    questions = read_data(f'gpt3.5-turbo/{questions_filename}')
    print(len(claims), len(questions))

    questions_dict = {q['claim']: q['questions'] for q in questions}

    for claim in claims:
        claim['questions'] = questions_dict[claim['claim']]

    save_data(f'gpt3.5-turbo/processed_{claims_filename}', claims)


process('train_claims.json', 'train_decomposed_gpt3.5-turbo.json')
process('val_claims.json', 'val_decomposed_gpt3.5-turbo.json')
process('test_claims.json', 'test_decomposed_gpt3.5-turbo.json')

9935 9935
3084 3084
2495 2495


#### Rename field key

In [29]:
def rename_key(path, filename, old_key, new_key):
    claims = read_data(os.path.join(path, filename))

    for claim in claims:
        claim[new_key] = claim.pop(old_key)

    print(len(claims))
    save_data(os.path.join(path, f'renamed_{filename}'), claims)

path = 'gpt3.5-turbo'
rename_key(path, 'train_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')
rename_key(path, 'val_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')
rename_key(path, 'test_decomposed_gpt3.5-turbo.json', 'questions', 'subquestions')

9935
3084
2495


#### Remove field

In [None]:
def remove_field(path, filename, key):
    claims = read_data(os.path.join(path, filename))

    for claim in claims:
        del claim[key]

    print(len(claims))
    save_data(os.path.join(path, f'processed_{filename}'), claims)

path = 'gpt3.5-turbo'
remove_field(path, 'train_decomposed_gpt3.5-turbo.json', 'doc')