# AVERITEC QA Generation

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import copy

os.chdir('/homes/bussotti/XFC2/code')
print(f'Current working directory: {os.getcwd()}')
from explainable_fact_checking.result_presentation.notebook_utility import *
import json
from tqdm import tqdm

from explainable_fact_checking.dataset_preprocessing.pre import preprocess_AVERITEC
import itertools

Current working directory: /homes/bussotti/XFC2/code
/home/bussotti/.conda/envs/feverous2/bin/python


In [2]:
ds_path = os.path.join(xfc.experiment_definitions.C.DATASET_DIR_V2, 'AVERITEC')
preprocessed_path = os.path.join(ds_path, 'preprocessed')
os.makedirs(preprocessed_path, exist_ok=True)
files = ['train', 'dev', 'test']
datasets = {}
for f in files:
    t_name = f
    with open(os.path.join(preprocessed_path, t_name + '.json'), 'r') as f_in:
        datasets[f] = json.load(f_in)

In [37]:
try:
    with open(os.path.join(preprocessed_path, 'dev_with_genqa.json'), 'r') as f_in:
        dataset_with_genqa = json.load(f_in)
except FileNotFoundError:
    dataset_with_genqa = []
    
for x in dataset_with_genqa:
    print(f'ID: {x["id"]} | LABEL: >{x["label"]}< \nCLAIM: {x["claim"]}')
    for i, ev in enumerate(x[xfc.C.EV_KEY]):
        gold_or_gen = '***Gold***' if x['gold_evidence'][i] == 1 else 'Generated'
        print(f'>>> {ev} [{gold_or_gen}]')
    print()

ID: 0 | LABEL: >Refuted< 
CLAIM: In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.
>>> Where was the claim first published? It was first published on Sccopertino [***Gold***]
>>> What did Sean Connery say in his letter to Steve Jobs? In the letter, Connery stated that he was not interested in participating in the commercial. [Generated]
>>> Was Sean Connery known for participating in commercials before this incident? No. Sean Connery had a reputation for avoiding commercial endorsements throughout his career. [Generated]
>>> What did Steve Jobs say about celebrity endorsements in general? Steve Jobs was known to appreciate celebrity endorsements but also valued authenticity in marketing. [Generated]
>>> What kind of website is Scoopertino? Scoopertino is an imaginary news organization devoted to ferreting out the most relevant stories in the world of Apple, whether or not they actually occurred - says their about page [***Gold***]
>>> Did Sean Connery pu

In [3]:
question_answer_list = [x['questions'] for dataset in [datasets['train'], datasets['dev']] for x in dataset]
answer_distinct_fields = set()

for question_answer in question_answer_list:
    for qa in question_answer:
        for answer in qa['answers']:
            answer_distinct_fields.update(set(answer.keys()))


In [4]:
qa_list = list(itertools.chain.from_iterable(question_answer_list))

In [5]:
[x for x in qa_list[:200] if x['answers'][0]['answer_type'] == 'Boolean' and x['answers'][0]['answer'] == 'Yes'][:10]

[{'question': 'Did Joe Biden support vote for the war in Syria',
  'answers': [{'answer': 'Yes',
    'answer_type': 'Boolean',
    'source_url': 'https://web.archive.org/web/20201210005946/https://www.govtrack.us/congress/votes/106-1999/s57',
    'source_medium': 'Web text',
    'boolean_explanation': 'He voted in support of the Serbian war with the Kosovo Resolution',
    'cached_source_url': 'https://web.archive.org/web/20201210005946/https://www.govtrack.us/congress/votes/106-1999/s57'}]},
 {'question': 'Did Joe Biden vote for the Iraq War?',
  'answers': [{'answer': 'Yes',
    'answer_type': 'Boolean',
    'source_url': 'https://www.senate.gov/legislative/LIS/roll_call_votes/vote1072/vote_107_2_00237.htm',
    'source_medium': 'Web text',
    'boolean_explanation': 'When Joe Biden was a U.S. Senator, he voted in favor of a resolution on Iraq in October 2022.',
    'cached_source_url': 'https://web.archive.org/web/20230409144327/https://www.senate.gov/legislative/LIS/roll_call_votes

In [6]:
# answers_df = pd.DataFrame.from_dict([x['answers'] for x in question_answer_list])
# answers_df

In [24]:
example_record = datasets['dev'][0]
example_record

{'claim': 'In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.',
 'required_reannotation': False,
 'label': 'Refuted',
 'justification': 'The answer and sources show that the claim was published in a fake news site so the claim is refuted.',
 'claim_date': '31-10-2020',
 'speaker': None,
 'original_claim_url': None,
 'fact_checking_article': 'https://web.archive.org/web/20201130144023/https://checkyourfact.com/2020/11/03/fact-check-sean-connery-letter-steve-jobs-apple-1998/',
 'reporting_source': 'Facebook',
 'location_ISO_code': None,
 'claim_types': ['Event/Property Claim'],
 'fact_checking_strategies': ['Written Evidence'],
 'questions': [{'question': 'Where was the claim first published',
   'answers': [{'answer': 'It was first published on Sccopertino',
     'answer_type': 'Abstractive',
     'source_url': 'https://web.archive.org/web/20201129141238/https://scoopertino.com/exposed-the-imac-disaster-that-almost-was/',
     'source_medium': 'Web text',


# ChatGPT40 mini generation

In [25]:
EXAMPLE_QA = json.dumps({
    "questions": [
        {
            "question": "Did the person claim to have experience in the field?",
            "answers": [
                {
                    "answer": "No",
                    "answer_type": "Boolean",
                    "boolean_explanation": "The person's documented career history does not include any work in the field."
                }
            ]
        },
        {
            "question": "How did the source describe the nature of the organization?",
            "answers": [
                {
                    "answer": "The organization is described as a satirical news site.",
                    "answer_type": "Abstractive",
                }
            ]
        },
        {
            "question": "What did the person say about their involvement?",
            "answers": [
                {
                    "answer": "According to the interview, the person stated that they were never involved in the project.",
                    "answer_type": "Extractive",
                }
            ]
        },
        {
            "question": "Did any politicians condemn the rioting after the death of George Floyd in 2020?",
            "answers": [
                {
                    "answer": "Yes",
                    "answer_type": "Boolean",
                    "boolean_explanation": "Joe Biden condemned the rioting as needless destruction.",
                }
            ]
        },
        {
            "question": "What are the sources saying about the claim?",
            "answers": [
                {
                    "answer": "Sources suggest that the claim originated from a questionable website.",
                    "answer_type": "Abstractive",
                }
            ]
        }
    ]
})

In [26]:
import openai
from openai import OpenAI
import json
from pydantic import BaseModel

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


class Answer(BaseModel):
    answer: str
    answer_type: str
    boolean_explanation: str


class QA(BaseModel):
    question: str
    answers: list[Answer]


class QAList(BaseModel):
    questions: list[QA]


def generate_noisy_evidence(claim):
    # Define the prompt for noisy QA generation
    prompt = f"""    
    Given the following claim, generate five question-answer pairs related to specific subparts or subjects/objects within the claim.
    Each question-answer pair must not individually nor collectively provide enough [ANY] information to confirm or refute the entire claim. 
    Each question-answer pair should focus on a different aspect of the claim, such as specific people, events, actions, or entities mentioned.    
    The evidence (each question-answer pair) should be relevant and plausible to the context of the claim, but not definitive or conclusive. The generated questions and answers should introduce noise and uncertainty into the claim verification process, making it difficult to verify the overall truthfulness of the claim.
    An evidence could support or refute only part of the claim, but not all the claim.
    Or it could be a relevant fact about part of the claim, but not the whole claim. 
    
    
    Ensure that:
    1. The questions and answers are tied to different subjects or objects in the claim, covering multiple elements within the claim.
    2. At least one question-answer pair is generated for each answer type: Boolean (Yes/No), Abstractive (paraphrased or summarized content), and Extractive (directly quoted text from a source).
    3. Format the output as a JSON object with a list of questions and answers.


    Claim: "{claim}"

    Example output format:
    
    {EXAMPLE_QA}    

    Use the boolean_explanation field to provide explanation only for Boolean answer types, leave it empty for other answer types.
    The answer field for boolean answer types should be "Yes" or "No".
    Now, generate the question-answer pairs in the format described above for the provided claim.
    """

    try:
        # Call the OpenAI API
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system",
                 "content": "You are a helpful assistant generating noisy question-answer pairs for fact-checking."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=600,
            n=1, # completion choices
            stop=None, # Up to 4 sequences where the API will stop generating further tokens.
            temperature=0,
            response_format=QAList,
        )

        # Extract the content from the API response
        response = completion.choices[0].message
        if response.parsed:
            return response.parsed
        elif response.refusal:
            # handle refusal
            return response.refusal

    except Exception as e:
        # Handle other exceptions
        print(e)
        pass
    # if type(e) == openai.LengthFinishReasonError:
    #     # Retry with a higher max tokens
    #     print("Too many tokens: ", e)
    #     pass
    # else:

    return None



## Example usage

In [27]:
# Example usage
claim = "In a letter to Steve Jobs, Sean Connery refused to appear in an Apple commercial."
# qa_pairs = generate_noisy_evidence(claim)
# qa_pairs
# claim
# json.loads(qa_pairs.json())

# qa_gen_list = json.loads(qa_pairs.json())['questions']
# example_record = copy.deepcopy(datasets['dev'][0])
# questions_list = example_record['questions']
# # create a list gold_evidence where 1 means the evidence is gold and 0 means it is generated.
# gold_evidence = [1] * len(questions_list)
# questions_list.extend(qa_gen_list)
# gold_evidence.extend([0] * len(qa_gen_list))
# # shuffle the questions

# rnd = random.Random(42)
# c = list(zip(gold_evidence, questions_list))
# rnd.shuffle(c)
# gold_evidence, questions_list = zip(*c)
# 
# 
# # update the record with the new questions list
# example_record['questions'] = questions_list
# example_record['gold_evidence'] = gold_evidence

# Adding generated QA pairs to the dataset

In [28]:

# adding the generated QA pairs to the dataset


In [29]:
example_record

{'claim': 'In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.',
 'required_reannotation': False,
 'label': 'Refuted',
 'justification': 'The answer and sources show that the claim was published in a fake news site so the claim is refuted.',
 'claim_date': '31-10-2020',
 'speaker': None,
 'original_claim_url': None,
 'fact_checking_article': 'https://web.archive.org/web/20201130144023/https://checkyourfact.com/2020/11/03/fact-check-sean-connery-letter-steve-jobs-apple-1998/',
 'reporting_source': 'Facebook',
 'location_ISO_code': None,
 'claim_types': ['Event/Property Claim'],
 'fact_checking_strategies': ['Written Evidence'],
 'questions': [{'question': 'Where was the claim first published',
   'answers': [{'answer': 'It was first published on Sccopertino',
     'answer_type': 'Abstractive',
     'source_url': 'https://web.archive.org/web/20201129141238/https://scoopertino.com/exposed-the-imac-disaster-that-almost-was/',
     'source_medium': 'Web text',


In [30]:
from explainable_fact_checking import C


def add_generated_evidence(record, qa_pairs, random_seed=41):
    qa_gen_list = json.loads(qa_pairs.json())['questions']
    questions_list = list(record['questions'])
    # create a list gold_evidence where 1 means the evidence is gold and 0 means it is generated.
    gold_evidence = [1] * len(questions_list) + [0] * len(qa_gen_list)
    questions_list.extend(qa_gen_list)
    
    # shuffle the questions
    rnd = random.Random(random_seed)
    c = list(zip(gold_evidence, questions_list))
    rnd.shuffle(c)
    gold_evidence, questions_list = zip(*c)

    # update the record with the new questions list
    record['questions'] = questions_list
    record['gold_evidence'] = gold_evidence
    record.pop(C.EV_KEY)
    record.pop('id')
    return record

def generate_add_qa(record):
    record = copy.deepcopy(record)
    qa_gen = generate_noisy_evidence(record['claim'])    
    return add_generated_evidence(record, qa_gen)

In [31]:


# dataset_with_genqa = []
# for r in tqdm(datasets['dev'][:3]):
#     dataset_with_genqa.append(generate_add_qa(r))
# dataset_with_genqa = preprocess_AVERITEC(dataset_with_genqa)

100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.96s/it]


# Generate 10 QA pairs for each record in the dataset

In [32]:
# Generate top QA for top 100 recor for each record in the dataset
try:
    with open(os.path.join(preprocessed_path, 'dev_with_genqa.json'), 'r') as f_in:
        dataset_with_genqa = json.load(f_in)
except FileNotFoundError:
    print('File not found')
    dataset_with_genqa = []

done_ids = [x['id'] for x in dataset_with_genqa]    
for r in tqdm(datasets['dev'][10:50]):
    if r['id'] in done_ids:
        continue
    dataset_with_genqa.append(generate_add_qa(r))
dataset_with_genqa = preprocess_AVERITEC(dataset_with_genqa)
# save
with open(os.path.join(preprocessed_path, 'dev_with_genqa.json'), 'w') as f_out:
    json.dump(dataset_with_genqa, f_out)

100%|███████████████████████████████████████████████████████████████████████████████████| 40/40 [03:19<00:00,  4.98s/it]


In [4]:
dataset_with_genqa

[{'claim': 'In a letter to Steve Jobs, Sean Connery refused to appear in an apple commercial.',
  'required_reannotation': False,
  'label': 'Refuted',
  'justification': 'The answer and sources show that the claim was published in a fake news site so the claim is refuted.',
  'claim_date': '31-10-2020',
  'speaker': None,
  'original_claim_url': None,
  'fact_checking_article': 'https://web.archive.org/web/20201130144023/https://checkyourfact.com/2020/11/03/fact-check-sean-connery-letter-steve-jobs-apple-1998/',
  'reporting_source': 'Facebook',
  'location_ISO_code': None,
  'claim_types': ['Event/Property Claim'],
  'fact_checking_strategies': ['Written Evidence'],
  'questions': [{'question': 'Where was the claim first published',
    'answers': [{'answer': 'It was first published on Sccopertino',
      'answer_type': 'Abstractive',
      'source_url': 'https://web.archive.org/web/20201129141238/https://scoopertino.com/exposed-the-imac-disaster-that-almost-was/',
      'source_medi