In [8]:
import pandas as pd
import json
from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial


# Load the dataset

In [4]:
df = pd.read_csv('dataset/simpleQA.csv')

In [5]:
df.head()

Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.


In [6]:
df.tail(10)

Unnamed: 0,metadata,problem,answer
4316,"{'topic': 'Sports', 'answer_type': 'Number', '...",How many gold medals did Kristin Otto win at t...,5.
4317,"{'topic': 'Video games', 'answer_type': 'Date'...","What day, month, and year did the Terraria ver...","June 2nd, 2011"
4318,"{'topic': 'Science and technology', 'answer_ty...",What year was John Monteath Robertson awarded ...,1983
4319,"{'topic': 'Geography', 'answer_type': 'Person'...",What is the name of the settler who selected t...,James B. Patterson
4320,"{'topic': 'Science and technology', 'answer_ty...",In which year was it reported that some lichen...,2012
4321,"{'topic': 'Art', 'answer_type': 'Date', 'urls'...","The book ""Rhine"" by Anselm Kiefer is from what...",1981.
4322,"{'topic': 'Video games', 'answer_type': 'Perso...",What was the first and last name of the voice ...,Jodelle Ferland
4323,"{'topic': 'Music', 'answer_type': 'Date', 'url...",What month and year was Miranda Lambert's albu...,October 2010
4324,"{'topic': 'Sports', 'answer_type': 'Date', 'ur...","Provide the day, month, and year Gazprom becam...",17th July 2012
4325,"{'topic': 'Other', 'answer_type': 'Other', 'ur...",What instrument did Kunihiko Kodaira's father ...,A piano


# Prompt for generating distractors

In [7]:
prompt = """You are an expert synthetic data generator. Your task is to generate three plausible but incorrect answers to a given question.

Guidelines for generating wrong answers:
1. Each answer should be factually incorrect but plausible within the context
2. Match the answer type (e.g. if asking for a date, provide wrong dates)
3. The wrong answers should be clearly distinct from the correct answer and from each other
4. Maintain a similar level of specificity as the original answer
5. The answers should be realistic and not obviously wrong

Example 1:
Question: What is the capital of France?
Answer: Paris
Wrong Answers: 
- Lyon
- Marseille 
- Bordeaux
Reason: All are major French cities, but incorrect as capital

Example 2:
Question: Who was the first president of the United States?
Answer: George Washington
Wrong Answers:
- John Adams
- Thomas Jefferson
- Benjamin Franklin
Reason: All are founding fathers but not the first president

Example 3:
Question: In what year did World War II end?
Answer: 1945
Wrong Answers:
- 1943
- 1944
- 1946
Reason: All are plausible years during or near WWII but not when it ended

Example 4:
Question: Who wrote Romeo and Juliet?
Answer: William Shakespeare
Wrong Answers:
- Christopher Marlowe
- Ben Jonson
- John Webster
Reason: All are prominent Elizabethan playwrights

Example 5:
Question: What is the largest planet in our solar system?
Answer: Jupiter
Wrong Answers:
- Saturn
- Neptune
- Uranus
Reason: All are gas giant planets, but smaller than Jupiter

Please generate three wrong answers that follow these guidelines for the given question.
The answers should be:
- Factually incorrect but plausible
- Match the same answer type (e.g. date, person, number)
- Clearly distinct from the correct answer and each other
- Similar in specificity/detail level
- Realistic and not obviously wrong

Return only three wrong answers as a list in JSON format with the following requirements:
- Each wrong answer should be a string
- The output should be a single JSON object with key "wrong_answers" 
- The value should be an array of exactly 3 wrong answers
- No explanations or additional text should be included
- The answers should maintain consistent formatting with the correct answer

Example format:
{{
    "wrong_answers": ["Wrong Answer 1", "Wrong Answer 2", "Wrong Answer 3"]
}}

Question: {question}
Correct Answer: {answer}
Generate three wrong answers:
"""

# LLM call to generate distractors

In [None]:
from openai import OpenAI
client = OpenAI(api_key="---")

In [None]:
def generate_wrong_answer(question, answer):
    """
    Generate 3 plausible but incorrect answers for a given question using GPT-4.
    
    Args:
        question (str): The question to generate wrong answers for
        answer (str): The correct answer to the question
        
    Returns:
        list: List of 3 wrong answers, or empty list if generation fails
        
    The function will retry up to 3 times if the API call fails.
    Wrong answers are generated to be:
    - Factually incorrect but plausible
    - Match the same answer type as correct answer
    - Clearly distinct from correct answer and each other
    - Similar in specificity/detail level
    """
    max_retries = 3
    for attempt in range(max_retries):
        try:
            updated_prompt = prompt.format(question=question, answer=answer)
            
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": updated_prompt}],
                temperature=1,
                response_format={"type": "json_object"}
            )
            return json.loads(response.choices[0].message.content)['wrong_answers']
        except Exception as e:
            print("Error: ", e)
            if attempt == max_retries - 1:
                raise e
            continue

    return []

print(generate_wrong_answer("What is the capital of India?", "New Delhi"))

In [None]:
index_incorrect_answers = defaultdict(list)

def process_row(index, df):
    problem = df['problem'][index]
    answer = df['answer'][index]
    wrong_answer = generate_wrong_answer(problem, answer)
    return index, wrong_answer

with ThreadPoolExecutor(max_workers=8) as executor:
    for index, wrong_answer in tqdm(
        executor.map(partial(process_row, df=df), range(len(df))), 
        total=len(df)
    ):
        index_incorrect_answers[index] = wrong_answer



In [None]:
index_incorrect_answers

In [None]:
with open('index_incorrect_answers_final.json', 'r') as f:
    index_incorrect_answers_final = json.load(f)

len(index_incorrect_answers_final)

# Create a new dataframe with the wrong answers

In [None]:
df_wrong = df.copy()
df_wrong['wrong_answer_1'] = df_wrong.index.map(lambda x: index_incorrect_answers_final[str(x)][0])
df_wrong['wrong_answer_2'] = df_wrong.index.map(lambda x: index_incorrect_answers_final[str(x)][1]) 
df_wrong['wrong_answer_3'] = df_wrong.index.map(lambda x: index_incorrect_answers_final[str(x)][2])

df_wrong.head(20)

In [None]:
df_wrong.to_csv('synthetic_dataset_with_wrong_answers.csv', index=False)