In [1]:
# Check all the generated questions and disambiguate ones that don't pertain to the topic.

In [2]:
from datasets import Dataset, load_dataset
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
from unidecode import unidecode
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
df = pd.read_csv('list_of_questions.csv')

In [4]:
len(df)

100

In [5]:
for i, row in df.iterrows():
    print(row['Title'], "\n", row['Questions'], "\n\n")

Employee accommodation 
 [ """how can a company provide reasonable accommodation for an employee with a disability?""", """how should an employer arrange reasonable accommodation for a disabled worker?""", """how does a business make reasonable accommodation for an employee who has a disability?""" ] 


George II of Great Britain 
 [ """how long did george ii of great britain rule as king?""", """for how many years was george ii the king of great britain?""", """what was the length of george ii of great britain's reign?""" ] 


Elephant Island 
 [ """where is elephant island located in relation to antarctica?""", """in which part of the southern ocean is elephant island situated?""", """geographically, where can elephant island be found near the antarctic region?""" ] 


Regent's Canal 
 [ """how long is regent's canal in london?""", """what is the total length of regent's canal?""", """how many miles long is regent's canal?""" ] 


Frankie Muse Freeman 
 [ """why is frankie muse freem

We looked through all the questions and noticed that there are some questions that do not relate to the title. We will manually replace these ones with more relevant questions by disambiguating the titles as shown below.

In [6]:
ambiguous_titles = ["Calling Time"]
disambiguated_titles = ["Calling Time (album)"] 

In [7]:
df = df[~df["Title"].isin(ambiguous_titles)]
len(df)

99

In [8]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [9]:
new_values_dict = {}
for topic in disambiguated_titles:
    response = client.responses.create(
        model="gpt-5.1",
        instructions = """
            You generate paraphrased questions in English by STRICTLY following the instructions below.

            For each topic:
            - All questions MUST ask for the same underlying information.
            - ONLY wording should change for ALL the questions (same question type, same answer).
            - Each question should be something you'd expect an average person to ask.
            - Each question must be a simple sentence, no connectors like "and", "but", "so".
            - Output ONLY a valid Python list of 3 strings, nothing else.
            """,
        input=f"Generate 3 similar one-line questions on the topic {topic} that you'd expect an average person to ask.",
    )

    raw_output = response.output_text
    output = unidecode(raw_output.lower().replace("\n", " ").replace('\"', '\"""').strip())
    
    # Create new row
    new_row = pd.DataFrame({
        "Title": [topic],
        "Questions": [output]
    })
    
    df = pd.concat([df, new_row], ignore_index=True)


In [10]:
df

Unnamed: 0,Title,Questions
0,Employee accommodation,"[ """"""how can a company provide reasonable acco..."
1,George II of Great Britain,"[ """"""how long did george ii of great britain r..."
2,Elephant Island,"[ """"""where is elephant island located in relat..."
3,Regent's Canal,"[ """"""how long is regent's canal in london?"""""",..."
4,Frankie Muse Freeman,"[ """"""why is frankie muse freeman an important ..."
...,...,...
95,Jerome Cooper,"[ """"""how old was jerome cooper when he died?""""..."
96,Alex Rodriguez,"[ """"""how many home runs did alex rodriguez hit..."
97,Carlo Fonseka,"[ """"""which political ideology did carlo fonsek..."
98,Belinda (book),"[ """"""who wrote the novel belinda?"""""", """"""who i..."


In [11]:
df.to_csv('list_of_questions.csv')