In [1]:
# Check all the generated questions and disambiguate ones that don't pertain to the topic.

In [2]:
from datasets import Dataset, load_dataset
from config import Config
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
df = pd.read_csv('list_of_questions.csv')

In [4]:
len(df)

100

In [5]:
for i, row in df.iterrows():
    print(row['Title'], "\n", row['Questions'], "\n\n")

Employee accommodation 
 ["What kind of accommodation do employees receive at this company?", "What employee accommodation options are available here?", "What types of accommodation are provided for employees?"] 


George II of Great Britain 
 ["Who was George II of Great Britain?", "Can you tell me who George II of Great Britain was?", "What should I know about George II of Great Britain?"] 


Elephant Island 
 ["Where is Elephant Island located?", "What country is Elephant Island part of?", "In which region of the world can I find Elephant Island?"] 


Regent's Canal 
 ["Where is Regent's Canal in London?", "Can you tell me where Regent's Canal is located in London?", "What part of London is Regent's Canal in?"] 


Frankie Muse Freeman 
 ["Who was Frankie Muse Freeman?", "Can you tell me who Frankie Muse Freeman was?", "What can you tell me about Frankie Muse Freeman?"] 


2023 Nottingham attacks 
 ["What happened in the 2023 Nottingham attacks?", "Can you explain what occurred durin

We looked through all the questions and noticed that there are some questions that do not relate to the title. We will manually replace these ones with more relevant questions by disambiguating the titles as shown below.

In [6]:
ambiguous_titles = ["Dilemma of Two Angels", "Calling Time", "You Can't Do That", "Linton"]
disambiguated_titles = ["Dilemma of Two Angels (movie)", "Calling Time (album)", "You Can't Do That (song)", "Linton (village)"] 

In [7]:
df = df[~df["Title"].isin(ambiguous_titles)]
len(df)

96

In [8]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [9]:
new_values_dict = {}
for topic in disambiguated_titles:
    response = client.responses.create(
        model="gpt-5.1",
        instructions = """
            You generate paraphrased questions in English by STRICTLY following the instructions below.

            For each topic:
            - All questions MUST ask for the same underlying information.
            - ONLY wording should change for ALL the questions (same question type, same answer).
            - Each question should be something you'd expect an average person to ask.
            - Each question must be a simple sentence, no connectors like "and", "but", "so".
            - Output ONLY a valid Python list of 3 strings, nothing else.
            """,
        input=f"Generate 3 similar one-line questions on the topic {topic} that you'd expect an average person to ask.",
    )

    # Create new row
    new_row = pd.DataFrame({
        "Title": [topic],
        "Questions": [response.output_text]
    })
    
    df = pd.concat([df, new_row], ignore_index=True)


In [10]:
df

Unnamed: 0,Title,Questions
0,Employee accommodation,"[""What kind of accommodation do employees rece..."
1,George II of Great Britain,"[""Who was George II of Great Britain?"", ""Can y..."
2,Elephant Island,"[""Where is Elephant Island located?"", ""What co..."
3,Regent's Canal,"[""Where is Regent's Canal in London?"", ""Can yo..."
4,Frankie Muse Freeman,"[""Who was Frankie Muse Freeman?"", ""Can you tel..."
...,...,...
95,Belinda (book),"[""What is the novel Belinda about?"", ""Can you ..."
96,Dilemma of Two Angels (movie),"[""What is the movie Dilemma of Two Angels abou..."
97,Calling Time (album),"[""What is the album Calling Time about?"", ""Can..."
98,You Can't Do That (song),"[""What is the song 'You Can't Do That' about?""..."


In [11]:
df.to_csv('list_of_questions.csv')