In [1]:
# Check all the generated questions and disambiguate ones that don't pertain to the topic.

In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
df = pd.read_csv('list_of_questions.csv')

In [3]:
len(df)

100

In [4]:
for i, row in df.iterrows():
    print(row['Title'], "\n", row['Questions'], "\n\n")

Employee accommodation 
 ["What does employee accommodation mean in the workplace?", "What is the meaning of employee accommodation at work?", "How is employee accommodation defined in a workplace setting?"] 


George II of Great Britain 
 ["When did George II of Great Britain become king?", "What year did George II of Great Britain start his reign?", "In which year did George II of Great Britain take the throne?"] 


Elephant Island 
 ["Where is Elephant Island located?", "In which part of the world can Elephant Island be found?", "Elephant Island lies off the coast of which region?"] 


Regent's Canal 
 ["What is the main purpose of Regent's Canal in London?", "What was Regent's Canal originally built to be used for?", "What is Regent's Canal mainly used for in London?"] 


Frankie Muse Freeman 
 ["What is Frankie Muse Freeman best known for?", "What is the main reason Frankie Muse Freeman is remembered?", "What is the primary achievement that made Frankie Muse Freeman famous?"] 


2

We looked through all the questions and noticed that there are some questions that do not relate to the title. We will manually replace these ones with more relevant questions by disambiguating the titles as shown below.

In [5]:
ambiguous_titles = ["Jerome Cooper", "Calling Time", "Linton"]
disambiguated_titles = ["Jerome Cooper (musician)", "Calling Time (album)", "Linton (village)"] 

In [6]:
df = df[~df["Title"].isin(ambiguous_titles)]
len(df)

97

In [7]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [8]:
new_values_dict = {}
for topic in disambiguated_titles:
    response = client.responses.create(
        model="gpt-5.1",
        instructions = """
            You generate paraphrased questions in English by STRICTLY following the instructions below.

            For each topic:
            - All questions MUST ask for the same underlying information.
            - ONLY wording should change for ALL the questions (same question type, same answer).
            - Each question should be something you'd expect an average person to ask.
            - Each question must be a simple sentence, no connectors like "and", "but", "so".
            - Output ONLY a valid Python list of 3 strings, nothing else.
            """,
        input=f"Generate 3 similar one-line questions on the topic {topic} that you'd expect an average person to ask.",
    )

    # Create new row
    new_row = pd.DataFrame({
        "Title": [topic],
        "Questions": [response.output_text]
    })
    
    df = pd.concat([df, new_row], ignore_index=True)


In [9]:
df

Unnamed: 0,Title,Questions
0,Employee accommodation,"[""What does employee accommodation mean in the..."
1,George II of Great Britain,"[""When did George II of Great Britain become k..."
2,Elephant Island,"[""Where is Elephant Island located?"", ""In whic..."
3,Regent's Canal,"[""What is the main purpose of Regent's Canal i..."
4,Frankie Muse Freeman,"[""What is Frankie Muse Freeman best known for?..."
...,...,...
95,Carlo Fonseka,"[""Who was Carlo Fonseka in Sri Lankan public l..."
96,Belinda (book),"[""What is the plot of the book Belinda?"", ""Wha..."
97,Jerome Cooper (musician),"[""Who is Jerome Cooper the musician?"", ""Can yo..."
98,Calling Time (album),"[""What is the album Calling Time about?"", ""Can..."


In [10]:
df.to_csv('list_of_questions.csv')