In [3]:
import pandas as pd
import os

In [4]:
# os.environ['OPENAI_API_KEY']='YOUR OPEN KEY'

In [5]:
from openai import OpenAI

client = OpenAI()

In [6]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [7]:
prompt_template = """
You emulate a user of our food order assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:
name: {name}
cuisine: {cuisine}
type: {type}
ingredients: {ingredients}
serving: {serving}
price: {price}
calories: {calories}


Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [8]:
prompt = prompt_template.format(**documents[0])

In [9]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [10]:
questions = llm(prompt)

In [11]:
import json

In [12]:
json.loads(questions)

{'questions': ['What ingredients are in the Margherita Pizza?',
  'How many calories does one slice of Margherita Pizza have?',
  'What is the price for a slice of Margherita Pizza?',
  'What type of dish is Margherita Pizza classified as?',
  'Is Margherita Pizza considered Italian cuisine?']}

In [13]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [14]:
from tqdm.auto import tqdm

In [15]:
results = {}

In [16]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [18]:
final_results[0]

(0, 'What type of cuisine is Margherita Pizza classified under?')

In [19]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [20]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [21]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What type of cuisine is Margherita Pizza classified under?
0,How many calories does one slice of Margherita Pizza contain?
0,What are the main ingredients found in Margherita Pizza?
0,How much does one slice of Margherita Pizza cost?
0,What type of dish is Margherita Pizza considered?
1,What is the name of the salad that contains grilled chicken?
1,Can you tell me the price of the Chicken Caesar Salad?
1,How many calories are in a bowl of Chicken Caesar Salad?
1,What type of cuisine does the Chicken Caesar Salad belong to?
