In [1]:
# Import required modules
import os
import json
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Extract the key.
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
# Create the client.
client = OpenAI()

In [4]:
# Import the data for the project.
df = pd.read_csv('../data/data.csv')

In [5]:
# Show the head of df.
df.head()

Unnamed: 0,id,answer,source,focus_area
0,0,There are many different types and designs of ...,NIHSeniorHealth,Knee Replacement
1,1,"- a need to urinate frequently, especially at ...",NIHSeniorHealth,Prostate Cancer
2,2,Who Should Be Tested? The United States Preven...,NIHSeniorHealth,Osteoporosis
3,3,Risk Factors Diabetes and high blood pressure ...,NIHSeniorHealth,Kidney Disease
4,4,Kidney Disease Kidney disease is often called ...,NIHSeniorHealth,Kidney Disease


In [6]:
# Create the documents for analysis.
documents = df.to_dict(orient='records')

In [7]:
# Create the prompt_template.
prompt_template = """
You emulate a user of the medical assistant.
Formulate 5 questions this user might ask based on a provided disease.
Make the questions specific to this disease.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record. 

The record:

answer: {answer}
source: {source}
focus_area: {focus_area}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [8]:
# Create the prompt.
prompt = prompt_template.format(**documents[0])

In [9]:
# Define the llm function.
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [10]:
# Get the qustions.
questions = llm(prompt)

In [11]:
# Load the questions into json
json.loads(questions)

{'questions': ['What are the different components of an artificial knee for knee replacement?',
  'What is the difference between total knee replacement and partial knee replacement?',
  'How are joint components attached to the bone during knee replacement surgery?',
  'What is minimally invasive surgery, and how does it differ from traditional knee replacement surgery?',
  'What are the advantages of using a hybrid implant in knee replacement?']}

In [12]:
# Define the generate_questions function.
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [13]:
# Define empty dictionary for results.
results = {}

In [14]:
# Run the generate_questions function.
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/198 [00:00<?, ?it/s]

In [15]:
# Get the final results.
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [16]:
# Show first item in final_results.
final_results[0]

(0, 'What are the main components of an artificial knee?')

In [17]:
# Create df_results dataframe.
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [18]:
# Save df_results as ground-truth-retrieval.csv.
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [19]:
# Show the head of ground-truth-retrieval.csv.
!head ../data/ground-truth-retrieval.csv

id,question
0,What are the main components of an artificial knee?
0,What is the difference between total knee replacement and partial knee replacement?
0,How are the components of a knee joint attached to the bone?
0,What are the advantages of minimally invasive knee surgery?
0,What should I consider if I am interested in having a partial knee replacement?
1,What are the common symptoms associated with prostate cancer that I should be aware of?
1,How does prostate cancer affect urination patterns in men?
1,What are the potential sexual side effects of untreated prostate cancer?
1,Can prostate cancer lead to any other physical discomfort besides urination issues?
