## Ingestion

In [20]:
import pandas as pd

In [21]:
df = pd.read_csv('../data/fitness_exercises_500.csv')
documents = df.to_dict(orient='records')

In [22]:
df.head()

Unnamed: 0,id,exercise_name,type_of_activity,type_of_equipment,body_part,type,muscle_groups_activated,instruction
0,0,Push-Up Hold,mobility,dip belt,lower,stretch,"glutes, quads, hamstrings",Setup: Prepare equipment: dip belt. Ensure sta...
1,1,Explosive Lateral Raise Pulse,cardio,barbells,full body,hold,"back, chest, legs",Setup: Set barbell with collars secured; grip ...
2,2,Rotational Jumping Jack Iso,warm-up,barbells,upper,push,"biceps, forearms, chest",Setup: Set barbell with collars secured; grip ...
3,3,Wide-Grip Running,strength,kettlebells,upper,pull,"deltoids, triceps, forearms",Setup: Place kettlebell close to midline; hing...
4,4,Decline Dips,cardio,barbells,core,stretch,"abs, lower back, obliques",Setup: Set barbell with collars secured; grip ...


In [43]:
from openai import OpenAI

client = OpenAI()

In [26]:
prompt_template = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instruction: {instruction}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [28]:
prompt = prompt_template.format(**documents[0])

In [44]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [45]:
questions = llm(prompt)

In [46]:
import json

In [47]:
json.loads(questions)

{'questions': ['What preparation is needed before starting the Push-Up Hold exercise?',
  'How should I breathe during the Push-Up Hold to ensure effectiveness and safety?',
  'What muscles are primarily activated during the Push-Up Hold?',
  'How long should I hold the stretch for each side during the Push-Up Hold?',
  'What common mistakes should I avoid while performing the Push-Up Hold?']}

In [48]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [49]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
results = {}

In [51]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [19:55<00:00,  2.39s/it]


In [54]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [56]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [57]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)