Purpose: Imports and runs the data_generator script to create your synthetic dataset.

Output: Saves the raw CSV files to the data/raw/ directory.

In [29]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

the following is just some config

In [30]:
USER_ID = "user_123"
NUM_DAYS = 90  # Generate data for 3 months
START_DATE = datetime(2025, 1, 1)
SYMPTOM_PROBABILITY_PER_DAY = 0.3 # literally just used for randomness later on

make a universe of diff foods + symptoms

In [31]:
FOOD_UNIVERSE = {
    "Pizza Slice": {"tags": ["gluten", "dairy", "processed", "high-fat"]},
    "Mac & Cheese": {"tags": ["gluten", "dairy", "processed"]},
    "Cheeseburger": {"tags": ["gluten", "dairy", "processed", "red-meat"]},
    "Cereal with Milk": {"tags": ["gluten", "dairy", "processed", "sugary"]},
    "Spicy Curry": {"tags": ["spicy", "high-fat", "dairy-free"]},
    "Buffalo Wings": {"tags": ["spicy", "processed", "high-fat"]},
    "Large Apple": {"tags": ["high-fiber", "fruit", "raw"]},
    "Bean Burrito": {"tags": ["high-fiber", "gluten"]},
    "Lentil Soup": {"tags": ["high-fiber", "legume"]},
    "Grilled Chicken Breast": {"tags": ["low-fodmap", "protein"]},
    "Steamed Rice": {"tags": ["low-fodmap", "carb"]},
    "Cooked Carrots": {"tags": ["low-fodmap", "vegetable"]},
    "Scrambled Eggs": {"tags": ["low-fodmap", "protein"]},
    "Gluten-Free Toast": {"tags": ["gluten-free"]},
}

SYMPTOM_UNIVERSE = ["Abdominal Pain", "Bloating", "Fatigue", "Nausea", "Urgency"]

create user

In [32]:
user_data = [{
    "user_id": USER_ID,
    "age": random.randint(25, 45),
    "gender": random.choice(["Male", "Female", "Non-binary"]),
    "diagnosis_date": (START_DATE - timedelta(days=random.randint(180, 2000))).strftime('%Y-%m-%d')
}]

In [33]:
all_meals_logs = []
all_symptoms_logs = []

generate fake data

In [34]:
for day_num in range(NUM_DAYS):
    current_date = START_DATE + timedelta(days=day_num)

    # --- A: Generate Meals for the day ---
    for meal_type in ["Breakfast", "Lunch", "Dinner"]:
        if meal_type == "Breakfast": meal_time = current_date + timedelta(hours=random.uniform(7, 9))
        elif meal_type == "Lunch": meal_time = current_date + timedelta(hours=random.uniform(12, 14))
        else: meal_time = current_date + timedelta(hours=random.uniform(18, 20))

        chosen_food_name = random.choice(list(FOOD_UNIVERSE.keys()))
        chosen_food_tags = FOOD_UNIVERSE[chosen_food_name]["tags"]

        all_meals_logs.append({
            "meal_id": f"meal_{len(all_meals_logs)+1}",
            "user_id": USER_ID,
            "timestamp": meal_time.strftime('%Y-%m-%d %H:%M:%S'),
            "meal_type": meal_type,
            "food_name": chosen_food_name,
            "food_tags": ", ".join(chosen_food_tags)
        })

    # --- B: Generate Symptoms for the day (Independent of meals) ---
    if random.random() < SYMPTOM_PROBABILITY_PER_DAY:
        num_symptoms_today = random.randint(1, 2)
        for _ in range(num_symptoms_today):
            symptom_time = current_date + timedelta(hours=random.uniform(8, 22)) # Random time during the day
            symptom_type = random.choice(SYMPTOM_UNIVERSE)
            severity = random.randint(3, 8) # Assign a random severity

            all_symptoms_logs.append({
                "symptom_id": f"symp_{len(all_symptoms_logs)+1}",
                "user_id": USER_ID,
                "timestamp": symptom_time.strftime('%Y-%m-%d %H:%M:%S'),
                "symptom": symptom_type,
                "severity": severity
            })


create dfs

In [35]:
df_user = pd.DataFrame(user_data)
df_meals = pd.DataFrame(all_meals_logs)
df_symptoms = pd.DataFrame(all_symptoms_logs)

sort since we did add that randomness into it

In [36]:
df_meals = df_meals.sort_values(by="timestamp").reset_index(drop=True)
df_symptoms = df_symptoms.sort_values(by="timestamp").reset_index(drop=True)

define output dir + create it if not exist

In [37]:
output_dir = "data/raw"
os.makedirs(output_dir, exist_ok=True)

create paths for every df

In [38]:
user_path = os.path.join(output_dir, "user.csv")
meals_path = os.path.join(output_dir, "meals.csv")
symptoms_path = os.path.join(output_dir, "symptoms.csv")

save dfs into dir

In [39]:
df_user.to_csv(user_path, index=False)
df_meals.to_csv(meals_path, index=False)
df_symptoms.to_csv(symptoms_path, index=False)