In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from pathlib import Path

In [2]:
# -------------------------
# Config (tweak as needed)
# -------------------------
N_PATIENTS = 12                       # number of synthetic patients
STAY_DAYS_MIN, STAY_DAYS_MAX = 3, 14  # hospital stay length bounds (days)
START_DATE = datetime(2025, 1, 1, 8)  # anchor; others stagger
OUTDIR = Path("./")                   # where to write CSVs

In [3]:
# Meal schedule
MEAL_TIMES = [timedelta(hours=8), timedelta(hours=13), timedelta(hours=19)]

# Reproducibility
random.seed(42)
np.random.seed(42)

def _iso(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%d %H:%M:%S")  # naive ISO (no tz)

def _add_row(rows, pid, concept, t: datetime, value):
    rows.append({
        "PatientId": pid,
        "ConceptName": concept,
        "StartTime": _iso(t),
        "EndTime": _iso(t + timedelta(seconds=1)),  # instantaneous events
        "Value": value,
    })

def _simulate_patient(pid: int, base_admit: datetime):
    rows = []

    ROUTES = ["SubCutaneous", "Intravenous"]  # emitted for BASAL_ROUTE / BOLUS_ROUTE

    def _add_route(concept_route: str, ts: datetime):
        _add_row(rows, pid, concept_route, ts, random.choice(ROUTES))

    # Admission time + length of stay
    admit_time = base_admit + timedelta(days=random.randint(0, 20),
                                        hours=random.randint(0, 12))
    los_days = random.randint(STAY_DAYS_MIN, STAY_DAYS_MAX)
    end_time = admit_time + timedelta(days=los_days, hours=random.randint(0, 6))

    # Admission + outcome
    _add_row(rows, pid, "ADMISSION", admit_time, True)
    if random.random() < 0.15:
        _add_row(rows, pid, "DEATH", end_time, True)
    else:
        _add_row(rows, pid, "RELEASE", end_time, True)

    # Diabetes dx (70%) at admission
    if random.random() < 0.70:
        _add_row(rows, pid, "DIABETES_DIAGNOSIS", admit_time, True)

    # Wider value helpers (so you can test ranges/thresholds)
    def clamp(v, lo, hi):
        return lo if v < lo else hi if v > hi else v

    def glucose_cap_value(day_idx: int):
        # broader variance + occasional excursions
        base = 140 + 15 * np.sin((day_idx * np.pi) / 2)  # slower day oscillation
        noise = np.random.normal(0, 50)                  # higher variance
        if random.random() < 0.12:                        # spikes/dips ~12%
            noise += np.random.normal(0, 120)
        return round(float(clamp(base + noise, 30, 600)), 1)

    def glucose_lab_value():
        val = np.random.normal(145, 45)                   # broader lab range
        if random.random() < 0.08:
            val += np.random.normal(0, 100)
        return round(float(clamp(val, 30, 600)), 1)

    def basal_units_value():
        # much wider dosing; clamp to 0..100 U
        val = np.random.normal(24, 12)
        if random.random() < 0.10:
            val += np.random.normal(0, 25)
        return round(float(clamp(val, 0, 100)), 1)

    def bolus_units_value():
        # wider bolus distribution; clamp to 0..50 U
        val = np.random.normal(10, 6)
        if random.random() < 0.12:
            val += np.random.normal(0, 15)
        return round(float(clamp(val, 0, 50)), 1)

    # Per-day timeline within stay
    day = 0
    while True:
        day_start = admit_time + timedelta(days=day)
        if day_start > end_time:
            break

        # Meals (3/day) with meal names as values
        MEAL_LABELS = ["Breakfast", "Lunch", "Dinner"]

        for i, mt in enumerate(MEAL_TIMES):
            ts = day_start.replace(hour=0, minute=0, second=0, microsecond=0) + mt
            if admit_time <= ts <= end_time:
                meal_name = MEAL_LABELS[i % len(MEAL_LABELS)]
                _add_row(rows, pid, "MEAL", ts, meal_name)

        # Capillary glucose every 4h (wider range)
        for h in range(0, 24, 4):
            ts = day_start.replace(hour=h, minute=0, second=0, microsecond=0)
            if admit_time <= ts <= end_time:
                _add_row(rows, pid, "GLUCOSE_CAPILLARY_MEASURE", ts, glucose_cap_value(day))

        # Lab glucose once/day ~06:15 (wider range)
        lab_ts = day_start.replace(hour=6, minute=15, second=0, microsecond=0)
        if admit_time <= lab_ts <= end_time:
            _add_row(rows, pid, "GLUCOSE_LAB_MEASURE", lab_ts, glucose_lab_value())

        # Basal insulin nightly ~21:00 + required BASAL_ROUTE
        basal_ts = day_start.replace(hour=21, minute=0, second=0, microsecond=0)
        if admit_time <= basal_ts <= end_time:
            units = basal_units_value()
            _add_row(rows, pid, "BASAL_DOSAGE", basal_ts, units)
            _add_route("BASAL_ROUTE", basal_ts)  # emit matching route

        # Bolus insulin near meals (most meals get a bolus; jitter) + required BOLUS_ROUTE
        for mt in MEAL_TIMES:
            if random.random() < 0.85:
                ts = day_start.replace(hour=0, minute=0, second=0, microsecond=0) \
                     + mt + timedelta(minutes=random.randint(-10, 20))
                if admit_time <= ts <= end_time:
                    units = bolus_units_value()
                    _add_row(rows, pid, "BOLUS_DOSAGE", ts, units)
                    _add_route("BOLUS_ROUTE", ts)  # emit matching route

        day += 1

    return rows

# -------------------------
# Generate cohort
# -------------------------
all_rows = []
for pid in range(1000, 1000 + N_PATIENTS):
    all_rows.extend(_simulate_patient(pid, START_DATE))

df = pd.DataFrame(all_rows).sort_values(
    by=["PatientId", "StartTime", "ConceptName"]
).reset_index(drop=True)

# -------------------------
# Write outputs
# -------------------------
OUTDIR.mkdir(parents=True, exist_ok=True)
combined_path = OUTDIR / "synthetic_input_data.csv"
df.to_csv(combined_path, index=False)

print(f"Wrote combined CSV to: {combined_path.resolve()}")
print(f"Rows: {len(df):,} | Patients: {df['PatientId'].nunique()}")
print("Columns:", list(df.columns))
display(df.head(50))

Wrote combined CSV to: C:\Users\yonat\CodeProjects\Mediator\backend\data\synthetic_input_data.csv
Rows: 1,604 | Patients: 12
Columns: ['PatientId', 'ConceptName', 'StartTime', 'EndTime', 'Value']


Unnamed: 0,PatientId,ConceptName,StartTime,EndTime,Value
0,1000,ADMISSION,2025-01-21 09:00:00,2025-01-21 09:00:01,True
1,1000,DIABETES_DIAGNOSIS,2025-01-21 09:00:00,2025-01-21 09:00:01,True
2,1000,GLUCOSE_CAPILLARY_MEASURE,2025-01-21 12:00:00,2025-01-21 12:00:01,164.8
3,1000,MEAL,2025-01-21 13:00:00,2025-01-21 13:00:01,Lunch
4,1000,BOLUS_DOSAGE,2025-01-21 13:06:00,2025-01-21 13:06:01,8.6
5,1000,BOLUS_ROUTE,2025-01-21 13:06:00,2025-01-21 13:06:01,SubCutaneous
6,1000,GLUCOSE_CAPILLARY_MEASURE,2025-01-21 16:00:00,2025-01-21 16:00:01,133.1
7,1000,MEAL,2025-01-21 19:00:00,2025-01-21 19:00:01,Dinner
8,1000,BOLUS_DOSAGE,2025-01-21 19:12:00,2025-01-21 19:12:01,19.5
9,1000,BOLUS_ROUTE,2025-01-21 19:12:00,2025-01-21 19:12:01,SubCutaneous
