In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from pathlib import Path

In [2]:
# -------------------------
# Config (tweak as needed)
# -------------------------
N_PATIENTS = 12                       # number of synthetic patients
STAY_DAYS_MIN, STAY_DAYS_MAX = 3, 14  # hospital stay length bounds (days)
START_DATE = datetime(2025, 1, 1, 8)  # anchor; others stagger
OUTDIR = Path("./")                   # where to write CSVs

In [3]:
# Concepts
NUMERIC_CONCEPTS = [
    "GLUCOSE_LAB_MEASURE",
    "GLUCOSE_CAPILLARY_MEASURE",
    "BASAL_BITZUA_DOSAGE",
    "BOLUS_BITZUA_DOSAGE",
    "BMI_MEASURE",
]
DISCRETE_CONCEPTS = [
    "DIABETES_DIAGNOSIS",
    "ADMISSION",
    "RELEASE",
    "DEATH",
    "MEAL",
]


In [4]:
# Meal schedule
MEAL_TIMES = [timedelta(hours=8), timedelta(hours=13), timedelta(hours=19)]

# Reproducibility
random.seed(42)
np.random.seed(42)

def _iso(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%d %H:%M:%S")  # naive ISO (no tz)

def _add_row(rows, pid, concept, t: datetime, value):
    rows.append({
        "PatientId": pid,
        "ConceptName": concept,
        "StartTime": _iso(t),
        "EndTime": _iso(t + timedelta(seconds=1)),  # instantaneous events
        "Value": value,
    })

def _simulate_patient(pid: int, base_admit: datetime):
    rows = []

    # Admission time + length of stay
    admit_time = base_admit + timedelta(days=random.randint(0, 20),
                                        hours=random.randint(0, 12))
    los_days = random.randint(STAY_DAYS_MIN, STAY_DAYS_MAX)
    end_time = admit_time + timedelta(days=los_days, hours=random.randint(0, 6))

    # Admission + outcome
    _add_row(rows, pid, "ADMISSION", admit_time, True)
    if random.random() < 0.15:
        _add_row(rows, pid, "DEATH", end_time, True)
    else:
        _add_row(rows, pid, "RELEASE", end_time, True)

    # Diabetes dx (70%) at admission
    if random.random() < 0.70:
        _add_row(rows, pid, "DIABETES_DIAGNOSIS", admit_time, True)

    # Per-day timeline within stay
    day = 0
    while True:
        day_start = admit_time + timedelta(days=day)
        if day_start > end_time:
            break

        # Meals (3/day)
        for mt in MEAL_TIMES:
            ts = day_start.replace(hour=0, minute=0, second=0, microsecond=0) + mt
            if admit_time <= ts <= end_time:
                _add_row(rows, pid, "MEAL", ts, True)

        # Capillary glucose every 4h
        for h in range(0, 24, 4):
            ts = day_start.replace(hour=h, minute=0, second=0, microsecond=0)
            if admit_time <= ts <= end_time:
                baseline = 140 + 5 * np.sin((day * np.pi) / 3)  # gentle day-to-day oscillation
                value = max(60, baseline + np.random.normal(0, 15))
                _add_row(rows, pid, "GLUCOSE_CAPILLARY_MEASURE", ts, round(float(value), 1))

        # Lab glucose once/day ~06:15
        lab_ts = day_start.replace(hour=6, minute=15, second=0, microsecond=0)
        if admit_time <= lab_ts <= end_time:
            lab_val = max(60, 130 + np.random.normal(0, 12))
            _add_row(rows, pid, "GLUCOSE_LAB_MEASURE", lab_ts, round(float(lab_val), 1))

        # Basal insulin nightly ~21:00
        basal_ts = day_start.replace(hour=21, minute=0, second=0, microsecond=0)
        if admit_time <= basal_ts <= end_time:
            basal_units = max(6, np.random.normal(18, 4))
            _add_row(rows, pid, "BASAL_BITZUA_DOSAGE", basal_ts, round(float(basal_units), 1))

        # Bolus insulin near meals (most meals get a bolus; small jitter)
        for mt in MEAL_TIMES:
            if random.random() < 0.85:
                ts = day_start.replace(hour=0, minute=0, second=0, microsecond=0) \
                     + mt + timedelta(minutes=random.randint(-10, 20))
                if admit_time <= ts <= end_time:
                    units = max(2, np.random.normal(8, 3))
                    _add_row(rows, pid, "BOLUS_BITZUA_DOSAGE", ts, round(float(units), 1))

        # BMI early; optional mid-stay if long
        if day == 0:
            bmi_ts = admit_time + timedelta(hours=random.randint(1, 12))
            bmi_val = max(16, np.random.normal(28, 4))
            _add_row(rows, pid, "BMI_MEASURE", bmi_ts, round(float(bmi_val), 1))
        if day == los_days // 2 and los_days >= 8 and random.random() < 0.5:
            bmi_ts2 = day_start + timedelta(hours=random.randint(9, 15))
            bmi_val2 = max(16, np.random.normal(28, 4))
            _add_row(rows, pid, "BMI_MEASURE", bmi_ts2, round(float(bmi_val2), 1))

        day += 1

    return rows

# -------------------------
# Generate cohort
# -------------------------
all_rows = []
for pid in range(1000, 1000 + N_PATIENTS):
    all_rows.extend(_simulate_patient(pid, START_DATE))

df = pd.DataFrame(all_rows).sort_values(
    by=["PatientId", "StartTime", "ConceptName"]
).reset_index(drop=True)

# -------------------------
# Write outputs
# -------------------------
OUTDIR.mkdir(parents=True, exist_ok=True)
combined_path = OUTDIR / "synthetic_input_data.csv"
df.to_csv(combined_path, index=False)

print(f"Wrote combined CSV to: {combined_path.resolve()}")
print(f"Rows: {len(df):,} | Patients: {df['PatientId'].nunique()}")
print("Columns:", list(df.columns))
display(df.head(50))

Wrote combined CSV to: C:\Users\yonat\CodeProjects\Mediator\backend\data\synthetic_input_data.csv
Rows: 1,355 | Patients: 12
Columns: ['PatientId', 'ConceptName', 'StartTime', 'EndTime', 'Value']


Unnamed: 0,PatientId,ConceptName,StartTime,EndTime,Value
0,1000,ADMISSION,2025-01-21 09:00:00,2025-01-21 09:00:01,True
1,1000,DIABETES_DIAGNOSIS,2025-01-21 09:00:00,2025-01-21 09:00:01,True
2,1000,BMI_MEASURE,2025-01-21 10:00:00,2025-01-21 10:00:01,34.3
3,1000,GLUCOSE_CAPILLARY_MEASURE,2025-01-21 12:00:00,2025-01-21 12:00:01,147.5
4,1000,MEAL,2025-01-21 13:00:00,2025-01-21 13:00:01,True
5,1000,BOLUS_BITZUA_DOSAGE,2025-01-21 13:07:00,2025-01-21 13:07:01,7.3
6,1000,GLUCOSE_CAPILLARY_MEASURE,2025-01-21 16:00:00,2025-01-21 16:00:01,137.9
7,1000,MEAL,2025-01-21 19:00:00,2025-01-21 19:00:01,True
8,1000,BOLUS_BITZUA_DOSAGE,2025-01-21 19:03:00,2025-01-21 19:03:01,7.3
9,1000,GLUCOSE_CAPILLARY_MEASURE,2025-01-21 20:00:00,2025-01-21 20:00:01,149.7
