In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from pathlib import Path

In [5]:
# -------------------------
# Config (tweak as needed)
# -------------------------
N_PATIENTS = 12                       # number of synthetic patients
STAY_DAYS_MIN, STAY_DAYS_MAX = 3, 14  # hospital stay length bounds (days)
START_DATE = datetime(2025, 1, 1, 8)  # anchor; others stagger
OUTDIR = Path("./")                   # where to write CSVs

In [7]:
# Meal schedule
MEAL_TIMES = [timedelta(hours=8), timedelta(hours=13), timedelta(hours=19)]

# Reproducibility
random.seed(42)
np.random.seed(42)

def _iso(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%d %H:%M:%S")  # naive ISO (no tz)

def _add_row(rows, pid, concept, t: datetime, value):
    rows.append({
        "PatientId": pid,
        "ConceptName": concept,
        "StartTime": _iso(t),
        "EndTime": _iso(t + timedelta(seconds=1)),  # instantaneous events
        "Value": value,
    })

def _simulate_patient(pid: int, base_admit: datetime):
    rows = []

    ROUTES = ["SubCutaneous", "IntraVenous"]  # FIXED: Capital V to match XML

    # Admission time + length of stay
    admit_time = base_admit + timedelta(days=random.randint(0, 20),
                                        hours=random.randint(0, 12))
    los_days = random.randint(STAY_DAYS_MIN, STAY_DAYS_MAX)
    end_time = admit_time + timedelta(days=los_days, hours=random.randint(0, 6))

    # Admission + outcome
    _add_row(rows, pid, "ADMISSION", admit_time, True)
    if random.random() < 0.15:
        _add_row(rows, pid, "DEATH", end_time, True)
    else:
        _add_row(rows, pid, "RELEASE", end_time, True)

    # Diabetes dx (70%) at admission
    if random.random() < 0.70:
        _add_row(rows, pid, "DIABETES_DIAGNOSIS", admit_time, True)

    # Weight and BMI measurements (80% of patients, at admission)
    if random.random() < 0.80:
        # Measure shortly after admission (within first few hours)
        measure_time = admit_time + timedelta(hours=random.randint(1, 6))
        
        # Generate realistic weight (kg): normal distribution around 75kg, range [45, 150]
        weight = np.random.normal(75, 18)
        weight = round(float(np.clip(weight, 45, 150)), 1)
        _add_row(rows, pid, "WEIGHT_MEASURE", measure_time, weight)
        
        # Generate realistic BMI: normal distribution around 26, range [16, 45]
        bmi = np.random.normal(26, 5)
        bmi = round(float(np.clip(bmi, 16, 45)), 1)
        _add_row(rows, pid, "BMI_MEASURE", measure_time, bmi)

    # Wider value helpers (so you can test ranges/thresholds)
    def clamp(v, lo, hi):
        return lo if v < lo else hi if v > hi else v

    def glucose_cap_value(day_idx: int):
        base = 140 + 15 * np.sin((day_idx * np.pi) / 2)
        noise = np.random.normal(0, 50)
        if random.random() < 0.12:
            noise += np.random.normal(0, 120)
        return round(float(clamp(base + noise, 30, 600)), 1)

    def glucose_lab_value():
        val = np.random.normal(145, 45)
        if random.random() < 0.08:
            val += np.random.normal(0, 100)
        return round(float(clamp(val, 30, 600)), 1)

    def basal_units_value():
        # much wider dosing; clamp to 0..100 U
        val = np.random.normal(24, 12)
        if random.random() < 0.10:
            val += np.random.normal(0, 25)
        return round(float(clamp(val, 0, 100)), 1)

    def bolus_units_value():
        # wider bolus distribution; clamp to 0..50 U
        val = np.random.normal(10, 6)
        if random.random() < 0.12:
            val += np.random.normal(0, 15)
        return round(float(clamp(val, 0, 50)), 1)

    # Per-day timeline within stay
    day = 0
    while True:
        day_start = admit_time + timedelta(days=day)
        if day_start > end_time:
            break

        # Meals (3/day) with meal names as values
        MEAL_LABELS = ["Breakfast", "Lunch", "Dinner"]

        for i, mt in enumerate(MEAL_TIMES):
            ts = day_start.replace(hour=0, minute=0, second=0, microsecond=0) + mt
            if admit_time <= ts <= end_time:
                meal_name = MEAL_LABELS[i % len(MEAL_LABELS)]
                _add_row(rows, pid, "MEAL", ts, meal_name)

        # FIXED: Use GLUCOSE_LAB_MEASURE (attribute name, not raw-concept name)
        # Capillary glucose every 4h (wider range)
        for h in range(0, 24, 4):
            ts = day_start.replace(hour=h, minute=0, second=0, microsecond=0)
            if admit_time <= ts <= end_time:
                _add_row(rows, pid, "GLUCOSE_MEASURE", ts, glucose_cap_value(day))

        # Lab glucose once/day ~06:15 (wider range)
        lab_ts = day_start.replace(hour=6, minute=15, second=0, microsecond=0)
        if admit_time <= lab_ts <= end_time:
            _add_row(rows, pid, "GLUCOSE_MEASURE", lab_ts, glucose_lab_value())

        # Emit BASAL_DOSAGE and BASAL_ROUTE at SAME timestamp (for tuple merging)
        basal_ts = day_start.replace(hour=21, minute=0, second=0, microsecond=0)
        if admit_time <= basal_ts <= end_time:
            units = basal_units_value()
            route = random.choice(ROUTES)
            # Emit BOTH attributes at same timestamp
            _add_row(rows, pid, "BASAL_DOSAGE", basal_ts, units)
            _add_row(rows, pid, "BASAL_ROUTE", basal_ts, route)

        # Emit BOLUS_DOSAGE and BOLUS_ROUTE at SAME timestamp (for tuple merging)
        for mt in MEAL_TIMES:
            if random.random() < 0.85:
                ts = day_start.replace(hour=0, minute=0, second=0, microsecond=0) \
                     + mt + timedelta(minutes=random.randint(-10, 20))
                if admit_time <= ts <= end_time:
                    units = bolus_units_value()
                    route = random.choice(ROUTES)
                    # Emit BOTH attributes at same timestamp
                    _add_row(rows, pid, "BOLUS_DOSAGE", ts, units)
                    _add_row(rows, pid, "BOLUS_ROUTE", ts, route)

        day += 1

    return rows

# -------------------------
# Generate cohort
# -------------------------
all_rows = []
for pid in range(1000, 1000 + N_PATIENTS):
    all_rows.extend(_simulate_patient(pid, START_DATE))

df = pd.DataFrame(all_rows).sort_values(
    by=["PatientId", "StartTime", "ConceptName"]
).reset_index(drop=True)

# -------------------------
# Write outputs
# -------------------------
OUTDIR.mkdir(parents=True, exist_ok=True)
combined_path = OUTDIR / "input_data.csv"
df.to_csv(combined_path, index=False)

print(f"Wrote combined CSV to: {combined_path.resolve()}")
print(f"Rows: {len(df):,} | Patients: {df['PatientId'].nunique()}")
print("Columns:", list(df.columns))

# Show summary of generated measurements
print("\nMeasurement counts:")
print(df.groupby("ConceptName").size().sort_values(ascending=False))
print(f"\nPatients with WEIGHT_MEASURE: {df[df['ConceptName']=='WEIGHT_MEASURE']['PatientId'].nunique()}/{N_PATIENTS}")
print(f"\nPatients with BMI_MEASURE: {df[df['ConceptName']=='BMI_MEASURE']['PatientId'].nunique()}/{N_PATIENTS}")

display(df.head(50))

Wrote combined CSV to: C:\Users\yonat\CodeProjects\Mediator\backend\data\input_data.csv
Rows: 1,778 | Patients: 12
Columns: ['PatientId', 'ConceptName', 'StartTime', 'EndTime', 'Value']

Measurement counts:
ConceptName
GLUCOSE_MEASURE       696
MEAL                  302
BOLUS_DOSAGE          262
BOLUS_ROUTE           262
BASAL_DOSAGE          101
BASAL_ROUTE           101
ADMISSION              12
RELEASE                12
BMI_MEASURE            10
DIABETES_DIAGNOSIS     10
WEIGHT_MEASURE         10
dtype: int64

Patients with WEIGHT_MEASURE: 10/12

Patients with BMI_MEASURE: 10/12


Unnamed: 0,PatientId,ConceptName,StartTime,EndTime,Value
0,1000,ADMISSION,2025-01-21 09:00:00,2025-01-21 09:00:01,True
1,1000,DIABETES_DIAGNOSIS,2025-01-21 09:00:00,2025-01-21 09:00:01,True
2,1000,GLUCOSE_MEASURE,2025-01-21 12:00:00,2025-01-21 12:00:01,172.4
3,1000,MEAL,2025-01-21 13:00:00,2025-01-21 13:00:01,Lunch
4,1000,BOLUS_DOSAGE,2025-01-21 13:07:00,2025-01-21 13:07:01,14.6
5,1000,BOLUS_ROUTE,2025-01-21 13:07:00,2025-01-21 13:07:01,IntraVenous
6,1000,BMI_MEASURE,2025-01-21 15:00:00,2025-01-21 15:00:01,25.3
7,1000,WEIGHT_MEASURE,2025-01-21 15:00:00,2025-01-21 15:00:01,83.9
8,1000,GLUCOSE_MEASURE,2025-01-21 16:00:00,2025-01-21 16:00:01,216.2
9,1000,MEAL,2025-01-21 19:00:00,2025-01-21 19:00:01,Dinner
