In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [2]:
# Configurable parameters
NUM_PATIENTS = 500
EVENTS_RANGE = (50, 500)
HOSPITALIZATION_DAYS_RANGE = (3, 14)
START_DATE = datetime(2023, 1, 1)
TOTAL_DAYS = 14

# Has a value STATES: (High / Medium / Low) TRENDS:(INC / DEC / Stable) and a duration > 1s
STATES_TRENDS = [
    "GLUCOSE_LAB_MEASURE_STATE",
    "GLUCOSE_CAPILLARY_MEASURE_STATE",
    "BASAL_BITZUA_DOSAGE_STATE",
    "BOLUS_BITZUA_DOSAGE_STATE",
    "BMI_MEASURE_STATE",
    "HEMOGLOBIN_MEASURE_STATE",
    "HEMOGLOBIN_DISTRIBUTION_MEASURE_STATE",
    "HEMOGLOBIN-A1C_MEASURE_STATE",
    "HEMATOCRIT_MEASURE_STATE",
    "MCV_MEASURE_STATE",
    "MCH_MEASURE_STATE",
    "LYMPHOCYTES_MEASURE_STATE",
    "NEUTROPHILS_MEASURE_STATE",
    "MONOCYTES_MEASURE_STATE",
    "RDW_MEASURE_STATE",
    "PLT_MEASURE_STATE",
    "PLT_DISTRIBUTION_MEASURE_STATE",
    "MPV_MEASURE_STATE",
    "CREATININE_SERUM_MEASURE_STATE",
    "CREATININE_URINE_MEASURE_STATE",
    "UREA_MEASURE_STATE",
    "ALBUMIN_SERUM_MEASURE_STATE",
    "ALBUMIN_URINE_MEASURE_STATE",
    "SODIUM_BLOOD_MEASURE_STATE",
    "SODIUM_URINE_MEASURE_STATE",
    "POTASSIUM_MEASURE_STATE",
    "ASPARATE-AMINOTRANSFERASE_MEASURE_STATE",
    "ALANINE-AMINOTRANSFERASE_SERUM_MEASURE_STATE",
    "ALANINE-AMINOTRANSFERASE_BLOOD_MEASURE_STATE",
    "PH_MEASURE_STATE",
    "BICARBONATE_MEASURE_STATE",
    "ALKALINE-PHOSPHATASE_MEASURE_STATE",
    "BLOOD_PRESSURE_MEASURE_STATE",
    "BLOOD_PRESSURE_SYSTOLIC_MEASURE_STATE",
    "BLOOD_PRESSURE_DIASTOLIC_MEASURE_STATE",
    "HEART_RATE_MEASURE_STATE",
    "E-GFR_MEASURE_STATE",
    "CREATINE-KINASE_MEASURE_STATE",
    "CHOLESTEROL_MEASURE_STATE",
    "TROPONIN_SERUM_MEASURE_STATE",
    "TROPONIN_BLOOD_MEASURE_STATE",
    "KETONES_SERUM_MEASURE_STATE",
    "KETONES_URINE_MEASURE_STATE",
    "GLUCOSE_MEASURE_TREND",
    "GLUCOSE_LAB_MEASURE_TREND",
    "GLUCOSE_CAPILLARY_MEASURE_TREND",
    "HEMOGLOBIN_MEASURE_TREND",
    "HEMOGLOBIN_DISTRIBUTION_MEASURE_TREND",
    "HEMOGLOBIN-A1C_MEASURE_TREND",
    "HEMATOCRIT_MEASURE_TREND",
    "MCV_MEASURE_TREND",
    "MCH_MEASURE_TREND",
    "LYMPHOCYTES_MEASURE_TREND",
    "NEUTROPHILS_MEASURE_TREND",
    "MONOCYTES_MEASURE_TREND",
    "RDW_MEASURE_TREND",
    "PLT_MEASURE_TREND",
    "PLT_DISTRIBUTION_MEASURE_TREND",
    "MPV_MEASURE_TREND",
    "CREATININE_SERUM_MEASURE_TREND",
    "CREATININE_URINE_MEASURE_TREND",
    "UREA_MEASURE_TREND",
    "ALBUMIN_SERUM_MEASURE_TREND",
    "ALBUMIN_URINE_MEASURE_TREND",
    "SODIUM_BLOOD_MEASURE_TREND",
    "SODIUM_URINE_MEASURE_TREND",
    "POTASSIUM_MEASURE_TREND",
    "ASPARATE-AMINOTRANSFERASE_MEASURE_TREND",
    "ALANINE-AMINOTRANSFERASE_SERUM_MEASURE_TREND",
    "ALANINE-AMINOTRANSFERASE_BLOOD_MEASURE_TREND",
    "PH_MEASURE_TREND",
    "BICARBONATE_MEASURE_TREND",
    "ALKALINE-PHOSPHATASE_MEASURE_TREND",
    "BLOOD_PRESSURE_MEASURE_TREND",
    "BLOOD_PRESSURE_SYSTOLIC_MEASURE_TREND",
    "BLOOD_PRESSURE_DIASTOLIC_MEASURE_TREND",
    "HEART_RATE_MEASURE_TREND",
    "E-GFR_MEASURE_TREND",
    "CREATINE-KINASE_MEASURE_TREND",
    "CHOLESTEROL_MEASURE_TREND",
    "TROPONIN_SERUM_MEASURE_TREND",
    "TROPONIN_BLOOD_MEASURE_TREND"
]

# Has a value (High / Medium / Low) and a duration > 1s
STATE_MEDICATIONS = [
    "STEROIDS_DOSAGE_STATE",
    "ANTIDIABETIC_DRUGS_IV_DOSAGE_STATE",
    "ANTIDIABETIC_DRUGS_PO_DOSAGE_STATE",
]

# Has no value and duration = 1s
EVENT_NAMES = [
    "STATINE_BITZUA_EVENT",
    "PROMETHAZINE_BITZUA_EVENT",
    "MIDAZOLAM_BITZUA_EVENT",
    "DEXTROSE_BITZUA_EVENT",
    "CHLORAMPHENICOL_BITZUA_EVENT",
    "CALCIUM-GLUCONATE_BITZUA_EVENT",
    "BICARBONATE_BITZUA_EVENT",
    "BLOOD_PRESSURE_MEDS_BITZUA_EVENT",
    "MENTAL_HEALTH_MEDS_BITZUA_EVENT",
    "ANTIBIOTIC_DRUGS_BITZUA_EVENT",
    "ASPIRIN_BITZUA_EVENT",
    "NITROGLYCERIN_BITZUA_EVENT",
    "CLOPIDOGREL_BITZUA_EVENT",
    "HEPARIN_BITZUA_EVENT",
    "SODIUM_BITZUA_EVENT",
]

# Has no value and duration = 1s, needs to be somewhat distributed in the data
OUTCOMES = [
    "RELEASE_EVENT",
    "DEATH_EVENT",
    "KETOACIDOSIS_EVENT",
    "KIDNEY_DISORDER_EVENT",
    "COMA_EVENT",
    "EYE_DISORDER_EVENT",
    "NERVOUS_SYSTEM_DISORDER_EVENT",
    "VASCULAR_DISORDER_EVENT",
    "OTHER_COMPLICATION_EVENT",
    "DEMENTIA_EVENT",
    "CARDIOVASCULAR_DISORDER_EVENT",
    "ULCER_EVENT",
    "INFECTION_EVENT",
    "MUSCULOSKELETAL_COMPLICATION_EVENT",
    "NEUROVASCULAR_COMPLICATION_EVENT"
]

In [5]:
def generate_patient_data(patient_id):
    stay_length_days = random.randint(*HOSPITALIZATION_DAYS_RANGE)
    patient_start = START_DATE + timedelta(days=random.randint(0, TOTAL_DAYS - stay_length_days))
    stay_seconds = stay_length_days * 86400

    rows = []

    # Admission
    admission_time = patient_start
    rows.append({
        'PatientID': f'P{patient_id:04d}',
        'ConceptName': 'ADMISSION_EVENT',
        'StartDateTime': admission_time,
        'EndDateTime': admission_time + timedelta(seconds=1),
        'Value': 'True'
    })

    # Meals per day
    for day in range(stay_length_days):
        base_date = admission_time + timedelta(days=day)
        for meal, hour in zip(["Breakfast", "Lunch", "Dinner"], [7, 12, 17]):
            ts = base_date.replace(hour=hour, minute=0, second=0)
            rows.append({
                'PatientID': f'P{patient_id:04d}',
                'ConceptName': f'MEAL_{meal}',
                'StartDateTime': ts,
                'EndDateTime': ts + timedelta(seconds=1),
                'Value': 'True'
            })

    # Random medical events
    n_events = random.randint(*EVENTS_RANGE)
    for _ in range(n_events):
        ts = admission_time + timedelta(seconds=random.randint(0, stay_seconds - 1))
        cat = random.random()
        if cat < 0.5:
            concept = random.choice(STATES_TRENDS)
            suffix = random.choice(['High', 'Medium', 'Low']) if 'STATE' in concept else random.choice(['INC', 'DEC', 'Stable'])
            duration = timedelta(seconds=random.randint(360, 7200))  # between 6 min and 2 hours
        elif cat < 0.7:
            concept = random.choice(STATE_MEDICATIONS)
            suffix = random.choice(['Low', 'Medium', 'High'])
            duration = timedelta(seconds=random.randint(360, 7200))
        else:
            concept = random.choice(EVENT_NAMES)
            suffix = 'True'
            duration = timedelta(seconds=1)

        rows.append({
            'PatientID': f'P{patient_id:04d}',
            'ConceptName': concept,
            'StartDateTime': ts,
            'EndDateTime': ts + duration,
            'Value': suffix
        })

    # Complications
    complications = []
    if random.random() < 0.55:
        complications = random.sample([o for o in OUTCOMES if o not in ['DEATH_EVENT', 'RELEASE_EVENT']],
                                      k=random.randint(1, 3))
    terminal = 'DEATH_EVENT' if random.random() < 0.2 else 'RELEASE_EVENT'
    outcome_sequence = complications + [terminal]

    outcome_time = patient_start + timedelta(seconds=stay_seconds + 1)
    for i, outcome in enumerate(outcome_sequence):
        rows.append({
            'PatientID': f'P{patient_id:04d}',
            'ConceptName': outcome,
            'StartDateTime': outcome_time + timedelta(seconds=i + 1),
            'EndDateTime': outcome_time + timedelta(seconds=i + 2),
            'Value': 'True'
        })

    return rows

# Generate all patient data
all_rows = []
for pid in range(NUM_PATIENTS):
    all_rows.extend(generate_patient_data(pid))

df = pd.DataFrame(all_rows)

# Sort by PatientID and StartDateTime
df = df.sort_values(by=['PatientID', 'StartDateTime']).reset_index(drop=True)

# Display or save
df.to_csv("synthetic_diabetes_temporal_data.csv", index=False)

In [7]:
df.head(50)

Unnamed: 0,PatientID,ConceptName,StartDateTime,EndDateTime,Value
0,P0000,ADMISSION_EVENT,2023-01-10 00:00:00,2023-01-10 00:00:01,True
1,P0000,MONOCYTES_MEASURE_TREND,2023-01-10 00:08:51,2023-01-10 01:38:32,DEC
2,P0000,ANTIDIABETIC_DRUGS_PO_DOSAGE_STATE,2023-01-10 00:22:20,2023-01-10 01:55:01,Low
3,P0000,ANTIBIOTIC_DRUGS_BITZUA_EVENT,2023-01-10 01:33:00,2023-01-10 01:33:01,True
4,P0000,NITROGLYCERIN_BITZUA_EVENT,2023-01-10 01:39:19,2023-01-10 01:39:20,True
5,P0000,MENTAL_HEALTH_MEDS_BITZUA_EVENT,2023-01-10 02:12:36,2023-01-10 02:12:37,True
6,P0000,TROPONIN_BLOOD_MEASURE_TREND,2023-01-10 02:35:39,2023-01-10 03:52:20,INC
7,P0000,CREATINE-KINASE_MEASURE_TREND,2023-01-10 02:38:28,2023-01-10 04:31:31,Stable
8,P0000,ANTIDIABETIC_DRUGS_PO_DOSAGE_STATE,2023-01-10 02:44:28,2023-01-10 04:30:51,High
9,P0000,ALANINE-AMINOTRANSFERASE_SERUM_MEASURE_STATE,2023-01-10 03:27:42,2023-01-10 05:15:08,High
