Why: These functions are foundational for every other notebook (e.g. training, SHAP, FHIR export). Mock data is sufficient to write, test, and refine these logic blocks.

In [1]:
import pandas as pd
from datetime import datetime, timedelta

**Mock Data**

In [None]:
# admissions.csv mock
admissions_df = pd.DataFrame({
    'subject_id': [1001, 1002, 1003, 1004],
    'hadm_id': [2001, 2002, 2003, 2004],
    'admittime': pd.to_datetime(['2100-01-01', '2100-01-05', '2100-02-10', '2100-02-15']),
    'dischtime': pd.to_datetime(['2100-01-10', '2100-01-07', '2100-02-12', '2100-02-20']),
    'deathtime': [pd.NaT, pd.to_datetime('2100-01-08'), pd.to_datetime('2100-02-20'), pd.NaT]
})

# patients.csv mock
patients_df = pd.DataFrame({
    'subject_id': [1001, 1002, 1003, 1004],
    'gender': ['M', 'F', 'M', 'F'],
    'anchor_age': [65, 74, 55, 80],
    'dod': [pd.NaT, pd.to_datetime('2100-01-08'), pd.to_datetime('2100-02-20'), pd.to_datetime('2100-03-01')]
})


**Cohort Filtering + Labeling**

In [3]:
# Merge patient + admission info
merged_df = admissions_df.merge(patients_df, on='subject_id', how='left')

# Define mortality label: death within 30 days of discharge
merged_df['mortality_30d'] = (
    (merged_df['dod'].notna()) &
    ((merged_df['dod'] - merged_df['dischtime']).dt.days <= 30) &
    ((merged_df['dod'] - merged_df['dischtime']).dt.days >= 0)
).astype(int)

# Optional: Filter for index admissions (e.g., only first per patient)
merged_df.sort_values(by='admittime', inplace=True)
merged_df = merged_df.drop_duplicates(subset='subject_id', keep='first')

# ---- Display Result ----
import IPython.display as disp
disp.display(merged_df[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'dod', 'mortality_30d']])


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,dod,mortality_30d
0,1001,2001,2100-01-01,2100-01-10,NaT,0
1,1002,2002,2100-01-05,2100-01-07,2100-01-08,1
2,1003,2003,2100-02-10,2100-02-12,2100-02-20,1
3,1004,2004,2100-02-15,2100-02-20,2100-03-01,1
