In [1]:
import pandas as pd
from pathlib import Path
import IPython.display as disp


In [4]:
# Define base path to real MIMIC data
base_path = Path("C:/Users/Rand Sobczak Jr/_rts") / "mlops" / "mimiccds" / "real_mimic" / "data" / "mimiciv"


In [5]:
# Load datasets
patients_df = pd.read_csv(base_path / "_hosp" / "patients.csv")
admissions_df = pd.read_csv(base_path / "_hosp" / "admissions.csv")


In [6]:
patients_df.head(2)

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,


In [7]:
admissions_df.head(2)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0


In [10]:
patients_df.shape

(299712, 6)

In [9]:
admissions_df.shape

(431231, 16)

In [11]:
# Merge patient + admission info
merged_df = admissions_df.merge(patients_df, on='subject_id', how='left')

In [12]:
# Ensure datetime formatting
merged_df['dod'] = pd.to_datetime(merged_df['dod'], errors='coerce')
merged_df['dischtime'] = pd.to_datetime(merged_df['dischtime'], errors='coerce')

In [13]:
# Define mortality label: death within 30 days of discharge
merged_df['mortality_30d'] = (
    (merged_df['dod'].notna()) &
    ((merged_df['dod'] - merged_df['dischtime']).dt.days <= 30) &
    ((merged_df['dod'] - merged_df['dischtime']).dt.days >= 0)
).astype(int)


In [14]:
# Filter to index admission only (first admission per subject)
merged_df.sort_values(by='admittime', inplace=True)
merged_df = merged_df.drop_duplicates(subset='subject_id', keep='first')

In [15]:
# Show final structure
disp.display(merged_df[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'dod', 'mortality_30d']].head())


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,dod,mortality_30d
297457,16904137,21081215,2105-10-04 17:26:00,2105-10-12 11:11:00,NaT,0
179462,14178262,24686846,2106-02-06 20:18:00,2106-02-07 09:31:00,NaT,0
86235,12024697,20302177,2109-12-14 22:50:00,2110-01-15 14:53:00,2111-05-24,0
230245,15350437,20383396,2110-01-11 08:02:00,2110-01-12 18:45:00,NaT,0
349959,18106347,24305596,2110-01-11 10:14:00,2110-01-15 17:31:00,NaT,0


In [17]:
# Save for downstream use
output_dir = Path(r"C:\Users\Rand Sobczak Jr\_rts\mlops\mimiccds\real_mimic\output")
output_dir.mkdir(parents=True, exist_ok=True)
merged_df.to_csv(output_dir / "admissions_labeled.csv", index=False)