In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import data
DATA_PATH = r"D:\ICUDATASET\mimic-iv-3.1"
ADMISSIONS_PATH = f"{DATA_PATH}/hosp/admissions.csv"
PATIENTS_PATH = f"{DATA_PATH}/hosp/patients.csv"
ICU_PATH = f"{DATA_PATH}/icu/icustays.csv"
DIAGNOSES_PATH = f"{DATA_PATH}/hosp/diagnoses_icd.csv"
PRESCRIPTIONS_PATH = f"{DATA_PATH}/hosp/prescriptions.csv"
LABS_PATH = f"{DATA_PATH}/hosp/labevents.csv"
D_LABITEMS_PATH = f"{DATA_PATH}/hosp/d_labitems.csv" # To map lab item IDs to names
CHARTEVENTS_PATH = f"{DATA_PATH}/icu/chartevents.csv" # For vital signs and GCS
D_ITEMS_PATH = f"{DATA_PATH}/icu/d_items.csv" # To map chartevent item IDs to names
INPUTEVENTS_PATH = f"{DATA_PATH}/icu/inputevents.csv" # For IV intake
OUTPUTEVENTS_PATH = f"{DATA_PATH}/icu/outputevents.csv" # For output
PROCEDURES_ICD_PATH = f"{DATA_PATH}/hosp/procedures_icd.csv" # For procedures

admissions = pd.read_csv(ADMISSIONS_PATH)
patients = pd.read_csv(PATIENTS_PATH)
icustays = pd.read_csv(ICU_PATH)
diagnoses = pd.read_csv(DIAGNOSES_PATH)
prescriptions = pd.read_csv(PRESCRIPTIONS_PATH, low_memory = False,nrows=100000)
labevents = pd.read_csv(LABS_PATH, low_memory= False, nrows=100000)
d_labitems = pd.read_csv(D_LABITEMS_PATH)
chartevents = pd.read_csv(CHARTEVENTS_PATH, low_memory=False, nrows=10000000) 
d_items = pd.read_csv(D_ITEMS_PATH)
inputevents = pd.read_csv(INPUTEVENTS_PATH, nrows = 100000)
outputevents = pd.read_csv(OUTPUTEVENTS_PATH, nrows = 100000)
procedures = pd.read_csv(PROCEDURES_ICD_PATH)

In [2]:
print(d_items["itemid"].unique())

[220001 220003 220045 ... 230174 230176 230177]


In [3]:
icustays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000690,25860671,37081114,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2150-11-02 19:37:00,2150-11-06 17:03:17,3.893252
2,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
3,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
4,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113


In [4]:
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P39NWO,EMERGENCY ROOM,,,English,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


## COHORT CREATION

In [5]:
#cohort creation
#We selected patients who had at least one diagnostic code for acute ischemic stroke (ICD 10 I63.* or ICD 9 beginning with 433, 434, or 436)
ais_diagnoses = diagnoses[
    ((diagnoses["icd_version"] == 9) & diagnoses["icd_code"].str.startswith(("433", "434", "436"))) |
    ((diagnoses["icd_version"] == 10) & diagnoses["icd_code"].str.startswith("I63"))
]



#merge with icu stays 
ais_icu = pd.merge(ais_diagnoses, icustays, on=["subject_id", "hadm_id"])
# Only the first ICU admission was considered for patients who required multiple ICU admissions during a single hospitalization
ais_icu = ais_icu.sort_values(by="intime")
ais_icu = ais_icu.drop_duplicates(subset=["hadm_id"], keep="first")


#hospitalizations not discharged or died within 48 hours
ais_icu_48h = ais_icu[ais_icu["los"] >= 2]


# add mortality outcome
ais_final = pd.merge(
    ais_icu_48h,
    admissions[["subject_id", "hadm_id", "hospital_expire_flag"]],
    on=["subject_id", "hadm_id"]
)
print("Final cohort size:", len(ais_final))
print("Patients died:", sum(ais_final["hospital_expire_flag"]))
print("Patients survived:", (ais_final['hospital_expire_flag'] == 0).sum())

Final cohort size: 3387
Patients died: 628
Patients survived: 2759


In [6]:
ais_final.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,stay_id,first_careunit,last_careunit,intime,outtime,los,hospital_expire_flag
0,14816979,21650344,1,43411,9,38466660,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2110-01-30 13:15:53,2110-02-03 18:18:26,4.210104,0
1,12264134,25257503,1,I63312,10,37673110,Neuro Intermediate,Neuro Intermediate,2110-03-02 16:04:26,2110-03-07 22:18:13,5.259572,0
2,11993259,23072371,1,43491,9,32269643,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2110-03-27 16:48:00,2110-03-29 22:17:13,2.228623,0
3,17907596,21801758,3,43411,9,34260029,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2110-04-12 18:19:00,2110-05-01 08:34:09,18.593854,0
4,18801749,29949595,1,I63412,10,36111383,Neuro Intermediate,Neuro Intermediate,2110-04-20 16:50:00,2110-04-23 18:12:27,3.057257,0


In [7]:
patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000058,F,33,2168,2020 - 2022,
3,10000068,F,19,2160,2008 - 2010,
4,10000084,M,72,2160,2017 - 2019,2161-02-13


## Data preprocessing 

In [8]:
ais_final['intime'] = pd.to_datetime(ais_final['intime'])
ais_final['outtime'] = pd.to_datetime(ais_final['outtime'])
admissions['admittime'] = pd.to_datetime(admissions['admittime'])
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'])
labevents['charttime'] = pd.to_datetime(labevents['charttime'])
prescriptions['starttime'] = pd.to_datetime(prescriptions['starttime'])
prescriptions['stoptime'] = pd.to_datetime(prescriptions['stoptime'])
chartevents['charttime'] = pd.to_datetime(chartevents['charttime'])
inputevents['starttime'] = pd.to_datetime(inputevents['starttime'])
inputevents['endtime'] = pd.to_datetime(inputevents['endtime'])
outputevents['charttime'] = pd.to_datetime(outputevents['charttime'])

base_cohort = ais_final.copy()
base_cohort.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,stay_id,first_careunit,last_careunit,intime,outtime,los,hospital_expire_flag
0,14816979,21650344,1,43411,9,38466660,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2110-01-30 13:15:53,2110-02-03 18:18:26,4.210104,0
1,12264134,25257503,1,I63312,10,37673110,Neuro Intermediate,Neuro Intermediate,2110-03-02 16:04:26,2110-03-07 22:18:13,5.259572,0
2,11993259,23072371,1,43491,9,32269643,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2110-03-27 16:48:00,2110-03-29 22:17:13,2.228623,0
3,17907596,21801758,3,43411,9,34260029,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2110-04-12 18:19:00,2110-05-01 08:34:09,18.593854,0
4,18801749,29949595,1,I63412,10,36111383,Neuro Intermediate,Neuro Intermediate,2110-04-20 16:50:00,2110-04-23 18:12:27,3.057257,0


In [9]:
# DEMOGRAPHIC FEATURES
# Demographic data included age as a continuous variable, and categorical data on sex, language, race, and marital status were represented numerically using one-hot encoding.
demographics = pd.merge(base_cohort,  patients[['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group']], on = ["subject_id"], how="left")
#merge with admissions to get race, martial status and language
demographics = pd.merge(demographics, admissions[['subject_id', 'hadm_id', 'race',  'marital_status','language']], on=['subject_id', 'hadm_id'])

demographics['age'] = demographics['anchor_age']

#one hot encode cat features
demographics_ohe = pd.get_dummies(demographics[['gender', 'marital_status', 'race', 'language']], prefix=['gender', 'marital_status', 'race', 'language']).astype(int)
demographics_features = pd.concat([demographics[['subject_id', 'hadm_id', 'stay_id', 'age']], demographics_ohe], axis=1)
demographics_features.head()

Unnamed: 0,subject_id,hadm_id,stay_id,age,gender_F,gender_M,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,...,language_Korean,language_Modern Greek (1453-),language_Other,language_Persian,language_Polish,language_Portuguese,language_Russian,language_Somali,language_Spanish,language_Vietnamese
0,14816979,21650344,38466660,30,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,12264134,25257503,37673110,73,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,11993259,23072371,32269643,76,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,17907596,21801758,34260029,67,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18801749,29949595,36111383,91,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# HOSPITILIZATION INFORMATION
# Hospitalization information, including admission type (i.e., urgent and surgical same-day admission) and location (i.e., emergency room and transfer from the hospital), was also one-hot encoded.
hospitalization = pd.merge(base_cohort, admissions[['subject_id', 'hadm_id', 'admission_type', 'admission_location']])
hospitalization_ohe = pd.get_dummies(hospitalization[['admission_type', 'admission_location', 'first_careunit', 'last_careunit']], prefix=['adm_type', 'adm_loc', 'first_careunit', 'last_careunit']).astype(int)
hospitalization_features = pd.concat([hospitalization[['subject_id', 'hadm_id', 'stay_id']], hospitalization_ohe], axis=1)

hospitalization_features.head()

Unnamed: 0,subject_id,hadm_id,stay_id,adm_type_DIRECT EMER.,adm_type_DIRECT OBSERVATION,adm_type_ELECTIVE,adm_type_EU OBSERVATION,adm_type_EW EMER.,adm_type_OBSERVATION ADMIT,adm_type_SURGICAL SAME DAY ADMISSION,...,last_careunit_Intensive Care Unit (ICU),last_careunit_Medical Intensive Care Unit (MICU),last_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),last_careunit_Neuro Intermediate,last_careunit_Neuro Stepdown,last_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),last_careunit_PACU,last_careunit_Surgery/Vascular/Intermediate,last_careunit_Surgical Intensive Care Unit (SICU),last_careunit_Trauma SICU (TSICU)
0,14816979,21650344,38466660,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,12264134,25257503,37673110,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,11993259,23072371,32269643,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,17907596,21801758,34260029,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,18801749,29949595,36111383,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [11]:
prescriptions.head()

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
0,10000032,22595853,12775705,10000032-55,55.0,P85UQ1,2180-05-08 08:00:00,2180-05-07 22:00:00,MAIN,Furosemide,...,8209.0,51079010000.0,40mg Tablet,,40,mg,1.0,TAB,1.0,PO/NG
1,10000032,22595853,18415984,10000032-42,42.0,P23SJA,2180-05-07 02:00:00,2180-05-07 22:00:00,MAIN,Ipratropium Bromide Neb,...,21700.0,487980100.0,2.5mL Vial,,1,NEB,1.0,VIAL,4.0,IH
2,10000032,22595853,23637373,10000032-35,35.0,P23SJA,2180-05-07 01:00:00,2180-05-07 09:00:00,MAIN,Furosemide,...,8208.0,51079010000.0,20mg Tablet,,20,mg,1.0,TAB,1.0,PO/NG
3,10000032,22595853,26862314,10000032-41,41.0,P23SJA,2180-05-07 01:00:00,2180-05-07 01:00:00,MAIN,Potassium Chloride,...,1275.0,245004100.0,10mEq ER Tablet,,40,mEq,4.0,TAB,1.0,PO
4,10000032,22595853,30740602,10000032-27,27.0,P23SJA,2180-05-07 00:00:00,2180-05-07 22:00:00,MAIN,Sodium Chloride 0.9% Flush,...,,0.0,10 mL Syringe,,3,mL,0.3,SYR,3.0,IV


In [12]:
#MEDICATION FEATURES
# For all patients, medication use was included if the patient had at least one prescription or treatment within the first 48 h of admission or before the admission.
prescriptions_filtered = pd.merge(base_cohort[["subject_id", "hadm_id", "intime"]], prescriptions, on = ["subject_id", "hadm_id"])
prescriptions_filtered = prescriptions_filtered[
    (prescriptions_filtered['starttime'] <= prescriptions_filtered['intime'] + pd.Timedelta(hours=48))
]

medication_features_ohe = prescriptions_filtered.groupby(['subject_id', 'hadm_id', 'drug']).size().unstack(fill_value = 0)
medication_features_ohe = (medication_features_ohe > 0).astype(int)
medication_features_ohe.columns = [f"med_{col}" for col in medication_features_ohe.columns]

medication_features = pd.merge(base_cohort[['subject_id', 'hadm_id', 'stay_id']], medication_features_ohe, on=['subject_id', 'hadm_id'], how='left').fillna(0)

In [13]:
medication_features.head()

Unnamed: 0,subject_id,hadm_id,stay_id,med_0.45% Sodium Chloride,med_0.83% Sodium Chloride,med_0.9 % Sodium Chloride,med_0.9% Sodium Chloride,med_5% Dextrose,med_ALPRAZolam,med_Acetaminophen,...,med_Syringe,med_Syringe (Iso-Osmotic Dextrose),med_Thiamine,med_Tiotropium Bromide,med_TraZODone,med_Vancomycin,med_Vasopressin,med_Vial,med_Vitamin D,med_amLODIPine
0,14816979,21650344,38466660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12264134,25257503,37673110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11993259,23072371,32269643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17907596,21801758,34260029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18801749,29949595,36111383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# LAB RESULTS
labevents_filtered = pd.merge(base_cohort, labevents, on = ['subject_id', 'hadm_id'])
labevents_filtered = labevents_filtered[(labevents_filtered["intime"] <= labevents_filtered["charttime"]) & (labevents_filtered["charttime"] <= labevents_filtered["intime"] + pd.Timedelta(hours = 48))]

#get labitem names
labevents_filtered = pd.merge(labevents_filtered, d_labitems[["itemid", "label"]], on=["itemid"])
labevents_filtered['label'] = labevents_filtered['label'].str.replace(' ', '_').str.lower()

#Summary statistics (i.e., minimum, mean, maximum, standard deviation, initial value, and number of measurements) were derived from the first laboratory results and vital signs recorded upon admission and in 8-h windows, creating 6 (8-h windows within 48 h) total windows
def calculate_summary_stats(group):
    if group.empty:
        return pd.Series({
            'min': np.nan, 'mean': np.nan, 'max': np.nan,
            'std': np.nan, 'initial': np.nan, 'count': 0
        })
    else:
        group_sorted = group.sort_values(by='charttime')
        return pd.Series({
        'min': group_sorted['valuenum'].min(),
        'mean': group_sorted['valuenum'].mean(),
        'max': group_sorted['valuenum'].max(),
        'std': group_sorted['valuenum'].std(),
        'initial': group_sorted['valuenum'].iloc[0], 
        'count': group_sorted['valuenum'].count()
    })

lab_features_list = []
for index, row in base_cohort.iterrows():
    patient_labevents = labevents_filtered[
        (labevents_filtered['subject_id'] == row['subject_id']) &
        (labevents_filtered['hadm_id'] == row['hadm_id']) &
        (labevents_filtered['stay_id'] == row['stay_id'])
    ]

    patient_lab_features = {'subject_id': row['subject_id'], 'hadm_id': row['hadm_id'], 'stay_id': row['stay_id']}

    # Initial measurement (first lab result upon admission within the 48h window)
    # This should be the earliest measurement for each lab item for the patient
    initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
    for _, lab_row in initial_labs.iterrows():
        patient_lab_features[f'lab_{lab_row["label"]}_initial'] = lab_row['valuenum']

    # 8-hour windows (6 windows total for 48 hours)
    for i in range(6): # 0-7h, 8-15h, ..., 40-47h
        start_time = row['intime'] + pd.Timedelta(hours=i * 8)
        end_time = row['intime'] + pd.Timedelta(hours=(i + 1) * 8)

        window_labs = patient_labevents[
            (patient_labevents['charttime'] >= start_time) &
            (patient_labevents['charttime'] < end_time)
        ]

        if not window_labs.empty:
            for lab_name, group in window_labs.groupby('label'):
                stats = calculate_summary_stats(group)
                for stat_name, value in stats.items():
                    patient_lab_features[f'lab_{lab_name}_{stat_name}_interval{i+1}'] = value
    lab_features_list.append(patient_lab_features)

lab_features = pd.DataFrame(lab_features_list)

# Fill NaN values with 0 
lab_features = lab_features.fillna(0)

lab_features.head()

  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_labs = patient_labevents.gr

Unnamed: 0,subject_id,hadm_id,stay_id,lab_%_hemoglobin_a1c_initial,lab_alanine_aminotransferase_(alt)_initial,lab_albumin_initial,lab_alkaline_phosphatase_initial,lab_ammonia_initial,lab_amorphous_crystals_initial,lab_anion_gap_initial,...,lab_white_blood_cells_max_interval6,lab_white_blood_cells_std_interval6,lab_white_blood_cells_initial_interval6,lab_white_blood_cells_count_interval6,lab_yeast_min_interval6,lab_yeast_mean_interval6,lab_yeast_max_interval6,lab_yeast_std_interval6,lab_yeast_initial_interval6,lab_yeast_count_interval6
0,14816979,21650344,38466660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12264134,25257503,37673110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11993259,23072371,32269643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17907596,21801758,34260029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18801749,29949595,36111383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Vital Signs and Clinical Assessment Features (e.g., GCS, RASS)
# Similar to lab features, summary statistics for vital signs and clinical assessments
# will be extracted from the first 48 hours, in 8-hour windows.

# Relevant Item IDs for vital signs and clinical assessments (verified with d_items)
relevant_itemids = {
    220739: 'gcs_motor',        # GCS - Motor Response
    223900: 'gcs_verbal',       # GCS - Verbal Response
    223901: 'gcs_eyes',         # GCS - Eyes Open
    227345: 'richmond_rass',    # Richmond-RAS Scale
    # Updated Braden Scale itemids based on user's d_items output
    224054: 'braden_sensory_perception',
    224055: 'braden_moisture',
    224056: 'braden_activity',
    224057: 'braden_mobility',
    224058: 'braden_nutrition',
    224059: 'braden_friction_shear',
    220045: 'heart_rate',       # Heart Rate
    220179: 'sbp',              # Non Invasive Blood Pressure systolic
    220180: 'dbp',              # Non Invasive Blood Pressure diastolic
    220210: 'respiratory_rate', # Respiratory Rate
    223761: 'temperature_f',    # Temperature Fahrenheit
    220277: 'spo2'              # O2 saturation pulseoxymetry
}

# Filter chartevents for relevant itemids first to reduce size
chartevents_relevant_items = chartevents[chartevents['itemid'].isin(relevant_itemids.keys())].copy()
print(f"chartevents_relevant_items shape after itemid filtering: {chartevents_relevant_items.shape}") # Debug print

# Check dtypes before merge
print(f"Dtypes of merge keys in chartevents_relevant_items: {chartevents_relevant_items[['subject_id', 'hadm_id', 'stay_id']].dtypes}")
print(f"Dtypes of merge keys in base_cohort: {base_cohort[['subject_id', 'hadm_id', 'stay_id']].dtypes}")

# Check for overlap in unique IDs
common_subjects = set(chartevents_relevant_items['subject_id']).intersection(set(base_cohort['subject_id']))
print(f"Number of common subject_ids before merge: {len(common_subjects)}")

common_hadms = set(chartevents_relevant_items['hadm_id']).intersection(set(base_cohort['hadm_id']))
print(f"Number of common hadm_ids before merge: {len(common_hadms)}")

common_stays = set(chartevents_relevant_items['stay_id']).intersection(set(base_cohort['stay_id']))
print(f"Number of common stay_ids before merge: {len(common_stays)}")

# Check for missing IDs
print(f"Missing subject_id in chartevents_relevant_items: {chartevents_relevant_items['subject_id'].isnull().sum()}")
print(f"Missing hadm_id in chartevents_relevant_items: {chartevents_relevant_items['hadm_id'].isnull().sum()}")
print(f"Missing stay_id in chartevents_relevant_items: {chartevents_relevant_items['stay_id'].isnull().sum()}")

print(f"Missing subject_id in base_cohort: {base_cohort['subject_id'].isnull().sum()}")
print(f"Missing hadm_id in base_cohort: {base_cohort['hadm_id'].isnull().sum()}")
print(f"Missing stay_id in base_cohort: {base_cohort['stay_id'].isnull().sum()}")


# Now merge with base_cohort on all common IDs: subject_id, hadm_id, stay_id
chartevents_filtered = pd.merge(
    chartevents_relevant_items,
    base_cohort[['subject_id', 'hadm_id', 'stay_id', 'intime']],
    on=['subject_id', 'hadm_id', 'stay_id'], # Merge on all three identifiers
    how='inner'
)
print(f"chartevents_filtered shape after merge on all IDs: {chartevents_filtered.shape}") # Debug print

# Apply time filtering
chartevents_filtered = chartevents_filtered[
    (chartevents_filtered['charttime'] >= chartevents_filtered['intime']) &
    (chartevents_filtered['charttime'] <= chartevents_filtered['intime'] + pd.Timedelta(hours=48))
]
print(f"chartevents_filtered shape after time filtering: {chartevents_filtered.shape}") # Debug print

chartevents_filtered['label'] = chartevents_filtered['itemid'].map(relevant_itemids)
# Drop rows where 'label' is NaN (i.e., itemid was not in relevant_itemids, though pre-filtering should handle most)
chartevents_filtered.dropna(subset=['label'], inplace=True)
print(f"chartevents_filtered shape after mapping labels and dropping NaNs: {chartevents_filtered.shape}") # Debug print


# Function to calculate summary statistics (same as for labs, adapted for vitals)
def calculate_summary_stats_vitals(group):
    if group.empty:
        return pd.Series({
            'min': np.nan, 'mean': np.nan, 'max': np.nan,
            'std': np.nan, 'initial': np.nan, 'count': 0
        })
    group_sorted = group.sort_values(by='charttime')
    return pd.Series({
        'min': group_sorted['valuenum'].min(),
        'mean': group_sorted['valuenum'].mean(),
        'max': group_sorted['valuenum'].max(),
        'std': group_sorted['valuenum'].std(),
        'initial': group_sorted['valuenum'].iloc[0] if not group_sorted['valuenum'].empty else np.nan,
        'count': group_sorted['valuenum'].count()
    })

vitals_clinical_features_list = []

for index, row in base_cohort.iterrows():
    patient_chartevents = chartevents_filtered[
        (chartevents_filtered['subject_id'] == row['subject_id']) &
        (chartevents_filtered['hadm_id'] == row['hadm_id']) &
        (chartevents_filtered['stay_id'] == row['stay_id'])
    ]
    print(f"Patient {row['subject_id']} - events found: {patient_chartevents.shape[0]}")

    patient_vitals_features = {'subject_id': row['subject_id'], 'hadm_id': row['hadm_id'], 'stay_id': row['stay_id']}

    # Initial measurement for each vital/clinical assessment
    # Check if patient_chartevents is empty before grouping
    if not patient_chartevents.empty:
        initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
        for _, vital_row in initial_vitals.iterrows():
            patient_vitals_features[f'vital_{vital_row["label"]}_initial'] = vital_row['valuenum']

    # 8-hour windows
    for i in range(6): # 0-7h, 8-15h, ..., 40-47h
        start_time = row['intime'] + pd.Timedelta(hours=i * 8)
        end_time = row['intime'] + pd.Timedelta(hours=(i + 1) * 8)

        window_vitals = patient_chartevents[
            (patient_chartevents['charttime'] >= start_time) &
            (patient_chartevents['charttime'] < end_time)
        ]

        if not window_vitals.empty:
            for vital_name, group in window_vitals.groupby('label'):
                stats = calculate_summary_stats_vitals(group)
                for stat_name, value in stats.items():
                    patient_vitals_features[f'vital_{vital_name}_{stat_name}_interval{i+1}'] = value
    vitals_clinical_features_list.append(patient_vitals_features)

vitals_clinical_features = pd.DataFrame(vitals_clinical_features_list)
vitals_clinical_features = vitals_clinical_features.fillna(0) # Or appropriate imputation

print("\nVital Signs and Clinical Assessment Features Head (first few vital columns):")
print(vitals_clinical_features.iloc[:, :10].head())
print("Vital Signs and Clinical Assessment Features Shape:", vitals_clinical_features.shape)

chartevents_relevant_items shape after itemid filtering: (1216966, 11)
Dtypes of merge keys in chartevents_relevant_items: subject_id    int64
hadm_id       int64
stay_id       int64
dtype: object
Dtypes of merge keys in base_cohort: subject_id    int64
hadm_id       int64
stay_id       int64
dtype: object
Number of common subject_ids before merge: 66
Number of common hadm_ids before merge: 67
Number of common stay_ids before merge: 67
Missing subject_id in chartevents_relevant_items: 0
Missing hadm_id in chartevents_relevant_items: 0
Missing stay_id in chartevents_relevant_items: 0
Missing subject_id in base_cohort: 0
Missing hadm_id in base_cohort: 0
Missing stay_id in base_cohort: 0
chartevents_filtered shape after merge on all IDs: (71542, 12)
chartevents_filtered shape after time filtering: (22718, 12)
chartevents_filtered shape after mapping labels and dropping NaNs: (22718, 13)
Patient 14816979 - events found: 0
Patient 12264134 - events found: 0
Patient 11993259 - events found:

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 16776268 - events found: 0
Patient 13277521 - events found: 0
Patient 13603956 - events found: 0
Patient 10669284 - events found: 0
Patient 13231077 - events found: 0
Patient 12171742 - events found: 0
Patient 12842941 - events found: 0
Patient 15524969 - events found: 0
Patient 14201054 - events found: 0
Patient 19799018 - events found: 0
Patient 10527738 - events found: 0
Patient 10198197 - events found: 244
Patient 19839681 - events found: 0
Patient 12032603 - events found: 0
Patient 11144093 - events found: 0
Patient 12432545 - events found: 0
Patient 10525033 - events found: 0
Patient 12478513 - events found: 0
Patient 19981702 - events found: 0
Patient 16641884 - events found: 0
Patient 19602094 - events found: 0
Patient 15824630 - events found: 0
Patient 19137460 - events found: 0
Patient 11750330 - events found: 0
Patient 12662064 - events found: 0
Patient 18616437 - events found: 0
Patient 10219697 - events found: 389
Patient 17591424 - events found: 0
Patient 17323774

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 14013197 - events found: 0
Patient 19122448 - events found: 0
Patient 17100571 - events found: 0
Patient 13782883 - events found: 0
Patient 16816038 - events found: 0
Patient 16303744 - events found: 0
Patient 16174267 - events found: 0
Patient 10712203 - events found: 0
Patient 15851487 - events found: 0
Patient 16523929 - events found: 0
Patient 17734473 - events found: 0
Patient 13134081 - events found: 0
Patient 19956383 - events found: 0
Patient 16726295 - events found: 0
Patient 15591338 - events found: 0
Patient 10495509 - events found: 0
Patient 13134081 - events found: 0
Patient 15672796 - events found: 0
Patient 15592846 - events found: 0
Patient 17898246 - events found: 0
Patient 12113838 - events found: 0
Patient 19545447 - events found: 0
Patient 17727745 - events found: 0
Patient 13489427 - events found: 0
Patient 10915432 - events found: 0
Patient 10330990 - events found: 0
Patient 14937825 - events found: 0
Patient 18675328 - events found: 0
Patient 18911483 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 16170639 - events found: 0
Patient 14688093 - events found: 0
Patient 17727047 - events found: 0
Patient 11205160 - events found: 0
Patient 18427416 - events found: 0
Patient 19898324 - events found: 0
Patient 19557789 - events found: 0
Patient 19837286 - events found: 0
Patient 14562173 - events found: 0
Patient 14115052 - events found: 0
Patient 18730144 - events found: 0
Patient 19865290 - events found: 0
Patient 14447426 - events found: 0
Patient 18204749 - events found: 0
Patient 14044609 - events found: 0
Patient 10363123 - events found: 0
Patient 18095375 - events found: 0
Patient 10865646 - events found: 0
Patient 19494859 - events found: 0
Patient 18464758 - events found: 0
Patient 14818953 - events found: 0
Patient 14844543 - events found: 0
Patient 12929759 - events found: 0
Patient 11901601 - events found: 0
Patient 18541229 - events found: 0
Patient 14935287 - events found: 0
Patient 18754526 - events found: 0
Patient 12330753 - events found: 0
Patient 18837678 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 13564316 - events found: 0
Patient 16419524 - events found: 0
Patient 18537209 - events found: 0
Patient 14538785 - events found: 0
Patient 17589545 - events found: 0
Patient 13615123 - events found: 0
Patient 17651267 - events found: 0
Patient 15391573 - events found: 0
Patient 10145374 - events found: 308
Patient 16055197 - events found: 0
Patient 17501806 - events found: 0
Patient 13445415 - events found: 0
Patient 11152914 - events found: 0
Patient 17277379 - events found: 0
Patient 12811828 - events found: 0
Patient 15446133 - events found: 0
Patient 16682527 - events found: 0
Patient 16277409 - events found: 0
Patient 15834756 - events found: 0
Patient 16139798 - events found: 0
Patient 13084414 - events found: 0
Patient 14018478 - events found: 0
Patient 10909478 - events found: 0
Patient 12534193 - events found: 0
Patient 18007614 - events found: 0
Patient 11718182 - events found: 0
Patient 14435153 - events found: 0
Patient 16088068 - events found: 0
Patient 10186976 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 11379140 - events found: 0
Patient 18585153 - events found: 0
Patient 13639158 - events found: 0
Patient 18521722 - events found: 0
Patient 14432496 - events found: 0
Patient 10569728 - events found: 0
Patient 17595158 - events found: 0
Patient 17168019 - events found: 0
Patient 16770849 - events found: 0
Patient 18075452 - events found: 0
Patient 18735396 - events found: 0
Patient 18312795 - events found: 0
Patient 14703317 - events found: 0
Patient 18986354 - events found: 0
Patient 11288013 - events found: 0
Patient 10059406 - events found: 262
Patient 15231987 - events found: 0
Patient 16276883 - events found: 0
Patient 15281401 - events found: 0
Patient 16531118 - events found: 0
Patient 13383454 - events found: 0
Patient 14219090 - events found: 0
Patient 12842017 - events found: 0
Patient 10757372 - events found: 0
Patient 18948896 - events found: 0
Patient 19727371 - events found: 0
Patient 10454463 - events found: 0
Patient 18265876 - events found: 0
Patient 19177306 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 12514413 - events found: 0
Patient 18430220 - events found: 0
Patient 15375392 - events found: 0
Patient 18108905 - events found: 0
Patient 14486669 - events found: 0
Patient 11523647 - events found: 0
Patient 11199155 - events found: 0
Patient 11199155 - events found: 0
Patient 12809207 - events found: 0
Patient 16804908 - events found: 0
Patient 17635417 - events found: 0
Patient 19935488 - events found: 0
Patient 16082951 - events found: 0
Patient 19278591 - events found: 0
Patient 14548428 - events found: 0
Patient 13034551 - events found: 0
Patient 16326056 - events found: 0
Patient 17741087 - events found: 0
Patient 17449089 - events found: 0
Patient 10528589 - events found: 0
Patient 12670899 - events found: 0
Patient 12638255 - events found: 0
Patient 13485675 - events found: 0
Patient 12533588 - events found: 0
Patient 18959097 - events found: 0
Patient 13370388 - events found: 0
Patient 19997293 - events found: 0
Patient 15984332 - events found: 0
Patient 15392686 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 11984647 - events found: 0
Patient 11114858 - events found: 0
Patient 13140465 - events found: 0
Patient 11439189 - events found: 0
Patient 15621459 - events found: 0
Patient 18459383 - events found: 0
Patient 15567057 - events found: 0
Patient 15171708 - events found: 0
Patient 19808173 - events found: 0
Patient 17958680 - events found: 0
Patient 14807966 - events found: 0
Patient 19609079 - events found: 0
Patient 19365113 - events found: 0
Patient 17745354 - events found: 0
Patient 13582085 - events found: 0
Patient 13514750 - events found: 0
Patient 15189782 - events found: 0
Patient 13998126 - events found: 0
Patient 17948734 - events found: 0
Patient 14693832 - events found: 0
Patient 16507681 - events found: 0
Patient 15347301 - events found: 0
Patient 19957675 - events found: 0
Patient 18881392 - events found: 0
Patient 13431454 - events found: 0
Patient 10909143 - events found: 0
Patient 14611912 - events found: 0
Patient 19898116 - events found: 0
Patient 19667160 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 13406846 - events found: 0
Patient 19265807 - events found: 0
Patient 15735569 - events found: 0
Patient 14732303 - events found: 0
Patient 17406211 - events found: 0
Patient 17190362 - events found: 0
Patient 16233501 - events found: 0
Patient 10479570 - events found: 0
Patient 19017919 - events found: 0
Patient 19500019 - events found: 0
Patient 12421716 - events found: 0
Patient 14221290 - events found: 0
Patient 14927693 - events found: 0
Patient 19674707 - events found: 0
Patient 11465422 - events found: 0
Patient 10369528 - events found: 0
Patient 15927594 - events found: 0
Patient 13402281 - events found: 0
Patient 15898810 - events found: 0
Patient 12241660 - events found: 0
Patient 19651727 - events found: 0
Patient 15301471 - events found: 0
Patient 19124341 - events found: 0
Patient 14560475 - events found: 0
Patient 18174227 - events found: 0
Patient 12811067 - events found: 0
Patient 12834991 - events found: 0
Patient 14559115 - events found: 0
Patient 17255902 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 19224634 - events found: 0
Patient 13734813 - events found: 0
Patient 14115733 - events found: 0
Patient 15755023 - events found: 0
Patient 16778843 - events found: 0
Patient 15142740 - events found: 0
Patient 13109270 - events found: 0
Patient 18054522 - events found: 0
Patient 11025341 - events found: 0
Patient 18970829 - events found: 0
Patient 14456271 - events found: 0
Patient 11471353 - events found: 0
Patient 12684822 - events found: 0
Patient 19674615 - events found: 0
Patient 11580096 - events found: 0
Patient 14479229 - events found: 0
Patient 15324792 - events found: 0
Patient 13244557 - events found: 0
Patient 19314496 - events found: 0
Patient 11599754 - events found: 0
Patient 15346162 - events found: 0
Patient 15032560 - events found: 0
Patient 18402020 - events found: 0
Patient 19584694 - events found: 0
Patient 11914188 - events found: 0
Patient 10994222 - events found: 0
Patient 13883230 - events found: 0
Patient 19402743 - events found: 0
Patient 10732078 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 11564670 - events found: 0
Patient 18346399 - events found: 0
Patient 10951555 - events found: 0
Patient 13662941 - events found: 0
Patient 14382290 - events found: 0
Patient 11307376 - events found: 0
Patient 15568298 - events found: 0
Patient 15456129 - events found: 0
Patient 19857331 - events found: 0
Patient 11307376 - events found: 0
Patient 12010644 - events found: 0
Patient 14462044 - events found: 0
Patient 11323513 - events found: 0
Patient 10613134 - events found: 0
Patient 18532454 - events found: 0
Patient 19146637 - events found: 0
Patient 15282725 - events found: 0
Patient 13822447 - events found: 0
Patient 14153132 - events found: 0
Patient 14284435 - events found: 0
Patient 11044485 - events found: 0
Patient 16235306 - events found: 0
Patient 11072440 - events found: 0
Patient 12542274 - events found: 0
Patient 18191490 - events found: 0
Patient 11943854 - events found: 0
Patient 18829665 - events found: 0
Patient 17625365 - events found: 0
Patient 12562823 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 11206254 - events found: 0
Patient 18908844 - events found: 0
Patient 13711875 - events found: 0
Patient 12155780 - events found: 0
Patient 11207789 - events found: 0
Patient 10258176 - events found: 0
Patient 16443507 - events found: 0
Patient 15078862 - events found: 0
Patient 15188030 - events found: 0
Patient 15347754 - events found: 0
Patient 16537903 - events found: 0
Patient 17931543 - events found: 0
Patient 10845539 - events found: 0
Patient 16129824 - events found: 0
Patient 15260995 - events found: 0
Patient 13673557 - events found: 0
Patient 12327683 - events found: 0
Patient 10284297 - events found: 0
Patient 16293224 - events found: 0
Patient 17658387 - events found: 0
Patient 10591447 - events found: 0
Patient 19585137 - events found: 0
Patient 13031876 - events found: 0
Patient 11668547 - events found: 0
Patient 16016038 - events found: 0
Patient 19164907 - events found: 0
Patient 14394051 - events found: 0
Patient 17271314 - events found: 0
Patient 15809638 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 12347959 - events found: 0
Patient 11767827 - events found: 0
Patient 11102841 - events found: 0
Patient 12106079 - events found: 0
Patient 15432525 - events found: 0
Patient 11582827 - events found: 0
Patient 19001252 - events found: 0
Patient 16990937 - events found: 0
Patient 16100569 - events found: 0
Patient 12690798 - events found: 0
Patient 16157611 - events found: 0
Patient 15075807 - events found: 0
Patient 11882869 - events found: 0
Patient 17469613 - events found: 0
Patient 10350119 - events found: 0
Patient 16738523 - events found: 0
Patient 10960177 - events found: 0
Patient 13790647 - events found: 0
Patient 11580826 - events found: 0
Patient 14673876 - events found: 0
Patient 11658100 - events found: 0
Patient 16496655 - events found: 0
Patient 16056209 - events found: 0
Patient 16432173 - events found: 0
Patient 12025783 - events found: 0
Patient 16607052 - events found: 0
Patient 15990162 - events found: 0
Patient 19158552 - events found: 0
Patient 12006413 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 12701112 - events found: 0
Patient 12532271 - events found: 0
Patient 10144378 - events found: 365
Patient 14044574 - events found: 0
Patient 13883022 - events found: 0
Patient 14316386 - events found: 0
Patient 13232894 - events found: 0
Patient 15477638 - events found: 0
Patient 18934262 - events found: 0
Patient 18727820 - events found: 0
Patient 12571738 - events found: 0
Patient 13158770 - events found: 0
Patient 13671107 - events found: 0
Patient 19787228 - events found: 0
Patient 18676404 - events found: 0
Patient 14419707 - events found: 0
Patient 12896279 - events found: 0
Patient 17342098 - events found: 0
Patient 19169261 - events found: 0
Patient 12761308 - events found: 0
Patient 16489178 - events found: 0
Patient 12471831 - events found: 0
Patient 15910730 - events found: 0
Patient 14752715 - events found: 0
Patient 11999982 - events found: 0
Patient 11963546 - events found: 0
Patient 10950332 - events found: 0
Patient 15857793 - events found: 0
Patient 11611556 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 18160220 - events found: 0
Patient 11971516 - events found: 0
Patient 10623490 - events found: 0
Patient 10528056 - events found: 0
Patient 12327925 - events found: 0
Patient 14330819 - events found: 0
Patient 11720931 - events found: 0
Patient 16260041 - events found: 0
Patient 11388315 - events found: 0
Patient 11508828 - events found: 0
Patient 19189423 - events found: 0
Patient 12843938 - events found: 0
Patient 16053405 - events found: 0
Patient 11971516 - events found: 0
Patient 15758468 - events found: 0
Patient 12395220 - events found: 0
Patient 18676703 - events found: 0
Patient 12614979 - events found: 0
Patient 16472043 - events found: 0
Patient 17839628 - events found: 0
Patient 14198265 - events found: 0
Patient 15104113 - events found: 0
Patient 12563706 - events found: 0
Patient 14261068 - events found: 0
Patient 19208673 - events found: 0
Patient 16717030 - events found: 0
Patient 19247572 - events found: 0
Patient 10097938 - events found: 401
Patient 11196453 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 11727202 - events found: 0
Patient 14040376 - events found: 0
Patient 10256586 - events found: 0
Patient 13996450 - events found: 0
Patient 12641758 - events found: 0
Patient 12848359 - events found: 0
Patient 17436451 - events found: 0
Patient 16476768 - events found: 0
Patient 16472682 - events found: 0
Patient 16139035 - events found: 0
Patient 14572141 - events found: 0
Patient 13270933 - events found: 0
Patient 19359066 - events found: 0
Patient 17932630 - events found: 0
Patient 11426065 - events found: 0
Patient 11061972 - events found: 0
Patient 11977213 - events found: 0
Patient 17142353 - events found: 0
Patient 17573377 - events found: 0
Patient 19732174 - events found: 0
Patient 16292532 - events found: 0
Patient 11621526 - events found: 0
Patient 13352372 - events found: 0
Patient 15952421 - events found: 0
Patient 14071703 - events found: 0
Patient 19960105 - events found: 0
Patient 12136570 - events found: 0
Patient 18496140 - events found: 0
Patient 10917306 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 19627403 - events found: 0
Patient 17502587 - events found: 0
Patient 19850946 - events found: 0
Patient 11814192 - events found: 0
Patient 19104785 - events found: 0
Patient 16893981 - events found: 0
Patient 12883382 - events found: 0
Patient 12997267 - events found: 0
Patient 18267541 - events found: 0
Patient 16606885 - events found: 0
Patient 13943586 - events found: 0
Patient 11323895 - events found: 0
Patient 12758544 - events found: 0
Patient 11649157 - events found: 0
Patient 12762605 - events found: 0
Patient 11164018 - events found: 0
Patient 12126537 - events found: 0
Patient 19928591 - events found: 0
Patient 15446875 - events found: 0
Patient 16802997 - events found: 0
Patient 16752897 - events found: 0
Patient 17316181 - events found: 0
Patient 12356016 - events found: 0
Patient 19004499 - events found: 0
Patient 11396012 - events found: 0
Patient 15449623 - events found: 0
Patient 11181705 - events found: 0
Patient 12452180 - events found: 0
Patient 10023708 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 15294652 - events found: 0
Patient 12159227 - events found: 0
Patient 15186511 - events found: 0
Patient 18767294 - events found: 0
Patient 19352186 - events found: 0
Patient 11275654 - events found: 0
Patient 12871416 - events found: 0
Patient 13709820 - events found: 0
Patient 15521529 - events found: 0
Patient 15016626 - events found: 0
Patient 10013310 - events found: 393
Patient 17563813 - events found: 0
Patient 10378032 - events found: 0
Patient 18690372 - events found: 0
Patient 15475850 - events found: 0
Patient 15574754 - events found: 0
Patient 17375120 - events found: 0
Patient 12229037 - events found: 0
Patient 12924058 - events found: 0
Patient 12022798 - events found: 0
Patient 10209431 - events found: 308
Patient 15574754 - events found: 0
Patient 17386513 - events found: 0
Patient 15382744 - events found: 0
Patient 16207072 - events found: 0
Patient 14456616 - events found: 0
Patient 11045637 - events found: 0
Patient 15995107 - events found: 0
Patient 10964292

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 12631125 - events found: 0
Patient 15380869 - events found: 0
Patient 11500793 - events found: 0
Patient 11871004 - events found: 0
Patient 11387260 - events found: 0
Patient 11534955 - events found: 0
Patient 19367488 - events found: 0
Patient 13784236 - events found: 0
Patient 16978803 - events found: 0
Patient 11129835 - events found: 0
Patient 12796402 - events found: 0
Patient 12700221 - events found: 0
Patient 12670557 - events found: 0
Patient 11417617 - events found: 0
Patient 12748404 - events found: 0
Patient 14881084 - events found: 0
Patient 19326654 - events found: 0
Patient 16252158 - events found: 0
Patient 15133454 - events found: 0
Patient 19854727 - events found: 0
Patient 15702544 - events found: 0
Patient 16193308 - events found: 0
Patient 12868157 - events found: 0
Patient 17593617 - events found: 0
Patient 13507232 - events found: 0
Patient 12246903 - events found: 0
Patient 16773578 - events found: 0
Patient 18344335 - events found: 0
Patient 17360195 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 19894874 - events found: 0
Patient 18019825 - events found: 0
Patient 16484898 - events found: 0
Patient 14783167 - events found: 0
Patient 17967970 - events found: 0
Patient 17316016 - events found: 0
Patient 14514730 - events found: 0
Patient 10092227 - events found: 383
Patient 18324403 - events found: 0
Patient 14228775 - events found: 0
Patient 11527001 - events found: 0
Patient 14019847 - events found: 0
Patient 11079993 - events found: 0
Patient 12424554 - events found: 0
Patient 12571007 - events found: 0
Patient 12231709 - events found: 0
Patient 11550765 - events found: 0
Patient 11589090 - events found: 0
Patient 14303994 - events found: 0
Patient 13447017 - events found: 0
Patient 19422986 - events found: 0
Patient 14737220 - events found: 0
Patient 16185236 - events found: 0
Patient 16636658 - events found: 0
Patient 11562844 - events found: 0
Patient 18143542 - events found: 0
Patient 16340856 - events found: 0
Patient 12952402 - events found: 0
Patient 10805306 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 19109639 - events found: 0
Patient 16580436 - events found: 0
Patient 19674536 - events found: 0
Patient 16504032 - events found: 0
Patient 18208545 - events found: 0
Patient 18867536 - events found: 0
Patient 15048648 - events found: 0
Patient 13081604 - events found: 0
Patient 14517899 - events found: 0
Patient 16029235 - events found: 0
Patient 15981908 - events found: 0
Patient 13199582 - events found: 0
Patient 16597308 - events found: 0
Patient 12308099 - events found: 0
Patient 15015119 - events found: 0
Patient 10356799 - events found: 0
Patient 13543681 - events found: 0
Patient 17872505 - events found: 0
Patient 13549627 - events found: 0
Patient 17759174 - events found: 0
Patient 17132849 - events found: 0
Patient 16715089 - events found: 0
Patient 15248985 - events found: 0
Patient 17180800 - events found: 0
Patient 13972095 - events found: 0
Patient 18615099 - events found: 0
Patient 12438112 - events found: 0
Patient 18339706 - events found: 0
Patient 18994071 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 17571389 - events found: 0
Patient 10240704 - events found: 301
Patient 16937639 - events found: 0
Patient 10349498 - events found: 0
Patient 12129130 - events found: 0
Patient 17759799 - events found: 0
Patient 14301172 - events found: 0
Patient 15507215 - events found: 0
Patient 11171390 - events found: 0
Patient 19429517 - events found: 0
Patient 14710659 - events found: 0
Patient 10522893 - events found: 0
Patient 13674989 - events found: 0
Patient 16028666 - events found: 0
Patient 14549065 - events found: 0
Patient 10330086 - events found: 0
Patient 18866492 - events found: 0
Patient 10417160 - events found: 0
Patient 14672225 - events found: 0
Patient 11079388 - events found: 0
Patient 15684247 - events found: 0
Patient 10911585 - events found: 0
Patient 10975341 - events found: 0
Patient 14643554 - events found: 0
Patient 16359014 - events found: 0
Patient 15094735 - events found: 0
Patient 16442963 - events found: 0
Patient 16705209 - events found: 0
Patient 17442462 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 16487303 - events found: 0
Patient 18667813 - events found: 0
Patient 18063505 - events found: 0
Patient 16185669 - events found: 0
Patient 16967676 - events found: 0
Patient 14235483 - events found: 0
Patient 13821845 - events found: 0
Patient 17592920 - events found: 0
Patient 19787095 - events found: 0
Patient 19875779 - events found: 0
Patient 17431704 - events found: 0
Patient 10174850 - events found: 363
Patient 14313245 - events found: 0
Patient 10782385 - events found: 0
Patient 16037702 - events found: 0
Patient 17401129 - events found: 0
Patient 16151261 - events found: 0
Patient 13308279 - events found: 0
Patient 15469636 - events found: 0
Patient 14536480 - events found: 0
Patient 14353309 - events found: 0
Patient 17737924 - events found: 0
Patient 14702995 - events found: 0
Patient 13835259 - events found: 0
Patient 14855461 - events found: 0
Patient 16982583 - events found: 0
Patient 12885965 - events found: 0
Patient 18746757 - events found: 0
Patient 13583876 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 16969456 - events found: 0
Patient 13561377 - events found: 0
Patient 14117749 - events found: 0
Patient 18396814 - events found: 0
Patient 11736157 - events found: 0
Patient 15331463 - events found: 0
Patient 16227072 - events found: 0
Patient 19229148 - events found: 0
Patient 14835491 - events found: 0
Patient 18689536 - events found: 0
Patient 15456479 - events found: 0
Patient 19378973 - events found: 0
Patient 17395391 - events found: 0
Patient 19865386 - events found: 0
Patient 16492632 - events found: 0
Patient 11116316 - events found: 0
Patient 16544339 - events found: 0
Patient 13950979 - events found: 0
Patient 15471999 - events found: 0
Patient 15444023 - events found: 0
Patient 13280145 - events found: 0
Patient 13396394 - events found: 0
Patient 19852737 - events found: 0
Patient 18679317 - events found: 0
Patient 19470049 - events found: 0
Patient 12224662 - events found: 0
Patient 19657463 - events found: 0
Patient 18930355 - events found: 0
Patient 16841093 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 15552398 - events found: 0
Patient 11116402 - events found: 0
Patient 11811707 - events found: 0
Patient 17261054 - events found: 0
Patient 18250302 - events found: 0
Patient 16333645 - events found: 0
Patient 19174686 - events found: 0
Patient 18791883 - events found: 0
Patient 16333645 - events found: 0
Patient 16503150 - events found: 0
Patient 17777581 - events found: 0
Patient 18637983 - events found: 0
Patient 10904369 - events found: 0
Patient 16426580 - events found: 0
Patient 10867600 - events found: 0
Patient 11146837 - events found: 0
Patient 11107643 - events found: 0
Patient 19018059 - events found: 0
Patient 16688665 - events found: 0
Patient 15225520 - events found: 0
Patient 16901707 - events found: 0
Patient 12329855 - events found: 0
Patient 17694396 - events found: 0
Patient 15279149 - events found: 0
Patient 13834663 - events found: 0
Patient 11964706 - events found: 0
Patient 17995051 - events found: 0
Patient 19051786 - events found: 0
Patient 13244678 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 17442138 - events found: 0
Patient 15556698 - events found: 0
Patient 19261159 - events found: 0
Patient 13787390 - events found: 0
Patient 16652205 - events found: 0
Patient 13053494 - events found: 0
Patient 18595852 - events found: 0
Patient 14806715 - events found: 0
Patient 18006408 - events found: 0
Patient 15777991 - events found: 0
Patient 16583373 - events found: 0
Patient 19767516 - events found: 0
Patient 14378870 - events found: 0
Patient 11409716 - events found: 0
Patient 17873103 - events found: 0
Patient 17827226 - events found: 0
Patient 16895456 - events found: 0
Patient 15346363 - events found: 0
Patient 13859166 - events found: 0
Patient 12142361 - events found: 0
Patient 11020841 - events found: 0
Patient 17421663 - events found: 0
Patient 14161008 - events found: 0
Patient 11047741 - events found: 0
Patient 15418295 - events found: 0
Patient 16418338 - events found: 0
Patient 11425407 - events found: 0
Patient 14490374 - events found: 0
Patient 19053146 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 14395135 - events found: 0
Patient 11956852 - events found: 0
Patient 17541389 - events found: 0
Patient 16387818 - events found: 0
Patient 16565191 - events found: 0
Patient 16950272 - events found: 0
Patient 18328540 - events found: 0
Patient 15613151 - events found: 0
Patient 14918161 - events found: 0
Patient 13878192 - events found: 0
Patient 19920078 - events found: 0
Patient 19168809 - events found: 0
Patient 16609129 - events found: 0
Patient 13329429 - events found: 0
Patient 13333021 - events found: 0
Patient 16618220 - events found: 0
Patient 13452498 - events found: 0
Patient 15883568 - events found: 0
Patient 10282939 - events found: 0
Patient 11786916 - events found: 0
Patient 17955142 - events found: 0
Patient 18972794 - events found: 0
Patient 15795691 - events found: 0
Patient 15601158 - events found: 0
Patient 10119770 - events found: 320
Patient 16578317 - events found: 0
Patient 12873124 - events found: 0
Patient 16404747 - events found: 0
Patient 17454242 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 17494417 - events found: 0
Patient 11156725 - events found: 0
Patient 17475607 - events found: 0
Patient 10739214 - events found: 0
Patient 10188428 - events found: 385
Patient 16760180 - events found: 0
Patient 18503226 - events found: 0
Patient 19385269 - events found: 0
Patient 19064155 - events found: 0
Patient 15403852 - events found: 0
Patient 15784583 - events found: 0
Patient 11005665 - events found: 0
Patient 13615021 - events found: 0
Patient 16901518 - events found: 0
Patient 14890817 - events found: 0
Patient 12755484 - events found: 0
Patient 17369232 - events found: 0
Patient 10248673 - events found: 0
Patient 16661055 - events found: 0
Patient 16170098 - events found: 0
Patient 14388050 - events found: 0
Patient 15303387 - events found: 0
Patient 10248673 - events found: 0
Patient 13577351 - events found: 0
Patient 19402245 - events found: 0
Patient 10602808 - events found: 0
Patient 17792682 - events found: 0
Patient 15869093 - events found: 0
Patient 10733134 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 19339500 - events found: 0
Patient 10750883 - events found: 0
Patient 11233081 - events found: 0
Patient 11285165 - events found: 0
Patient 14015052 - events found: 0
Patient 19897902 - events found: 0
Patient 15729444 - events found: 0
Patient 17593758 - events found: 0
Patient 11214284 - events found: 0
Patient 15041543 - events found: 0
Patient 10887781 - events found: 0
Patient 14841663 - events found: 0
Patient 16790473 - events found: 0
Patient 17726141 - events found: 0
Patient 14702698 - events found: 0
Patient 10255052 - events found: 0
Patient 16949150 - events found: 0
Patient 12698059 - events found: 0
Patient 11894530 - events found: 0
Patient 14632617 - events found: 0
Patient 16213371 - events found: 0
Patient 12671679 - events found: 0
Patient 14266723 - events found: 0
Patient 14152034 - events found: 0
Patient 10476871 - events found: 0
Patient 17021943 - events found: 0
Patient 15158950 - events found: 0
Patient 12652899 - events found: 0
Patient 12623497 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 17689755 - events found: 0
Patient 11533462 - events found: 0
Patient 12466651 - events found: 0
Patient 14323503 - events found: 0
Patient 12836931 - events found: 0
Patient 18969221 - events found: 0
Patient 11092156 - events found: 0
Patient 18713965 - events found: 0
Patient 14735340 - events found: 0
Patient 15240836 - events found: 0
Patient 17843659 - events found: 0
Patient 17692815 - events found: 0
Patient 10992265 - events found: 0
Patient 18005486 - events found: 0
Patient 16579421 - events found: 0
Patient 15856423 - events found: 0
Patient 13102263 - events found: 0
Patient 15292609 - events found: 0
Patient 15356016 - events found: 0
Patient 11128068 - events found: 0
Patient 15967699 - events found: 0
Patient 18951987 - events found: 0
Patient 10277119 - events found: 0
Patient 18753289 - events found: 0
Patient 13996865 - events found: 0
Patient 13273041 - events found: 0
Patient 11615050 - events found: 0
Patient 14977613 - events found: 0
Patient 16765466 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 16907496 - events found: 0
Patient 14739658 - events found: 0
Patient 17123098 - events found: 0
Patient 18780840 - events found: 0
Patient 18016578 - events found: 0
Patient 14689365 - events found: 0
Patient 17033783 - events found: 0
Patient 10986184 - events found: 0
Patient 12106600 - events found: 0
Patient 10630083 - events found: 0
Patient 17352692 - events found: 0
Patient 16974395 - events found: 0
Patient 17630793 - events found: 0
Patient 12360599 - events found: 0
Patient 16103368 - events found: 0
Patient 12087353 - events found: 0
Patient 15754765 - events found: 0
Patient 17802616 - events found: 0
Patient 19630197 - events found: 0
Patient 15754765 - events found: 0
Patient 17830656 - events found: 0
Patient 13396234 - events found: 0
Patient 10400376 - events found: 0
Patient 16774869 - events found: 0
Patient 18756147 - events found: 0
Patient 17975155 - events found: 0
Patient 15226983 - events found: 0
Patient 16473254 - events found: 0
Patient 15423840 - e

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 14223179 - events found: 0
Patient 17406430 - events found: 0
Patient 14018555 - events found: 0
Patient 15260609 - events found: 0
Patient 15951517 - events found: 0
Patient 11712698 - events found: 0
Patient 18931949 - events found: 0
Patient 11097412 - events found: 0
Patient 17286315 - events found: 0
Patient 12598727 - events found: 0
Patient 18615743 - events found: 0
Patient 12975227 - events found: 0
Patient 14677796 - events found: 0
Patient 10652536 - events found: 0
Patient 11695792 - events found: 0
Patient 16954495 - events found: 0
Patient 14178020 - events found: 0
Patient 14114995 - events found: 0
Patient 14253390 - events found: 0
Patient 10379868 - events found: 0
Patient 18366346 - events found: 0
Patient 12854615 - events found: 0
Patient 18712508 - events found: 0
Patient 17277688 - events found: 0
Patient 10062774 - events found: 344
Patient 18621664 - events found: 0
Patient 18303336 - events found: 0
Patient 10441831 - events found: 0
Patient 13256955 -

  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])
  initial_vitals = patient_chartevents.groupby('label').apply(lambda x: x.sort_values(by='charttime').iloc[0])


Patient 14816060 - events found: 0
Patient 13383310 - events found: 0
Patient 14454179 - events found: 0
Patient 18399094 - events found: 0
Patient 14875942 - events found: 0
Patient 19508874 - events found: 0
Patient 18369403 - events found: 0
Patient 12684652 - events found: 0
Patient 12844772 - events found: 0

Vital Signs and Clinical Assessment Features Head (first few vital columns):
   subject_id   hadm_id   stay_id  vital_braden_activity_initial  \
0    14816979  21650344  38466660                            0.0   
1    12264134  25257503  37673110                            0.0   
2    11993259  23072371  32269643                            0.0   
3    17907596  21801758  34260029                            0.0   
4    18801749  29949595  36111383                            0.0   

   vital_braden_friction_shear_initial  vital_braden_mobility_initial  \
0                                  0.0                            0.0   
1                                  0.0              

In [16]:
vitals_clinical_features.shape

(3387, 595)

In [17]:
#INPUT AND OUTPUT FEATURES
# Summary statistics and overall sum will be derived in daily intervals for oral and intravenous input
# and output within the first 48 hours.

# Filter inputevents for relevant hadm_ids and time window
inputevents_filtered = pd.merge(base_cohort[['subject_id', 'hadm_id', 'stay_id', 'intime']], inputevents, on=['subject_id', 'hadm_id', 'stay_id'], how='inner')
inputevents_filtered = inputevents_filtered[
    (inputevents_filtered['starttime'] >= inputevents_filtered['intime']) &
    (inputevents_filtered['starttime'] <= inputevents_filtered['intime'] + pd.Timedelta(hours=48))
]

# Filter outputevents for relevant hadm_ids and time window
outputevents_filtered = pd.merge(base_cohort[['subject_id', 'hadm_id', 'stay_id', 'intime']], outputevents, on=['subject_id', 'hadm_id', 'stay_id'], how='inner')
outputevents_filtered = outputevents_filtered[
    (outputevents_filtered['charttime'] >= outputevents_filtered['intime']) &
    (outputevents_filtered['charttime'] <= outputevents_filtered['intime'] + pd.Timedelta(hours=48))
]

input_output_features_list = []

for index, row in base_cohort.iterrows():
    patient_input = inputevents_filtered[
        (inputevents_filtered['subject_id'] == row['subject_id']) &
        (inputevents_filtered['hadm_id'] == row['hadm_id']) &
        (inputevents_filtered['stay_id'] == row['stay_id'])
    ]
    patient_output = outputevents_filtered[
        (outputevents_filtered['subject_id'] == row['subject_id']) &
        (outputevents_filtered['hadm_id'] == row['hadm_id']) &
        (outputevents_filtered['stay_id'] == row['stay_id'])
    ]

    patient_io_features = {'subject_id': row['subject_id'], 'hadm_id': row['hadm_id'], 'stay_id': row['stay_id']}

    # Daily intervals for 48 hours (Day 1: 0-23h, Day 2: 24-47h)
    for i in range(2): # Day 1, Day 2
        start_time_day = row['intime'] + pd.Timedelta(hours=i * 24)
        end_time_day = row['intime'] + pd.Timedelta(hours=(i + 1) * 24)

        # Input
        daily_input = patient_input[
            (patient_input['starttime'] >= start_time_day) &
            (patient_input['starttime'] < end_time_day)
        ]
        patient_io_features[f'input_total_volume_day{i+1}'] = daily_input['amount'].sum()
        patient_io_features[f'input_count_day{i+1}'] = daily_input.shape[0]

        # Output
        daily_output = patient_output[
            (patient_output['charttime'] >= start_time_day) &
            (patient_output['charttime'] < end_time_day)
        ]
        patient_io_features[f'output_total_volume_day{i+1}'] = daily_output['value'].sum()
        patient_io_features[f'output_count_day{i+1}'] = daily_output.shape[0]

    input_output_features_list.append(patient_io_features)

input_output_features = pd.DataFrame(input_output_features_list)
input_output_features = input_output_features.fillna(0) # Fill NaNs (e.g., if no input/output)

input_output_features.head()

Unnamed: 0,subject_id,hadm_id,stay_id,input_total_volume_day1,input_count_day1,output_total_volume_day1,output_count_day1,input_total_volume_day2,input_count_day2,output_total_volume_day2,output_count_day2
0,14816979,21650344,38466660,0.0,0,0.0,0,0.0,0,0.0,0
1,12264134,25257503,37673110,0.0,0,0.0,0,0.0,0,0.0,0
2,11993259,23072371,32269643,0.0,0,0.0,0,0.0,0,0.0,0
3,17907596,21801758,34260029,0.0,0,0.0,0,0.0,0,0.0,0
4,18801749,29949595,36111383,0.0,0,0.0,0,0.0,0,0.0,0


In [18]:
#PROCEDURE FEATURES
# Create binary features indicating whether a patient underwent specific procedures.
# The paper mentions "procedure data from the first 48 h of the hospitalization."


procedures_filtered = pd.merge(base_cohort[['subject_id', 'hadm_id']], procedures, on=['subject_id', 'hadm_id'], how='inner')

procedure_features_ohe = procedures_filtered.groupby(['subject_id', 'hadm_id', 'icd_code']).size().unstack(fill_value=0)
procedure_features_ohe = (procedure_features_ohe > 0).astype(int) # Convert counts to binary (presence/absence)
procedure_features_ohe.columns = [f"proc_{col}" for col in procedure_features_ohe.columns]
procedure_features_ohe = procedure_features_ohe.reset_index()

# Merge with base_cohort to ensure all patients are covered, fill NaN with 0 for patients without procedures
procedure_features = pd.merge(base_cohort[['subject_id', 'hadm_id', 'stay_id']], procedure_features_ohe, on=['subject_id', 'hadm_id'], how='left').fillna(0)

procedure_features.head()

Unnamed: 0,subject_id,hadm_id,stay_id,proc_0012,proc_0014,proc_00160J6,proc_00163J4,proc_00163J6,proc_00164J6,proc_0017,...,proc_X2CS3T7,proc_XW023S6,proc_XW03372,proc_XW033E5,proc_XW033H5,proc_XW043B3,proc_XW043E5,proc_XW043H6,proc_XW0G886,proc_XW0H886
0,14816979,21650344,38466660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12264134,25257503,37673110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11993259,23072371,32269643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17907596,21801758,34260029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18801749,29949595,36111383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# MERGE ALL FEATURES
# Start with base_cohort and merge all feature sets
final_features_df = base_cohort.copy()

final_features_df = pd.merge(final_features_df, demographics_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
final_features_df = pd.merge(final_features_df, hospitalization_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
final_features_df = pd.merge(final_features_df, medication_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
final_features_df = pd.merge(final_features_df, lab_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
final_features_df = pd.merge(final_features_df, vitals_clinical_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
final_features_df = pd.merge(final_features_df, input_output_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
final_features_df = pd.merge(final_features_df, procedure_features, on=['subject_id', 'hadm_id', 'stay_id'], how='left')



X = final_features_df.drop(columns=['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'hospital_expire_flag', 'icd_code', 'first_careunit', 'last_careunit'])
y = final_features_df['hospital_expire_flag']


X = X.fillna(0)

print("\nFinal Feature Matrix (X) Head:")
print(X.head())
print("Final Feature Matrix (X) Shape:", X.shape)
print("Target Variable (y) Head:")
print(y.head())
print("Target Variable (y) Shape:", y.shape)



Final Feature Matrix (X) Head:
   seq_num  icd_version        los  age  gender_F  gender_M  \
0        1            9   4.210104   30         0         1   
1        1           10   5.259572   73         1         0   
2        1            9   2.228623   76         1         0   
3        3            9  18.593854   67         0         1   
4        1           10   3.057257   91         1         0   

   marital_status_DIVORCED  marital_status_MARRIED  marital_status_SINGLE  \
0                        0                       1                      0   
1                        0                       0                      0   
2                        0                       0                      0   
3                        0                       1                      0   
4                        0                       0                      0   

   marital_status_WIDOWED  ...  proc_X2CS3T7  proc_XW023S6  proc_XW03372  \
0                       0  ...           0.0      

## Model traning and valid

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Check class imbalance
class_counts = y.value_counts()
print("Class distribution:", class_counts)
class_percentages = y.value_counts(normalize=True) * 100
print("Class percentages (%):", class_percentages)

# Split data with stratified split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Calculate scale_pos_weight for imbalanced data
scale_pos_weight_val = (y_train == 0).sum() / (y_train == 1).sum()

Class distribution: hospital_expire_flag
0    2759
1     628
Name: count, dtype: int64
Class percentages (%): hospital_expire_flag
0    81.458518
1    18.541482
Name: proportion, dtype: float64


In [24]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.utils import resample
import numpy as np

# Import for Bayesian Optimization
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

# Split data using stratified split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use 10 folds to tune
tuned_models = {}

# Calculate class weights for imbalanced dataset
class_weight_val = 'balanced'  # Automatically handles imbalanced classes

lr_model = LogisticRegression(
    random_state=42,
    class_weight=class_weight_val,  # Handle imbalanced dataset
    max_iter=1000  # Increase max iterations to ensure convergence
)

# Define the search space for Bayesian Optimization
# Note: We'll use a simpler approach to avoid solver-penalty incompatibilities
lr_search_space = {
    'C': Real(0.001, 100.0, prior='log-uniform'),  # Regularization strength (inverse)
    'penalty': Categorical(['l1', 'l2']),  # Regularization type (avoiding elasticnet for simplicity)
    'solver': Categorical(['liblinear', 'saga']),  # liblinear for l1/l2, saga for all
    'fit_intercept': Categorical([True, False]),  # Whether to fit intercept
    'max_iter': Integer(500, 2000)  # Maximum iterations for convergence
}

scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
n_iter_bayes = 50
model_name = 'Logistic Regression'
model = lr_model
param_grid = lr_search_space

for scoring_strategy in ['f1', 'roc_auc']:
    print(f"  Tuning for {scoring_strategy.upper()} using Bayesian Optimization...")

    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=param_grid,
        n_iter=n_iter_bayes,  # Number of optimization iterations
        scoring=scoring_strategy,
        cv=10,
        verbose=1,
        n_jobs=-1,  # Use all available cores
        random_state=42
    )
    bayes_search.fit(x_train, y_train)

    print(f"  Best parameters for {scoring_strategy.upper()}: {bayes_search.best_params_}")
    print(f"  Best {scoring_strategy.upper()} score: {bayes_search.best_score_:.4f}")

    # Report average metrics for the best estimator from cross-validation
    best_model_cv = bayes_search.best_estimator_
    cv_results = cross_validate(best_model_cv, x_train, y_train, cv=10, scoring=scoring_metrics, n_jobs=-1)

    print(f"  Average CV Metrics for best {model_name} (tuned for {scoring_strategy.upper()}):")
    for metric in scoring_metrics:
        mean_score = cv_results[f'test_{metric}'].mean()
        std_score = cv_results[f'test_{metric}'].std()
        print(f"    {metric.capitalize()}: {mean_score:.4f} (+/- {std_score:.4f})")

    # Store the best model based on F1 score (as it's often preferred for imbalanced data)
    if scoring_strategy == 'f1':
        tuned_models[model_name] = bayes_search.best_estimator_


def evaluate_model_bootstrapped(model, X_train_full, y_train_full, X_test_data, y_test_data, n_bootstraps=1000):
    """
    Trains the model on the full training data and evaluates on bootstrapped test data.
    """
    model.fit(X_train_full, y_train_full)  # Train on entire training set

    test_metrics = {metric: [] for metric in scoring_metrics}

    for i in range(n_bootstraps):
        # Bootstrap sample from the test data
        bootstrap_indices = np.random.choice(len(X_test_data), len(X_test_data), replace=True)
        X_test_bootstrap = X_test_data.iloc[bootstrap_indices]
        y_test_bootstrap = y_test_data.iloc[bootstrap_indices]

        if X_test_bootstrap.empty:
            continue

        y_pred = model.predict(X_test_bootstrap)
        y_proba = model.predict_proba(X_test_bootstrap)[:, 1] if hasattr(model, 'predict_proba') else None

        test_metrics['accuracy'].append(accuracy_score(y_test_bootstrap, y_pred))
        test_metrics['precision'].append(precision_score(y_test_bootstrap, y_pred, zero_division=0))
        test_metrics['recall'].append(recall_score(y_test_bootstrap, y_pred, zero_division=0))
        test_metrics['f1'].append(f1_score(y_test_bootstrap, y_pred, zero_division=0))
        if y_proba is not None:
            # Ensure enough samples of both classes for ROC AUC calculation
            if len(np.unique(y_test_bootstrap)) > 1:
                test_metrics['roc_auc'].append(roc_auc_score(y_test_bootstrap, y_proba))
            else:
                test_metrics['roc_auc'].append(np.nan)  # Cannot calculate if only one class present
        else:
            test_metrics['roc_auc'].append(np.nan)

    # Calculate mean and 95% confidence intervals
    results = {}
    for metric, scores in test_metrics.items():
        # Remove NaNs if any (e.g., from roc_auc when only one class is present in bootstrap sample)
        scores = [s for s in scores if not np.isnan(s)]
        if len(scores) > 0:
            mean_score = np.mean(scores)
            # Use basic percentile for CI, or scipy.stats.bootstrap for more robust CI
            # For simplicity and common practice, using percentile here for 95% CI
            lower_bound = np.percentile(scores, 2.5)
            upper_bound = np.percentile(scores, 97.5)
            results[metric] = f"{mean_score:.4f} (95% CI: {lower_bound:.4f}-{upper_bound:.4f})"
        else:
            results[metric] = "N/A (No valid scores)"
    return results


final_results = {}
for model_name, model in tuned_models.items():
    print(f"\nEvaluating {model_name} on Test Set (Bootstrapped 1000 times):")
    results = evaluate_model_bootstrapped(model, x_train, y_train, x_test, y_test, n_bootstraps=1000)
    final_results[model_name] = results
    for metric, value in results.items():
        print(f"  {metric.capitalize()}: {value}")

print("\n--- Summary of Final Test Set Results ---")
for model_name, results in final_results.items():
    print(f"\n{model_name}:")
    for metric, value in results.items():
        print(f"  {metric.capitalize()}: {value}")

  Tuning for F1 using Bayesian Optimization...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candida

In [25]:
# Create sample input using mean values from training data
sample_input_df = pd.DataFrame(x_train.mean()).T
sample_input_df.columns = x_train.columns

# Get the best logistic regression model
best_lr_model = tuned_models['Logistic Regression']

# Modify specific features for prediction
if 'age' in sample_input_df.columns:
    sample_input_df['age'] = 12

if 'gender_M' in sample_input_df.columns:
    sample_input_df['gender_M'] = 1
    sample_input_df['gender_F'] = 0

if 'lab_blood_urea_nitrogen_mean_interval1' in sample_input_df.columns:
    sample_input_df['lab_blood_urea_nitrogen_mean_interval1'] = 50.0

if 'first_careunit_Neuro_Intermediate' in sample_input_df.columns:
    sample_input_df['first_careunit_Neuro_Intermediate'] = 1

print("=== Logistic Regression Prediction ===")

# Display subset of features for inspection
display_features_subset = [col for col in ['age', 'gender_M', 'gender_F', 'ethnicity_WHITE', 'ethnicity_ASIAN',
                                          'first_careunit_Neuro_Intermediate', 'last_careunit_Neuro_Intermediate',
                                          'vital_gcs_motor_initial', 'vital_richmond_rass_initial',
                                          'lab_blood_urea_nitrogen_mean_interval1'] if col in sample_input_df.columns]
print("Sample Input Features:")
print(sample_input_df[display_features_subset].to_string())
print("")

# Make predictions
predicted_class = best_lr_model.predict(sample_input_df)[0]
predicted_proba = best_lr_model.predict_proba(sample_input_df)[0]

# Display results
print(f"Predicted Class (0=Survived, 1=Died): {predicted_class}")
print(f"Probability of Survival (Class 0): {predicted_proba[0]:.4f}")
print(f"Probability of Mortality (Class 1): {predicted_proba[1]:.4f}")

# Additional logistic regression specific information
print("\n=== Logistic Regression Model Information ===")
print(f"Model Parameters:")
print(f"  C (Regularization): {best_lr_model.C}")
print(f"  Penalty: {best_lr_model.penalty}")
print(f"  Solver: {best_lr_model.solver}")
print(f"  Max Iterations: {best_lr_model.max_iter}")

# Show feature coefficients (if interpretability is needed)
if hasattr(best_lr_model, 'coef_') and len(display_features_subset) > 0:
    print(f"\nTop Feature Coefficients (for interpretability):")
    feature_coefs = []
    for feature in display_features_subset:
        if feature in sample_input_df.columns:
            feature_idx = list(x_train.columns).index(feature)
            coef_value = best_lr_model.coef_[0][feature_idx]
            feature_coefs.append((feature, coef_value))
    
    # Sort by absolute coefficient value
    feature_coefs.sort(key=lambda x: abs(x[1]), reverse=True)
    
    for feature, coef in feature_coefs[:5]:  # Show top 5
        direction = "increases" if coef > 0 else "decreases"
        print(f"  {feature}: {coef:.4f} ({direction} mortality risk)")

=== Logistic Regression Prediction ===
Sample Input Features:
   age  gender_M  gender_F  vital_gcs_motor_initial  vital_richmond_rass_initial
0   12         1         0                 0.062754                     0.066445

Predicted Class (0=Survived, 1=Died): 0
Probability of Survival (Class 0): 0.9437
Probability of Mortality (Class 1): 0.0563

=== Logistic Regression Model Information ===
Model Parameters:
  C (Regularization): 0.6436659709839805
  Penalty: l2
  Solver: liblinear
  Max Iterations: 2000

Top Feature Coefficients (for interpretability):
  gender_F: -0.3701 (decreases mortality risk)
  gender_M: -0.3216 (decreases mortality risk)
  age: 0.0306 (increases mortality risk)
  vital_richmond_rass_initial: 0.0139 (increases mortality risk)
  vital_gcs_motor_initial: -0.0030 (decreases mortality risk)


## Features important

In [26]:
test_input = x_test.iloc[2:3]
test_input

Unnamed: 0,seq_num,icd_version,los,age,gender_F,gender_M,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,...,proc_X2CS3T7,proc_XW023S6,proc_XW03372,proc_XW033E5,proc_XW033H5,proc_XW043B3,proc_XW043E5,proc_XW043H6,proc_XW0G886,proc_XW0H886
2380,3,9,3.027141,69,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
test_input = x_test.iloc[2:3]
best_lr_model = tuned_models['Logistic Regression']
predicted_class = best_lr_model.predict(test_input)
print(f"Predicted class for test sample: {predicted_class[0]}")

print("\n--- Logistic Regression Feature Importance ---")
# Get feature coefficients (weights) from logistic regression
feature_coefficients = best_lr_model.coef_[0]  # For binary classification, coef_ is 2D array
feature_importance_df = pd.DataFrame({
    'Feature': x_train.columns,
    'Coefficient': feature_coefficients,
    'Abs_Coefficient': np.abs(feature_coefficients)
})

Predicted class for test sample: 1

--- Logistic Regression Feature Importance ---


In [29]:
feature_importance_df = feature_importance_df.sort_values(by='Abs_Coefficient', ascending=False)

print("Top 20 Most Important Features (by absolute coefficient):")
print(feature_importance_df[['Feature', 'Coefficient', 'Abs_Coefficient']].head(20).to_string())

print(f"\nCoefficient Interpretation:")
print("- Positive coefficient: Increases probability of mortality (Class 1)")
print("- Negative coefficient: Decreases probability of mortality (Class 1)")
print("- Larger absolute value: More influence on prediction")

print("\n--- 2. Ablation Analysis ---")
print("This process involves retraining the model for each ablated set and can be time-consuming.")


Top 20 Most Important Features (by absolute coefficient):
                  Feature  Coefficient  Abs_Coefficient
3324            proc_4311    -2.042371         2.042371
3133             proc_311    -1.856084         1.856084
2646         proc_0DH63UZ    -1.752101         1.752101
3451         proc_5A1955Z     1.718747         1.718747
3610            proc_9672     1.513470         1.513470
1981         proc_009U3ZX    -1.438599         1.438599
2015         proc_00JU3ZZ     1.387720         1.387720
2421         proc_06H03DZ    -1.363169         1.363169
3450         proc_5A1945Z     1.218811         1.218811
3421            proc_5491     1.212404         1.212404
3290         proc_3E0436Z    -1.197947         1.197947
3268            proc_3965     1.194004         1.194004
2794         proc_0JH63XZ    -1.149070         1.149070
2048            proc_0131     1.141138         1.141138
2501         proc_0B978ZZ     1.135916         1.135916
2601         proc_0D9P70Z     1.117391        

In [30]:
    # Define feature groups based on prefixes from feature engineering
feature_groups = {
    'Demographics': [col for col in x_train.columns if col.startswith(('gender_', 'marital_status_', 'ethnicity_', 'language_', 'age'))],
    'Hospitalization Info': [col for col in x_train.columns if col.startswith(('adm_type_', 'adm_loc_', 'first_careunit_', 'last_careunit_'))],
    'Medication': [col for col in x_train.columns if col.startswith('med_')],
    'Laboratory': [col for col in x_train.columns if col.startswith('lab_')],
    'Vital Signs & Clinical Assessment': [col for col in x_train.columns if col.startswith('vital_')],
    'Input/Output': [col for col in x_train.columns if col.startswith(('input_', 'output_'))],
    'Procedure': [col for col in x_train.columns if col.startswith('proc_')]
}

# Ensure 'age' is only in Demographics and not picked up by other prefixes if applicable
feature_groups['Demographics'] = [col for col in feature_groups['Demographics'] if col not in ['age']] + ['age'] # Ensure age is included

# Remove any duplicates that might arise from overlapping prefixes
all_grouped_features = []
for group_name, features in feature_groups.items():
    all_grouped_features.extend(features)

# Verify that all features in X_train are covered and no overlaps
cleaned_feature_groups = {}
processed_features = set()
for group_name, features in feature_groups.items():
    cleaned_features = []
    for feature in features:
        if feature in x_train.columns and feature not in processed_features:
            cleaned_features.append(feature)
            processed_features.add(feature)
    if cleaned_features: # Only add if there are actual features in the group
        cleaned_feature_groups[group_name] = cleaned_features

# Add any features not covered by the defined groups to an 'Other' category
other_features = [col for col in x_train.columns if col not in processed_features]
if other_features:
    cleaned_feature_groups['Other'] = other_features
    print(f"Note: {len(other_features)} features not explicitly categorized were placed in 'Other' group for ablation.")

ablation_results = {}

# Evaluate the full model first for baseline comparison
print("\nEvaluating Full Model (Baseline):")
full_model_results = evaluate_model_bootstrapped(best_lr_model, x_train, y_train, x_test, y_test)
ablation_results['Full Model'] = full_model_results
for metric, value in full_model_results.items():
    print(f"  {metric.capitalize()}: {value}")

for group_name, features_to_ablate in cleaned_feature_groups.items():
    print(f"\nEvaluating Model without {group_name} Features:")

    # Create ablated datasets
    X_train_ablated = x_train.drop(columns=features_to_ablate, errors='ignore')
    X_test_ablated = x_test.drop(columns=features_to_ablate, errors='ignore')

    # Retrain the logistic regression model on the ablated training data
    # Create a new instance of the model with the same best hyperparameters
    ablated_model_params = best_lr_model.get_params().copy()
    
    # Remove parameters that shouldn't be passed to new instance
    excluded_params = ['random_state']
    for param in excluded_params:
        if param in ablated_model_params:
            del ablated_model_params[param]
    
    ablated_model = LogisticRegression(
        random_state=42,
        **ablated_model_params
    )

    # Evaluate the ablated model
    current_ablation_results = evaluate_model_bootstrapped(ablated_model, X_train_ablated, y_train, X_test_ablated, y_test)
    ablation_results[f'Without {group_name}'] = current_ablation_results

    for metric, value in current_ablation_results.items():
        print(f"  {metric.capitalize()}: {value}")

print("\n--- Ablation Analysis Summary ---")
for model_type, results in ablation_results.items():
    print(f"\n{model_type}:")
    for metric, value in results.items():
        print(f"  {metric.capitalize()}: {value}")

# Additional analysis: Show which feature groups have the most impact when removed
print("\n--- Feature Group Impact Analysis ---")
baseline_f1 = float(full_model_results['f1'].split(' ')[0])  # Extract numeric value from CI string

print("Impact of removing each feature group (F1 score change):")
group_impacts = []
for model_type, results in ablation_results.items():
    if model_type != 'Full Model':
        ablated_f1 = float(results['f1'].split(' ')[0])
        f1_drop = baseline_f1 - ablated_f1
        group_name = model_type.replace('Without ', '')
        group_impacts.append((group_name, f1_drop))
        print(f"  {group_name}: F1 drop = {f1_drop:.4f}")

# Sort by impact (largest drop = most important group)
group_impacts.sort(key=lambda x: x[1], reverse=True)
print(f"\nMost impactful feature groups (when removed):")
for i, (group_name, impact) in enumerate(group_impacts[:3], 1):
    print(f"  {i}. {group_name}: F1 drop = {impact:.4f}")

Note: 36 features not explicitly categorized were placed in 'Other' group for ablation.

Evaluating Full Model (Baseline):
  Accuracy: 0.7861 (95% CI: 0.7566-0.8156)
  Precision: 0.4521 (95% CI: 0.3854-0.5198)
  Recall: 0.7313 (95% CI: 0.6576-0.8115)
  F1: 0.5580 (95% CI: 0.4951-0.6211)
  Roc_auc: 0.8372 (95% CI: 0.7990-0.8758)

Evaluating Model without Demographics Features:
  Accuracy: 0.7889 (95% CI: 0.7552-0.8215)
  Precision: 0.4576 (95% CI: 0.3908-0.5246)
  Recall: 0.7434 (95% CI: 0.6667-0.8209)
  F1: 0.5658 (95% CI: 0.5000-0.6299)
  Roc_auc: 0.8303 (95% CI: 0.7883-0.8692)

Evaluating Model without Hospitalization Info Features:
  Accuracy: 0.7793 (95% CI: 0.7448-0.8098)
  Precision: 0.4444 (95% CI: 0.3762-0.5125)
  Recall: 0.7530 (95% CI: 0.6699-0.8291)
  F1: 0.5581 (95% CI: 0.4915-0.6205)
  Roc_auc: 0.8233 (95% CI: 0.7834-0.8623)

Evaluating Model without Medication Features:
  Accuracy: 0.7916 (95% CI: 0.7625-0.8201)
  Precision: 0.4625 (95% CI: 0.3872-0.5311)
  Recall: 0.7475

In [32]:
print("\n--- 3. Fairness Assessment (Logistic Regression) ---")

# To perform fairness assessment, we need the original sensitive attributes (gender, ethnicity)
# merged back to the X_test set.
# Assuming 'admissions' and 'patients' dataframes are loaded from the preprocessing step.
# If not, you might need to load them here or ensure they are passed.

# Merge relevant demographic info to X_test based on subject_id, hadm_id, stay_id
# We need to retain subject_id, hadm_id, stay_id in X_test for this merge.
# The `final_features_df` from the preprocessing script contains these IDs.
# Let's recreate X_test_with_sensitive_attrs directly from base_cohort and original patients/admissions
# to ensure we have the raw categorical values.

# First, ensure base_cohort is available (it should be from preprocessing)
try:
    _ = base_cohort.shape
    _ = patients.shape
    _ = admissions.shape
except NameError:
    print("Error: 'base_cohort', 'patients', or 'admissions' DataFrames not found for fairness assessment.")
    print("Please ensure the 'Data Preprocessing and Feature Engineering' cell has been run completely.")
    # Exit or handle error appropriately if not in an interactive environment
    # For this interactive environment, we'll use dummy data as a fallback.
    print("Using dummy sensitive attributes for fairness assessment.")
    X_test_with_sensitive_attrs = x_test.copy()
    X_test_with_sensitive_attrs['gender'] = np.random.choice(['M', 'F'], size=len(X_test_with_sensitive_attrs))
    X_test_with_sensitive_attrs['ethnicity'] = np.random.choice(['WHITE', 'ASIAN', 'BLACK/AFRICAN AMERICAN', 'OTHER'], size=len(X_test_with_sensitive_attrs))
else:
    # Get the test indices from the train_test_split
    # We need to recreate the split to get the indices
    _, _, _, _, train_idx, test_idx = train_test_split(
        X, y, range(len(X)), test_size=0.2, random_state=42, stratify=y
    )
    
    # Get the original patient and admission info for the test set cohort
    test_cohort_info = base_cohort.iloc[test_idx].copy() # Get the specific rows from base_cohort that went into x_test

    # Merge with patients for gender
    test_cohort_info = pd.merge(test_cohort_info, patients[['subject_id', 'gender']], on='subject_id', how='left')
    # Merge with admissions for ethnicity (using 'ethnicity' from admissions)
    test_cohort_info = pd.merge(test_cohort_info, admissions[['subject_id', 'hadm_id', 'ethnicity']], on=['subject_id', 'hadm_id'], how='left')

    # Now, merge these sensitive attributes back to x_test using its index
    # Ensure x_test has its original index or a common key for merging
    X_test_with_sensitive_attrs = x_test.copy()
    X_test_with_sensitive_attrs = X_test_with_sensitive_attrs.merge(
        test_cohort_info[['subject_id', 'hadm_id', 'stay_id', 'gender', 'ethnicity']],
        left_index=True, right_on=x_test.index.name if x_test.index.name else ['subject_id', 'hadm_id', 'stay_id'], # Use original index if named, else use IDs
        how='left'
    )
    # Drop the merge keys from X_test_with_sensitive_attrs if they were added as columns
    if 'subject_id' in X_test_with_sensitive_attrs.columns and X_test_with_sensitive_attrs.index.name is None:
        X_test_with_sensitive_attrs = X_test_with_sensitive_attrs.drop(columns=['subject_id', 'hadm_id', 'stay_id'])

sensitive_attributes = {
    'gender': X_test_with_sensitive_attrs['gender'].dropna().unique(),
    'ethnicity': X_test_with_sensitive_attrs['ethnicity'].dropna().unique()
}

fairness_results = {}

# Get the best logistic regression model
best_lr_model = tuned_models['Logistic Regression']

for attr, subgroups in sensitive_attributes.items():
    print(f"\nFairness Assessment for {attr.capitalize()}:")
    fairness_results[attr] = {}
    for subgroup in subgroups:
        # Filter test set for the current subgroup
        subgroup_mask = (X_test_with_sensitive_attrs[attr] == subgroup)
        X_test_subgroup = X_test_with_sensitive_attrs[subgroup_mask].drop(columns=[attr, 'ethnicity'] if attr == 'gender' else [attr, 'gender'], errors='ignore') # Drop sensitive cols from features
        y_test_subgroup = y_test[subgroup_mask]

        if X_test_subgroup.empty or len(np.unique(y_test_subgroup)) < 2:
            print(f"  {subgroup}: Not enough data or only one class present for evaluation.")
            fairness_results[attr][subgroup] = "N/A (Insufficient data)"
            continue

        # Evaluate logistic regression model on this subgroup
        y_pred_subgroup = best_lr_model.predict(X_test_subgroup)
        y_proba_subgroup = best_lr_model.predict_proba(X_test_subgroup)[:, 1]

        subgroup_accuracy = accuracy_score(y_test_subgroup, y_pred_subgroup)
        subgroup_precision = precision_score(y_test_subgroup, y_pred_subgroup, zero_division=0)
        subgroup_recall = recall_score(y_test_subgroup, y_pred_subgroup, zero_division=0)
        subgroup_f1 = f1_score(y_test_subgroup, y_pred_subgroup, zero_division=0)
        subgroup_roc_auc = roc_auc_score(y_test_subgroup, y_proba_subgroup) if len(np.unique(y_test_subgroup)) > 1 else np.nan

        print(f"  {subgroup} (N={len(y_test_subgroup)}):")
        print(f"    Accuracy: {subgroup_accuracy:.4f}")
        print(f"    Precision: {subgroup_precision:.4f}")
        print(f"    Recall: {subgroup_recall:.4f}")
        print(f"    F1 Score: {subgroup_f1:.4f}")
        print(f"    ROC AUC: {subgroup_roc_auc:.4f}")

        fairness_results[attr][subgroup] = {
            'Accuracy': subgroup_accuracy,
            'Precision': subgroup_precision,
            'Recall': subgroup_recall,
            'F1 Score': subgroup_f1,
            'ROC AUC': subgroup_roc_auc
        }

print("\n--- Fairness Assessment Summary ---")
for attr, results_by_subgroup in fairness_results.items():
    print(f"\n{attr.capitalize()} Results:")
    for subgroup, metrics in results_by_subgroup.items():
        if isinstance(metrics, dict):
            print(f"  {subgroup}:")
            for metric_name, value in metrics.items():
                print(f"    {metric_name}: {value:.4f}")
        else:
            print(f"  {subgroup}: {metrics}") # For N/A cases

# Additional fairness metrics analysis
print("\n--- Detailed Fairness Metrics ---")

def calculate_fairness_metrics(fairness_results):
    """Calculate fairness disparities between subgroups"""
    fairness_disparities = {}
    
    for attr, results_by_subgroup in fairness_results.items():
        # Get valid subgroups (those with actual numeric results)
        valid_subgroups = {k: v for k, v in results_by_subgroup.items() if isinstance(v, dict)}
        
        if len(valid_subgroups) < 2:
            print(f"\nNot enough valid subgroups for {attr} fairness comparison")
            continue
            
        fairness_disparities[attr] = {}
        
        # Calculate disparities for each metric
        for metric in ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']:
            metric_values = []
            subgroup_names = []
            
            for subgroup, metrics_dict in valid_subgroups.items():
                if not np.isnan(metrics_dict[metric]):
                    metric_values.append(metrics_dict[metric])
                    subgroup_names.append(subgroup)
            
            if len(metric_values) >= 2:
                max_value = max(metric_values)
                min_value = min(metric_values)
                disparity = max_value - min_value
                fairness_disparities[attr][metric] = {
                    'max_value': max_value,
                    'min_value': min_value,
                    'disparity': disparity,
                    'max_subgroup': subgroup_names[metric_values.index(max_value)],
                    'min_subgroup': subgroup_names[metric_values.index(min_value)]
                }
    
    return fairness_disparities

fairness_disparities = calculate_fairness_metrics(fairness_results)

for attr, metrics_disparities in fairness_disparities.items():
    print(f"\n{attr.capitalize()} Fairness Disparities:")
    for metric, disparity_info in metrics_disparities.items():
        print(f"  {metric}:")
        print(f"    Highest: {disparity_info['max_subgroup']} ({disparity_info['max_value']:.4f})")
        print(f"    Lowest: {disparity_info['min_subgroup']} ({disparity_info['min_value']:.4f})")
        print(f"    Disparity: {disparity_info['disparity']:.4f}")
        
        # Interpret disparity level
        if disparity_info['disparity'] < 0.05:
            disparity_level = "Low"
        elif disparity_info['disparity'] < 0.10:
            disparity_level = "Moderate" 
        else:
            disparity_level = "High"
        print(f"    Assessment: {disparity_level} disparity")

# Logistic Regression specific fairness insights
print("\n--- Logistic Regression Fairness Insights ---")
print("Key considerations for LR fairness:")
print("1. Coefficient Analysis: Check if sensitive attribute coefficients are reasonable")
print("2. Calibration: LR probabilities are generally well-calibrated across groups")
print("3. Linear Assumptions: Fairness may be affected if linear assumptions don't hold equally across groups")
print("4. Feature Interactions: Consider if sensitive attributes interact differently with other features")

# Check if sensitive attributes are directly encoded in features
sensitive_feature_check = []
for col in x_train.columns:
    if any(sensitive_attr.lower() in col.lower() for sensitive_attr in ['gender', 'ethnicity', 'race']):
        sensitive_feature_check.append(col)

if sensitive_feature_check:
    print(f"\nNote: The following features may encode sensitive attributes:")
    for feature in sensitive_feature_check[:5]:  # Show first 5
        feature_idx = list(x_train.columns).index(feature)
        coef_value = best_lr_model.coef_[0][feature_idx]
        print(f"  {feature}: coefficient = {coef_value:.4f}")
    print("Consider the ethical implications of these features in your model.")


--- 3. Fairness Assessment (Logistic Regression) ---


KeyError: "['ethnicity'] not in index"