# MIMIC-IV Data Preprocessing


In [1]:
import pandas as pd
from collections import Counter

import dask.array as da 
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

import numpy as np

from multiprocessing import Pool

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import math

from tqdm import tqdm
import time

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
d_items_data = pd.read_csv('../icu_data/mimic_iv/d_items.csv.gz', compression = 'gzip')
input_events_data = pd.read_csv('../icu_data/mimic_iv/inputevents.csv.gz', compression = 'gzip')
pro_events_data = pd.read_csv('../icu_data/mimic_iv/procedureevents.csv.gz', compression = 'gzip')
output_event_data = pd.read_csv('../icu_data/mimic_iv/outputevents.csv.gz', compression = 'gzip')
ICU_patient_data = pd.read_csv('../icu_data/mimic_iv/icustays.csv.gz', compression = 'gzip')

In [4]:
ICU_patient_data

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000690,25860671,37081114,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2150-11-02 19:37:00,2150-11-06 17:03:17,3.893252
2,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
3,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
4,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113
...,...,...,...,...,...,...,...,...
94453,19999442,26785317,32336619,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2148-11-19 14:23:43,2148-11-26 13:12:15,6.950370
94454,19999625,25304202,31070865,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2139-10-10 19:18:00,2139-10-11 18:21:28,0.960741
94455,19999828,25744818,36075953,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2149-01-08 18:12:00,2149-01-10 13:11:02,1.790995
94456,19999840,21033226,38978960,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2164-09-12 09:26:28,2164-09-17 16:35:15,5.297766


In [5]:
d_items_data

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,220001,Problem List,Problem List,chartevents,General,,Text,,
1,220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
3,220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
4,220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,
...,...,...,...,...,...,...,...,...,...
4090,230172,Patient Reversed,Patient Reversed,procedureevents,3-Significant Events,,Processes,,
4091,230173,Patient - Fast Track Protocol,Patient - Fast Track Protocol,procedureevents,3-Significant Events,,Processes,,
4092,230174,Nerve block in OR,Nerve block in OR,procedureevents,3-Significant Events,,Processes,,
4093,230176,IUC Stabilization Device,IUC Stabilization Device,chartevents,GI/GU,,Checkbox,,


In [6]:
patient_list = pd.unique(ICU_patient_data["subject_id"])
hos_ad_list = pd.unique(ICU_patient_data["hadm_id"])
icu_ad_list = pd.unique(ICU_patient_data["stay_id"])


[len(patient_list), len(hos_ad_list), len(icu_ad_list)]

[65366, 85242, 94458]

There are 65,366 patients with 85,242 hospital admissions and 94,458 ICU stays.

In [7]:
VitalSigns_id = [220045, 220048, 220179, 220050, 220180, 220051, 220052, 220181, 225312, 220210, 224690, 223761, 223762, 220277]

GCS_score_id = [223901, 223900, 220739]

Vent_para_id = [220339, 224700, 224685, 224684, 224686, 223835, 223848, 223849]

Labs_id = [225624, 226536, 220602, 227464, 226534, 226537, 229761, 220653, 220546, 227466, 227467, 227457, 220274, 223830, 
           220228, 220235, 220224, 226062, 226063, 227456, 226540, 224828, 220635, 220545, 220615, 220621, 220645]

General_id = [224639, 226260, 226512, 226531, 226892, 227428]

ADT_id = [220003, 226228, 226545]

add_id_1 = [224719, 226862, 228878, 225624, 220615, 229761, 227465, 227442, 227443, 
            225651, 225690, 226566, 227489, 226627, 220994, 227519, 227488, 225667, 228699, 228709, 228713, 
            228703, 228704, 228705, 225309, 225310, 220227, 223830, 224688, 224689]

In [8]:
variable_list = VitalSigns_id + GCS_score_id + Vent_para_id + Labs_id + General_id + ADT_id + add_id_1

d_items_data_1 = d_items_data[d_items_data['itemid'].isin(variable_list)].copy()
len(d_items_data_1)

86

In [9]:
d_items_data_1[d_items_data_1['label'] == 'Inspired O2 Fraction']

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
384,223835,Inspired O2 Fraction,FiO2,chartevents,Respiratory,,Numeric,,


In [10]:
item_id_list = d_items_data_1['itemid'].tolist()

In [11]:
# Define column data types explicitly
dtypes = {
    'cgid': 'float64',
    'stay_id': 'float64',
    'error': 'float64',
    'resultstatus': 'object',
    'stopped': 'object',
    'value': 'object',
    'valuenum': 'float64',
    'warning': 'float64',
    'valueuom': 'object',
    'caregiver_id': 'float64'
}

# Read the CSV file using Dask
chart_events = dd.read_csv(
    '../icu_data/mimic_iv/chartevents.csv.gz',
    dtype = dtypes,
    compression = 'gzip',
    assume_missing = True,
    blocksize = None
)

chart_events = chart_events[chart_events.itemid.isin(item_id_list)]

# Compute the Dask DataFrame into a Pandas DataFrame with progress monitoring
try:
    with ProgressBar():
        chart_events_data = chart_events.compute()
    print("Data successfully loaded!")
except Exception as e:
    print(f"Error computing DataFrame: {e}")

[########################################] | 100% Completed | 17m 27s
Data successfully loaded!


In [None]:
# chart_events_data

In [None]:
# chart_events_data.to_csv('../icu_data/mimic_iv/chart_events_data.csv', index = False)

In [None]:
# chart_events_data = pd.read_csv('../icu_data/mimic_iv/chart_events_data.csv')

In [12]:
ICU_unit = ['Medical Intensive Care Unit (MICU)', 
            'Surgical Intensive Care Unit (SICU)', 
            'Medical/Surgical Intensive Care Unit (MICU/SICU)', 
            'Cardiac Vascular Intensive Care Unit (CVICU)', 
            'Coronary Care Unit (CCU)', 
            'Trauma SICU (TSICU)']

ICU_patient_data_test = ICU_patient_data[ICU_patient_data['first_careunit'].isin(ICU_unit)].copy()

In [13]:
chart_events_data_1 = chart_events_data[chart_events_data['stay_id'].isin(ICU_patient_data_test['stay_id'])].copy()

In [14]:
# chart_events_data_1

In [15]:
d_items_data_1['linksto'].value_counts()

chartevents         80
outputevents         4
datetimeevents       1
ingredientevents     1
Name: linksto, dtype: int64

In [16]:
d_items_data_chart = d_items_data_1[d_items_data_1['linksto'] == 'chartevents'].copy()
d_items_data_output = d_items_data_1[d_items_data_1['linksto'] == 'outputevents'].copy()
d_items_data_datetime = d_items_data_1[d_items_data_1['linksto'] == 'datetimeevents'].copy()
d_items_data_ingred = d_items_data_1[d_items_data_1['linksto'] == 'ingredientevents'].copy()

In [17]:
d_items_data_chart

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
5,220048,Heart Rhythm,Heart Rhythm,chartevents,Routine Vital Signs,,Text,,
6,220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90.0,140.0
7,220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60.0,90.0
8,220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,
...,...,...,...,...,...,...,...,...,...
3116,228705,Nutrition,Nutrition,chartevents,MD Progress Note,,Text,,
3120,228709,Respiratory,Respiratory,chartevents,MD Progress Note,,Text,,
3124,228713,Vascular,Vascular,chartevents,MD Progress Note,,Text,,
3270,228878,PeCO2,PeCO2,chartevents,Respiratory,mmHg,Numeric,,


- Select and mark patients

In [18]:
ICU_unit = ['Medical Intensive Care Unit (MICU)', 
            'Surgical Intensive Care Unit (SICU)', 
            'Medical/Surgical Intensive Care Unit (MICU/SICU)', 
            'Cardiac Vascular Intensive Care Unit (CVICU)', 
            'Coronary Care Unit (CCU)', 
            'Trauma SICU (TSICU)']

ICU_patient_data_test = ICU_patient_data[ICU_patient_data['first_careunit'].isin(ICU_unit)].copy()

In [19]:
ICU_patient_data_s_15 = ICU_patient_data_test.copy()
# ICU_patient_data_s_15 = ICU_patient_data_test[ICU_patient_data_test['los'] <= 15.00].copy()
# ICU_patient_data_s_30 = ICU_patient_data_test[ICU_patient_data_test['los'] <= 30.00].copy()

In [20]:
ICU_patient_data_s_15 = ICU_patient_data_s_15.reset_index(drop = True)

In [21]:
chart_events_data_1 = chart_events_data_1.reset_index(drop = True)

In [22]:
ICU_patient_data_s_15['intime'] = pd.to_datetime(ICU_patient_data_s_15['intime'])
ICU_patient_data_s_15['outtime'] = pd.to_datetime(ICU_patient_data_s_15['outtime'])

ICU_patient_data_s_15['TD_LOS'] = ICU_patient_data_s_15['outtime'] - ICU_patient_data_s_15['intime']

In [23]:
ICU_patient_data_s_15 = ICU_patient_data_s_15.sort_values(by = ['subject_id', 'intime'])

pa_list = pd.unique(ICU_patient_data_s_15['subject_id'])
icu_list = pd.unique(ICU_patient_data_s_15['stay_id'])

# build the readmission list
icu_rd_list = []

for i in range(len(pa_list)):
    sub_data = ICU_patient_data_s_15[ICU_patient_data_s_15['subject_id'] == pa_list[i]]
    if len(pd.unique(sub_data['stay_id'])) > 1:
        icu_rd_list.append(pa_list[i])

ICU_patient_data_rd = ICU_patient_data_s_15[ICU_patient_data_s_15['subject_id'].isin(icu_rd_list)].copy()

pa_list_d_7 = []
pa_list_d_14 = []
pa_list_d_21 = []
pa_list_d_30 = []
pa_list_d_60 = []
pa_list_d_90 = []

icu_rd_7_list = []
icu_rd_14_list = []
icu_rd_21_list = []
icu_rd_30_list = []
icu_rd_60_list = []
icu_rd_90_list = []

dist_fail_7_list = []
dist_fail_14_list = []
dist_fail_21_list = []
dist_fail_30_list = []
dist_fail_60_list = []
dist_fail_90_list = []

for i in tqdm(range(len(icu_rd_list))):
    sub_data = ICU_patient_data_rd[ICU_patient_data_rd['subject_id'] == icu_rd_list[i]]
    
    for j in range(1, len(sub_data)):
        if sub_data['stay_id'].iloc[j] != sub_data['stay_id'].iloc[j-1]:

            if sub_data['intime'].iloc[j] - sub_data['outtime'].iloc[j-1] <= pd.Timedelta('7 days 00:00:00'):
                pa_list_d_7.append(icu_rd_list[i])
                dist_fail_7_list.append(sub_data['stay_id'].iloc[j - 1])
                icu_rd_7_list.append(sub_data['stay_id'].iloc[j])
                
            if sub_data['intime'].iloc[j] - sub_data['outtime'].iloc[j-1] <= pd.Timedelta('14 days 00:00:00'):
                pa_list_d_14.append(icu_rd_list[i])
                dist_fail_14_list.append(sub_data['stay_id'].iloc[j - 1])
                icu_rd_14_list.append(sub_data['stay_id'].iloc[j])

            if sub_data['intime'].iloc[j] - sub_data['outtime'].iloc[j-1] <= pd.Timedelta('21 days 00:00:00'):
                pa_list_d_21.append(icu_rd_list[i])
                dist_fail_21_list.append(sub_data['stay_id'].iloc[j - 1])
                icu_rd_21_list.append(sub_data['stay_id'].iloc[j])

            if sub_data['intime'].iloc[j] - sub_data['outtime'].iloc[j-1] <= pd.Timedelta('30 days 00:00:00'):
                pa_list_d_30.append(icu_rd_list[i])
                dist_fail_30_list.append(sub_data['stay_id'].iloc[j - 1])
                icu_rd_30_list.append(sub_data['stay_id'].iloc[j])

            if sub_data['intime'].iloc[j] - sub_data['outtime'].iloc[j-1] <= pd.Timedelta('60 days 00:00:00'):
                pa_list_d_60.append(icu_rd_list[i])
                dist_fail_60_list.append(sub_data['stay_id'].iloc[j - 1])
                icu_rd_60_list.append(sub_data['stay_id'].iloc[j])

            if sub_data['intime'].iloc[j] - sub_data['outtime'].iloc[j-1] <= pd.Timedelta('90 days 00:00:00'):
                pa_list_d_90.append(icu_rd_list[i])
                dist_fail_90_list.append(sub_data['stay_id'].iloc[j - 1])
                icu_rd_90_list.append(sub_data['stay_id'].iloc[j])      
        
        else:
            print("Error: ", sub_data['stay_id'].iloc[j])          

100%|███████████████████████████████████████████████████████████████████████████| 14517/14517 [00:43<00:00, 337.22it/s]


In [24]:
ICU_patient_data_s_15['discharge_fail_7_day'] = 0
ICU_patient_data_s_15['discharge_fail_14_day'] = 0
ICU_patient_data_s_15['discharge_fail_21_day'] = 0
ICU_patient_data_s_15['discharge_fail_30_day'] = 0
ICU_patient_data_s_15['discharge_fail_60_day'] = 0
ICU_patient_data_s_15['discharge_fail_90_day'] = 0

ICU_patient_data_s_15['readmission_7_day'] = 0
ICU_patient_data_s_15['readmission_14_day'] = 0
ICU_patient_data_s_15['readmission_21_day'] = 0
ICU_patient_data_s_15['readmission_30_day'] = 0
ICU_patient_data_s_15['readmission_60_day'] = 0
ICU_patient_data_s_15['readmission_90_day'] = 0


# Mark discharge failures for each time window
for stay_id in dist_fail_7_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'discharge_fail_7_day'] = 1

for stay_id in dist_fail_14_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'discharge_fail_14_day'] = 1

for stay_id in dist_fail_21_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'discharge_fail_21_day'] = 1

for stay_id in dist_fail_30_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'discharge_fail_30_day'] = 1

for stay_id in dist_fail_60_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'discharge_fail_60_day'] = 1

for stay_id in dist_fail_90_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'discharge_fail_90_day'] = 1


# Mark readmissions for each time window
for stay_id in icu_rd_7_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'readmission_7_day'] = 1

for stay_id in icu_rd_14_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'readmission_14_day'] = 1
    
for stay_id in icu_rd_21_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'readmission_21_day'] = 1

for stay_id in icu_rd_30_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'readmission_30_day'] = 1

for stay_id in icu_rd_60_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'readmission_60_day'] = 1

for stay_id in icu_rd_90_list:
    ICU_patient_data_s_15.loc[ICU_patient_data_s_15['stay_id'] == stay_id, 'readmission_90_day'] = 1

In [25]:
ICU_patient_data_s_15[ICU_patient_data_s_15['subject_id'] == 16133115]

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,TD_LOS,discharge_fail_7_day,...,discharge_fail_21_day,discharge_fail_30_day,discharge_fail_60_day,discharge_fail_90_day,readmission_7_day,readmission_14_day,readmission_21_day,readmission_30_day,readmission_60_day,readmission_90_day
52279,16133115,26364901,32772743,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2117-01-10 13:42:01,2117-01-15 18:32:33,5.201759,5 days 04:50:32,0,...,0,0,0,0,0,0,0,0,0,0
52271,16133115,23529718,30164948,Trauma SICU (TSICU),Trauma SICU (TSICU),2118-07-27 16:05:23,2118-08-02 17:19:04,6.051169,6 days 01:13:41,0,...,0,0,1,1,0,0,0,0,0,0
52281,16133115,27701111,30424582,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2118-09-03 04:27:27,2118-09-04 18:12:56,1.573252,1 days 13:45:29,0,...,0,0,0,0,0,0,0,0,1,1
52276,16133115,24673862,38070632,Trauma SICU (TSICU),Trauma SICU (TSICU),2120-09-08 07:30:10,2120-10-09 18:02:15,31.438947,31 days 10:32:05,1,...,1,1,1,1,0,0,0,0,0,0
52274,16133115,24673862,36863807,Trauma SICU (TSICU),Trauma SICU (TSICU),2120-10-13 12:28:39,2120-10-16 22:24:51,3.414028,3 days 09:56:12,0,...,0,1,1,1,1,1,1,1,1,1
52278,16133115,24673862,38660441,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2120-11-10 00:31:24,2120-11-25 22:17:06,15.906736,15 days 21:45:42,0,...,1,1,1,1,0,0,0,1,1,1
52273,16133115,24673862,32583672,Trauma SICU (TSICU),Trauma SICU (TSICU),2120-12-07 18:26:25,2121-01-16 18:32:02,40.0039,40 days 00:05:37,0,...,1,1,1,1,0,1,1,1,1,1
52272,16133115,24673862,30207372,Trauma SICU (TSICU),Trauma SICU (TSICU),2121-02-06 12:25:33,2121-02-07 21:17:19,1.369282,1 days 08:51:46,1,...,1,1,1,1,0,0,1,1,1,1
52277,16133115,24673862,38271504,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2121-02-09 22:01:15,2121-02-14 18:40:35,4.860648,4 days 20:39:20,1,...,1,1,1,1,1,1,1,1,1,1
52275,16133115,24673862,37990758,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2121-02-18 14:53:49,2121-03-22 23:19:48,32.351377,32 days 08:25:59,0,...,0,0,0,1,1,1,1,1,1,1


In [26]:
admission_data = pd.read_csv('../icu_data/mimic_iv/admissions.csv.gz', compression = 'gzip')
patients_data = pd.read_csv('../icu_data/mimic_iv/patients.csv.gz', compression = 'gzip')

In [27]:
patients_data_select = patients_data.drop(columns = ['anchor_year', 'anchor_year_group'])
admission_data_select = admission_data[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'race']]

patients_data_select = patients_data_select[patients_data_select['subject_id'].isin(ICU_patient_data_s_15['subject_id'])]
admission_data_select = admission_data_select[admission_data_select['subject_id'].isin(ICU_patient_data_s_15['subject_id'])]

admission_data_select_v1 = admission_data_select[['subject_id', 'race']].copy()
admission_data_select_v1 = admission_data_select_v1.drop_duplicates(subset = ['subject_id'], keep = 'first')

ICU_patient_data_s15_v1 = pd.merge(ICU_patient_data_s_15, admission_data_select_v1, on = 'subject_id', how = 'left')
ICU_patient_data_s15_v2 = pd.merge(ICU_patient_data_s15_v1, patients_data_select, on = 'subject_id', how = 'left')

In [28]:
ICU_patient_data_s15_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85181 entries, 0 to 85180
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype          
---  ------                 --------------  -----          
 0   subject_id             85181 non-null  int64          
 1   hadm_id                85181 non-null  int64          
 2   stay_id                85181 non-null  int64          
 3   first_careunit         85181 non-null  object         
 4   last_careunit          85181 non-null  object         
 5   intime                 85181 non-null  datetime64[ns] 
 6   outtime                85168 non-null  datetime64[ns] 
 7   los                    85168 non-null  float64        
 8   TD_LOS                 85168 non-null  timedelta64[ns]
 9   discharge_fail_7_day   85181 non-null  int64          
 10  discharge_fail_14_day  85181 non-null  int64          
 11  discharge_fail_21_day  85181 non-null  int64          
 12  discharge_fail_30_day  85181 non-null  int64  

In [29]:
ICU_patient_data_s15_v2['dod'] = pd.to_datetime(ICU_patient_data_s15_v2['dod'])
ICU_patient_data_s15_v2['TD_death_disch'] = ICU_patient_data_s15_v2['dod'] - ICU_patient_data_s15_v2['outtime']

ICU_patient_data_s15_v2['death_in_ICU'] = 0

ICU_patient_data_s15_v2['death_out_ICU_7_day'] = 0
ICU_patient_data_s15_v2['death_out_ICU_14_day'] = 0
ICU_patient_data_s15_v2['death_out_ICU_21_day'] = 0
ICU_patient_data_s15_v2['death_out_ICU_30_day'] = 0
ICU_patient_data_s15_v2['death_out_ICU_60_day'] = 0
ICU_patient_data_s15_v2['death_out_ICU_90_day'] = 0


# Mark death in ICU
ICU_patient_data_s15_v2.loc[ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(0), 'death_in_ICU'] = 1

# Mark death after ICU discharge within different time windows
ICU_patient_data_s15_v2.loc[(ICU_patient_data_s15_v2['TD_death_disch'] > pd.Timedelta(0)) & 
                           (ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(days = 7)), 'death_out_ICU_7_day'] = 1

ICU_patient_data_s15_v2.loc[(ICU_patient_data_s15_v2['TD_death_disch'] > pd.Timedelta(0)) &
                           (ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(days = 14)), 'death_out_ICU_14_day'] = 1

ICU_patient_data_s15_v2.loc[(ICU_patient_data_s15_v2['TD_death_disch'] > pd.Timedelta(0)) &
                           (ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(days = 21)), 'death_out_ICU_21_day'] = 1

ICU_patient_data_s15_v2.loc[(ICU_patient_data_s15_v2['TD_death_disch'] > pd.Timedelta(0)) &
                           (ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(days = 30)), 'death_out_ICU_30_day'] = 1

ICU_patient_data_s15_v2.loc[(ICU_patient_data_s15_v2['TD_death_disch'] > pd.Timedelta(0)) &
                           (ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(days = 60)), 'death_out_ICU_60_day'] = 1

ICU_patient_data_s15_v2.loc[(ICU_patient_data_s15_v2['TD_death_disch'] > pd.Timedelta(0)) &
                           (ICU_patient_data_s15_v2['TD_death_disch'] <= pd.Timedelta(days = 90)), 'death_out_ICU_90_day'] = 1

- Mark readmission count

In [30]:
list(ICU_patient_data_s15_v2.columns)

['subject_id',
 'hadm_id',
 'stay_id',
 'first_careunit',
 'last_careunit',
 'intime',
 'outtime',
 'los',
 'TD_LOS',
 'discharge_fail_7_day',
 'discharge_fail_14_day',
 'discharge_fail_21_day',
 'discharge_fail_30_day',
 'discharge_fail_60_day',
 'discharge_fail_90_day',
 'readmission_7_day',
 'readmission_14_day',
 'readmission_21_day',
 'readmission_30_day',
 'readmission_60_day',
 'readmission_90_day',
 'race',
 'gender',
 'anchor_age',
 'dod',
 'TD_death_disch',
 'death_in_ICU',
 'death_out_ICU_7_day',
 'death_out_ICU_14_day',
 'death_out_ICU_21_day',
 'death_out_ICU_30_day',
 'death_out_ICU_60_day',
 'death_out_ICU_90_day']

In [31]:
patient_list = pd.unique(ICU_patient_data_s15_v2['subject_id'])
icu_stay_list = pd.unique(ICU_patient_data_s15_v2['stay_id'])

In [32]:
time_windows = [7, 14, 21, 30, 60, 90]

for w in time_windows:
    ICU_patient_data_s15_v2[f'readmission_count_{w}_day'] = 0

patient_list = ICU_patient_data_s15_v2['subject_id'].unique()

for patient_id in patient_list:
    sub_data = ICU_patient_data_s15_v2.loc[ICU_patient_data_s15_v2['subject_id'] == patient_id]

    prev_counts = {w: 0 for w in time_windows}

    for idx, row in sub_data.iterrows():

        for w in time_windows:

            if row[f'readmission_{w}_day'] == 1:
                current_count = prev_counts[w] + 1
            
            else:
                # current_count = prev_counts[w]
                current_count = 0

            ICU_patient_data_s15_v2.at[idx, f'readmission_count_{w}_day'] = current_count

            prev_counts[w] = current_count

In [33]:
ICU_patient_data_s15_v2[ICU_patient_data_s15_v2['subject_id'] == 16133115][['stay_id', 'discharge_fail_30_day', 'readmission_count_30_day']]

Unnamed: 0,stay_id,discharge_fail_30_day,readmission_count_30_day
52271,32772743,0,0
52272,30164948,0,0
52273,30424582,0,0
52274,38070632,1,0
52275,36863807,1,1
52276,38660441,1,2
52277,32583672,1,3
52278,30207372,1,4
52279,38271504,1,5
52280,37990758,0,6


In [34]:
list(ICU_patient_data_s15_v2.columns)

['subject_id',
 'hadm_id',
 'stay_id',
 'first_careunit',
 'last_careunit',
 'intime',
 'outtime',
 'los',
 'TD_LOS',
 'discharge_fail_7_day',
 'discharge_fail_14_day',
 'discharge_fail_21_day',
 'discharge_fail_30_day',
 'discharge_fail_60_day',
 'discharge_fail_90_day',
 'readmission_7_day',
 'readmission_14_day',
 'readmission_21_day',
 'readmission_30_day',
 'readmission_60_day',
 'readmission_90_day',
 'race',
 'gender',
 'anchor_age',
 'dod',
 'TD_death_disch',
 'death_in_ICU',
 'death_out_ICU_7_day',
 'death_out_ICU_14_day',
 'death_out_ICU_21_day',
 'death_out_ICU_30_day',
 'death_out_ICU_60_day',
 'death_out_ICU_90_day',
 'readmission_count_7_day',
 'readmission_count_14_day',
 'readmission_count_21_day',
 'readmission_count_30_day',
 'readmission_count_60_day',
 'readmission_count_90_day']

In [35]:
len(list(ICU_patient_data_s15_v2.columns))

39

In [36]:
ICU_patient_data_s15_v2[['subject_id', 'stay_id', 'readmission_30_day', 'readmission_count_30_day']].head(50)

Unnamed: 0,subject_id,stay_id,readmission_30_day,readmission_count_30_day
0,10000032,39553978,0,0
1,10000690,37081114,0,0
2,10000980,39765666,0,0
3,10001217,37067082,0,0
4,10001217,34592300,1,1
5,10001725,31205490,0,0
6,10001843,39698942,0,0
7,10001884,37510196,0,0
8,10002013,39060235,0,0
9,10002114,34672098,0,0


- chart_events Data Preprocessing

In [37]:
d_items_data_chart_numeric = d_items_data_chart[d_items_data_chart['param_type'] == 'Numeric']
d_items_data_chart_text = d_items_data_chart[d_items_data_chart['param_type'] == 'Text']
d_items_data_chart_numeric_tag = d_items_data_chart[d_items_data_chart['param_type'] == 'Numeric with tag']
d_items_data_chart_checkbox = d_items_data_chart[d_items_data_chart['param_type'] == 'Checkbox']

In [38]:
d_items_data_chart_select = d_items_data_chart[~d_items_data_chart['label'].isin(['Ventilator Type', 'Ventilator Mode', 
                                                                                  'SaO2 < 90% > 2 min', 'Gender', 
                                                                                  'Race', 'Cardiovascular', 'Musculoskeletal', 
                                                                                  'Neurological', 'Nutrition', 'Respiratory', 
                                                                                  'Vascular', 'Mechanically Ventilated', 
                                                                                  'Re-admit < 48 hours'])]

In [39]:
chart_events_data_1[['subject_id', 'hadm_id', 'stay_id', 'itemid']] = chart_events_data_1[['subject_id', 'hadm_id', 'stay_id', 'itemid']].astype('int64')

In [40]:
chart_events_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78938869 entries, 0 to 78938868
Data columns (total 11 columns):
 #   Column        Dtype  
---  ------        -----  
 0   subject_id    int64  
 1   hadm_id       int64  
 2   stay_id       int64  
 3   caregiver_id  float64
 4   charttime     object 
 5   storetime     object 
 6   itemid        int64  
 7   value         object 
 8   valuenum      float64
 9   valueuom      object 
dtypes: float64(3), int64(4), object(4)
memory usage: 6.5+ GB


In [41]:
chart_events_data_2 = chart_events_data_1[chart_events_data_1['itemid'].isin(d_items_data_chart_select['itemid'])]

In [42]:
chart_events_data_2 = chart_events_data_2.reset_index(drop = True)

In [43]:
chart_events_data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77738078 entries, 0 to 77738077
Data columns (total 11 columns):
 #   Column        Dtype  
---  ------        -----  
 0   subject_id    int64  
 1   hadm_id       int64  
 2   stay_id       int64  
 3   caregiver_id  float64
 4   charttime     object 
 5   storetime     object 
 6   itemid        int64  
 7   value         object 
 8   valuenum      float64
 9   valueuom      object 
dtypes: float64(3), int64(4), object(4)
memory usage: 6.4+ GB


In [44]:
tuple_list = list(zip(d_items_data_chart_select["itemid"], d_items_data_chart_select["label"]))
print(tuple_list)

[(220045, 'Heart Rate'), (220048, 'Heart Rhythm'), (220050, 'Arterial Blood Pressure systolic'), (220051, 'Arterial Blood Pressure diastolic'), (220052, 'Arterial Blood Pressure mean'), (220179, 'Non Invasive Blood Pressure systolic'), (220180, 'Non Invasive Blood Pressure diastolic'), (220181, 'Non Invasive Blood Pressure mean'), (220210, 'Respiratory Rate'), (220224, 'Arterial O2 pressure'), (220227, 'Arterial O2 Saturation'), (220228, 'Hemoglobin'), (220235, 'Arterial CO2 Pressure'), (220274, 'PH (Venous)'), (220277, 'O2 saturation pulseoxymetry'), (220339, 'PEEP set'), (220545, 'Hematocrit (serum)'), (220546, 'WBC'), (220602, 'Chloride (serum)'), (220615, 'Creatinine (serum)'), (220621, 'Glucose (serum)'), (220635, 'Magnesium'), (220645, 'Sodium (serum)'), (220739, 'GCS - Eye Opening'), (223761, 'Temperature Fahrenheit'), (223762, 'Temperature Celsius'), (223830, 'PH (Arterial)'), (223835, 'Inspired O2 Fraction'), (223900, 'GCS - Verbal Response'), (223901, 'GCS - Motor Response'),

In [45]:
chart_events_data_2[chart_events_data_2['itemid'] == 228878]

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
3670,10001884,26184834,37510196,55165.0,2131-01-17 08:00:00,2131-01-17 08:44:00,228878,47,47.0,mmHg,0.0
3678,10001884,26184834,37510196,55165.0,2131-01-17 12:00:00,2131-01-17 13:01:00,228878,45,45.0,mmHg,0.0
121633,10014610,23258342,39959884,27327.0,2173-12-21 04:00:00,2173-12-21 04:44:00,228878,33,33.0,mmHg,0.0
170245,10019003,27525946,35214014,69634.0,2153-04-14 08:43:00,2153-04-14 08:43:00,228878,35,35.0,mmHg,0.0
201582,10021927,24623461,34575919,72896.0,2180-09-24 00:00:00,2180-09-24 04:16:00,228878,26,26.0,mmHg,0.0
...,...,...,...,...,...,...,...,...,...,...,...
77624511,19989783,26984195,32761676,36088.0,2130-07-22 07:00:00,2130-07-22 07:19:00,228878,37,37.0,mmHg,0.0
77624519,19989783,26984195,32761676,36088.0,2130-07-22 11:00:00,2130-07-22 11:56:00,228878,40,40.0,mmHg,0.0
77625188,19989783,26984195,32761676,69514.0,2130-07-22 00:00:00,2130-07-22 00:09:00,228878,45,45.0,mmHg,0.0
77625197,19989783,26984195,32761676,69514.0,2130-07-22 03:00:00,2130-07-22 03:47:00,228878,44,44.0,mmHg,0.0


In [46]:
# Counter(chart_events_data_2[chart_events_data_2['itemid'] == 220048]['value'])

In [47]:
var_delete_list = ['Heart Rhythm', 'PA %O2 Saturation (PA Line)', 'SOFA Score', 'Urine output_ApacheIV']

In [48]:
d_items_data_chart_select = d_items_data_chart_select[~d_items_data_chart_select['label'].isin(var_delete_list)]

In [49]:
d_items_data_chart_select

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
6,220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90.0,140.0
7,220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60.0,90.0
8,220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,
24,220179,Non Invasive Blood Pressure systolic,NBPs,chartevents,Routine Vital Signs,mmHg,Numeric,,
...,...,...,...,...,...,...,...,...,...
2226,227465,Prothrombin time,PT,chartevents,Labs,,Numeric with tag,,
2227,227466,PTT,PTT,chartevents,Labs,,Numeric with tag,,
2228,227467,INR,INR,chartevents,Labs,,Numeric with tag,,
3270,228878,PeCO2,PeCO2,chartevents,Respiratory,mmHg,Numeric,,


In [50]:
# physio_table_7_day = {'subject_id':[], 'hadm_id':[], 'stay_id':[], 
#                       'time':[],
#                       'icu_starttime':[], 'icu_endtime':[], 'los':[],
#                       'discharge_fail':[], 
#                       'readmission':[], 'readmission_count':[],
#                       'death_in_ICU':[], 'death_out_ICU':[], 
#                       'age':[], 'gender':[], 'race':[]}

# for label in d_items_data_chart_select['label']:
#     physio_table_7_day[label] = []

# physio_table_7_day

In [51]:
[len(pd.unique(chart_events_data_2['stay_id'])), len(pd.unique(ICU_patient_data_s15_v2['stay_id']))]

[85171, 85181]

In [52]:
chart_events_data_3 = chart_events_data_2[chart_events_data_2['stay_id'].isin(ICU_patient_data_s15_v2['stay_id'])].copy()

In [53]:
[len(pd.unique(chart_events_data_3['stay_id'])), len(pd.unique(ICU_patient_data_s15_v2['stay_id']))]

[85171, 85181]

In [54]:
[len(pd.unique(chart_events_data_3['subject_id'])), len(pd.unique(ICU_patient_data_s15_v2['subject_id']))]

[58879, 58879]

In [55]:
drop_patient_list = pd.unique(ICU_patient_data_s15_v2[~ICU_patient_data_s15_v2['stay_id'].isin(chart_events_data_3['stay_id'])]['subject_id'])

In [56]:
drop_patient_list

array([10702059, 11952041, 14030959, 15386471, 15496226, 15711279,
       16316457, 17468902, 18137539, 18223988], dtype=int64)

In [57]:
ICU_patient_data_s15_v2 = ICU_patient_data_s15_v2[~ICU_patient_data_s15_v2['subject_id'].isin(drop_patient_list)]

In [58]:
ICU_patient_data_s15_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85144 entries, 0 to 85180
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype          
---  ------                    --------------  -----          
 0   subject_id                85144 non-null  int64          
 1   hadm_id                   85144 non-null  int64          
 2   stay_id                   85144 non-null  int64          
 3   first_careunit            85144 non-null  object         
 4   last_careunit             85144 non-null  object         
 5   intime                    85144 non-null  datetime64[ns] 
 6   outtime                   85131 non-null  datetime64[ns] 
 7   los                       85131 non-null  float64        
 8   TD_LOS                    85131 non-null  timedelta64[ns]
 9   discharge_fail_7_day      85144 non-null  int64          
 10  discharge_fail_14_day     85144 non-null  int64          
 11  discharge_fail_21_day     85144 non-null  int64          
 12  disc

In [59]:
drop_patient_list = pd.unique(ICU_patient_data_s15_v2[ICU_patient_data_s15_v2['los'].isnull()]['subject_id'])

In [60]:
drop_patient_list

array([10492274, 10882284, 11661851, 11783844, 14330929, 15777534,
       15882332, 16117624, 16348177, 16799689, 17434223, 18717462,
       19526758], dtype=int64)

In [61]:
ICU_patient_data_s15_v2 = ICU_patient_data_s15_v2[~ICU_patient_data_s15_v2['subject_id'].isin(drop_patient_list)]

In [62]:
chart_events_data_3 = chart_events_data_3[chart_events_data_3['stay_id'].isin(ICU_patient_data_s15_v2['stay_id'])].copy()

In [63]:
[len(pd.unique(chart_events_data_3['stay_id'])), len(pd.unique(ICU_patient_data_s15_v2['stay_id']))]

[85129, 85129]

In [64]:
chart_events_data_3 = chart_events_data_3.reset_index(drop = True)

In [65]:
chart_events_data_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77662312 entries, 0 to 77662311
Data columns (total 11 columns):
 #   Column        Dtype  
---  ------        -----  
 0   subject_id    int64  
 1   hadm_id       int64  
 2   stay_id       int64  
 3   caregiver_id  float64
 4   charttime     object 
 5   storetime     object 
 6   itemid        int64  
 7   value         object 
 8   valuenum      float64
 9   valueuom      object 
dtypes: float64(3), int64(4), object(4)
memory usage: 6.4+ GB


In [66]:
ICU_patient_data_s15_v2 = ICU_patient_data_s15_v2.reset_index(drop = True)

In [67]:
ICU_patient_data_s15_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85129 entries, 0 to 85128
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype          
---  ------                    --------------  -----          
 0   subject_id                85129 non-null  int64          
 1   hadm_id                   85129 non-null  int64          
 2   stay_id                   85129 non-null  int64          
 3   first_careunit            85129 non-null  object         
 4   last_careunit             85129 non-null  object         
 5   intime                    85129 non-null  datetime64[ns] 
 6   outtime                   85129 non-null  datetime64[ns] 
 7   los                       85129 non-null  float64        
 8   TD_LOS                    85129 non-null  timedelta64[ns]
 9   discharge_fail_7_day      85129 non-null  int64          
 10  discharge_fail_14_day     85129 non-null  int64          
 11  discharge_fail_21_day     85129 non-null  int64          
 12  disc

In [68]:
icu_stay_list = list(ICU_patient_data_s15_v2['stay_id'])

There are ten ICU admissions without any char_event records.

In [69]:
chart_events_data_3['charttime'] = pd.to_datetime(chart_events_data_3['charttime'])
chart_events_data_3['storetime'] = pd.to_datetime(chart_events_data_3['storetime'])

In [70]:
chart_events_data_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77662312 entries, 0 to 77662311
Data columns (total 11 columns):
 #   Column        Dtype         
---  ------        -----         
 0   subject_id    int64         
 1   hadm_id       int64         
 2   stay_id       int64         
 3   caregiver_id  float64       
 4   charttime     datetime64[ns]
 5   storetime     datetime64[ns]
 6   itemid        int64         
 7   value         object        
 8   valuenum      float64       
 9   valueuom      object        
dtypes: datetime64[ns](2), float64(3), int64(4), object(2)
memory usage: 6.4+ GB


In [71]:
# Counter(ICU_patient_data_s15_v2['readmission_count_7_day'])

In [72]:
# ICU_patient_data_s15_v2['readmission_count_7_day'].to_numpy().max()

In [73]:
def data_select(data, i_1, i_2, i_3):
    sub_data = data.loc[(data['charttime'] >= i_1) & 
                        (data['charttime'] <= i_2) & 
                        (data["itemid"] == i_3)]
    return sub_data

In [74]:
# for i in range(len(icu_stay_list)):
    
#     print("The number of processed ICU stay admissions: ", i)
     
#     index = ICU_patient_data_s15_v2["intime"].iloc[i]
    
#     s_table_id = chart_events_data_3[chart_events_data_3['stay_id'] == icu_stay_list[i]]

#     while index <= ICU_patient_data_s15_v2["outtime"].iloc[i]:
#         physio_table_7_day['subject_id'].append(ICU_patient_data_s15_v2['subject_id'].iloc[i])
#         physio_table_7_day['hadm_id'].append(ICU_patient_data_s15_v2['hadm_id'].iloc[i])
#         physio_table_7_day['stay_id'].append(ICU_patient_data_s15_v2['stay_id'].iloc[i])
#         physio_table_7_day['icu_starttime'].append(ICU_patient_data_s15_v2['intime'].iloc[i])
#         physio_table_7_day['icu_endtime'].append(ICU_patient_data_s15_v2['outtime'].iloc[i]) 
#         physio_table_7_day['los'].append(ICU_patient_data_s15_v2['los'].iloc[i])        
#         physio_table_7_day['discharge_fail'].append(ICU_patient_data_s15_v2['discharge_fail_7_day'].iloc[i])
#         physio_table_7_day['readmission'].append(ICU_patient_data_s15_v2['readmission_7_day'].iloc[i])
#         physio_table_7_day['readmission_count'].append(ICU_patient_data_s15_v2['readmission_count_7_day'].iloc[i])
#         physio_table_7_day['death_in_ICU'].append(ICU_patient_data_s15_v2['death_in_ICU'].iloc[i])
#         physio_table_7_day['death_out_ICU'].append(ICU_patient_data_s15_v2['death_out_ICU_7_day'].iloc[i])
#         physio_table_7_day['age'].append(ICU_patient_data_s15_v2['anchor_age'].iloc[i])
#         physio_table_7_day['gender'].append(ICU_patient_data_s15_v2['gender'].iloc[i])
#         physio_table_7_day['race'].append(ICU_patient_data_s15_v2['race'].iloc[i])
        
#         td = pd.Timedelta('0 days 12:00:00')
#         rd_idx = physio_table_7_day['readmission_count'][-1]
        
#         index_1 = index + td * (0.5**rd_idx)
        
#         if index_1 <= ICU_patient_data_s15_v2["outtime"].iloc[i]:
#             physio_table_7_day['time'].append(index_1)
#         else:
#             index_1 = ICU_patient_data_s15_v2["outtime"].iloc[i]
#             physio_table_7_day['time'].append(index_1)
            
#         for j in range(len(d_items_data_chart_select)):
#             s_table = data_select(s_table_id, 
#                                   index, 
#                                   index_1,
#                                   d_items_data_chart_select["itemid"].iloc[j])

#             n = len(s_table)

#             if n >= 1:
#                 physio_table_7_day[d_items_data_chart_select['label'].iloc[j]].append(s_table['valuenum'].mean())
#                 # physio_table[d_items_data_chart_select['label'].iloc[j]].append(s_table['valuenum'].iloc[-1])

#             else:
#                 physio_table_7_day[d_items_data_chart_select['label'].iloc[j]].append(np.nan)

#         index = index + td * (0.5**rd_idx)

In [75]:
# physio_df_7d = pd.DataFrame.from_dict(physio_table_7_day)
# physio_df_7d.to_csv('../icu_data/mimic_iv/physio_df_7d.csv', index = False)

In [76]:
physio_table_30_day = {'subject_id':[], 'hadm_id':[], 'stay_id':[], 
                      'time':[],
                      'icu_starttime':[], 'icu_endtime':[], 'los':[],
                      'discharge_fail':[], 
                      'readmission':[], 'readmission_count':[],
                      'death_in_ICU':[], 'death_out_ICU':[], 
                      'age':[], 'gender':[], 'race':[]}

for label in d_items_data_chart_select['label']:
    physio_table_30_day[label] = []

physio_table_30_day

{'subject_id': [],
 'hadm_id': [],
 'stay_id': [],
 'time': [],
 'icu_starttime': [],
 'icu_endtime': [],
 'los': [],
 'discharge_fail': [],
 'readmission': [],
 'readmission_count': [],
 'death_in_ICU': [],
 'death_out_ICU': [],
 'age': [],
 'gender': [],
 'race': [],
 'Heart Rate': [],
 'Arterial Blood Pressure systolic': [],
 'Arterial Blood Pressure diastolic': [],
 'Arterial Blood Pressure mean': [],
 'Non Invasive Blood Pressure systolic': [],
 'Non Invasive Blood Pressure diastolic': [],
 'Non Invasive Blood Pressure mean': [],
 'Respiratory Rate': [],
 'Arterial O2 pressure': [],
 'Arterial O2 Saturation': [],
 'Hemoglobin': [],
 'Arterial CO2 Pressure': [],
 'PH (Venous)': [],
 'O2 saturation pulseoxymetry': [],
 'PEEP set': [],
 'Hematocrit (serum)': [],
 'WBC': [],
 'Chloride (serum)': [],
 'Creatinine (serum)': [],
 'Glucose (serum)': [],
 'Magnesium': [],
 'Sodium (serum)': [],
 'GCS - Eye Opening': [],
 'Temperature Fahrenheit': [],
 'Temperature Celsius': [],
 'PH (Arter

In [77]:
ICU_patient_data_s15_v2['readmission_count_30_day'].to_numpy().max()

35

In [78]:
Counter(ICU_patient_data_s15_v2['readmission_count_30_day'])

Counter({0: 72976,
         1: 9411,
         2: 1931,
         3: 512,
         4: 173,
         5: 64,
         6: 21,
         7: 8,
         8: 4,
         9: 2,
         10: 2,
         11: 1,
         12: 1,
         13: 1,
         14: 1,
         15: 1,
         16: 1,
         17: 1,
         18: 1,
         19: 1,
         20: 1,
         21: 1,
         22: 1,
         23: 1,
         24: 1,
         25: 1,
         26: 1,
         27: 1,
         28: 1,
         29: 1,
         30: 1,
         31: 1,
         32: 1,
         33: 1,
         34: 1,
         35: 1})

In [79]:
ICU_patient_data_s15_v2[ICU_patient_data_s15_v2['readmission_count_30_day'] == 35]

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,TD_LOS,discharge_fail_7_day,...,death_out_ICU_21_day,death_out_ICU_30_day,death_out_ICU_60_day,death_out_ICU_90_day,readmission_count_7_day,readmission_count_14_day,readmission_count_21_day,readmission_count_30_day,readmission_count_60_day,readmission_count_90_day
71208,18358138,29844902,32504293,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2120-04-07 21:15:00,2120-04-08 15:20:34,0.753866,0 days 18:05:34,0,...,1,1,1,1,0,1,1,35,35,36


In [80]:
ICU_patient_data_s15_v2[ICU_patient_data_s15_v2['subject_id'] == 18358138]

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,TD_LOS,discharge_fail_7_day,...,death_out_ICU_21_day,death_out_ICU_30_day,death_out_ICU_60_day,death_out_ICU_90_day,readmission_count_7_day,readmission_count_14_day,readmission_count_21_day,readmission_count_30_day,readmission_count_60_day,readmission_count_90_day
71172,18358138,28786556,36640493,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2119-01-20 11:50:00,2119-01-24 15:11:37,4.140012,4 days 03:21:37,0,...,0,0,0,0,0,0,0,0,0,0
71173,18358138,29451022,33083516,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2119-04-16 21:16:00,2119-04-17 19:41:08,0.93412,0 days 22:25:08,1,...,0,0,0,0,0,0,0,0,0,1
71174,18358138,27404457,34488631,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2119-04-22 10:59:00,2119-04-23 16:53:07,1.245914,1 days 05:54:07,0,...,0,0,0,0,1,1,1,1,1,2
71175,18358138,25474193,31417816,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2119-05-01 20:44:36,2119-05-03 21:44:50,2.041829,2 days 01:00:14,0,...,0,0,0,0,0,2,2,2,2,3
71176,18358138,21075980,32603397,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2119-05-25 00:16:00,2119-05-29 20:42:32,4.851759,4 days 20:26:32,1,...,0,0,0,0,0,0,0,3,3,4
71177,18358138,21075980,37652411,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2119-05-30 04:37:48,2119-05-31 20:11:23,1.648322,1 days 15:33:35,1,...,0,0,0,0,1,1,1,4,4,5
71178,18358138,21075980,32054856,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2119-06-01 02:07:21,2119-06-05 20:49:04,4.77897,4 days 18:41:43,0,...,0,0,0,0,2,2,2,5,5,6
71179,18358138,23104977,34368548,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2119-06-29 15:01:00,2119-06-30 21:35:18,1.273819,1 days 06:34:18,1,...,0,0,0,0,0,0,0,6,6,7
71180,18358138,27192918,37231661,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2119-07-05 17:38:00,2119-07-07 21:51:50,2.176273,2 days 04:13:50,0,...,0,0,0,0,1,1,1,7,7,8
71181,18358138,26386811,34115393,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2119-07-19 02:48:00,2119-07-20 21:07:43,1.763692,1 days 18:19:43,0,...,0,0,0,0,0,2,2,8,8,9


In [None]:
for i in range(len(icu_stay_list)):
    
    print("The number of processed ICU stay admissions: ", i)
     
    index = ICU_patient_data_s15_v2["intime"].iloc[i]
    
    s_table_id = chart_events_data_3[chart_events_data_3['stay_id'] == icu_stay_list[i]]

    while index <= ICU_patient_data_s15_v2["outtime"].iloc[i]:
        physio_table_30_day['subject_id'].append(ICU_patient_data_s15_v2['subject_id'].iloc[i])
        physio_table_30_day['hadm_id'].append(ICU_patient_data_s15_v2['hadm_id'].iloc[i])
        physio_table_30_day['stay_id'].append(ICU_patient_data_s15_v2['stay_id'].iloc[i])
        physio_table_30_day['icu_starttime'].append(ICU_patient_data_s15_v2['intime'].iloc[i])
        physio_table_30_day['icu_endtime'].append(ICU_patient_data_s15_v2['outtime'].iloc[i]) 
        physio_table_30_day['los'].append(ICU_patient_data_s15_v2['los'].iloc[i])        
        physio_table_30_day['discharge_fail'].append(ICU_patient_data_s15_v2['discharge_fail_30_day'].iloc[i])
        physio_table_30_day['readmission'].append(ICU_patient_data_s15_v2['readmission_30_day'].iloc[i])
        physio_table_30_day['readmission_count'].append(ICU_patient_data_s15_v2['readmission_count_30_day'].iloc[i])
        physio_table_30_day['death_in_ICU'].append(ICU_patient_data_s15_v2['death_in_ICU'].iloc[i])
        physio_table_30_day['death_out_ICU'].append(ICU_patient_data_s15_v2['death_out_ICU_30_day'].iloc[i])
        physio_table_30_day['age'].append(ICU_patient_data_s15_v2['anchor_age'].iloc[i])
        physio_table_30_day['gender'].append(ICU_patient_data_s15_v2['gender'].iloc[i])
        physio_table_30_day['race'].append(ICU_patient_data_s15_v2['race'].iloc[i])
        
        td = pd.Timedelta('0 days 12:00:00')
        rd_idx = physio_table_30_day['readmission_count'][-1]
        
        if rd_idx <= 4:
        
            index_1 = index + td * (0.5**rd_idx)

            if index_1 <= ICU_patient_data_s15_v2["outtime"].iloc[i]:
                physio_table_30_day['time'].append(index_1)
            else:
                index_1 = ICU_patient_data_s15_v2["outtime"].iloc[i]
                physio_table_30_day['time'].append(index_1)

            for j in range(len(d_items_data_chart_select)):
                s_table = data_select(s_table_id, 
                                      index, 
                                      index_1,
                                      d_items_data_chart_select["itemid"].iloc[j])

                n = len(s_table)

                if n >= 1:
                    physio_table_30_day[d_items_data_chart_select['label'].iloc[j]].append(s_table['valuenum'].mean())
                    # physio_table[d_items_data_chart_select['label'].iloc[j]].append(s_table['valuenum'].iloc[-1])

                else:
                    physio_table_30_day[d_items_data_chart_select['label'].iloc[j]].append(np.nan)

            index = index + td * (0.5**rd_idx)
            
        else:
            rd_idx = 4
            index_1 = index + td * (0.5**rd_idx)

            if index_1 <= ICU_patient_data_s15_v2["outtime"].iloc[i]:
                physio_table_30_day['time'].append(index_1)
            else:
                index_1 = ICU_patient_data_s15_v2["outtime"].iloc[i]
                physio_table_30_day['time'].append(index_1)

            for j in range(len(d_items_data_chart_select)):
                s_table = data_select(s_table_id, 
                                      index, 
                                      index_1,
                                      d_items_data_chart_select["itemid"].iloc[j])

                n = len(s_table)

                if n >= 1:
                    physio_table_30_day[d_items_data_chart_select['label'].iloc[j]].append(s_table['valuenum'].mean())
                    # physio_table[d_items_data_chart_select['label'].iloc[j]].append(s_table['valuenum'].iloc[-1])

                else:
                    physio_table_30_day[d_items_data_chart_select['label'].iloc[j]].append(np.nan)

            index = index + td * (0.5**rd_idx)

The number of processed ICU stay admissions:  0
The number of processed ICU stay admissions:  1
The number of processed ICU stay admissions:  2
The number of processed ICU stay admissions:  3
The number of processed ICU stay admissions:  4
The number of processed ICU stay admissions:  5
The number of processed ICU stay admissions:  6
The number of processed ICU stay admissions:  7
The number of processed ICU stay admissions:  8
The number of processed ICU stay admissions:  9
The number of processed ICU stay admissions:  10
The number of processed ICU stay admissions:  11
The number of processed ICU stay admissions:  12
The number of processed ICU stay admissions:  13
The number of processed ICU stay admissions:  14
The number of processed ICU stay admissions:  15
The number of processed ICU stay admissions:  16
The number of processed ICU stay admissions:  17
The number of processed ICU stay admissions:  18
The number of processed ICU stay admissions:  19
The number of processed ICU st

In [None]:
physio_df_30d = pd.DataFrame.from_dict(physio_table_30_day)

In [None]:
physio_df_30d.info()

In [None]:
physio_df_30d[physio_df_30d['subject_id'] == 16133115]

In [None]:
physio_df_30d[physio_df_30d['subject_id'] == 16133115][['subject_id', 'stay_id', 'discharge_fail', 'readmission', 'readmission_count']].iloc[60:100]

In [None]:
# physio_df_30d.to_csv('../icu_data/mimic_iv/physio_df_30d.csv', index = False)

## Data Preprocess - Part 2

In [None]:
physio_df = physio_df_30d.copy()

In [None]:
physio_df

- Tidal Volume

In [None]:
physio_df['Tidal Volume (set)'] = physio_df['Tidal Volume (set)']/1000
physio_df['Tidal Volume (observed)'] = physio_df['Tidal Volume (observed)']/1000
physio_df['Tidal Volume (spontaneous)'] = physio_df['Tidal Volume (spontaneous)']/1000

- Time information

In [None]:
physio_df['Tidal Volume (set)'] = physio_df['Tidal Volume (set)']/1000
physio_df['Tidal Volume (observed)'] = physio_df['Tidal Volume (observed)']/1000
physio_df['Tidal Volume (spontaneous)'] = physio_df['Tidal Volume (spontaneous)']/1000

- Gender information

In [None]:
gender_dummies = pd.get_dummies(physio_df.gender)
physio_df = pd.concat([physio_df, gender_dummies], axis = 'columns')

In [None]:
physio_df = physio_df.drop(columns = ['gender', 'F'])

- Race information

In [None]:
# race_dummies = pd.get_dummies(physio_df_7d.race, prefix='race')
# physio_df_7d = pd.concat([physio_df_7d, race_dummies], axis='columns')


physio_df = physio_df.drop(columns = ['race'])

- Discharge action

In [None]:
icu_stayid_list = physio_df['stay_id'].unique()

physio_df['discharge_action'] = 0

for i in range(len(icu_stayid_list)):

    time_idx = physio_df[(physio_df['stay_id'] == icu_stayid_list[i])]['time'].iloc[-1]
    
    physio_df.loc[(physio_df['stay_id'] == icu_stayid_list[i]) & (physio_df['time'] == time_idx), 'discharge_action'] = 1

- Blood pressure

In [None]:
def assign_blood_pressure(row):
    if pd.isna(row['Arterial Blood Pressure systolic']) and not pd.isna(row['Non Invasive Blood Pressure systolic']):
        return row['Non Invasive Blood Pressure systolic']
    elif not pd.isna(row['Arterial Blood Pressure systolic']):
        return row['Arterial Blood Pressure systolic']
    elif not pd.isna(row['ART BP Systolic']):
        return row['ART BP Systolic']
    else:
        return np.nan

physio_df['Blood Pressure Systolic'] = physio_df.apply(assign_blood_pressure, axis = 1)

In [None]:
def assign_blood_pressure_diastolic(row):
    if pd.isna(row['Arterial Blood Pressure diastolic']) and not pd.isna(row['Non Invasive Blood Pressure diastolic']):
        return row['Non Invasive Blood Pressure diastolic']
    elif not pd.isna(row['Arterial Blood Pressure diastolic']):
        return row['Arterial Blood Pressure diastolic']
    elif not pd.isna(row['ART BP Diastolic']):
        return row['ART BP Diastolic']
    else:
        return np.nan

physio_df['Blood Pressure Diastolic'] = physio_df.apply(assign_blood_pressure_diastolic, axis = 1)

In [None]:
def assign_blood_pressure_mean(row):
    if pd.isna(row['Arterial Blood Pressure mean']) and not pd.isna(row['Non Invasive Blood Pressure mean']):
        return row['Non Invasive Blood Pressure mean']
    elif not pd.isna(row['Arterial Blood Pressure mean']):
        return row['Arterial Blood Pressure mean']
    elif not pd.isna(row['ART BP Mean']):
        return row['ART BP Mean']
    else:
        return np.nan

physio_df['Blood Pressure Mean'] = physio_df.apply(assign_blood_pressure_mean, axis = 1)

- Temperature

In [None]:
def assign_temperature(row):
    if pd.isna(row['Temperature Celsius']) and not pd.isna(row['Temperature Fahrenheit']):
        return (row['Temperature Fahrenheit']-32) * 5.0/9.0
    elif not pd.isna(row['Temperature Celsius']):
        return row['Temperature Celsius']
    else:
        return np.nan

physio_df['Temperature C'] = physio_df.apply(assign_temperature, axis = 1)

- O2 Saturation

In [None]:
def assign_SaO2(row):
    if pd.isna(row['Arterial O2 Saturation']) and not pd.isna(row['O2 saturation pulseoxymetry']):
        return row['O2 saturation pulseoxymetry']
    elif not pd.isna(row['Arterial O2 Saturation']):
        return row['Arterial O2 Saturation']
    else:
        return np.nan

physio_df['SaO2'] = physio_df.apply(assign_SaO2, axis = 1)

- GCS score

In [None]:
def assign_gcs_score(row):
    return row['GCS - Eye Opening'] + row['GCS - Verbal Response'] + row['GCS - Motor Response']

physio_df['GCS score'] = physio_df.apply(assign_gcs_score, axis = 1)

- PEEP level

In [None]:
def assign_peep_level(row):
    if pd.isna(row['PEEP set']) and not pd.isna(row['Total PEEP Level']):
        return row['Total PEEP Level']
    elif not pd.isna(row['PEEP set']):
        return row['PEEP set']
    else:
        return np.nan

physio_df['PEEP Level'] = physio_df.apply(assign_peep_level, axis = 1)

- Weight

In [None]:
def assign_weight(row):
    if not pd.isna(row['Daily Weight']):
        return row['Daily Weight']
    elif not pd.isna(row['Admission Weight (Kg)']):
        return row['Admission Weight (Kg)']
    elif not pd.isna(row['Admission Weight (lbs.)']):
        return row['Admission Weight (lbs.)'] * 0.453592  # Convert lbs to kg
    else:
        return np.nan

physio_df['Weight'] = physio_df.apply(assign_weight, axis = 1)

In [None]:
physio_df_v1 = physio_df.drop(columns = ['Arterial Blood Pressure systolic', 'Non Invasive Blood Pressure systolic', 'ART BP Systolic', 'Arterial Blood Pressure diastolic', 
                                         'Non Invasive Blood Pressure diastolic', 'ART BP Diastolic', 'Arterial Blood Pressure mean', 'Non Invasive Blood Pressure mean', 
                                         'ART BP Mean', 'Temperature Celsius', 'Temperature Fahrenheit', 'Arterial O2 Saturation', 'O2 saturation pulseoxymetry', 
                                         'GCS - Eye Opening', 'GCS - Verbal Response', 'GCS - Motor Response', 'PEEP set', 'Total PEEP Level', 'Admission Weight (lbs.)', 'Admission Weight (Kg)', 'Daily Weight'])

- Filter out abnormal values

In [None]:
physio_df_v2 = physio_df_v1.drop(columns = ['Direct Bilirubin', 'PeCO2', 'Creatinine (whole blood)'])

In [None]:
pro_events_data.columns

In [None]:
pro_events_data_weight = pro_events_data[['stay_id', 'patientweight']]
pro_events_data_weight = pro_events_data_weight.drop_duplicates(subset = ['stay_id'], keep = 'first')

physio_df_v2 = pd.merge(physio_df_v2, pro_events_data_weight, on = 'stay_id', how = 'left')

In [None]:
def assign_weight_2(row):
    if not pd.isna(row['Weight']):
        return row['Weight']
    elif not pd.isna(row['patientweight']):
        return row['patientweight']
    else:
        return np.nan

physio_df_v2['weight'] = physio_df_v2.apply(assign_weight_2, axis = 1)
physio_df_v2 = physio_df_v2.drop(columns = ['Weight', 'patientweight'])

In [None]:
names_var = ['age',
             'Heart Rate', 'Respiratory Rate', 'Arterial O2 pressure', 'Hemoglobin',
             'Arterial CO2 Pressure', 'PH (Venous)', 'Hematocrit (serum)', 'WBC',
             'Chloride (serum)', 'Creatinine (serum)', 'Glucose (serum)',
             'Magnesium', 'Sodium (serum)', 'PH (Arterial)', 'Inspired O2 Fraction',
             'Tidal Volume (set)', 'Tidal Volume (observed)',
             'Tidal Volume (spontaneous)', 'Respiratory Rate (Set)',
             'Respiratory Rate (spontaneous)', 'Respiratory Rate (Total)',
             'Arterial Base Excess', 'BUN', 'Ionized Calcium', 'Total Bilirubin',
             'Venous CO2 Pressure', 'Venous O2 Pressure', 'Sodium (whole blood)',
             'Chloride (whole blood)', 'Glucose (whole blood)',
             'Hematocrit (whole blood - calc)', 'Potassium (serum)', 'HCO3 (serum)',
             'Albumin', 'Platelet Count', 'Potassium (whole blood)',
             'Prothrombin time', 'PTT', 'INR', 'M',
             'Blood Pressure Systolic', 'Blood Pressure Diastolic',
             'Blood Pressure Mean', 'Temperature C', 'SaO2', 'GCS score', 'PEEP Level', 'weight']

len(names_var)

In [None]:
abv_data = physio_df_v2[names_var]

In [None]:
inspect_col = ['Heart Rate', 'Respiratory Rate', 'Arterial O2 pressure', 'Hemoglobin',
               'Arterial CO2 Pressure', 'PH (Venous)', 'Hematocrit (serum)', 'WBC',
               'Chloride (serum)', 'Creatinine (serum)', 'Glucose (serum)',
               'Magnesium', 'Sodium (serum)', 'PH (Arterial)', 
               'Tidal Volume (observed)',
               'Tidal Volume (spontaneous)', 'Respiratory Rate (Set)',
               'Respiratory Rate (spontaneous)', 'Respiratory Rate (Total)',
               'Arterial Base Excess', 'BUN', 'Ionized Calcium', 'Total Bilirubin',
               'Venous CO2 Pressure', 'Venous O2 Pressure', 'Sodium (whole blood)',
               'Chloride (whole blood)', 'Glucose (whole blood)',
               'Hematocrit (whole blood - calc)', 'Potassium (serum)', 'HCO3 (serum)',
               'Albumin', 'Platelet Count', 'Potassium (whole blood)',
               'Prothrombin time', 'PTT', 'INR', 
               'Blood Pressure Systolic', 'Blood Pressure Diastolic',
               'Blood Pressure Mean', 'Temperature C', 'SaO2', 'weight']

In [None]:
ab_data_sub = abv_data[inspect_col]

In [None]:
# lim = np.logical_or(ab_data_sub >= ab_data_sub.quantile(0.999),
#                     ab_data_sub <= ab_data_sub.quantile(0.001))

q1 = ab_data_sub.quantile(0.25)
q3 = ab_data_sub.quantile(0.75)

iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [None]:
physio_df_v2_abn = physio_df_v2.copy()

In [None]:
for column in ab_data_sub.columns:
    physio_df_v2_abn.loc[physio_df_v2_abn[column] > upper_bound[column], column] = np.nan
    physio_df_v2_abn.loc[physio_df_v2_abn[column] < lower_bound[column], column] = np.nan

In [None]:
physio_df_v2_abn.info()

In [None]:
abv_data = physio_df_v2_abn[names_var]
abv_data.describe().T

In [None]:
id_delete_list = list(physio_df_v2_abn[physio_df_v2_abn['Inspired O2 Fraction'] > 100]['subject_id'])

In [None]:
id_delete_list

In [None]:
physio_df_v2_abn = physio_df_v2_abn[~physio_df_v2_abn['subject_id'].isin(id_delete_list)]
physio_df_v2 = physio_df_v2_abn.copy()

In [None]:
abv_data = physio_df_v2[names_var]
abv_data.describe().T

## Data Imputation

In [None]:
drop_list = []

for i in names_var:
    if (physio_df_v2[i].isnull().sum()/len(physio_df_v2)) > 0.75:
        drop_list.append(i)

In [None]:
middle_list = []

for i in names_var:
    if ((physio_df_v2[i].isnull().sum()/len(physio_df_v2)) <= 0.75) & ((physio_df_v2[i].isnull().sum()/len(physio_df_v2)) >= 0.10):
        middle_list.append(i)

In [None]:
knn_list = []

for i in names_var:
    if (physio_df_v2[i].isnull().sum()/len(physio_df_v2)) < 0.10:
        knn_list.append(i)

- Forward fill

In [None]:
for i in range(len(drop_list)):
    physio_df_v2[drop_list[i]] = physio_df_v2.groupby(by = ['stay_id', 'readmission_count'])[drop_list[i]].ffill()

for i in range(len(middle_list)):
    physio_df_v2[middle_list[i]] = physio_df_v2.groupby(by = ['stay_id', 'readmission_count'])[middle_list[i]].ffill()

for i in range(len(knn_list)):
    physio_df_v2[knn_list[i]] = physio_df_v2.groupby(by = ['stay_id', 'readmission_count'])[knn_list[i]].ffill()

In [None]:
drop_list = []

for i in names_var:
    if (physio_df_v2[i].isnull().sum()/len(physio_df_v2)) > 0.75:
        drop_list.append(i)

In [None]:
middle_list = []

for i in names_var:
    if ((physio_df_v2[i].isnull().sum()/len(physio_df_v2)) <= 0.75) & ((physio_df_v2[i].isnull().sum()/len(physio_df_v2)) >= 0.10):
        middle_list.append(i)

In [None]:
knn_list = []

for i in names_var:
    if (physio_df_v2[i].isnull().sum()/len(physio_df_v2)) < 0.10:
        knn_list.append(i)

In [None]:
physio_df_v3 = physio_df_v2.drop(columns = drop_list)

- Linear interpolation

In [None]:
feature_list = middle_list + knn_list

In [None]:
for i in range(len(feature_list)):
    physio_df_v3[feature_list[i]] = physio_df_v3.groupby(by = ['stay_id', 'readmission_count'])[feature_list[i]].apply(lambda x: x.interpolate(method = 'linear'))

In [None]:
physio_df_v3.columns

In [None]:
physio_df_v3.info()

In [None]:
physio_df_v4 = physio_df_v3.drop(columns = ['Venous CO2 Pressure', 'Venous O2 Pressure', 
                                            'Sodium (whole blood)', 'Chloride (whole blood)', 
                                            'Hematocrit (whole blood - calc)', 'Albumin',
                                            'Potassium (whole blood)', 'PEEP Level'])

In [None]:
physio_df_v4.info()

In [None]:
summary_stats = physio_df_v4.describe().T

summary_stats.insert(0, "Category", "Clinical Information")

latex_table = summary_stats.to_latex(
    index = True,
    columns = ["mean", "std", "min", "max"],
    header = ["Mean", "SD", "Min", "Max"],
    float_format = "%.2f",  
    column_format = "llcccc",  
    caption = "Summary statistics of the study samples.",
    label = "tab:summary_stats",
    longtable = False,
    escape = False  
)

print(latex_table)

- KNN Imputation

In [None]:
import os
from threadpoolctl import threadpool_limits
from joblib import Parallel, delayed
from tqdm import tqdm

In [None]:
num_threads = os.cpu_count()
print(f"Available CPU threads: {num_threads}")

In [None]:
imputer = KNNImputer(n_neighbors = 5)

In [None]:
physio_df_v4.info()

In [None]:
physio_df_v4 = physio_df_v4.reset_index(drop = True)

In [None]:
summary_stats = physio_df_v4.describe().T

summary_stats.insert(0, "Category", "Clinical Information")

latex_table = summary_stats.to_latex(
    index = True,
    columns = ["mean", "std", "min", "max"],
    header = ["Mean", "SD", "Min", "Max"],
    float_format = "%.2f",  
    column_format = "llcccc",  
    caption = "Summary statistics of the study samples.",
    label = "tab:summary_stats",
    longtable = False,
    escape = False  
)

print(latex_table)

In [None]:
# physio_df_v4.to_csv('physio_df_v4.csv', index = False)

In [None]:
columns_with_missing_values = physio_df_v4.columns[physio_df_v4.isnull().any()].tolist()
physio_df_v4_pre = physio_df_v4[columns_with_missing_values].copy()

In [None]:
columns_with_missing_values

In [None]:
def process_chunk(chunk, imputer):
    
    chunk_imputed = imputer.fit_transform(chunk)  
    
    return chunk_imputed

In [None]:
physio_df_v4.info()

In [None]:
scaler = MinMaxScaler()

In [None]:
physio_df_v4_pre[columns_with_missing_values] = scaler.fit_transform(physio_df_v4_pre[columns_with_missing_values])

In [None]:
len(physio_df_v4_pre)

In [None]:
chunk_size = 20000  
chunks = [physio_df_v4_pre.iloc[i:i + chunk_size] for i in range(0, len(physio_df_v4_pre), chunk_size)]

In [None]:
results = Parallel(n_jobs = 60)(
    delayed(process_chunk)(chunk, imputer) 
    for chunk in tqdm(chunks, desc = "KNN Imputation Progress")
)

In [None]:
physio_df_v4_pre = pd.concat(
    [pd.DataFrame(result, columns = columns_with_missing_values) for result in results],
    ignore_index = True
)

In [None]:
physio_df_v4_pre

In [None]:
physio_df_v4_pre[columns_with_missing_values] = scaler.inverse_transform(physio_df_v4_pre[columns_with_missing_values])

In [None]:
columns_with_missing_values

In [None]:
physio_df_v4 = physio_df_v4.reset_index(drop = True)

In [None]:
physio_df_v4.info()

In [None]:
physio_df_v4[columns_with_missing_values] = physio_df_v4_pre[columns_with_missing_values]

In [None]:
# scaler = MinMaxScaler()

# with threadpool_limits(limits = 100):
#     physio_df_v4_pre[columns_with_missing_values] = scaler.fit_transform(physio_df_v4_pre[columns_with_missing_values])
    
#     physio_df_v4_pre[columns_with_missing_values] = imputer.fit_transform(physio_df_v4_pre[columns_with_missing_values])
    
#     physio_df_v4_pre[columns_with_missing_values] = scaler.inverse_transform(physio_df_v4_pre[columns_with_missing_values])

# physio_df_v4[columns_with_missing_values] = physio_df_v4_pre[columns_with_missing_values]

In [None]:
# scaler = MinMaxScaler()

# columns_with_missing_values = physio_df_v3.columns[
#     physio_df_v3.isnull().any()
# ].tolist()

# X = physio_df_v3[columns_with_missing_values].values  
# # X.shape = (n_rows, n_cols_with_nan)

# X_scaled = scaler.fit_transform(X)      
# X_imputed = imputer.fit_transform(X_scaled) 
# X_restored = scaler.inverse_transform(X_imputed) 

# physio_df_v3.loc[:, columns_with_missing_values] = X_restored

In [None]:
physio_df_v4_pre.info()

In [None]:
physio_df_v4.info()

In [None]:
Counter(physio_df_v4[physio_df_v4['subject_id'] == 16133115]['readmission_count'])

In [None]:
physio_df_v4[physio_df_v4['subject_id'] == 16133115][['subject_id', 'stay_id', 'discharge_fail', 'readmission', 'readmission_count']].iloc[60:100]

In [None]:
physio_df_v4.to_csv('../icu_data/mimic_iv/physio_df_v5.csv', index = False)