# DataSet Preprocessing - MIMIC-IV

## Case: Discharge Decision-Making

In [None]:
import sys
sys.path.append('C:/Users/ge28pih/PycharmProjects/ICU-Decision Making-OCRL')

In [None]:
from Data.mimic_iv_icu_discharge.data_preprocess import *

In [None]:
discharge_data_preprocess = ICUDataInput(
    d_items_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/d_items.csv.gz',
    input_events_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/inputevents.csv.gz',
    pro_events_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/procedureevents.csv.gz',
    output_events_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/outputevents.csv.gz',
    icu_patient_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/icustays.csv.gz',
    d_labitems_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/d_labitems.csv.gz',
    admission_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/admissions.csv.gz',
    all_patients_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/patients.csv.gz',
    data_compression = 'gzip'
)

In [None]:
discharge_data_preprocess.load_data()

In [None]:
discharge_data_preprocess.d_items_data

In [None]:
discharge_data_preprocess.d_items_data[discharge_data_preprocess.d_items_data['linksto'] == 'chartevents']

- Search Variables

In [None]:
search_variable = VariableSearch(d_items_table = discharge_data_preprocess.d_items_data, 
                                 search_column = 'label')

In [None]:
search_variable.search_by_keyword(keyword = 'heart rate', 
                                  output_column = 'itemid', 
                                  use_regex = False, 
                                  enable_translation = False, 
                                  target_languages = None, 
                                  translation_service = 'openai', 
                                  api_key = None, 
                                  custom_translator = None)

In [None]:
discharge_data_preprocess.d_items_data[discharge_data_preprocess.d_items_data['itemid'] == 220045]

- Select Variables 

In [None]:
# Define item IDs by category for better organization and maintenance
vital_signs_ids = [
    220045, 220048, 220179, 220050, 220180, 220051, 220052, 220181, 
    225312, 220210, 224690, 223761, 223762, 220277
]

gcs_score_ids = [
    223901, 223900, 220739
]

ventilator_params_ids = [
    220339, 224700, 224685, 224684, 224686, 223835, 223848, 223849
]

laboratory_ids = [
    225624, 226536, 220602, 227464, 226534, 226537, 229761, 220653, 220546, 
    227466, 227467, 227457, 220274, 223830, 220228, 220235, 220224, 226062, 
    226063, 227456, 226540, 224828, 220635, 220545, 220615, 220621, 220645
]

general_ids = [
    224639, 226260, 226512, 226531, 226892, 227428
]

adt_ids = [
    220003, 226228, 226545
]

additional_ids = [
    224719, 226862, 228878, 227465, 227442, 227443, 225651, 225690, 
    226566, 227489, 226627, 220994, 227519, 227488, 225667, 228699, 
    228709, 228713, 228703, 228704, 228705, 225309, 225310, 220227, 
    224688, 224689
]

# Combine all IDs and remove duplicates
items_id_list = list(set(
    vital_signs_ids + gcs_score_ids + ventilator_params_ids + 
    laboratory_ids + general_ids + adt_ids + additional_ids
))

# Sort for consistent ordering
items_id_list.sort()

print(f"Total unique item IDs: {len(items_id_list)}")
print(f"First 10 IDs: {items_id_list[:10]}")

In [None]:
variable_selection = VariableSelect(d_items_table = discharge_data_preprocess.d_items_data, 
                                    item_id_list = items_id_list)

variable_selection.select_variables()

In [None]:
variable_selection.get_selected_data()

In [None]:
variable_selection.select_data_chartEvents()
variable_selection.select_data_outputEvents()
variable_selection.select_data_datetimeEvents()
variable_selection.select_data_ingredientEvents()

In [None]:
variable_selection.d_items_data_chart

- Chart events data preprocess

In [None]:
chart_events_process = ChartEventsProcess(chart_events_path = 'C:/Users/ge28pih/PycharmProjects/discharge_decision_making/icu_data/mimic_iv/chartevents.csv.gz', 
                                          item_id_list = variable_selection.d_items_data['itemid'].tolist())

In [None]:
chart_events_process.load_and_filter_data(var_type = {
    'cgid': 'float64',
    'stay_id': 'float64',
    'error': 'float64',
    'resultstatus': 'object',
    'stopped': 'object',
    'value': 'object',
    'valuenum': 'float64',
    'warning': 'float64',
    'valueuom': 'object',
    'caregiver_id': 'float64'
})

In [None]:
icu_patient_data_process = PatientDataProcess(ICU_patient_data = discharge_data_preprocess.ICU_patient_data)

In [None]:
ICU_unit = ['Medical Intensive Care Unit (MICU)', 
            'Surgical Intensive Care Unit (SICU)', 
            'Medical/Surgical Intensive Care Unit (MICU/SICU)', 
            'Cardiac Vascular Intensive Care Unit (CVICU)', 
            'Coronary Care Unit (CCU)', 
            'Trauma SICU (TSICU)']

icu_patient_data_process.filter_ICU_patients(icu_unit_list = ICU_unit)
icu_patient_data_process.calculate_los()
icu_patient_data_process.denote_readmission_cases(readmission_observation_days = 30)
icu_patient_data_process.denote_death_cases(admission_data = discharge_data_preprocess.admission_data, 
                                            patients_data = discharge_data_preprocess.patients_data, 
                                            readmission_observation_days = 30)
icu_patient_data_process.denote_readmission_count(readmission_observation_days = 30)

In [None]:
gen_dataset = GenerateDataSet(chart_events_data = chart_events_process.chart_events_data,
                              d_items_data_chart = variable_selection.d_items_data_chart,
                              icu_patient_data = icu_patient_data_process.ICU_patient_data)

In [None]:
items_delete_list = ['Ventilator Type', 'Ventilator Mode', 
                     'SaO2 < 90% > 2 min', 'Gender', 
                     'Race', 'Cardiovascular', 'Musculoskeletal', 
                     'Neurological', 'Nutrition', 'Respiratory', 
                     'Vascular', 'Mechanically Ventilated', 
                     'Re-admit < 48 hours', 
                     'Heart Rhythm', 'PA %O2 Saturation (PA Line)', 'SOFA Score', 'Urine output_ApacheIV']

gen_dataset.prepare_chart_events_data(items_delete_list = items_delete_list)

In [None]:
physio_table_30_day = {'subject_id':[], 'hadm_id':[], 'stay_id':[], 
                      'time':[],
                      'icu_starttime':[], 'icu_endtime':[], 'los':[],
                      'discharge_fail':[], 
                      'readmission':[], 'readmission_count':[],
                      'death_in_ICU':[], 'death_out_ICU':[], 
                      'age':[], 'gender':[], 'race':[]}

for label in gen_dataset.d_items_data_chart['label']:
    physio_table_30_day[label] = []

physio_table_30_day

In [None]:
gen_dataset.dataset_generation(physio_table = physio_table_30_day)

In [None]:
gen_dataset.generated_dataset

In [None]:
gen_dataset.process_gen_data(pro_events_data = discharge_data_preprocess.pro_events_data, 
                             drop_columns = ['Arterial Blood Pressure systolic', 'Non Invasive Blood Pressure systolic', 'ART BP Systolic', 
                                             'Arterial Blood Pressure diastolic', 
                                             'Non Invasive Blood Pressure diastolic', 
                                             'ART BP Diastolic', 'Arterial Blood Pressure mean', 
                                             'Non Invasive Blood Pressure mean', 
                                             'ART BP Mean', 'Temperature Celsius', 'Temperature Fahrenheit', 
                                             'Arterial O2 Saturation', 'O2 saturation pulseoxymetry', 
                                             'GCS - Eye Opening', 'GCS - Verbal Response', 'GCS - Motor Response', 
                                             'PEEP set', 'Total PEEP Level', 'Admission Weight (lbs.)', 
                                             'Admission Weight (Kg)', 'Daily Weight'])

In [None]:
gen_dataset.generated_dataset.info()

In [None]:
# Temporary rename the column to avoid conflict with the original column
gen_dataset.generated_dataset.rename(columns = {'patientweight': 'weight'}, inplace = True)

In [None]:
gen_dataset.abnormal_data_filter(method = 'iqr', 
                                 iqr_factor = 3.0, 
                                 z_threshold = 3.0, 
                                 abnormal_var_list = ['Heart Rate', 'Respiratory Rate', 'Arterial O2 pressure', 'Hemoglobin',
                                                      'Arterial CO2 Pressure', 'PH (Venous)', 'Hematocrit (serum)', 'WBC',
                                                      'Chloride (serum)', 'Creatinine (serum)', 'Glucose (serum)',
                                                      'Magnesium', 'Sodium (serum)', 'PH (Arterial)', 
                                                      'Tidal Volume (observed)',
                                                      'Tidal Volume (spontaneous)', 'Respiratory Rate (Set)',
                                                      'Respiratory Rate (spontaneous)', 'Respiratory Rate (Total)',
                                                      'Arterial Base Excess', 'BUN', 'Ionized Calcium', 'Total Bilirubin',
                                                      'Venous CO2 Pressure', 'Venous O2 Pressure', 'Sodium (whole blood)',
                                                      'Chloride (whole blood)', 'Glucose (whole blood)',
                                                      'Hematocrit (whole blood - calc)', 'Potassium (serum)', 'HCO3 (serum)',
                                                      'Albumin', 'Platelet Count', 'Potassium (whole blood)',
                                                      'Prothrombin time', 'PTT', 'INR', 
                                                      'Blood Pressure Systolic', 'Blood Pressure Diastolic',
                                                      'Blood Pressure Mean', 'Temperature C', 'SaO2', 'weight'])

In [None]:
gen_dataset.generated_dataset.to_csv('./mimic_iv_icu_discharge_dataset_v1.csv', index = False)

In [None]:
patient_data_imputer = PatientDataImputation(generated_dataset = gen_dataset.generated_dataset)

In [None]:
gen_dataset.generated_dataset.columns

In [None]:
names_var = ['age',
             'Heart Rate', 'Respiratory Rate', 'Arterial O2 pressure', 'Hemoglobin',
             'Arterial CO2 Pressure', 'PH (Venous)', 'Hematocrit (serum)', 'WBC',
             'Chloride (serum)', 'Creatinine (serum)', 'Glucose (serum)',
             'Magnesium', 'Sodium (serum)', 'PH (Arterial)', 'Inspired O2 Fraction',
             'Tidal Volume (set)', 'Tidal Volume (observed)',
             'Tidal Volume (spontaneous)', 'Respiratory Rate (Set)',
             'Respiratory Rate (spontaneous)', 'Respiratory Rate (Total)',
             'Arterial Base Excess', 'BUN', 'Ionized Calcium', 'Total Bilirubin',
             'Venous CO2 Pressure', 'Venous O2 Pressure', 'Sodium (whole blood)',
             'Chloride (whole blood)', 'Glucose (whole blood)',
             'Hematocrit (whole blood - calc)', 'Potassium (serum)', 'HCO3 (serum)',
             'Albumin', 'Platelet Count', 'Potassium (whole blood)',
             'Prothrombin time', 'PTT', 'INR', 'M',
             'Blood Pressure Systolic', 'Blood Pressure Diastolic',
             'Blood Pressure Mean', 'SaO2', 'Temperature C', 'GCS Score', 'PEEP Level', 'weight']

In [None]:
drop_list, middle_list, knn_list = patient_data_imputer.classify_missing_columns(var_list = names_var, 
                                                                                 missing_threshold_1 = 0.75, 
                                                                                 missing_threshold_2 = 0.10)

In [None]:
drop_list

In [None]:
# patient_data_imputer.drop_columns(var_list = drop_list)

In [None]:
patient_data_imputer.forward_fill_missing_values(var_list = drop_list)
patient_data_imputer.forward_fill_missing_values(var_list = middle_list)
patient_data_imputer.forward_fill_missing_values(var_list = knn_list)

In [None]:
drop_list, middle_list, knn_list = patient_data_imputer.classify_missing_columns(var_list = names_var, 
                                                                                 missing_threshold_1 = 0.75, 
                                                                                 missing_threshold_2 = 0.10)

In [None]:
# patient_data_imputer.drop_columns(var_list = drop_list)

In [None]:
feature_list = middle_list + knn_list
patient_data_imputer.linear_impute_missing_values(var_list = feature_list)

In [None]:
### Check the missing values of the dataset again

In [None]:
# patient_data_imputer.drop_columns(var_list = drop_list)

In [None]:
patient_data_imputer.knn_impute_missing_values(num_neigh = 5, 
                                               scaler = MinMaxScaler(),
                                               chunk_size = 10000, 
                                               num_jobs = 60)

In [None]:
patient_data_imputer.save_to_csv(file_path = './mimic_iv_icu_discharge_dataset_v2.csv', 
                                 index = False)

In [None]:
state_space_builder = StateSpaceBuilder(generated_dataset = patient_data_imputer.generated_dataset)

In [None]:
state_space_builder.drop_duplicate_rows()

In [None]:
state_space_builder.columns_manipulation()

In [None]:
state_space_builder.icu_discharge_data_selection(los_threshold = 15.0)

In [None]:
var_outcome_list = ['subject_id', 'hadm_id', 'stay_id', 'time', 'discharge_action', 'epoch', 
                    'icu_starttime', 'icu_endtime', 'los', 
                    'discharge_fail', 
                    'readmission', 'readmission_count', 'death_in_ICU', 'death_out_ICU']

var_physio_list = ['age', 'M', 'weight', 'Heart Rate', 'Arterial O2 pressure', 'Hemoglobin',
                   'Arterial CO2 Pressure', 'PH (Venous)', 'Hematocrit (serum)', 'WBC',
                   'Chloride (serum)', 'Creatinine (serum)', 'Glucose (serum)',
                    'Magnesium', 'Sodium (serum)', 'PH (Arterial)', 'Inspired O2 Fraction',
                    'Arterial Base Excess', 'BUN', 'Ionized Calcium', 'Total Bilirubin',
                    'Glucose (whole blood)', 'Potassium (serum)', 'HCO3 (serum)',
                    'Platelet Count', 'Prothrombin time', 'PTT', 'INR', 
                    'Blood Pressure Systolic', 'Blood Pressure Diastolic', 'Blood Pressure Mean', 'Temperature C',
                    'SaO2', 'GCS score', 'RR', 'TV', 'readmission_count']

In [None]:
state_space_builder.table_split(var_outcome = var_outcome_list, var_physio = var_physio_list)

In [None]:
state_space_builder.discharge_cost_set(scaler = MinMaxScaler())

In [None]:
state_space_builder.train_val_test_split(scaler = MinMaxScaler(), 
                                         test_prop = 0.2, val_prop = 0.5, 
                                         random_seed = 42)

In [None]:
state_space_builder.save_to_csv(dataset = state_space_builder.rl_cont_state_table, 
                                file_path = '../model_output/state_table.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.rl_table_train, 
                                file_path = '../model_output/state_table_train.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.rl_table_val, 
                                file_path = '../model_output/state_table_val.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.rl_table_test, 
                                file_path = '../model_output/state_table_test.csv')

state_space_builder.save_to_csv(dataset = state_space_builder.rl_cont_state_table_scaled, 
                                file_path = '../model_output/state_table_scaled.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.rl_table_train_scaled, 
                                file_path = '../model_output/state_table_scaled_train.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.rl_table_val_scaled, 
                                file_path = '../model_output/state_table_scaled_val.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.rl_table_test_scaled, 
                                file_path = '../model_output/state_table_scaled_test.csv')

state_space_builder.save_to_csv(dataset = state_space_builder.state_id_table, 
                                file_path = '../model_output/outcome_table.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.id_table_train, 
                                file_path = '../model_output/outcome_table_train.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.id_table_val, 
                                file_path = '../model_output/outcome_table_val.csv')
state_space_builder.save_to_csv(dataset = state_space_builder.id_table_test, 
                                file_path = '../model_output/outcome_table_test.csv')