In [3]:
import sys
import os
import importlib
from pathlib import Path

# Add custom module paths
module_paths = [
    'preprocessing/day_intervals_preproc',
    'utils',
    'preprocessing/hosp_module_preproc',
    'model'
]
for module_path in module_paths:
    if module_path not in sys.path:
        sys.path.append(module_path)

# Standard library imports
import pandas as pd
import pickle

# External library imports
import ipywidgets as widgets
from sklearn.metrics import classification_report

# Custom module imports
import day_intervals_cohort
import day_intervals_cohort_v2
import data_generation_icu
import data_generation
import evaluation
import feature_selection_hosp
import feature_selection_icu
import ml_models
import dl_train
import tokenization
import fairness
import callibrate_output

# Reload custom modules to ensure latest versions
custom_modules = [
    day_intervals_cohort,
    day_intervals_cohort_v2,
    data_generation_icu,
    data_generation,
    feature_selection_hosp,
    feature_selection_icu,
    ml_models,
    dl_train,
    tokenization,
    fairness,
    callibrate_output,
    evaluation
]

for module in custom_modules:
    importlib.reload(module)

# Optional: import everything from specific modules, if needed
from day_intervals_cohort import *
from day_intervals_cohort_v2 import *
from feature_selection_hosp import *
from feature_selection_icu import *
from ml_models import *
from dl_train import *
from tokenization import *

# Welcome to your MIMIC-IV Project

This repository explains the steps to download and clean MIMIC-IV dataset for analysis.
The repository is compatible with MIMIC-IV v1.0 and MIMIC-IV v2.0

Please go to:
- https://physionet.org/content/mimiciv/1.0/ for v1.0
- https://physionet.org/content/mimiciv/2.0/ for v2.0

Follow instructions to get access to MIMIC-IV dataset.

Download the files using your terminal: 
- wget -r -N -c -np --user mehakg --ask-password https://physionet.org/files/mimiciv/1.0/ or
- wget -r -N -c -np --user mehakg --ask-password https://physionet.org/files/mimiciv/2.0/
        

Save downloaded files in the parent directory of this github repo. 

The structure should look like below for v1.0-
- mimiciv/1.0/core
- mimiciv/1.0/hosp
- mimiciv/1.0/icu

The structure should look like below for v2.0-
- mimiciv/2.0/hosp
- mimiciv/2.0/icu

## 1. DATA EXTRACTION
Please run below cell to select option for cohort selection.
The cohort will be svaed in **./data/cohort/**

In [4]:
print("This Code is tailored for Version 2.2 of the MIMIC Data Set")
version = widgets.RadioButtons(options=['Version 2'],value='Version 2')
display(version)

print("This Code is Tailored for Mortality Prediction")
radio_input4 = widgets.RadioButtons(options=['Mortality'],value='Mortality')
display(radio_input4)


This Code is tailored for Version 2.2 of the MIMIC Data Set


RadioButtons(options=('Version 2',), value='Version 2')

This Code is Tailored for Mortality Prediction


RadioButtons(options=('Mortality',), value='Mortality')

### Refining Cohort and Prediction Task Definition

Based on your current selection following block will provide option to further refine prediction task and cohort associated with it:

- First you will refine the prediction task choosing from following options -
    - **length of Stay** - You can select from two predefined options or enter custom number of days to predict length os stay greater than number of days.

    - **Readmission** - You can select from two predefined options or enter custom number of days to predict readmission after "number of days" after previous admission.

    - **Phenotype Prediction** - You can select from four major chronic diseases to predict its future outcome

        - Heart failure
        - CAD (Coronary Artery Disease)
        - CKD (Chronic Kidney Disease)
        - COPD (Chronic obstructive pulmonary disease)

- Second, you will choode whether to perfom above task using ICU or non-ICU admissions data

- Third, you can refine the refine the cohort selection for any of the above choosen prediction tasks by including the admission samples admitted with particular chronic disease - 
    - Heart failure
    - CAD (Coronary Artery Disease)
    - CKD (Chronic Kidney Disease)
    - COPD (Chronic obstructive pulmonary disease)
    
print("**Please run below cell to extract the cohort for selected options**")

In [5]:
import ipywidgets as widgets
from IPython.display import display

# Assuming radio_input4 is set to 'Mortality'
radio_input4 = widgets.RadioButtons(options=['Mortality'], value='Mortality')

# Display the option for Mortality (could be omitted if you always use 'Mortality')
display(radio_input4)

print("Extract Data")
print("Please select below if you want to work with ICU or Non-ICU data ?")

# Radio buttons for choosing ICU or Non-ICU data
radio_input1 = widgets.RadioButtons(options=['ICU'], value='ICU')
display(radio_input1)

print("Please select if you want to perform chosen prediction task for a specific disease.")

# Radio buttons for choosing disease filter
radio_input3 = widgets.RadioButtons(options=['No Disease Filter'], value='No Disease Filter')
display(radio_input3)

RadioButtons(options=('Mortality',), value='Mortality')

Extract Data
Please select below if you want to work with ICU or Non-ICU data ?


RadioButtons(options=('ICU',), value='ICU')

Please select if you want to perform chosen prediction task for a specific disease.


RadioButtons(options=('No Disease Filter',), value='No Disease Filter')

In [6]:
# Assuming the relevant variables are already defined:
# radio_input1.value = 'ICU'
# label = 'Mortality'

# Default values set directly as there are no conditions to evaluate
time = 0  # Set to a default value that makes sense for your mortality task
icd_code = 'No Disease Filter'  # Default ICD code, adjust as necessary
disease_label = ''  # Empty or a default value as needed
label=radio_input4.value
data_mort=label=="Mortality"

# Directly set the version path for version 2.0
version_path = "mimiciv/2.0"

# Directly use the data extraction method for version 2.0
cohort_output = day_intervals_cohort_v2.extract_data(radio_input1.value, label, time, icd_code, root_dir, disease_label)

EXTRACTING FOR: | ICU | MORTALITY | 0 |
[ MORTALITY LABELS FINISHED ]
[ COHORT SUCCESSFULLY SAVED ]
[ SUMMARY SUCCESSFULLY SAVED ]
Mortality FOR ICU DATA
# Admission Records: 73181
# Patients: 50920
# Positive cases: 4889
# Negative cases: 68292


## 2. FEATURE SELECTION
Features available for ICU data -
- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)
- Procedures (https://mimic.mit.edu/docs/iv/modules/icu/procedureevents/)
- Medications (https://mimic.mit.edu/docs/iv/modules/icu/inputevents/)
- Output Events (https://mimic.mit.edu/docs/iv/modules/icu/outputevents/)
- Chart Events (https://mimic.mit.edu/docs/iv/modules/icu/chartevents/)

Features available for ICU data -
- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)
- Procedures (https://mimic.mit.edu/docs/iv/modules/hosp/procedures_icd/)
- Medications (https://mimic.mit.edu/docs/iv/modules/hosp/prescriptions/)
- Lab Events (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/)

All features will be saved in **./data/features/**

**Please run below cell to select features**

In [7]:
disease_label=""
time=0
label=radio_input4.value

data_icu=radio_input1.value=="ICU"
data_mort=label=="Mortality"
data_admn=label=='Readmission'
data_los=label=='Length of Stay'

In [8]:
print("Feature Selection")
if data_icu:
    print("Which Features you want to include for cohort?")
    check_input1 = widgets.Checkbox(description='Diagnosis', value=True)
    display(check_input1)
    check_input2 = widgets.Checkbox(description='Output Events', value=True)
    display(check_input2)
    check_input3 = widgets.Checkbox(description='Chart Events(Labs and Vitals)', value=True)
    display(check_input3)
    check_input4 = widgets.Checkbox(description='Procedures', value=True)
    display(check_input4)
    check_input5 = widgets.Checkbox(description='Medications', value=True)
    display(check_input5)
else:
    print("Which Features you want to include for cohort?")
    check_input1 = widgets.Checkbox(description='Diagnosis', value=True)
    display(check_input1)
    check_input2 = widgets.Checkbox(description='Labs', value=True)
    display(check_input2)
    check_input3 = widgets.Checkbox(description='Procedures', value=True)
    display(check_input3)
    check_input4 = widgets.Checkbox(description='Medications', value=True)
    display(check_input4)
print("**Please run below cell to extract selected features**")


Feature Selection
Which Features you want to include for cohort?


Checkbox(value=True, description='Diagnosis')

Checkbox(value=True, description='Output Events')

Checkbox(value=True, description='Chart Events(Labs and Vitals)')

Checkbox(value=True, description='Procedures')

Checkbox(value=True, description='Medications')

**Please run below cell to extract selected features**


In [10]:
diag_flag=check_input1.value
out_flag=check_input2.value
chart_flag=check_input3.value
proc_flag=check_input4.value
med_flag=check_input5.value

feature_icu(cohort_output, diag_flag,out_flag,chart_flag,proc_flag,med_flag)

Attempting to open: data/mimiciv/2.0/hosp/diagnoses_icd.csv.gz
[EXTRACTING DIAGNOSIS DATA]
# unique ICD-9 codes 6610
# unique ICD-10 codes 9978
# unique ICD-10 codes (After converting ICD-9 to ICD-10) 10270
# unique ICD-10 codes (After clinical gruping ICD-10 codes) 1520
# Admissions:   73159
Total rows 1297458
[SUCCESSFULLY SAVED DIAGNOSIS DATA]
[EXTRACTING OUPTPUT EVENTS DATA]
# Unique Events:   71
# Admissions:   71111
Total rows 4234967
[SUCCESSFULLY SAVED OUPTPUT EVENTS DATA]
[EXTRACTING CHART EVENTS DATA]


32it [07:05, 13.28s/it]


# Unique Events:   454
# Admissions:   73173
Total rows 76942772
[SUCCESSFULLY SAVED CHART EVENTS DATA]
[EXTRACTING PROCEDURES DATA]
# Unique Events:   157
# Admissions:   72711
Total rows 678994
[SUCCESSFULLY SAVED PROCEDURES DATA]
[EXTRACTING MEDICATIONS DATA]
# of unique type of drug:  195
# Admissions:   68945
# Total rows 4820051
[SUCCESSFULLY SAVED MEDICATIONS DATA]


## 3. CLINICAL GROUPING
Below you will have option to clinically group diagnosis and medications.
Grouping medical codes will reduce dimensional space of features.

Default options selected below will group medical codes to reduce feature dimension space.

**Please run below cell to select preprocessing for diferent features**

In [21]:
if diag_flag:
    print("Do you want to group ICD 10 DIAG codes?")
    radio_input4 = widgets.RadioButtons(
        options=['Keep both ICD-9 and ICD-10 codes', 'Convert ICD-9 to ICD-10 codes', 'Convert ICD-9 to ICD-10 and group ICD-10 codes'],
        value='Convert ICD-9 to ICD-10 and group ICD-10 codes',
        layout={'width': '100%'}
    )
    display(radio_input4)
print("**Please run below cell to perform feature preprocessing**")

Do you want to group ICD 10 DIAG codes?


RadioButtons(index=2, layout=Layout(width='100%'), options=('Keep both ICD-9 and ICD-10 codes', 'Convert ICD-9…

**Please run below cell to perform feature preprocessing**


In [23]:
group_diag=False

if data_icu:
    if diag_flag:
        group_diag=radio_input4.value
    preprocess_features_icu(cohort_output, diag_flag, group_diag,False,False,False,0,0)

[PROCESSING DIAGNOSIS DATA]
Total number of rows 1228579
[SUCCESSFULLY SAVED DIAGNOSIS DATA]


### 4. SUMMARY OF FEATURES

This step will generate summary of all features extracted so far.<br>
It will save summary files in **./data/summary/**<br>
- These files provide summary about **mean frequency** of medical codes per admission.<br>
- It also provides **total occurrence count** of each medical code.<br>
- For labs and chart events it will also provide <br>**missing %** which tells how many rows for a certain medical code has missing value.

Please use this information to further refine your cohort by selecting <br>which medical codes in each feature you want to keep and <br>which codes you would like to remove for downstream analysis tasks.

**Please run below cell to generate summary files**

In [25]:
generate_summary_icu(diag_flag, proc_flag, med_flag, out_flag, chart_flag)

[GENERATING FEATURE SUMMARY]
[SUCCESSFULLY SAVED FEATURE SUMMARY]


## 5. Feature Selection

based on the files generated in previous step and other infromation gathered by you,<br>
Please select which medical codes you want to include in this study.

Please run below cell to to select options for which features you want to perform feature selection.

- Select **Yes** if you want to select a subset of medical codes for that feature and<br> **edit** the corresponding feature file for it.
- Select **No** if you want to keep all the codes in a feature.

In [27]:
if diag_flag:
    print("Do you want to do Feature Selection for Diagnosis \n (If yes, please edit list of codes in ./data/summary/diag_features.csv)")
    radio_input4 = widgets.RadioButtons(options=['Yes', 'No'], value='No')
    display(radio_input4)       

if med_flag:
    print("Do you want to do Feature Selection for Medication \n (If yes, please edit list of codes in ./data/summary/med_features.csv)")
    radio_input5 = widgets.RadioButtons(options=['Yes', 'No'], value='No')
    display(radio_input5)   

if proc_flag:
    print("Do you want to do Feature Selection for Procedures \n (If yes, please edit list of codes in ./data/summary/proc_features.csv)")
    radio_input6 = widgets.RadioButtons(options=['Yes', 'No'], value='No')
    display(radio_input6)   

if out_flag:
    print("Do you want to do Feature Selection for Output event \n (If yes, please edit list of codes in ./data/summary/out_features.csv)")
    radio_input7 = widgets.RadioButtons(options=['Yes', 'No'], value='No')
    display(radio_input7)  

if chart_flag:
    print("Do you want to do Feature Selection for Chart events \n (If yes, please edit list of codes in ./data/summary/chart_features.csv)")
    radio_input8 = widgets.RadioButtons(options=['Yes', 'No'], value='No')
    display(radio_input8)  

print("**Please run below cell to perform feature selection**")


Do you want to do Feature Selection for Diagnosis 
 (If yes, please edit list of codes in ./data/summary/diag_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Medication 
 (If yes, please edit list of codes in ./data/summary/med_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Procedures 
 (If yes, please edit list of codes in ./data/summary/proc_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Output event 
 (If yes, please edit list of codes in ./data/summary/out_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Chart events 
 (If yes, please edit list of codes in ./data/summary/chart_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

**Please run below cell to perform feature selection**


In [29]:
select_diag = True
select_med = True
select_proc = True
select_out = True
select_chart = True

features_selection_icu(cohort_output, diag_flag, proc_flag, med_flag, out_flag, chart_flag, select_diag, select_med, select_proc, select_out, select_chart)

[FEATURE SELECTION DIAGNOSIS DATA]
Total number of rows 1228579
[SUCCESSFULLY SAVED DIAGNOSIS DATA]
[FEATURE SELECTION MEDICATIONS DATA]
Total number of rows 4820051
[SUCCESSFULLY SAVED MEDICATIONS DATA]
[FEATURE SELECTION PROCEDURES DATA]
Total number of rows 678994
[SUCCESSFULLY SAVED PROCEDURES DATA]
[FEATURE SELECTION OUTPUT EVENTS DATA]
Total number of rows 4234967
[SUCCESSFULLY SAVED OUTPUT EVENTS DATA]
[FEATURE SELECTION CHART EVENTS DATA]
Total number of rows 76942666
[SUCCESSFULLY SAVED CHART EVENTS DATA]


## 6. CLEANING OF FEATURES
Below you will have option to to clean lab and chart events by performing outlier removal and unit conversion.

Outlier removal is performed to remove values higher than selected **right threshold** percentile and lower than selected **left threshold** percentile among all values for each itemid. 

**Please run below cell to select preprocessing for diferent features**

In [31]:
if chart_flag:
    print("Outlier removal in values of chart events ?")
    layout = widgets.Layout(width='100%', height='40px')  # set width and height

    radio_input5 = widgets.RadioButtons(
        options=['No outlier detection', 'Impute Outlier (default:98)', 'Remove outliers (default:98)'],
        value='No outlier detection', layout=layout)
    display(radio_input5)
    outlier = widgets.IntSlider(
        value=98,
        min=90,
        max=99,
        step=1,
        disabled=False, layout={'width': '100%'}
    )
    left_outlier = widgets.IntSlider(
        value=0,
        min=0,
        max=10,
        step=1,
        disabled=False, layout={'width': '100%'}
    )
    display(widgets.HBox([widgets.Label('Right Outlier Threshold', layout={'width': '150px'}), outlier]))
    display(widgets.HBox([widgets.Label('Left Outlier Threshold', layout={'width': '150px'}), left_outlier]))

print("**Please run below cell to perform feature preprocessing**")

Outlier removal in values of chart events ?


RadioButtons(layout=Layout(height='40px', width='100%'), options=('No outlier detection', 'Impute Outlier (def…

HBox(children=(Label(value='Right Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=98, layou…

HBox(children=(Label(value='Left Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=0, layout=…

**Please run below cell to perform feature preprocessing**


In [32]:
if chart_flag:
    clean_chart = radio_input5.value != 'No outlier detection'
    impute_outlier_chart = radio_input5.value == 'Impute Outlier (default:98)'
    thresh = outlier.value
    left_thresh = left_outlier.value

    preprocess_features_icu(cohort_output, False, False, chart_flag, clean_chart, impute_outlier_chart, thresh, left_thresh)

## 7. Time-Series Representation
In this section, please choose how you want to process and represent time-series data.

- First option is to select the length of time-series data you want to include for this study. (Default is 72 hours)

- Second option is to select bucket size which tells in what size time windows you want to divide your time-series.<br>
For example, if you select **2** bucket size, it wil aggregate data for every 2 hours and <br>a time-series of length 24 hours will be represented as time-series with 12 time-windows <br>where data for every 2 hours is agggregated from original raw time-series.

During this step, we will also save the time-series data in data dictionaries in the format that can be directly used for following deep learning analysis.

### Imputation
You can also choose if you want to impute lab/chart values. The imputation will be done by froward fill and mean or median imputation.<br>
Values will be forward fill first and if no value exists for that admission we will use mean or median value for the patient.

The data dictionaries will be saved in **./data/dict/**

Please refer the readme to know the structure of data dictionaries.

**Please run below cell to select time-series representation**

In [36]:
data_mort = True

In [37]:
print("=======Time-series Data Representation=======")

print("Length of data to be included for time-series prediction ?")
radio_input8 = widgets.RadioButtons(options=['First 72 hours', 'First 48 hours', 'First 24 hours', 'Custom'], value='First 24 hours')
display(radio_input8)
text2 = widgets.IntSlider(
    value=72,
    min=24,
    max=72,
    step=1,
    description='First',
    disabled=False
)
display(widgets.HBox([widgets.Label('First (in hours):', layout={'width': '150px'}), text2]))

print("What time bucket size you want to choose ?")
radio_input7 = widgets.RadioButtons(options=['1 hour', '2 hour', '3 hour', '4 hour', '5 hour', 'Custom'], value='2 hour')
display(radio_input7)
text1 = widgets.IntSlider(
    value=1,
    min=1,
    max=6,
    step=1,
    disabled=False
)
display(widgets.HBox([widgets.Label('Bucket Size (in hours):', layout={'width': '150px'}), text1]))

print("Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?")
radio_impute = widgets.RadioButtons(options=['No Imputation', 'forward fill and mean', 'forward fill and median'], value='No Imputation')
display(radio_impute)

if data_mort:
    print("If you have chosen mortality prediction task, then what prediction window length do you want to keep?")
    radio_input6 = widgets.RadioButtons(options=['2 hours', '4 hours', '6 hours', '8 hours', 'Custom'], value='8 hours')
    display(radio_input6)
    text3 = widgets.IntSlider(
        value=2,
        min=2,
        max=8,
        step=1,
        disabled=False
    )
    display(widgets.HBox([widgets.Label('Prediction window (in hours)', layout={'width': '180px'}), text3]))

print("**Please run below cell to perform time-series representation and save in data dictionaries**")


Length of data to be included for time-series prediction ?


RadioButtons(index=2, options=('First 72 hours', 'First 48 hours', 'First 24 hours', 'Custom'), value='First 2…

HBox(children=(Label(value='First (in hours):', layout=Layout(width='150px')), IntSlider(value=72, description…

What time bucket size you want to choose ?


RadioButtons(index=1, options=('1 hour', '2 hour', '3 hour', '4 hour', '5 hour', 'Custom'), value='2 hour')

HBox(children=(Label(value='Bucket Size (in hours):', layout=Layout(width='150px')), IntSlider(value=1, max=6,…

Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?


RadioButtons(options=('No Imputation', 'forward fill and mean', 'forward fill and median'), value='No Imputati…

If you have chosen mortality prediction task, then what prediction window length do you want to keep?


RadioButtons(index=3, options=('2 hours', '4 hours', '6 hours', '8 hours', 'Custom'), value='8 hours')

HBox(children=(Label(value='Prediction window (in hours)', layout=Layout(width='180px')), IntSlider(value=2, m…

**Please run below cell to perform time-series representation and save in data dictionaries**


In [41]:
# Print selected parameters
print(radio_input8.value)
print(radio_input7.value)
print(radio_input6.value)
print(radio_impute.value)

First 24 hours
2 hour
8 hours
No Imputation


In [42]:
# Define objects based on widget selection
if (radio_input6.value=='Custom'):
    predW=int(text3.value)
else:
    predW=int(radio_input6.value[0].strip())
if (radio_input7.value=='Custom'):
    bucket=int(text1.value)
else:
    bucket=int(radio_input7.value[0].strip())
if (radio_input8.value=='Custom'):
    include=int(text2.value)
else:
    include=int(radio_input8.value.split()[1])
if (radio_impute.value=='forward fill and mean'):
    impute='Mean'
elif (radio_impute.value=='forward fill and median'):
    impute='Median'
else:
    impute=False

In [44]:
if data_icu:
    gen=data_generation_icu.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,proc_flag,out_flag,chart_flag,med_flag,impute,include,bucket,predW)

[ READ COHORT ]


16it [10:41, 40.09s/it]


[ READ ALL FEATURES ]
include_time 24
[ PROCESSED TIME SERIES TO EQUAL LENGTH  ]


100%|██████████| 12/12 [00:13<00:00,  1.11s/it]


bucket 2
[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]
12


100%|██████████| 47581/47581 [6:26:56<00:00,  2.05it/s]   


[ SUCCESSFULLY SAVED DATA DICTIONARIES ]


In [45]:
if data_icu:
    gen=data_generation_icu.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,proc_flag,out_flag,chart_flag,med_flag,impute,include,bucket,predW)
    #gen=data_generation_icu.Generator(cohort_output,data_mort,diag_flag,False,False,chart_flag,False,impute,include,bucket,predW)
    #if chart_flag
    #    gen=data_generation_icu.Generator(cohort_output,data_mort,False,False,False,chart_flag,False,impute,include,bucket,predW)
else:
    gen=data_generation.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,lab_flag,proc_flag,med_flag,impute,include,bucket,predW)

[ READ COHORT ]


16it [05:16, 19.77s/it]


[ READ ALL FEATURES ]
include_time 24
[ PROCESSED TIME SERIES TO EQUAL LENGTH  ]


100%|██████████| 12/12 [00:04<00:00,  2.49it/s]


bucket 2
[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]
12


100%|██████████| 47581/47581 [5:26:46<00:00,  2.43it/s]   


[ SUCCESSFULLY SAVED DATA DICTIONARIES ]


In [46]:
if (radio_input6.value=='Custom'):
    predW=int(text3.value)
else:
    predW=int(radio_input6.value[0].strip())
if (radio_input7.value=='Custom'):
    bucket=int(text1.value)
else:
    bucket=int(radio_input7.value[0].strip())
if (radio_input8.value=='Custom'):
    include=int(text2.value)
else:
    print("Value of radio_input8:", radio_input8.value)
    include=int(radio_input8.value.split()[1])
if (radio_impute.value=='forward fill and mean'):
    impute='Mean'
elif (radio_impute.value=='forward fill and median'):
    impute='Median'
else:
    impute=False

if data_icu:
    gen=data_generation_icu.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,proc_flag,out_flag,chart_flag,med_flag,impute,include,bucket,predW)

Value of radio_input8: First 24 hours
[ READ COHORT ]


16it [09:05, 34.08s/it]


[ READ ALL FEATURES ]
include_time 24
[ PROCESSED TIME SERIES TO EQUAL LENGTH  ]


100%|██████████| 12/12 [00:07<00:00,  1.55it/s]


bucket 2
[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]
12


100%|██████████| 47581/47581 [6:00:54<00:00,  2.20it/s]  


[ SUCCESSFULLY SAVED DATA DICTIONARIES ]


# END OF ORIGINAL PIPELINE

# Group Minority Ethinicites into "OTHER"

In [54]:
def load_demo_data(stay_ids, base_path='./data/csv/'):
    all_demos = pd.DataFrame()
    for stay_id in stay_ids:
        # Construct the file path
        file_path = f'{base_path}{stay_id}/demo.csv'
        # Load the demographic data
        demo_data = pd.read_csv(file_path)
        # Add a 'stay_id' column to keep track of the data
        demo_data['stay_id'] = stay_id
        # Append to the full DataFrame
        all_demos = pd.concat([all_demos, demo_data], ignore_index=True)
    return all_demos

# Load labels.csv to get stay IDs
labels = pd.read_csv('./data/csv/labels.csv')

# Load demographic data for the sampled stays
demo_data = load_demo_data(labels['stay_id'])

In [55]:
# Get distribution of a categorical variable such as ethnicity
ethnicity_counts = demo_data['ethnicity'].value_counts()
print(ethnicity_counts)

# If gender is also a column of interest
gender_counts = demo_data['gender'].value_counts()
print(gender_counts)

ethnicity
WHITE              19777
UNKNOWN             3490
BLACK               3025
OTHER               1525
HISPANIC/LATINO     1077
ASIAN                755
Name: count, dtype: int64
gender
M    16743
F    12906
Name: count, dtype: int64


In [45]:
# Update all entries that contain "WHITE" to simply "WHITE"
demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'WHITE' if 'WHITE' in x else x)

# Update all entries that contain "BLACK" to simply "BLACK"
demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'BLACK' if 'BLACK' in x else x)

# Update all entries that contain "BLACK" to simply "BLACK"
demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'ASIAN' if 'ASIAN' in x else x)

# Update all entries that contain "BLACK" to simply "BLACK"
demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'HISPANIC/LATINO' if 'HISPANIC' in x else x)

# Update all entries that contain "BLACK" to simply "BLACK"
demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'HISPANIC/LATINO' if 'LATINO' in x else x)

# Verify the change by getting the distribution of the 'ethnicity' variable again
ethnicity_counts_updated = demo_data['ethnicity'].value_counts()
print(ethnicity_counts_updated)

ethnicity
WHITE                                        19777
BLACK                                         3025
UNKNOWN                                       3022
OTHER                                         1377
HISPANIC/LATINO                                948
ASIAN                                          755
UNABLE TO OBTAIN                               343
PORTUGUESE                                     129
PATIENT DECLINED TO ANSWER                     125
AMERICAN INDIAN/ALASKA NATIVE                   58
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER       40
SOUTH AMERICAN                                  29
MULTIPLE RACE/ETHNICITY                         21
Name: count, dtype: int64


In [48]:
# Replace "WHITE - OTHER EUROPEAN" with "WHITE" in the ethnicity column
demo_data['ethnicity'] = demo_data['ethnicity'].replace('PORTUGUESE', 'HISPANIC/LATINO')

# Group inte UNKOWN
demo_data['ethnicity'] = demo_data['ethnicity'].replace('PATIENT DECLINED TO ANSWER', 'UNKNOWN')
demo_data['ethnicity'] = demo_data['ethnicity'].replace('UNABLE TO OBTAIN', 'UNKNOWN')

# Group others into OTHER
demo_data['ethnicity'] = demo_data['ethnicity'].replace('AMERICAN INDIAN/ALASKA NATIVE', 'OTHER')
demo_data['ethnicity'] = demo_data['ethnicity'].replace('NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'OTHER')
demo_data['ethnicity'] = demo_data['ethnicity'].replace('SOUTH AMERICAN', 'OTHER')
demo_data['ethnicity'] = demo_data['ethnicity'].replace('MULTIPLE RACE/ETHNICITY', 'OTHER')

# Verify the change by getting the distribution of the 'ethnicity' variable again
ethnicity_counts_updated = demo_data['ethnicity'].value_counts()
print(ethnicity_counts_updated)

ethnicity
WHITE              19777
UNKNOWN             3490
BLACK               3025
OTHER               1525
HISPANIC/LATINO     1077
ASIAN                755
Name: count, dtype: int64


In [49]:
# Base path where csv files are stored
base_path = './data/csv/'

# Function to save the updated data back to CSV files
def save_updated_data(demo_data):
    for index, row in demo_data.iterrows():
        stay_id = row['stay_id']
        file_path = f'{base_path}{stay_id}/demo.csv'
        # Convert the row to DataFrame and drop the 'stay_id' column before saving
        row_df = pd.DataFrame([row]).drop(columns=['stay_id'])
        row_df.to_csv(file_path, index=False)

# Assuming demo_data is your DataFrame after applying all the modifications
save_updated_data(demo_data)