## Imports

In [1]:
# autoreload import module on change (does not work with from x import y)
%load_ext autoreload
%autoreload 2

In [2]:
# Import functions
import pandas as pd
import numpy as np
from pathlib import Path
from mimic_constants import *
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [4]:
# Import cleaned master dataframe
df_master_cleaned = get_master_df(idp=True)

## Removing columns with too many NaNs

In [5]:
df = df_master_cleaned
select_col_groups = []
removed_col_groups = []
og_col_groups = [chart_labels_mean_cols, chart_labels_max_cols, chart_labels_min_cols, lab_labels_mean_cols, lab_labels_max_cols, lab_labels_min_cols]
for col_group in og_col_groups:
    nan_counts = df_master_cleaned[col_group].isna().sum()
    too_many_nans = nan_counts[nan_counts > 1000].index.tolist()
    df = df.drop(too_many_nans, axis=1)
    col_group = list(set(col_group) - set(too_many_nans))
    select_col_groups.append(col_group)
    removed_col_groups.append(too_many_nans)

In [6]:
df[select_col_groups[0]].head(2)

Unnamed: 0,RR_mean,HR_mean,SpO2_mean,PlateletCount_mean,Sodium_mean,NBPs_mean,NBPm_mean,NBPd_mean,Temp(F)_mean
0,19.044444,70.8,97.648649,136.2,136.5,129.176471,90.529412,74.669683,98.98913
1,15.185185,94.214286,96.730769,251.333333,138.333333,110.5,75.4,64.5,97.5


In [7]:
df[select_col_groups[3]].head(2)

Unnamed: 0,Creatinine_mean,Glucose_mean,Lactate_mean,Hematocrit_mean,ALT_mean,Magnesium_mean,PTT_mean,Urea_Nitrogren_mean,Potassium_mean,Phosphate_mean,Bilirubin,Alkaline_Phosphatase_mean,Calcium_Total_mean,Chloride_mean
0,0.905556,149.444444,1.8,29.436842,521.636364,2.194444,58.061765,25.611111,4.638889,3.172222,0.372727,56.727273,8.994444,95.777778
1,1.0,181.75,2.82,31.085714,,2.1,26.3,15.6,4.214286,,,,,102.142857


In [8]:
df.dropna().shape

(1304, 61)

In [9]:
df.isna().sum()[(df.isna().sum() > 0) & (df.isna().sum() < 1000)]

NBPs_mean                     20
NBPd_mean                     22
NBPm_mean                     18
SpO2_mean                      2
Sodium_mean                   68
Temp(F)_mean                  27
PlateletCount_mean            88
NBPs_max                      20
NBPd_max                      22
NBPm_max                      18
Sodium_max                    68
NBPs_min                      20
NBPd_min                      22
NBPm_min                      18
SpO2_min                       2
Sodium_min                    68
Lactate_mean                 734
ALT_mean                     858
Alkaline_Phosphatase_mean    872
Bilirubin                    854
Calcium_Total_mean            45
Chloride_mean                 13
Creatinine_mean               13
Glucose_mean                  13
Magnesium_mean                15
Phosphate_mean                40
Potassium_mean                13
Urea_Nitrogren_mean           13
Hematocrit_mean               13
PTT_mean                     204
Potassium_

In [10]:
df_master_cleaned[removed_col_groups[0]].tail(2)

Unnamed: 0,ABPm_mean,ABPd_mean,ABPs_mean,FiO2_mean,PH_mean,PCO2_mean,SaO2_mean,Albumin_mean,Cholesterol_mean,PO2_mean
2660,,,,,,,,2.4,,
2661,76.461538,54.538462,114.692308,60.555556,,60.666667,98.0,,,130.0


In [11]:
df_master_cleaned[removed_col_groups[3]].tail(2)

Unnamed: 0,Tidal_Volume_mean,Troponin-T_mean,Hemoglobin_mean,WBC_mean,Fibrinogen
2660,,3.237778,8.085714,9.0,288.0
2661,400.0,,12.0,,


Heart Score, PERC Score, Canadian C-Spine/Head Injuries: Regressions used in clinic to rule out certain diseases

WELLS Criteria, Ottawa Ankle, PERC: useful to know whether or not I need to get imaging 

Lactate - Pneumonia
Tidal - Lung pathologies
Troponin-T - Heart Attacks Acute setting (1 week) trop up — can lead to early findings of higher heart attacks chances 
WBC - infections

## Prep data for Cardiomegaly

In [28]:
del df
label = 'Cardiomegaly'

In [29]:
df_train = pd.read_csv('/home/ays124/mimic/cardiomegaly/cross-val/densenet-xray-age_chloride_rr_urea_nitrogren_magnesium_glucose_phosphate_hematocrit-idp/train.csv')
df_val = pd.read_csv('/home/ays124/mimic/cardiomegaly/cross-val/densenet-xray-age_chloride_rr_urea_nitrogren_magnesium_glucose_phosphate_hematocrit-idp/val.csv')
df_test = pd.read_csv('/home/ays124/mimic/cardiomegaly/cross-val/densenet-xray-age_chloride_rr_urea_nitrogren_magnesium_glucose_phosphate_hematocrit-idp/test.csv')
f'Number of Total Train Samples: {len(df_train)}'
f'Number of Total Val Samples: {len(df_val)}'
f'Number of Total Test Samples: {len(df_test)}'

'Number of Total Test Samples: 537'

In [30]:
for df in [df_train, df_val, df_test]:
    df['age_label'] = df['anchor_age'].apply(lambda x: min(x / 100, 1))
    df = standardize_mimic_ethnicity(df)
    df['race_label'] = df['ethnicity']
    df.loc[df['race_label'] == 'White', 'race_label'] = 0
    df.loc[df['race_label'] == 'Asian', 'race_label'] = 1
    df.loc[df['race_label'] == 'Black', 'race_label'] = 2
    df.loc[df['race_label'] == 'Hispanic/Latino', 'race_label'] = 3
    df.loc[df['race_label'] == 'Other', 'race_label'] = 4

    df['sex_label'] = df['gender']
    df.loc[df['sex_label'] == 'M', 'sex_label'] = 0
    df.loc[df['sex_label'] == 'F', 'sex_label'] = 1

In [31]:
demographic_cols = ['age_label', 'race_label', 'sex_label']

In [32]:
X_train = df_train[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y_train = df_train[[label]]
X_val   = df_val[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y_val   = df_val[[label]]
X_test  = df_test[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y_test  = df_test[[label]]

In [33]:
# 2. Mean Imputation for NaNs in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 3. One-Hot Encoding for 'race_label'
X_train_encoded = pd.get_dummies(X_train_imputed, columns=['race_label'], drop_first=True, dtype=float)
X_test_encoded = pd.get_dummies(X_test_imputed, columns=['race_label'], drop_first=True, dtype=float)

# Ensure that the train and test sets have the same columns after encoding
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

X_train_encoded = sm.add_constant(X_train_encoded)  # Add intercept term
X_test_encoded = sm.add_constant(X_test_encoded)

X_train_encoded = X_train_encoded.reset_index(drop=True)
X_test_encoded = X_test_encoded.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

## Logistic Regression

In [34]:
# 4. Logistic Regression using Statsmodels
logit_model = sm.Logit(Y_train, X_train_encoded)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.555417
         Iterations 10


In [35]:
result.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,Cardiomegaly,Pseudo R-squared:,0.094
Date:,2024-09-04 14:45,AIC:,2113.9316
No. Observations:,1849,BIC:,2279.6036
Df Model:,29,Log-Likelihood:,-1027.0
Df Residuals:,1819,LL-Null:,-1133.9
Converged:,1.0000,LLR p-value:,4.2698e-30
No. Iterations:,10.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-2.4407,8.9968,-0.2713,0.7862,-20.0741,15.1927
age_label,2.2058,0.4088,5.3959,0.0000,1.4046,3.0070
sex_label,0.0972,0.1144,0.8498,0.3954,-0.1270,0.3214
RR_mean,0.0361,0.0170,2.1211,0.0339,0.0027,0.0695
HR_mean,-0.0160,0.0044,-3.6180,0.0003,-0.0246,-0.0073
SpO2_mean,0.0284,0.0252,1.1292,0.2588,-0.0209,0.0777
PlateletCount_mean,-0.0000,0.0000,-0.1770,0.8595,-0.0000,0.0000
Sodium_mean,0.0001,0.0002,0.4188,0.6753,-0.0003,0.0004
NBPs_mean,-0.0100,0.0052,-1.9221,0.0546,-0.0202,0.0002


In [36]:
# Z-scores of the coefficients
z_scores = pd.concat([result.summary2().tables[1]['z'], result.summary2().tables[1]['P>|z|']], axis=1)

# Predictions and evaluations
Y_test_pred_prob = result.predict(X_test_encoded)
Y_test_pred = (Y_test_pred_prob > 0.5).astype(int)

auc_score = roc_auc_score(Y_test, Y_test_pred_prob)
f1 = f1_score(Y_test, Y_test_pred)
conf_matrix = confusion_matrix(Y_test, Y_test_pred)

In [37]:
# Outputs
print(f'AUC: {auc_score}', f'F1 Score: {f1}', f'Confusion Matrix:\n{conf_matrix}', sep='\n')
pd.set_option('display.float_format', '{:.4f}'.format)

AUC: 0.7429210875331564
F1 Score: 0.8331388564760793
Confusion Matrix:
[[ 37 123]
 [ 20 357]]


In [38]:
# add model predictions
df_pred_xray = pd.read_csv(k_fold_test_pred_csv_path['xray'])
df_pred_noise = pd.read_csv(k_fold_test_pred_csv_path['noise'])
df_pred_blank = pd.read_csv(k_fold_test_pred_csv_path['blank'])
Y_test['target'] = df_pred_xray['target_0'] 
Y_test['xray_prob'] = df_pred_xray['class_0']
Y_test['noise_prob'] = df_pred_noise['class_0']
Y_test['blank_prob'] = df_pred_blank['class_0']
Y_test['lr_prob'] = pd.Series(Y_test_pred_prob)

In [40]:
Y_test

Unnamed: 0,Cardiomegaly,target,xray_prob,noise_prob,blank_prob,lr_prob
0,0,0.0000,0.9044,0.5408,0.3556,0.5741
1,1,1.0000,1.0000,0.9636,0.8258,0.8079
2,1,1.0000,0.9973,0.8662,0.8625,0.7560
3,1,1.0000,0.9987,0.9227,0.8623,0.5657
4,0,0.0000,0.1533,0.5533,0.3473,0.6202
...,...,...,...,...,...,...
532,1,1.0000,0.9686,0.5765,0.7308,0.6252
533,0,0.0000,0.2282,0.5633,0.7388,0.4227
534,1,1.0000,0.9830,0.5703,0.6527,0.2020
535,0,0.0000,0.1033,0.5389,0.5747,0.2920
