## Imports

In [1]:
# autoreload import module on change (does not work with from x import y)
%load_ext autoreload
%autoreload 2

In [1]:
# Import functions
import pandas as pd
import numpy as np
from pathlib import Path
from mimic_constants import *
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [3]:
# Import cleaned master dataframe
df_master_cleaned = get_master_df(idp=True)

## Removing columns with too many NaNs

In [4]:
df = df_master_cleaned
select_col_groups = []
removed_col_groups = []
og_col_groups = [chart_labels_mean_cols, chart_labels_max_cols, chart_labels_min_cols, lab_labels_mean_cols, lab_labels_max_cols, lab_labels_min_cols]
for col_group in og_col_groups:
    nan_counts = df_master_cleaned[col_group].isna().sum()
    too_many_nans = nan_counts[nan_counts > 500].index.tolist()
    df = df.drop(too_many_nans, axis=1)
    col_group = list(set(col_group) - set(too_many_nans))
    select_col_groups.append(col_group)
    removed_col_groups.append(too_many_nans)

In [5]:
df[select_col_groups[0]].head(2)

Unnamed: 0,Temp(F)_mean,Sodium_mean,PlateletCount_mean,HR_mean,NBPm_mean,NBPs_mean,RR_mean,SpO2_mean,NBPd_mean
0,98.98913,136.5,136.2,70.8,90.529412,129.176471,19.044444,97.648649,74.669683
1,97.5,138.333333,251.333333,94.214286,75.4,110.5,15.185185,96.730769,64.5


In [6]:
df[select_col_groups[3]].head(2)

Unnamed: 0,Glucose_mean,Creatinine_mean,Chloride_mean,Magnesium_mean,PTT_mean,Calcium_Total_mean,Phosphate_mean,Urea_Nitrogren_mean,Hematocrit_mean,Potassium_mean
0,149.444444,0.905556,95.777778,2.194444,58.061765,8.994444,3.172222,25.611111,29.436842,4.638889
1,181.75,1.0,102.142857,2.1,26.3,,,15.6,31.085714,4.214286


In [7]:
df.dropna().shape

(2265, 57)

In [8]:
df.isna().sum()[(df.isna().sum() > 0) & (df.isna().sum() < 1000)]

NBPs_mean               20
NBPd_mean               22
NBPm_mean               18
SpO2_mean                2
Sodium_mean             68
Temp(F)_mean            27
PlateletCount_mean      88
NBPs_max                20
NBPd_max                22
NBPm_max                18
Sodium_max              68
NBPs_min                20
NBPd_min                22
NBPm_min                18
SpO2_min                 2
Sodium_min              68
Calcium_Total_mean      45
Chloride_mean           13
Creatinine_mean         13
Glucose_mean            13
Magnesium_mean          15
Phosphate_mean          40
Potassium_mean          13
Urea_Nitrogren_mean     13
Hematocrit_mean         13
PTT_mean               204
Potassium_max           13
Potassium_min           13
CTR                     36
CPAR                    74
dtype: int64

In [9]:
df_master_cleaned[removed_col_groups[0]].tail(2)

Unnamed: 0,ABPm_mean,ABPd_mean,ABPs_mean,FiO2_mean,PH_mean,PCO2_mean,SaO2_mean,Albumin_mean,Cholesterol_mean,PO2_mean
2660,,,,,,,,2.4,,
2661,76.461538,54.538462,114.692308,60.555556,,60.666667,98.0,,,130.0


In [10]:
df_master_cleaned[removed_col_groups[3]].tail(2)

Unnamed: 0,Tidal_Volume_mean,Alkaline_Phosphatase_mean,Lactate_mean,Bilirubin,Troponin-T_mean,Hemoglobin_mean,ALT_mean,WBC_mean,Fibrinogen
2660,,414.083333,1.37619,0.9,3.237778,8.085714,88.083333,9.0,288.0
2661,400.0,,0.8,,,12.0,,,


Heart Score, PERC Score, Canadian C-Spine/Head Injuries: Regressions used in clinic to rule out certain diseases

WELLS Criteria, Ottawa Ankle, PERC: useful to know whether or not I need to get imaging 

Lactate - Pneumonia
Tidal - Lung pathologies
Troponin-T - Heart Attacks Acute setting (1 week) trop up — can lead to early findings of higher heart attacks chances 
WBC - infections

## Prep data for Cardiomegaly

In [11]:
del df
label = 'Cardiomegaly'

In [12]:
df_train = pd.read_csv('/home/ays124/mimic/cardiomegaly/cross-val/densenet-xray-age_chloride_rr_urea_nitrogren_magnesium_glucose_phosphate_hematocrit-idp/train.csv')
df_val = pd.read_csv('/home/ays124/mimic/cardiomegaly/cross-val/densenet-xray-age_chloride_rr_urea_nitrogren_magnesium_glucose_phosphate_hematocrit-idp/val.csv')
df_test = pd.read_csv('/home/ays124/mimic/cardiomegaly/cross-val/densenet-xray-age_chloride_rr_urea_nitrogren_magnesium_glucose_phosphate_hematocrit-idp/test.csv')
f'Number of Total Train Samples: {len(df_train)}'
f'Number of Total Val Samples: {len(df_val)}'
f'Number of Total Test Samples: {len(df_test)}'

'Number of Total Test Samples: 537'

In [13]:
for df in [df_train, df_val, df_test]:
    df['age_label'] = df['anchor_age'].apply(lambda x: min(x / 100, 1))
    df = standardize_mimic_ethnicity(df)
    df['race_label'] = df['ethnicity']
    df.loc[df['race_label'] == 'White', 'race_label'] = 0
    df.loc[df['race_label'] == 'Asian', 'race_label'] = 1
    df.loc[df['race_label'] == 'Black', 'race_label'] = 2
    df.loc[df['race_label'] == 'Hispanic/Latino', 'race_label'] = 3
    df.loc[df['race_label'] == 'Other', 'race_label'] = 4

    df['sex_label'] = df['gender']
    df.loc[df['sex_label'] == 'M', 'sex_label'] = 0
    df.loc[df['sex_label'] == 'F', 'sex_label'] = 1

In [14]:
demographic_cols = ['age_label', 'race_label', 'sex_label']

In [15]:
X_train = df_train[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y_train = df_train[[label]]
X_val   = df_val[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y_val   = df_val[[label]]
X_test  = df_test[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y_test  = df_test[[label]]

In [16]:
# 2. Mean Imputation for NaNs in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 3. One-Hot Encoding for 'race_label'
X_train_encoded = pd.get_dummies(X_train_imputed, columns=['race_label'], drop_first=True, dtype=float)
X_test_encoded = pd.get_dummies(X_test_imputed, columns=['race_label'], drop_first=True, dtype=float)

# Ensure that the train and test sets have the same columns after encoding
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

X_train_encoded = sm.add_constant(X_train_encoded)  # Add intercept term
X_test_encoded = sm.add_constant(X_test_encoded)

X_train_encoded = X_train_encoded.reset_index(drop=True)
X_test_encoded = X_test_encoded.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [17]:
filtered_mean_columns = select_col_groups[0] + select_col_groups[3]

In [18]:
filtered_mean_columns

['Temp(F)_mean',
 'Sodium_mean',
 'PlateletCount_mean',
 'HR_mean',
 'NBPm_mean',
 'NBPs_mean',
 'RR_mean',
 'SpO2_mean',
 'NBPd_mean',
 'Glucose_mean',
 'Creatinine_mean',
 'Chloride_mean',
 'Magnesium_mean',
 'PTT_mean',
 'Calcium_Total_mean',
 'Phosphate_mean',
 'Urea_Nitrogren_mean',
 'Hematocrit_mean',
 'Potassium_mean']

In [19]:
len(filtered_mean_columns)

19

## Logistic Regression

In [20]:
# 4. Logistic Regression using Statsmodels
logit_model = sm.Logit(Y_train, X_train_encoded)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.557245
         Iterations 10


In [21]:
result.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,Cardiomegaly,Pseudo R-squared:,0.091
Date:,2024-09-07 22:05,AIC:,2112.6929
No. Observations:,1849,BIC:,2256.2753
Df Model:,25,Log-Likelihood:,-1030.3
Df Residuals:,1823,LL-Null:,-1133.9
Converged:,1.0000,LLR p-value:,1.2574e-30
No. Iterations:,10.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-2.9762,8.9330,-0.3332,0.7390,-20.4845,14.5322
age_label,2.3335,0.4042,5.7737,0.0000,1.5414,3.1257
sex_label,0.1096,0.1141,0.9601,0.3370,-0.1141,0.3332
Temp(F)_mean,-0.0144,0.0857,-0.1681,0.8665,-0.1824,0.1536
Sodium_mean,0.0001,0.0002,0.4169,0.6768,-0.0003,0.0004
PlateletCount_mean,-0.0000,0.0000,-0.2017,0.8401,-0.0000,0.0000
HR_mean,-0.0163,0.0044,-3.7068,0.0002,-0.0249,-0.0077
NBPm_mean,0.0056,0.0095,0.5918,0.5540,-0.0130,0.0242
NBPs_mean,-0.0097,0.0051,-1.8993,0.0575,-0.0197,0.0003


In [22]:
# Z-scores of the coefficients
z_scores = pd.concat([result.summary2().tables[1]['z'], result.summary2().tables[1]['P>|z|']], axis=1)

# Predictions and evaluations
Y_test_pred_prob = result.predict(X_test_encoded)
Y_test_pred = (Y_test_pred_prob > 0.5).astype(int)

auc_score = roc_auc_score(Y_test, Y_test_pred_prob)
f1 = f1_score(Y_test, Y_test_pred)
conf_matrix = confusion_matrix(Y_test, Y_test_pred)

In [23]:
# Outputs
print(f'AUC: {auc_score}', f'F1 Score: {f1}', f'Confusion Matrix:\n{conf_matrix}', sep='\n')
pd.set_option('display.float_format', '{:.4f}'.format)

AUC: 0.738262599469496
F1 Score: 0.8344988344988346
Confusion Matrix:
[[ 37 123]
 [ 19 358]]


In [24]:
# add model predictions
df_pred_xray = pd.read_csv(k_fold_test_pred_csv_path['xray'])
df_pred_noise = pd.read_csv(k_fold_test_pred_csv_path['noise'])
df_pred_blank = pd.read_csv(k_fold_test_pred_csv_path['blank'])
Y_test['target'] = df_pred_xray['target_0'] 
Y_test['xray_prob'] = df_pred_xray['class_0']
Y_test['noise_prob'] = df_pred_noise['class_0']
Y_test['blank_prob'] = df_pred_blank['class_0']
Y_test['lr_prob'] = pd.Series(Y_test_pred_prob)

In [25]:
Y_test

Unnamed: 0,Cardiomegaly,target,xray_prob,noise_prob,blank_prob,lr_prob
0,0,0.0000,0.9044,0.5408,0.3556,0.5766
1,1,1.0000,1.0000,0.9636,0.8258,0.8000
2,1,1.0000,0.9973,0.8662,0.8625,0.7398
3,1,1.0000,0.9987,0.9227,0.8623,0.5683
4,0,0.0000,0.1533,0.5533,0.3473,0.6183
...,...,...,...,...,...,...
532,1,1.0000,0.9686,0.5765,0.7308,0.6373
533,0,0.0000,0.2282,0.5633,0.7388,0.4275
534,1,1.0000,0.9830,0.5703,0.6527,0.4156
535,0,0.0000,0.1033,0.5389,0.5747,0.4049
