## Imports

In [1]:
# autoreload import module on change (does not work with from x import y)
%load_ext autoreload
%autoreload 2

In [2]:
# Import functions
import pandas as pd
import numpy as np
from pathlib import Path
from mimic_constants import *
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [4]:
# Import cleaned master dataframe
df_master_cleaned = get_master_df(idp=True)

## Removing columns with too many NaNs

In [5]:
df = df_master_cleaned
select_col_groups = []
removed_col_groups = []
og_col_groups = [chart_labels_mean_cols, chart_labels_max_cols, chart_labels_min_cols, lab_labels_mean_cols, lab_labels_max_cols, lab_labels_min_cols]
for col_group in og_col_groups:
    nan_counts = df_master_cleaned[col_group].isna().sum()
    too_many_nans = nan_counts[nan_counts > 1000].index.tolist()
    df = df.drop(too_many_nans, axis=1)
    col_group = list(set(col_group) - set(too_many_nans))
    select_col_groups.append(col_group)
    removed_col_groups.append(too_many_nans)

In [6]:
df[select_col_groups[0]].head(2)

Unnamed: 0,PlateletCount_mean,SpO2_mean,RR_mean,NBPd_mean,HR_mean,Temp(F)_mean,Sodium_mean,NBPs_mean,NBPm_mean
0,136.2,97.648649,19.044444,74.669683,70.8,98.98913,136.5,129.176471,90.529412
1,251.333333,96.730769,15.185185,64.5,94.214286,97.5,138.333333,110.5,75.4


In [7]:
df[select_col_groups[3]].head(2)

Unnamed: 0,Bilirubin,Potassium_mean,Lactate_mean,Phosphate_mean,Glucose_mean,Hematocrit_mean,Calcium_Total_mean,PTT_mean,Alkaline_Phosphatase_mean,Creatinine_mean,Urea_Nitrogren_mean,ALT_mean,Magnesium_mean,Chloride_mean
0,0.372727,4.638889,1.8,3.172222,149.444444,29.436842,8.994444,58.061765,56.727273,0.905556,25.611111,521.636364,2.194444,95.777778
1,,4.214286,2.82,,181.75,31.085714,,26.3,,1.0,15.6,,2.1,102.142857


In [8]:
df.dropna().shape

(1304, 61)

In [9]:
df.isna().sum()[(df.isna().sum() > 0) & (df.isna().sum() < 1000)]

NBPs_mean                     20
NBPd_mean                     22
NBPm_mean                     18
SpO2_mean                      2
Sodium_mean                   68
Temp(F)_mean                  27
PlateletCount_mean            88
NBPs_max                      20
NBPd_max                      22
NBPm_max                      18
Sodium_max                    68
NBPs_min                      20
NBPd_min                      22
NBPm_min                      18
SpO2_min                       2
Sodium_min                    68
Lactate_mean                 734
ALT_mean                     858
Alkaline_Phosphatase_mean    872
Bilirubin                    854
Calcium_Total_mean            45
Chloride_mean                 13
Creatinine_mean               13
Glucose_mean                  13
Magnesium_mean                15
Phosphate_mean                40
Potassium_mean                13
Urea_Nitrogren_mean           13
Hematocrit_mean               13
PTT_mean                     204
Potassium_

In [10]:
df_master_cleaned[removed_col_groups[0]].tail(2)

Unnamed: 0,ABPm_mean,ABPd_mean,ABPs_mean,FiO2_mean,PH_mean,PCO2_mean,SaO2_mean,Albumin_mean,Cholesterol_mean,PO2_mean
2660,,,,,,,,2.4,,
2661,76.461538,54.538462,114.692308,60.555556,,60.666667,98.0,,,130.0


In [11]:
df_master_cleaned[removed_col_groups[3]].tail(2)

Unnamed: 0,Tidal_Volume_mean,Troponin-T_mean,Hemoglobin_mean,WBC_mean,Fibrinogen
2660,,3.237778,8.085714,9.0,288.0
2661,400.0,,12.0,,


Heart Score, PERC Score, Canadian C-Spine/Head Injuries: Regressions used in clinic to rule out certain diseases

WELLS Criteria, Ottawa Ankle, PERC: useful to know whether or not I need to get imaging 

Lactate - Pneumonia
Tidal - Lung pathologies
Troponin-T - Heart Attacks Acute setting (1 week) trop up — can lead to early findings of higher heart attacks chances 
WBC - infections

## Prep data for Cardiomegaly

In [12]:
label = 'Cardiomegaly'
df = df[df[label].isin([0, 1])]  # pick only positive and negative samples from label
f'Number of Total Samples: {len(df)}'

'Number of Total Samples: 2662'

In [13]:
df['age_label'] = df['anchor_age'].apply(lambda x: min(x / 100, 1))
df = standardize_mimic_ethnicity(df)
df['race_label'] = df['ethnicity']
df.loc[df['race_label'] == 'White', 'race_label'] = 0
df.loc[df['race_label'] == 'Asian', 'race_label'] = 1
df.loc[df['race_label'] == 'Black', 'race_label'] = 2
df.loc[df['race_label'] == 'Hispanic/Latino', 'race_label'] = 3
df.loc[df['race_label'] == 'Other', 'race_label'] = 4

df['sex_label'] = df['gender']
df.loc[df['sex_label'] == 'M', 'sex_label'] = 0
df.loc[df['sex_label'] == 'F', 'sex_label'] = 1

In [14]:
demographic_cols = ['age_label', 'race_label', 'sex_label']

In [15]:
X = df[demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y = df[[label]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y[label])

In [16]:
# 2. Mean Imputation for NaNs in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 3. One-Hot Encoding for 'race_label'
X_train_encoded = pd.get_dummies(X_train_imputed, columns=['race_label'], drop_first=True, dtype=float)
X_test_encoded = pd.get_dummies(X_test_imputed, columns=['race_label'], drop_first=True, dtype=float)

# Ensure that the train and test sets have the same columns after encoding
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

X_train_encoded = sm.add_constant(X_train_encoded)  # Add intercept term
X_test_encoded = sm.add_constant(X_test_encoded)

X_train_encoded = X_train_encoded.reset_index(drop=True)
X_test_encoded = X_test_encoded.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

## Logistic Regression

In [17]:
# 4. Logistic Regression using Statsmodels
logit_model = sm.Logit(Y_train, X_train_encoded)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.548031
         Iterations 10


In [18]:
result.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,Cardiomegaly,Pseudo R-squared:,0.104
Date:,2024-09-04 18:37,AIC:,2393.5167
No. Observations:,2129,BIC:,2563.4189
Df Model:,29,Log-Likelihood:,-1166.8
Df Residuals:,2099,LL-Null:,-1302.5
Converged:,1.0000,LLR p-value:,3.4040e-41
No. Iterations:,10.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,2.8440,5.5694,0.5107,0.6096,-8.0718,13.7599
age_label,2.6437,0.3820,6.9214,0.0000,1.8951,3.3923
sex_label,0.0776,0.1084,0.7160,0.4740,-0.1349,0.2902
PlateletCount_mean,-0.0000,0.0000,-0.3121,0.7550,-0.0000,0.0000
SpO2_mean,-0.0006,0.0008,-0.7422,0.4580,-0.0023,0.0010
RR_mean,0.0121,0.0123,0.9868,0.3237,-0.0120,0.0362
NBPd_mean,0.0003,0.0011,0.2981,0.7656,-0.0019,0.0025
HR_mean,-0.0172,0.0040,-4.3307,0.0000,-0.0250,-0.0094
Temp(F)_mean,-0.0280,0.0534,-0.5250,0.5996,-0.1326,0.0766


In [19]:
# Z-scores of the coefficients
z_scores = pd.concat([result.summary2().tables[1]['z'], result.summary2().tables[1]['P>|z|']], axis=1)

# Predictions and evaluations
Y_test_pred_prob = result.predict(X_test_encoded)
Y_test_pred = (Y_test_pred_prob > 0.5).astype(int)

auc_score = roc_auc_score(Y_test, Y_test_pred_prob)
f1 = f1_score(Y_test, Y_test_pred)
conf_matrix = confusion_matrix(Y_test, Y_test_pred)

In [20]:
# Outputs
print(f'AUC: {auc_score}', f'F1 Score: {f1}', f'Confusion Matrix:\n{conf_matrix}', sep='\n')
pd.set_option('display.float_format', '{:.4f}'.format)

AUC: 0.7284349865951742
F1 Score: 0.8215568862275449
Confusion Matrix:
[[ 41 119]
 [ 30 343]]


In [21]:
z_scores.sort_values(by='P>|z|', ascending=True).head(20)

Unnamed: 0,z,P>|z|
age_label,6.9214,0.0
race_label_2.0,5.518,0.0
HR_mean,-4.3307,0.0
race_label_3.0,3.9312,0.0001
Urea_Nitrogren_mean,3.2444,0.0012
Magnesium_mean,2.7175,0.0066
Chloride_mean,-2.2647,0.0235
NBPs_mean,-2.0976,0.0359
PTT_mean,1.6054,0.1084
NBPm_mean,1.5806,0.114


In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, matthews_corrcoef
def compute_metrics(y_true, y_pred, y_prob):
    """Compute various classification metrics for binary classification."""
    # y_true: Actual binary labels (0 or 1)
    # y_pred: Predicted binary labels (0 or 1), thresholding applied
    # y_prob: Raw probabilities for the positive class

    auc = roc_auc_score(y_true, y_prob)
    avg_precision = average_precision_score(y_true, y_prob)
    
    # Convert y_pred to binary predictions if it's probabilities
    y_pred_binary = (y_pred > 0.5).astype(int)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred_binary)
    
    # Metrics calculations
    tpr = cm[1, 1] / (cm[1, 1] + cm[1, 0]) if (cm[1, 1] + cm[1, 0]) > 0 else 0  # Sensitivity, Recall
    tnr = cm[0, 0] / (cm[0, 0] + cm[0, 1]) if (cm[0, 0] + cm[0, 1]) > 0 else 0  # Specificity
    ppv = cm[1, 1] / (cm[1, 1] + cm[0, 1]) if (cm[1, 1] + cm[0, 1]) > 0 else 0  # Precision
    npv = cm[0, 0] / (cm[0, 0] + cm[1, 0]) if (cm[0, 0] + cm[1, 0]) > 0 else 0  # Negative Predictive Value
    f1 = f1_score(y_true, y_pred_binary)
    mcc = matthews_corrcoef(y_true, y_pred_binary)

    return {
        'AUC': auc,
        'Average Precision': avg_precision,
        'TPR': tpr,
        'TNR': tnr,
        'PPV': ppv,
        'NPV': npv,
        'F1': f1,
        'MCC': mcc
    }
df_fold_metrics = pd.DataFrame({'test_metrics': compute_metrics(Y_test, Y_test_pred, Y_test_pred_prob)})
test_metrics_df = df_fold_metrics['test_metrics'].apply(pd.Series).add_prefix('test_')

In [26]:
test_metrics_df

Unnamed: 0,test_0
AUC,0.7284
Average Precision,0.8479
TPR,0.9196
TNR,0.2562
PPV,0.7424
NPV,0.5775
F1,0.8216
MCC,0.2372


In [33]:
test_metrics_df.T.reset_index(drop=True)

Unnamed: 0,AUC,Average Precision,TPR,TNR,PPV,NPV,F1,MCC
0,0.7284,0.8479,0.9196,0.2562,0.7424,0.5775,0.8216,0.2372
