## Imports

In [25]:
# autoreload import module on change (does not work with from x import y)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# Import functions
import pandas as pd
import numpy as np
from pathlib import Path
from mimic_constants import *
from sklearn.ensemble import HistGradientBoostingClassifier

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [28]:
# Import cleaned master dataframe
df_master_cleaned = get_master_df()

## Removing columns with too many NaNs

In [29]:
df = df_master_cleaned
select_col_groups = []
removed_col_groups = []
og_col_groups = [chart_labels_mean_cols, chart_labels_max_cols, chart_labels_min_cols, lab_labels_mean_cols, lab_labels_max_cols, lab_labels_min_cols]
for col_group in og_col_groups:
    nan_counts = df_master_cleaned[col_group].isna().sum()
    too_many_nans = nan_counts[nan_counts > 1000].index.tolist()
    df = df.drop(too_many_nans, axis=1)
    col_group = list(set(col_group) - set(too_many_nans))
    select_col_groups.append(col_group)
    removed_col_groups.append(too_many_nans)

In [30]:
df[select_col_groups[0]].head(2)

Unnamed: 0,NBPm_mean,PlateletCount_mean,NBPd_mean,Temp(F)_mean,RR_mean,HR_mean,Sodium_mean,NBPs_mean,SpO2_mean
0,62.3,,54.1,98.9667,20.7,96.5,132.0,88.9,96.3
1,97.5455,,83.2727,98.1333,20.5455,73.6364,,142.4545,98.9091


In [31]:
df[select_col_groups[3]].head(2)

Unnamed: 0,Chloride_mean,Glucose_mean,Magnesium_mean,Hematocrit_mean,Calcium_Total_mean,Phosphate_mean,Urea_Nitrogren_mean,Creatinine_mean,Potassium_mean
0,100.3333,114.3333,2.1333,33.45,9.0333,2.3333,32.0,0.4333,4.9667
1,106.0,139.2857,2.25,26.7833,9.3,4.7833,45.2857,2.0714,4.3143


In [32]:
df.dropna().shape

(5029, 68)

In [33]:
df.isna().sum()[(df.isna().sum() > 0) & (df.isna().sum() < 1000)]

HR_mean                  6
NBPs_mean              223
NBPd_mean              225
NBPm_mean              211
RR_mean                 11
SpO2_mean               36
Sodium_mean            722
Temp(F)_mean           329
PlateletCount_mean     865
HR_max                   6
NBPs_max               223
NBPd_max               225
NBPm_max               211
RR_max                  11
Sodium_max             722
HR_min                   6
NBPs_min               223
NBPd_min               225
NBPm_min               211
RR_min                  11
SpO2_min                36
Sodium_min             722
Calcium_Total_mean     360
Chloride_mean          208
Creatinine_mean        205
Glucose_mean           213
Magnesium_mean         260
Phosphate_mean         354
Potassium_mean         207
Urea_Nitrogren_mean    209
Hematocrit_mean        226
Potassium_max          207
Potassium_min          207
dtype: int64

In [34]:
df_master_cleaned[removed_col_groups[0]].tail(2)

Unnamed: 0,ABPm_mean,ABPd_mean,ABPs_mean,FiO2_mean,PH_mean,PCO2_mean,SaO2_mean,Albumin_mean,Cholesterol_mean,PO2_mean
22980,95.8108,82.8056,127.4722,42.0,,34.25,,3.9,,167.75
22981,,,,39.9,,40.0,,,166.0,91.5


In [35]:
df_master_cleaned[removed_col_groups[3]].tail(2)

Unnamed: 0,Tidal_Volume_mean,Alkaline_Phosphatase_mean,Lactate_mean,Bilirubin,Troponin-T_mean,Hemoglobin_mean,ALT_mean,PTT_mean,WBC_mean,Fibrinogen
22980,,50.0,2.2667,0.5,,,37.6667,26.14,58.0,
22981,400.0,93.75,,1.4667,1.2567,,59.0,32.7,136.0,


Heart Score, PERC Score, Canadian C-Spine/Head Injuries: Regressions used in clinic to rule out certain diseases

WELLS Criteria, Ottawa Ankle, PERC: useful to know whether or not I need to get imaging 

Lactate - Pneumonia
Tidal - Lung pathologies
Troponin-T - Heart Attacks Acute setting (1 week) trop up — can lead to early findings of higher heart attacks chances 
WBC - infections

## Prep data for Cardiomegaly

In [36]:
label = 'Cardiomegaly'
df = df[df[label].isin([0, 1])]  # pick only positive and negative samples from label
f'Number of Total Samples: {len(df)}'

'Number of Total Samples: 11423'

In [37]:
study_year = np.floor(df['StudyDate'] / 10000)
delta_years = study_year - df['anchor_year']
df['age'] = df['anchor_age'] + delta_years
df['age_label'] = df['age'] / 100

In [38]:
# TODO FIX: introduces NaNs in age. One-hot encode the 'race_label' column
#from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder(drop='first')
#race_encoded = encoder.fit_transform(df[['race_label']]).toarray()
#race_encoded_df = pd.DataFrame(race_encoded, columns=encoder.get_feature_names_out(['race_label']))
#df = pd.concat([df, race_encoded_df], axis=1)
#df = df.drop(columns=['race_label'])

In [39]:
df['age_label'] = df['age'] / 100

df['race_label'] = df['ethnicity']
df.loc[df['race_label'] == 'White', 'race_label'] = 0
df.loc[df['race_label'] == 'Asian', 'race_label'] = 1
df.loc[df['race_label'] == 'Black', 'race_label'] = 2
df.loc[df['race_label'] == 'Hispanic/Latino', 'race_label'] = 3
df.loc[df['race_label'] == 'Other', 'race_label'] = 4

df['sex_label'] = df['gender']
df.loc[df['sex_label'] == 'M', 'sex_label'] = 0
df.loc[df['sex_label'] == 'F', 'sex_label'] = 1

In [40]:
demographic_cols = ['age_label', 'race_label', 'sex_label']

In [42]:
X = df[['split'] + demographic_cols + select_col_groups[0] + select_col_groups[3]]
Y = df[['split'] + [label]]
X_train = X[X['split'] == 'train'].drop(['split'], axis=1)
Y_train = Y[Y['split'] == 'train'].drop(['split'], axis=1)[label].astype(float)
X_test = X[X['split'] == 'test'].drop(['split'], axis=1)
Y_test = Y[Y['split'] == 'test'].drop(['split'], axis=1)[label].astype(float)

In [43]:
# 1. Adjust Train-Test Split to 90/10
# Calculate the number of samples to move from train to test
additional_test_samples = 916

# Sampling additional samples from X_train and Y_train
X_train_to_test, X_train = train_test_split(X_train, test_size=(11110 - additional_test_samples) / 11110, random_state=42, stratify=Y_train)
Y_train_to_test, Y_train = train_test_split(Y_train, test_size=(11110 - additional_test_samples) / 11110, random_state=42, stratify=Y_train)

# Concatenate the sampled data to the test sets
X_test = pd.concat([X_test, X_train_to_test], axis=0)
Y_test = pd.concat([Y_test, Y_train_to_test], axis=0)

In [44]:
# 2. Mean Imputation for NaNs in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 3. One-Hot Encoding for 'race_label'
X_train_encoded = pd.get_dummies(X_train_imputed, columns=['race_label'], drop_first=True, dtype=float)
X_test_encoded = pd.get_dummies(X_test_imputed, columns=['race_label'], drop_first=True, dtype=float)

# Ensure that the train and test sets have the same columns after encoding
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

X_train_encoded = sm.add_constant(X_train_encoded)  # Add intercept term
X_test_encoded = sm.add_constant(X_test_encoded)

X_train_encoded = X_train_encoded.reset_index(drop=True)
X_test_encoded = X_test_encoded.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

## Logistic Regression

In [45]:
# 4. Logistic Regression using Statsmodels
logit_model = sm.Logit(Y_train, X_train_encoded)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.622207
         Iterations 11


In [46]:
result.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,Cardiomegaly,Pseudo R-squared:,0.087
Date:,2024-09-02 15:28,AIC:,12735.5506
No. Observations:,10194,BIC:,12916.2894
Df Model:,24,Log-Likelihood:,-6342.8
Df Residuals:,10169,LL-Null:,-6946.7
Converged:,1.0000,LLR p-value:,5.0847e-240
No. Iterations:,11.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,0.2801,1.0344,0.2708,0.7866,-1.7474,2.3075
age_label,2.1478,0.1400,15.3406,0.0000,1.8734,2.4222
sex_label,0.0980,0.0444,2.2074,0.0273,0.0110,0.1851
NBPm_mean,0.0000,0.0002,0.0232,0.9815,-0.0003,0.0003
PlateletCount_mean,0.0000,0.0000,0.3171,0.7512,-0.0000,0.0000
NBPd_mean,0.0003,0.0004,0.7186,0.4724,-0.0005,0.0011
Temp(F)_mean,0.0014,0.0074,0.1906,0.8488,-0.0131,0.0160
RR_mean,0.0744,0.0067,11.1641,0.0000,0.0613,0.0874
HR_mean,-0.0019,0.0016,-1.1874,0.2351,-0.0050,0.0012


In [47]:
# Z-scores of the coefficients
z_scores = pd.concat([result.summary2().tables[1]['z'], result.summary2().tables[1]['P>|z|']], axis=1)

# Predictions and evaluations
Y_test_pred_prob = result.predict(X_test_encoded)
Y_test_pred = (Y_test_pred_prob > 0.5).astype(int)

auc_score = roc_auc_score(Y_test, Y_test_pred_prob)
f1 = f1_score(Y_test, Y_test_pred)
conf_matrix = confusion_matrix(Y_test, Y_test_pred)

In [48]:
# Outputs
print(f'AUC: {auc_score}', f'F1 Score: {f1}', f'Confusion Matrix:\n{conf_matrix}', sep='\n')
pd.set_option('display.float_format', '{:.4f}'.format)
z_scores.sort_values(by='z', ascending=False).head(20)

AUC: 0.7190507663739653
F1 Score: 0.5647840531561462
Confusion Matrix:
[[485 138]
 [255 255]]


Unnamed: 0,z,P>|z|
age_label,15.3406,0.0
RR_mean,11.1641,0.0
Urea_Nitrogren_mean,6.8814,0.0
Magnesium_mean,5.1887,0.0
Phosphate_mean,3.3469,0.0008
race_label_2.0,2.3746,0.0176
sex_label,2.2074,0.0273
Calcium_Total_mean,1.2097,0.2264
Potassium_mean,1.1029,0.2701
race_label_4.0,0.7979,0.4249


In [49]:
z_scores.sort_values(by='P>|z|', ascending=True).head(20)

Unnamed: 0,z,P>|z|
age_label,15.3406,0.0
Chloride_mean,-11.3091,0.0
RR_mean,11.1641,0.0
Urea_Nitrogren_mean,6.8814,0.0
Magnesium_mean,5.1887,0.0
Glucose_mean,-3.7193,0.0002
Phosphate_mean,3.3469,0.0008
Hematocrit_mean,-3.2741,0.0011
race_label_2.0,2.3746,0.0176
sex_label,2.2074,0.0273
