## Imports

In [11]:
# autoreload import module on change (does not work with from x import y)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Import functions
import pandas as pd
import numpy as np
from pathlib import Path
from mimic_constants import *
from sklearn.ensemble import HistGradientBoostingClassifier

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [13]:
# Import cleaned master dataframe
df_master_cleaned = get_master_df()

## Removing columns with too many NaNs

In [14]:
df = df_master_cleaned
select_col_groups = []
removed_col_groups = []
og_col_groups = [chart_labels_mean_cols, chart_labels_max_cols, chart_labels_min_cols, lab_labels_mean_cols, lab_labels_max_cols, lab_labels_min_cols]
for col_group in og_col_groups:
    nan_counts = df_master_cleaned[col_group].isna().sum()
    too_many_nans = nan_counts[nan_counts > 1000].index.tolist()
    df = df.drop(too_many_nans, axis=1)
    col_group = list(set(col_group) - set(too_many_nans))
    select_col_groups.append(col_group)
    removed_col_groups.append(too_many_nans)

In [15]:
df[select_col_groups[0]].head(2)

Unnamed: 0,Sodium_mean,NBPm_mean,SpO2_mean,Temp(F)_mean,NBPs_mean,RR_mean,HR_mean,NBPd_mean,PlateletCount_mean
0,132.0,62.3,96.3,98.966667,88.9,20.7,96.5,54.1,
1,,97.545455,98.909091,98.133333,142.454545,20.545455,73.636364,83.272727,


In [16]:
df[select_col_groups[3]].head(2)

Unnamed: 0,Creatinine_mean,Hematocrit_mean,Magnesium_mean,Chloride_mean,Urea_Nitrogren_mean,Calcium_Total_mean,Potassium_mean,Phosphate_mean,Glucose_mean
0,0.433333,33.45,2.133333,100.333333,32.0,9.033333,4.966667,2.333333,114.333333
1,2.071429,26.783333,2.25,106.0,45.285714,9.3,4.314286,4.783333,139.285714


In [17]:
df.dropna().shape

(5029, 68)

In [18]:
df.isna().sum()[(df.isna().sum() > 0) & (df.isna().sum() < 1000)]

HR_mean                  6
NBPs_mean              223
NBPd_mean              225
NBPm_mean              211
RR_mean                 11
SpO2_mean               36
Sodium_mean            722
Temp(F)_mean           329
PlateletCount_mean     865
HR_max                   6
NBPs_max               223
NBPd_max               225
NBPm_max               211
RR_max                  11
Sodium_max             722
HR_min                   6
NBPs_min               223
NBPd_min               225
NBPm_min               211
RR_min                  11
SpO2_min                36
Sodium_min             722
Calcium_Total_mean     360
Chloride_mean          208
Creatinine_mean        205
Glucose_mean           213
Magnesium_mean         260
Phosphate_mean         354
Potassium_mean         207
Urea_Nitrogren_mean    209
Hematocrit_mean        226
Potassium_max          207
Potassium_min          207
dtype: int64

In [19]:
df_master_cleaned[removed_col_groups[0]].tail(2)

Unnamed: 0,ABPm_mean,ABPd_mean,ABPs_mean,FiO2_mean,PH_mean,PCO2_mean,SaO2_mean,Albumin_mean,Cholesterol_mean,PO2_mean
22980,95.810811,82.805556,127.472222,42.0,,34.25,,3.9,,167.75
22981,,,,39.9,,40.0,,,166.0,91.5


In [20]:
df_master_cleaned[removed_col_groups[3]].tail(2)

Unnamed: 0,Tidal_Volume_mean,Alkaline_Phosphatase_mean,Lactate_mean,Bilirubin,Troponin-T_mean,Hemoglobin_mean,ALT_mean,PTT_mean,WBC_mean,Fibrinogen
22980,,50.0,2.266667,0.5,,,37.666667,26.14,58.0,
22981,400.0,93.75,,1.466667,1.256667,,59.0,32.7,136.0,


Heart Score, PERC Score, Canadian C-Spine/Head Injuries: Regressions used in clinic to rule out certain diseases

WELLS Criteria, Ottawa Ankle, PERC: useful to know whether or not I need to get imaging 

Lactate - Pneumonia
Tidal - Lung pathologies
Troponin-T - Heart Attacks Acute setting (1 week) trop up — can lead to early findings of higher heart attacks chances 
WBC - infections

## Prep data for Cardiomegaly

In [83]:
label = 'Cardiomegaly'
df = df_master_cleaned[df_master_cleaned[label].isin([0, 1])]  # pick only positive and negative samples from label
f'Number of Total Samples: {len(df)}'

'Number of Total Samples: 11423'

In [84]:
study_year = np.floor(df['StudyDate'] / 10000)
delta_years = study_year - df['anchor_year']
df['age'] = df['anchor_age'] + delta_years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age'] = df['anchor_age'] + delta_years


In [23]:
# TODO FIX: introduces NaNs in age. One-hot encode the 'race_label' column
#from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder(drop='first')
#race_encoded = encoder.fit_transform(df[['race_label']]).toarray()
#race_encoded_df = pd.DataFrame(race_encoded, columns=encoder.get_feature_names_out(['race_label']))
#df = pd.concat([df, race_encoded_df], axis=1)
#df = df.drop(columns=['race_label'])

In [85]:
df['age_label'] = df['age'] / 100

df['race_label'] = df['ethnicity']
df.loc[df['race_label'] == 'White', 'race_label'] = 0
df.loc[df['race_label'] == 'Asian', 'race_label'] = 1
df.loc[df['race_label'] == 'Black', 'race_label'] = 2
df.loc[df['race_label'] == 'Hispanic/Latino', 'race_label'] = 3
df.loc[df['race_label'] == 'Other', 'race_label'] = 4

df['sex_label'] = df['gender']
df.loc[df['sex_label'] == 'M', 'sex_label'] = 0
df.loc[df['sex_label'] == 'F', 'sex_label'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_label'] = df['age'] / 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['race_label'] = df['ethnicity']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex_label'] = df['gender']


In [86]:
demographic_cols = ['age_label', 'race_label', 'sex_label']

In [91]:
X = df[['split'] + demographic_cols + chart_labels_mean_cols + lab_labels_mean_cols]
Y = df[['split'] + [label]]
X_train = X[X['split'] == 'train'].drop(['split'], axis=1)
Y_train = Y[Y['split'] == 'train'].drop(['split'], axis=1)[label].astype(float)
X_test = X[X['split'] == 'test'].drop(['split'], axis=1)
Y_test = Y[Y['split'] == 'test'].drop(['split'], axis=1)[label].astype(float)

In [92]:
# 1. Adjust Train-Test Split to 90/10
# Calculate the number of samples to move from train to test
additional_test_samples = 916

# Sampling additional samples from X_train and Y_train
X_train_to_test, X_train = train_test_split(X_train, test_size=(11110 - additional_test_samples) / 11110, random_state=42, stratify=Y_train)
Y_train_to_test, Y_train = train_test_split(Y_train, test_size=(11110 - additional_test_samples) / 11110, random_state=42, stratify=Y_train)

# Concatenate the sampled data to the test sets
X_test = pd.concat([X_test, X_train_to_test], axis=0)
Y_test = pd.concat([Y_test, Y_train_to_test], axis=0)

In [93]:
# 2. Mean Imputation for NaNs in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 3. One-Hot Encoding for 'race_label'
X_train_encoded = pd.get_dummies(X_train_imputed, columns=['race_label'], drop_first=True, dtype=float)
X_test_encoded = pd.get_dummies(X_test_imputed, columns=['race_label'], drop_first=True, dtype=float)

# Ensure that the train and test sets have the same columns after encoding
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

X_train_encoded = sm.add_constant(X_train_encoded)  # Add intercept term
X_test_encoded = sm.add_constant(X_test_encoded)

X_train_encoded = X_train_encoded.reset_index(drop=True)
X_test_encoded = X_test_encoded.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

## Logistic Regression

In [94]:
# 4. Logistic Regression using Statsmodels
logit_model = sm.Logit(Y_train, X_train_encoded)
result = logit_model.fit()

Optimization terminated successfully.
         Current function value: 0.616624
         Iterations 12


In [97]:
result.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,Cardiomegaly,Pseudo R-squared:,0.095
Date:,2024-09-01 21:03,AIC:,12661.7308
No. Observations:,10194,BIC:,12987.0608
Df Model:,44,Log-Likelihood:,-6285.9
Df Residuals:,10149,LL-Null:,-6946.7
Converged:,1.0000,LLR p-value:,3.3170e-248
No. Iterations:,12.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.4816,3.7532,-2.7927,0.0052,-17.8377,-3.1254
age_label,2.1295,0.1450,14.6871,0.0000,1.8453,2.4137
sex_label,0.1206,0.0454,2.6587,0.0078,0.0317,0.2095
HR_mean,-0.0018,0.0016,-1.1323,0.2575,-0.0050,0.0013
SpO2_mean,-0.0014,0.0023,-0.6367,0.5243,-0.0058,0.0030
Temp(F)_mean,0.0009,0.0075,0.1150,0.9084,-0.0138,0.0155
RR_mean,0.0714,0.0068,10.4790,0.0000,0.0580,0.0847
ABPm_mean,0.0016,0.0026,0.6175,0.5369,-0.0034,0.0066
ABPd_mean,0.0007,0.0023,0.3150,0.7528,-0.0037,0.0051


In [98]:
# Z-scores of the coefficients
z_scores = result.summary2().tables[1]['z']

# Predictions and evaluations
Y_test_pred_prob = result.predict(X_test_encoded)
Y_test_pred = (Y_test_pred_prob > 0.5).astype(int)

auc_score = roc_auc_score(Y_test, Y_test_pred_prob)
f1 = f1_score(Y_test, Y_test_pred)
conf_matrix = confusion_matrix(Y_test, Y_test_pred)

In [104]:
# Outputs
print(f'AUC: {auc_score}', f'F1 Score: {f1}', f'Confusion Matrix:\n{conf_matrix}', sep='\n')
z_scores.sort_values(ascending=False).head(20)

AUC: 0.7257860447549805
F1 Score: 0.5817782656421515
Confusion Matrix:
[[487 136]
 [245 265]]


age_label              14.687090
RR_mean                10.479048
Urea_Nitrogren_mean     6.551532
PTT_mean                4.690020
Magnesium_mean          4.077638
PCO2_mean               4.032119
Phosphate_mean          3.452199
FiO2_mean               3.119107
PH_mean                 2.859992
sex_label               2.658730
race_label_2.0          2.420022
ABPs_mean               1.759333
Potassium_mean          1.303123
Calcium_Total_mean      1.217547
Tidal_Volume_mean       1.023537
Albumin_mean            0.971907
race_label_4.0          0.924165
WBC_mean                0.670425
ABPm_mean               0.617488
NBPd_mean               0.570965
Name: z, dtype: float64