In [1]:
import pandas as pd
import pickle
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

In [6]:
# Unpickling results files
# with open('model_results_immune_90.pkl', 'rb') as f:
#     model_results_immune_90 = pickle.load(f)

# with open('model_results_immune_30.pkl', 'rb') as f:
#     model_results_immune_30 = pickle.load(f)

with open('model_results_all_90.pkl', 'rb') as f:
    model_results_all_90 = pickle.load(f)

with open('model_results_all_30.pkl', 'rb') as f:
    model_results_all_30 = pickle.load(f)

with open ('model_results_labs_90.pkl', 'rb') as f:
    model_results_labs_90 = pickle.load(f)

with open ('model_results_labs_30.pkl', 'rb') as f:
    model_results_labs_30 = pickle.load(f)

**90-day mortality; immune features**

In [7]:
# Printing results
for model in model_results_immune_90.keys():
    print(model)
    print(model_results_immune_90[model]['metrics'])

XGB
{'accuracy': 0.5728643216080402, 'f1': 0.41379310344827586, 'precision': 0.273972602739726, 'recall': 0.8450704225352113, 'rel_impr_accuracy': -0.30275229357798156, 'rel_impr_f1': 413793103448.2759}
KNN
{'accuracy': 0.7437185929648241, 'f1': 0.23880597014925373, 'precision': 0.25396825396825395, 'recall': 0.22535211267605634, 'rel_impr_accuracy': -0.09480122324159015, 'rel_impr_f1': 238805970149.25372}
SVC Linear
{'accuracy': 0.7864321608040201, 'f1': 0.41379310344827586, 'precision': 0.40540540540540543, 'recall': 0.4225352112676056, 'rel_impr_accuracy': -0.04281345565749227, 'rel_impr_f1': 413793103448.2759}
SVC RBF
{'accuracy': 0.7160804020100503, 'f1': 0.45410628019323673, 'precision': 0.34558823529411764, 'recall': 0.6619718309859155, 'rel_impr_accuracy': -0.12844036697247696, 'rel_impr_f1': 454106280193.23676}
SVC Poly
{'accuracy': 0.6633165829145728, 'f1': 0.45528455284552843, 'precision': 0.32, 'recall': 0.7887323943661971, 'rel_impr_accuracy': -0.19266055045871555, 'rel_im

In [10]:
# SVC Polynomial had the best f1 Score
my_model = model_results_immune_90['SVC Poly']
best_model= my_model['best_model']
X_test = my_model['X_test_raw']
y_test = my_model['y_test']

# Compute permutation importance
results = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
print(len(results.importances_mean))

# Combine feature names and importance
importances = pd.DataFrame({
    'feature': X_test.columns,
    'importance_mean': results.importances_mean,
    'importance_std': results.importances_std
}).sort_values(by='importance_mean', ascending=False)

print(importances)

4
                     feature  importance_mean  importance_std
2  Absolute Lymphocyte Count         0.112563        0.021841
1    Absolute Monocyte Count         0.038693        0.022951
3  Absolute Neutrophil Count         0.035176        0.009059
0                       SIRI        -0.002261        0.008291


In [None]:
# XGB had the best recall
my_model = model_results_immune_90['XGB']
best_model= my_model['best_model']

xgb_clf = best_model.named_steps['model']
xgb_importances = xgb_clf.feature_importances_

transformer = best_model.named_steps['preprocessor']
feature_names = transformer.get_feature_names_out()

import pandas as pd

feat_imp_df = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': xgb_importances
    })
    .sort_values(by='importance', ascending=False)
)

print(feat_imp_df.head())

                          feature  importance
2  num__Absolute Lymphocyte Count    0.318746
0                       num__SIRI    0.284477
3  num__Absolute Neutrophil Count    0.217734
1    num__Absolute Monocyte Count    0.179043


**90-day mortality; all features**

In [7]:
# Printing results
for model in model_results_all_90.keys():
    print(model)
    print(model_results_all_90[model]['metrics'])

XGB
{'accuracy': 0.7866323907455013, 'f1': 0.4713375796178344, 'precision': 0.37, 'recall': 0.6491228070175439, 'rel_impr_accuracy': -0.07831325301204818, 'rel_impr_f1': 471337579617.8344}
KNN
{'accuracy': 0.8354755784061697, 'f1': 0.3191489361702128, 'precision': 0.40540540540540543, 'recall': 0.2631578947368421, 'rel_impr_accuracy': -0.02108433734939754, 'rel_impr_f1': 319148936170.21277}
SVC Linear
{'accuracy': 0.7352185089974294, 'f1': 0.4607329842931937, 'precision': 0.3283582089552239, 'recall': 0.7719298245614035, 'rel_impr_accuracy': -0.13855421686746983, 'rel_impr_f1': 460732984293.1937}
SVC RBF
{'accuracy': 0.7789203084832905, 'f1': 0.47560975609756095, 'precision': 0.3644859813084112, 'recall': 0.6842105263157895, 'rel_impr_accuracy': -0.0873493975903614, 'rel_impr_f1': 475609756097.561}
SVC Poly
{'accuracy': 0.7249357326478149, 'f1': 0.4623115577889447, 'precision': 0.323943661971831, 'recall': 0.8070175438596491, 'rel_impr_accuracy': -0.15060240963855429, 'rel_impr_f1': 46

In [9]:
# XGB had the best f1 score
my_model = model_results_all_90['XGB']
best_model= my_model['best_model']

xgb_clf = best_model.named_steps['model']
xgb_importances = xgb_clf.feature_importances_

transformer = best_model.named_steps['preprocessor']
feature_names = transformer.get_feature_names_out()

import pandas as pd

feat_imp_df = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': xgb_importances
    })
    .sort_values(by='importance', ascending=False)
)

print(feat_imp_df.head(20))

                                               feature  importance
367  cat__last_careunit_Cardiac Vascular Intensive ...    0.115260
356  cat__first_careunit_Cardiac Vascular Intensive...    0.095967
136                      cat__admit_provider_id_P40N2M    0.022631
108                      cat__admit_provider_id_P32CZ5    0.015582
7                                       num__Anion Gap    0.015047
1                                             num__los    0.013789
190                      cat__admit_provider_id_P605EW    0.012887
29                        cat__admission_type_EW EMER.    0.012745
3                       num__Absolute Eosinophil Count    0.012212
93                       cat__admit_provider_id_P23NUR    0.011854
117                      cat__admit_provider_id_P35C58    0.011757
143                      cat__admit_provider_id_P42LYZ    0.011726
279                      cat__admit_provider_id_P95BQY    0.011369
297                            cat__insurance_Medicare    0.01

In [11]:
# SVC Poly had the best recall
my_model = model_results_all_90['SVC Poly']
best_model= my_model['best_model']
X_test = my_model['X_test_raw']
y_test = my_model['y_test']

# Compute permutation importance
results = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
print(len(results.importances_mean))

# Combine feature names and importance
importances = pd.DataFrame({
    'feature': X_test.columns,
    'importance_mean': results.importances_mean,
    'importance_std': results.importances_std
}).sort_values(by='importance_mean', ascending=False)

print(importances.head(20))

35
                      feature  importance_mean  importance_std
27                    Lactate         0.025450        0.009920
30                        RDW         0.010540        0.009853
5              marital_status         0.005398        0.005070
3                   insurance         0.004627        0.005490
32                       SIRI         0.004113        0.005167
18                Base Excess         0.003599        0.005293
29             Platelet Count         0.002571        0.003042
8                  anchor_age         0.002314        0.004938
25      Immature Granulocytes         0.002057        0.004271
0              admission_type         0.002057        0.001924
23                          I         0.002057        0.003410
19                Bicarbonate         0.001542        0.003847
17                  Anion Gap         0.001028        0.007198
7                      gender         0.000257        0.003138
12    Absolute Basophil Count         0.000000      

**90-day mortality; lab + racial features only**

In [14]:
# Printing results
for model in model_results_labs_90.keys():
    print(model)
    print(model_results_all_30[model]['metrics'])

XGB
{'accuracy': 0.7763496143958869, 'f1': 0.5492227979274611, 'precision': 0.40458015267175573, 'recall': 0.8548387096774194, 'rel_impr_accuracy': -0.07645259938837917, 'rel_impr_f1': 549222797927.4611}
KNN
{'accuracy': 0.8020565552699229, 'f1': 0.29357798165137616, 'precision': 0.3404255319148936, 'recall': 0.25806451612903225, 'rel_impr_accuracy': -0.04587155963302745, 'rel_impr_f1': 293577981651.37616}
SVC Linear
{'accuracy': 0.7634961439588689, 'f1': 0.5208333333333334, 'precision': 0.38461538461538464, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.09174311926605502, 'rel_impr_f1': 520833333333.3334}
SVC RBF
{'accuracy': 0.7866323907455013, 'f1': 0.546448087431694, 'precision': 0.4132231404958678, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.06422018348623851, 'rel_impr_f1': 546448087431.69403}
SVC Poly
{'accuracy': 0.7557840616966581, 'f1': 0.5128205128205128, 'precision': 0.37593984962406013, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.1009174311926604

In [18]:
# SVC RBF had the best f1 score
my_model = model_results_labs_90['SVC RBF']
best_model= my_model['best_model']
X_test = my_model['X_test_raw']
y_test = my_model['y_test']

# Compute permutation importance
results = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
print(len(results.importances_mean))

# Combine feature names and importance
importances = pd.DataFrame({
    'feature': X_test.columns,
    'importance_mean': results.importances_mean,
    'importance_std': results.importances_std
}).sort_values(by='importance_mean', ascending=False)

print(importances.head(20))


35
                      feature  importance_mean  importance_std
28                        PTT     3.856041e-03        0.005423
20                 Creatinine     1.542416e-03        0.003847
0              admission_type     0.000000e+00        0.000000
2          admission_location     0.000000e+00        0.000000
3                   insurance     0.000000e+00        0.000000
4                    language     0.000000e+00        0.000000
9              first_careunit     0.000000e+00        0.000000
10              last_careunit     0.000000e+00        0.000000
11                        los     0.000000e+00        0.000000
1           admit_provider_id     0.000000e+00        0.000000
27                    Lactate    -3.330669e-17        0.005268
12    Absolute Basophil Count    -7.712082e-04        0.001646
14  Absolute Lymphocyte Count    -7.712082e-04        0.002008
23                          I    -2.570694e-03        0.003042
34                        pO2    -2.570694e-03      

In [15]:
# XGB had the best recall
my_model = model_results_labs_30['XGB']
best_model= my_model['best_model']

xgb_clf = best_model.named_steps['model']
xgb_importances = xgb_clf.feature_importances_

transformer = best_model.named_steps['preprocessor']
feature_names = transformer.get_feature_names_out()

import pandas as pd

feat_imp_df = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': xgb_importances
    })
    .sort_values(by='importance', ascending=False)
)

print(feat_imp_df.head(20))


                           feature  importance
6                   num__Anion Gap    0.098623
2   num__Absolute Eosinophil Count    0.093011
9                  num__Creatinine    0.078009
22              num__Urea Nitrogen    0.051491
21                       num__SIRI    0.049507
8                 num__Bicarbonate    0.047929
19                        num__RDW    0.041315
23                        num__pO2    0.037169
14      num__Immature Granulocytes    0.032046
15                          num__L    0.031876
12                          num__I    0.028180
26      cat__marital_status_SINGLE    0.027394
16                    num__Lactate    0.026611
10                          num__H    0.026554
0                  num__anchor_age    0.026495
7                 num__Base Excess    0.023540
17                        num__PTT    0.023463
3   num__Absolute Lymphocyte Count    0.023237
28     cat__marital_status_missing    0.022128
4     num__Absolute Monocyte Count    0.020313


**30-day mortality; immune features**

In [None]:
# TBD once/if model runs

**30-day mortality; all features**

In [12]:
# Printing results
for model in model_results_all_30.keys():
    print(model)
    print(model_results_all_30[model]['metrics'])

XGB
{'accuracy': 0.7763496143958869, 'f1': 0.5492227979274611, 'precision': 0.40458015267175573, 'recall': 0.8548387096774194, 'rel_impr_accuracy': -0.07645259938837917, 'rel_impr_f1': 549222797927.4611}
KNN
{'accuracy': 0.8020565552699229, 'f1': 0.29357798165137616, 'precision': 0.3404255319148936, 'recall': 0.25806451612903225, 'rel_impr_accuracy': -0.04587155963302745, 'rel_impr_f1': 293577981651.37616}
SVC Linear
{'accuracy': 0.7634961439588689, 'f1': 0.5208333333333334, 'precision': 0.38461538461538464, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.09174311926605502, 'rel_impr_f1': 520833333333.3334}
SVC RBF
{'accuracy': 0.7866323907455013, 'f1': 0.546448087431694, 'precision': 0.4132231404958678, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.06422018348623851, 'rel_impr_f1': 546448087431.69403}
SVC Poly
{'accuracy': 0.7557840616966581, 'f1': 0.5128205128205128, 'precision': 0.37593984962406013, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.1009174311926604

In [16]:
# XGB had the best recall and f1 score
my_model = model_results_all_30['XGB']
best_model= my_model['best_model']

xgb_clf = best_model.named_steps['model']
xgb_importances = xgb_clf.feature_importances_

transformer = best_model.named_steps['preprocessor']
feature_names = transformer.get_feature_names_out()

import pandas as pd

feat_imp_df = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': xgb_importances
    })
    .sort_values(by='importance', ascending=False)
)

print(feat_imp_df.head(20))

                                               feature  importance
360  cat__first_careunit_Cardiac Vascular Intensive...    0.132714
372  cat__last_careunit_Cardiac Vascular Intensive ...    0.116118
7                                       num__Anion Gap    0.052229
3                       num__Absolute Eosinophil Count    0.045678
9                                     num__Bicarbonate    0.036352
10                                     num__Creatinine    0.034134
1                                             num__los    0.026426
15                          num__Immature Granulocytes    0.018899
23                                  num__Urea Nitrogen    0.018295
17                                        num__Lactate    0.017244
20                                            num__RDW    0.017117
22                                           num__SIRI    0.015295
14                                        num__INR(PT)    0.015056
0                                      num__anchor_age    0.01

**30-day mortality; lab + racial features only**

In [19]:
# Printing results
for model in model_results_labs_30.keys():
    print(model)
    print(model_results_all_30[model]['metrics'])

XGB
{'accuracy': 0.7763496143958869, 'f1': 0.5492227979274611, 'precision': 0.40458015267175573, 'recall': 0.8548387096774194, 'rel_impr_accuracy': -0.07645259938837917, 'rel_impr_f1': 549222797927.4611}
KNN
{'accuracy': 0.8020565552699229, 'f1': 0.29357798165137616, 'precision': 0.3404255319148936, 'recall': 0.25806451612903225, 'rel_impr_accuracy': -0.04587155963302745, 'rel_impr_f1': 293577981651.37616}
SVC Linear
{'accuracy': 0.7634961439588689, 'f1': 0.5208333333333334, 'precision': 0.38461538461538464, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.09174311926605502, 'rel_impr_f1': 520833333333.3334}
SVC RBF
{'accuracy': 0.7866323907455013, 'f1': 0.546448087431694, 'precision': 0.4132231404958678, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.06422018348623851, 'rel_impr_f1': 546448087431.69403}
SVC Poly
{'accuracy': 0.7557840616966581, 'f1': 0.5128205128205128, 'precision': 0.37593984962406013, 'recall': 0.8064516129032258, 'rel_impr_accuracy': -0.1009174311926604

In [22]:
# XGB had the best recall and f1 score
my_model = model_results_labs_30['XGB']
best_model= my_model['best_model']

xgb_clf = best_model.named_steps['model']
xgb_importances = xgb_clf.feature_importances_

transformer = best_model.named_steps['preprocessor']
feature_names = transformer.get_feature_names_out()

import pandas as pd

feat_imp_df = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': xgb_importances
    })
    .sort_values(by='importance', ascending=False)
)

print(feat_imp_df.head(20))

                           feature  importance
6                   num__Anion Gap    0.098623
2   num__Absolute Eosinophil Count    0.093011
9                  num__Creatinine    0.078009
22              num__Urea Nitrogen    0.051491
21                       num__SIRI    0.049507
8                 num__Bicarbonate    0.047929
19                        num__RDW    0.041315
23                        num__pO2    0.037169
14      num__Immature Granulocytes    0.032046
15                          num__L    0.031876
12                          num__I    0.028180
26      cat__marital_status_SINGLE    0.027394
16                    num__Lactate    0.026611
10                          num__H    0.026554
0                  num__anchor_age    0.026495
7                 num__Base Excess    0.023540
17                        num__PTT    0.023463
3   num__Absolute Lymphocyte Count    0.023237
28     cat__marital_status_missing    0.022128
4     num__Absolute Monocyte Count    0.020313
