In [2]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import shap

In [3]:
data = pd.read_csv('mimic_commentary.csv').drop(columns=['Unnamed: 0'])

X = data.drop(columns=['los', 'feature_pc']).astype(float)
y = data['los'].astype(int)

data

Unnamed: 0,feature_pc,los,gender,age_senior,ELECTIVE,EMERGENCY,URGENT,amiodarone_y,ampicillinsulbactam_y,atropine_y,...,chloride,PEEP set,tidal volume,troponin,anion gap,Neutrophils,Blood pressure systolic,Blood pressure diatolic,Pulseoxymetry,O2 Fraction
0,0.730769,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,107.059701,4.981567,392.086735,,,,,,,
1,0.692308,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,107.500000,4.250000,609.800000,,,,,,,
2,0.461538,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,105.000000,,,,,,,,,
3,0.653846,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,89.476190,,,,,,,,,
4,0.615385,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,103.500000,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,0.846154,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,105.333333,5.000000,424.222222,0.01,10.000000,,110.660714,64.107143,97.266667,48.333333
108,0.730769,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,117.000000,,,0.16,9.000000,,139.529412,63.176471,96.416667,
109,0.730769,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,104.125000,,,0.01,11.714286,,136.036232,74.239130,96.619048,
110,0.807692,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,107.857143,4.000000,575.000000,,10.714286,,99.041667,53.441667,96.293233,40.833333


In [4]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

aucs = []
accs = []
for fold_nr, (train_ix, test_ix) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_ix, :]
    X_test = X.iloc[test_ix, :]
    
    y_train = y.iloc[train_ix]
    y_test = y.iloc[test_ix]
    
    X_test = X_test.fillna(X_train.median(axis=0))
    X_train = X_train.fillna(X_train.median(axis=0))
    
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)
    
    print(X_train.shape, X_test.shape)
    
    rf = GridSearchCV(
        RandomForestClassifier(random_state=42),
        {'max_depth': [5, 10, None], 'n_estimators': [100, 250, 500, 1000]}
    )
    rf.fit(X_train, y_train)
    print(rf.best_params_)
    
    auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
    aucs.append(auc)
    print(f'Fold #{fold_nr + 1} Test AUC = {np.around(auc, 4)}')
    acc = accuracy_score(y_test, rf.predict(X_test))
    accs.append(acc)
    print(f'Fold #{fold_nr + 1} Test ACC = {np.around(acc, 4)}')
    
#     print(f'Fold #{fold_nr + 1} Train AUC = {np.around(roc_auc_score(y_train, rf.predict_proba(X_train)[:, 1]), 4)}')

(182, 74) (12, 74)
{'max_depth': 10, 'n_estimators': 100}
Fold #1 Test AUC = 0.2273
Fold #1 Test ACC = 0.9167
(182, 74) (12, 74)
{'max_depth': 10, 'n_estimators': 250}
Fold #2 Test AUC = 0.9091
Fold #2 Test ACC = 0.9167
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 100}
Fold #3 Test AUC = 0.9
Fold #3 Test ACC = 0.9091
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 100}
Fold #4 Test AUC = 1.0
Fold #4 Test ACC = 0.9091
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 500}
Fold #5 Test AUC = 1.0
Fold #5 Test ACC = 1.0
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 1000}
Fold #6 Test AUC = 1.0
Fold #6 Test ACC = 0.9091
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 500}
Fold #7 Test AUC = 0.9
Fold #7 Test ACC = 0.9091
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 100}
Fold #8 Test AUC = 1.0
Fold #8 Test ACC = 0.9091
(184, 74) (11, 74)
{'max_depth': 10, 'n_estimators': 100}
Fold #9 Test AUC = 1.0
Fold #9 Test ACC = 0.9091
(184, 74) (11, 74)
{'max_depth': 

In [5]:
np.mean(aucs), np.std(aucs)

(0.8936363636363636, 0.22632712374381636)

In [6]:
np.mean(accs), np.std(accs)

(0.9196969696969696, 0.026933922476713917)

In [7]:
from collections import Counter
Counter(y), 102/112

(Counter({1: 10, 0: 102}), 0.9107142857142857)

In [8]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

aucs = []
for fold_nr, (train_ix, test_ix) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_ix, :]
    X_test = X.iloc[test_ix, :]
    
    y_train = y.iloc[train_ix]
    y_test = y.iloc[test_ix]
    
    X_test = X_test.fillna(X_train.median(axis=0))
    X_train = X_train.fillna(X_train.median(axis=0))
    
    rf = GridSearchCV(
        RandomForestClassifier(random_state=42),
        {'max_depth': [5, 10, None], 'n_estimators': [100, 250, 500, 1000]}
    )
    rf.fit(X_train, y_train)
    print(rf.best_params_)
    
    auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
    aucs.append(auc)
    
    print(f'Fold #{fold_nr + 1} Test AUC = {np.around(roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]), 4)}')
#     print(f'Fold #{fold_nr + 1} Train AUC = {np.around(roc_auc_score(y_train, rf.predict_proba(X_train)[:, 1]), 4)}')

{'max_depth': 5, 'n_estimators': 100}
Fold #1 Test AUC = 0.4545
{'max_depth': 5, 'n_estimators': 100}
Fold #2 Test AUC = 0.8182
{'max_depth': 5, 'n_estimators': 100}
Fold #3 Test AUC = 1.0
{'max_depth': 5, 'n_estimators': 100}
Fold #4 Test AUC = 0.8
{'max_depth': 5, 'n_estimators': 100}
Fold #5 Test AUC = 1.0
{'max_depth': 5, 'n_estimators': 100}
Fold #6 Test AUC = 1.0
{'max_depth': 5, 'n_estimators': 100}
Fold #7 Test AUC = 0.8
{'max_depth': 5, 'n_estimators': 100}
Fold #8 Test AUC = 0.8
{'max_depth': 5, 'n_estimators': 100}
Fold #9 Test AUC = 1.0
{'max_depth': 5, 'n_estimators': 100}
Fold #10 Test AUC = 1.0


In [9]:
np.mean(aucs), np.std(aucs)

(0.8672727272727272, 0.16565422516208148)

In [10]:
from sklearn.model_selection import KFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

X = X.fillna(X.median(axis=0))
    
X, y = SMOTE().fit_resample(X, y)

aucs = []
for fold_nr, (train_ix, test_ix) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_ix, :]#[selected]
    X_test = X.iloc[test_ix, :]#[selected]
    
    y_train = y.iloc[train_ix]
    y_test = y.iloc[test_ix]
    
    rf = RandomForestClassifier(random_state=42, n_estimators=500, max_depth=None)
    rf.fit(X_train, y_train)
    
    aucs.append(roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))
    print(f'Fold #{fold_nr + 1} Test AUC = {np.around(roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]), 4)}')
#     print(f'Fold #{fold_nr + 1} Train AUC = {np.around(roc_auc_score(y_train, rf.predict_proba(X_train)[:, 1]), 4)}')

Fold #1 Test AUC = 1.0
Fold #2 Test AUC = 1.0
Fold #3 Test AUC = 1.0
Fold #4 Test AUC = 1.0
Fold #5 Test AUC = 0.98
Fold #6 Test AUC = 1.0
Fold #7 Test AUC = 1.0
Fold #8 Test AUC = 1.0
Fold #9 Test AUC = 1.0
Fold #10 Test AUC = 1.0


In [11]:
np.mean(aucs), np.std(aucs)

(0.998, 0.0060000000000000045)