In [2]:

# basic modules
import os
import time
import random as rn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# plotting style
plt.style.use('seaborn-v0_8-notebook')
# sns.set_style('notebook')
sns.set_style('darkgrid')

# pandas tricks for better display
pd.options.display.max_columns = 50  
pd.options.display.max_rows = 500     
pd.options.display.max_colwidth = 100
pd.options.display.precision = 3

# preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# metrics & utilities 
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, auc
from sklearn.metrics import average_precision_score, precision_recall_curve, PrecisionRecallDisplay
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils import resample

# warnings
import warnings
warnings.filterwarnings("ignore")



# user defined functions
from utility_functions import process_max_glu_serum, process_A1Cresult, process_medical_specialty, process_race
from utility_functions import process_diag_codes, process_age, process_discharge_disposition_id, process_admission_type_id
from utility_functions import process_admission_source_ID, process_readmitted, process_diabetesMed_and_change, preprocess_df
from utility_functions import get_previous_encounters, aggregate_previous_encounters, get_last_encounter, aggregate_encounters
from utility_functions import stratified_split, get_train_val_scores, get_performance_metrics, get_results_df, plot_performance_metrics
from utility_functions import plot_ROC_curves, plot_PR_curves, plot_var_imp, plot_shap_values, plot_LIME

from imblearn.over_sampling import RandomOverSampler

In [3]:
X_train_transformed = pd.read_pickle('picklefiles/X_train_transformed.pkl')
X_test_transformed = pd.read_pickle('picklefiles/X_test_transformed.pkl')
y_train=pd.read_pickle('picklefiles/y_train.pkl')
y_test= pd.read_pickle('picklefiles/y_test.pkl')

In [4]:
results=[]

### Random Forest model:

In our quest to optimize the Random Forest model, we employed GridSearchCV, a systematic approach to tuning hyperparameters. This method involved experimenting with a multitude of parameter combinations. For clarity and brevity in this demonstration, we have included only a select few of these combinations in the subsequent code snippet.

To navigate effectively towards the optimal set of parameters, we adopted two approached:

1. Logging the runs into wandb. Wandb is a service that allows for users to log their ML training runs, it records the hyperparameters and the resulting metric. The wandb runs are not shared here, to avoid authentication errors.
2. Visualizing ROCAUC for each parameter across variations of the other parameters.


This graphical representation provided us with valuable insights into the trends and patterns of the model's performance. This helped us with a more efficient gridsearch, where we updated the params based on results from the previous runs/ plots.


### Findings:
As we tried to optimize for the ROCAUC curve here are some findings:


1. n_estimators=800 was a good spot, increasing the number of estimators beyond this number was taking a lot longer to fit the models and the gain in ROCAUC was not worth it.
2. We found that entropy as a criterion stood out against gini for our data
3. A random state has been chosen to make the results reproducable.
4. A max depth of 18 worked best, which for our data is not very deep for a random forest mode, meaning more generalizable models works better.



The Val ROCAUC ended up being 73.2
The Test ROCAUC ended up being 71.6

In [None]:

# Best parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 19, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 800, 'n_jobs': -1, 'oob_score': False, 'random_state': 109, 'warm_start': False} 
#Val ROCAUC=73.2
#Test ROCAUC=71.6


params = {'random_state': [109],
          'n_estimators': [800],                #tried 1000, 100, 500
          'criterion': ['entropy'],             #tried Gini
          'max_depth': [17,18,19],              #tried 10, 25, 50 
          'min_samples_split': [2,3,1],
          'min_samples_leaf': [2],              #tried 3,4,1,10
          'min_weight_fraction_leaf': [0.0], 
          'max_features': ['sqrt'],             #tried log2, 0.5, 0.3
          'max_leaf_nodes': [None], 
          'min_impurity_decrease': [0.0],       #tried 0.5, 0.3
          'bootstrap': [True], 
          'oob_score': [False],
          'n_jobs': [-1],
          'warm_start': [False], 
          'class_weight': [None],               #tried balanced, Smote (using pipeline), random undersampled (using pipeline)
          'ccp_alpha': [0.0]
         }


# 10-fold CV
rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=params,
    cv=10,
    scoring='roc_auc',
    refit=True,
    return_train_score=True,
    n_jobs=-1,
    verbose=2
)

start_time = time.time()

# fit the model
rf.fit(X_train_transformed, y_train)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

print("\nBest parameters:", rf.best_params_, "\n\nAUCROC:", round(rf.best_score_, 3))

In [31]:
# get training and validation scores for each hyperparameter combination
pd.set_option('display.precision', 3)
get_train_val_scores(rf, params=params).dropna(subset=['mean_test_score'])


Unnamed: 0,mean_test_score,std_test_score,mean_train_score,param_random_state,param_n_estimators,param_criterion,param_max_depth,param_min_samples_split,param_min_samples_leaf,param_min_weight_fraction_leaf,param_max_features,param_max_leaf_nodes,param_min_impurity_decrease,param_bootstrap,param_oob_score,param_n_jobs,param_warm_start,param_class_weight,param_ccp_alpha
6,0.7,0.0,1.0,109,800,entropy,19,2,2,0.0,sqrt,,0.0,True,False,-1,False,,0.0
7,0.7,0.0,1.0,109,800,entropy,19,3,2,0.0,sqrt,,0.0,True,False,-1,False,,0.0
0,0.7,0.0,1.0,109,800,entropy,17,2,2,0.0,sqrt,,0.0,True,False,-1,False,,0.0
1,0.7,0.0,1.0,109,800,entropy,17,3,2,0.0,sqrt,,0.0,True,False,-1,False,,0.0
3,0.7,0.0,1.0,109,800,entropy,18,2,2,0.0,sqrt,,0.0,True,False,-1,False,,0.0
4,0.7,0.0,1.0,109,800,entropy,18,3,2,0.0,sqrt,,0.0,True,False,-1,False,,0.0


In [7]:
results=[]
data = (X_train_transformed, X_test_transformed, y_train, y_test)
results.append(get_performance_metrics(rf, 'Random Forest', data))
get_results_df(results, model='Random Forest')

Unnamed: 0,model,partition,metric,value
0,Random Forest,Train,Readmitted-Rate-Observed,5.0
1,Random Forest,Train,Readmitted-Rate-Predicted,5.0
2,Random Forest,Train,Naive-Accuracy,95.0
3,Random Forest,Train,Accuracy,95.1
4,Random Forest,Train,AUC-ROC,99.1
5,Random Forest,Train,AUC-PR,93.5
6,Random Forest,Train,F1-Score,3.7
7,Random Forest,Train,Recall-Sensitivity,1.9
8,Random Forest,Train,Specificity,100.0
9,Random Forest,Train,Precision,100.0
