### Random Forest with all dummy features

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
# Read the dataset
data = pd.read_csv("./data_all_dfeatures.csv")
data.shape

(99340, 131)

In [3]:
data.head()

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady
0,0,5,5,5.779898,-1.431088,-1.138455,-0.097257,-0.783548,-1.850433,-0.40111,...,1,0,0,1,0,0,0,0,0,0
1,0,15,1,-0.317652,0.648827,-0.464686,0.820652,-0.783548,0.249657,-0.40111,...,0,0,1,1,0,0,0,0,0,0
2,0,25,1,-0.317652,0.648827,-0.80157,-1.627106,2.152858,-0.368016,2.15797,...,1,0,0,1,0,0,0,0,0,0
3,1,35,1,-0.317652,0.648827,-0.80157,0.055727,-0.196267,0.002588,-0.40111,...,0,0,1,1,0,0,0,0,0,0
4,1,45,1,-0.317652,0.648827,-1.138455,0.412692,-0.783548,-0.98569,-0.40111,...,0,1,0,1,0,0,0,0,0,0


In [4]:
# SMOTE balancing technique is applied
from imblearn.over_sampling import SMOTE
# Importing test_train_split from sklearn library
from sklearn.model_selection import train_test_split

# get the predictors and target variable from dataset
X = data.drop('readmitted', axis=1)
y = data['readmitted']
oversample = SMOTE()

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101, stratify=y)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [5]:
# Random Forest

# Let's first fit a random forest model with default hyperparameters.
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [6]:
# Running the random forest with default parameters.
rfc = RandomForestClassifier()

In [7]:
# fit the model
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [8]:
# predict
predictions = rfc.predict(X_test)

In [9]:
# Printing confusion matrix
print(confusion_matrix(y_test,predictions))

[[17462   143]
 [ 2189    74]]


In [10]:
print(accuracy_score(y_test,predictions))

0.882625327159251


In [11]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94     17605
           1       0.34      0.03      0.06      2263

    accuracy                           0.88     19868
   macro avg       0.61      0.51      0.50     19868
weighted avg       0.83      0.88      0.84     19868



In [12]:
from sklearn.model_selection import RandomizedSearchCV

In [13]:
# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
param_grid = {'max_features': [4, 8, 14, 20],
             'n_estimators': range(100, 1500, 400),
             'max_depth': range(2, 20, 5),
             'min_samples_leaf': range(100, 400, 50),
             'min_samples_split': range(200, 500, 50),
             }
# instantiate the model 
rf = RandomForestClassifier()


# fit tree on training data
rf = RandomizedSearchCV(rf, param_grid, 
                  cv=n_folds, 
                  scoring="accuracy",
                  return_train_score=True,
                  n_jobs = -1,verbose = 1)
rf.fit(X_train, y_train)

best_parameters = rf.best_params_
print(best_parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 900, 'min_samples_split': 300, 'min_samples_leaf': 100, 'max_features': 20, 'max_depth': 17}


In [14]:
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,10.52241,0.35666,0.386081,0.01949,100,350,350,8,7,"{'n_estimators': 100, 'min_samples_split': 350...",...,0.804782,0.07212,7,0.861457,0.795222,0.807524,0.799625,0.792916,0.811349,0.025545
1,97.346365,0.947468,4.650152,0.201077,1300,300,250,4,12,"{'n_estimators': 1300, 'min_samples_split': 30...",...,0.833573,0.082707,4,0.895121,0.827856,0.826136,0.825248,0.826535,0.840179,0.027484
2,2.853763,0.180202,0.196871,0.009361,100,200,150,4,2,"{'n_estimators': 100, 'min_samples_split': 200...",...,0.768046,0.057479,10,0.802934,0.765037,0.757389,0.768687,0.755844,0.769978,0.017149
3,202.087237,1.203264,4.475873,0.165657,1300,200,150,14,7,"{'n_estimators': 1300, 'min_samples_split': 20...",...,0.801402,0.069724,8,0.863534,0.798257,0.796084,0.795747,0.797353,0.810195,0.026685
4,3.04004,0.058157,0.203712,0.002017,100,300,100,4,2,"{'n_estimators': 100, 'min_samples_split': 300...",...,0.776353,0.061593,9,0.822344,0.768986,0.769299,0.770755,0.771118,0.780501,0.020938


In [15]:
# Get the best parameters
best_parameters = rf.best_params_
print(best_parameters)

{'n_estimators': 900, 'min_samples_split': 300, 'min_samples_leaf': 100, 'max_features': 20, 'max_depth': 17}


In [16]:
# Build the Random Forest with the best_parameters
rf_best = RandomForestClassifier(n_estimators=best_parameters['n_estimators'],
                                 min_samples_split=best_parameters['min_samples_split'],
                                 min_samples_leaf=best_parameters['min_samples_leaf'],
                                 max_features=best_parameters['max_features'],
                                 max_depth=best_parameters['max_depth'],
                                 criterion='gini')
# fit the model
rf_best.fit(X_train, y_train)

# predict
best_predictions = rf_best.predict(X_test)

In [17]:
# evaluation metrics
print(classification_report(y_test,best_predictions))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92     17605
           1       0.23      0.12      0.16      2263

    accuracy                           0.85     19868
   macro avg       0.56      0.53      0.54     19868
weighted avg       0.82      0.85      0.83     19868



In [18]:
# Let's check the overall accuracy over the test dataset
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score
print("Accuracy_score: ", metrics.accuracy_score(y_test, best_predictions))

print("AUC_accu =", metrics.roc_auc_score(y_test, best_predictions ))
print("Precision_score = ",precision_score(y_test, best_predictions))
print("recall_score = ",recall_score(y_test, best_predictions))


Accuracy_score:  0.8545902959532917
AUC_accu = 0.5344005533116558
Precision_score =  0.2320205479452055
recall_score =  0.11975254087494476
