In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_curve, precision_recall_curve, classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.combine import SMOTETomek
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/Diabetes_Preprocessed_Before_Feature_Selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,Not Required,Not Required,1,,,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",Diabetes,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,-2,-2,-2,-2,1,1,0,0,1,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",Diabetes,External causes of injury,6,,,-2,-2,-2,0,-2,-2,-2,-2,1,0,3,0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,Diabetes,Circulatory,7,,,-2,-2,-2,-2,-2,-2,-2,1,1,0,0,1,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,Neoplasms,Diabetes,5,,,-2,-2,-2,0,-2,-2,-2,0,1,0,0,0,insulin_combo


In [None]:
df.shape

(97070, 28)

In [None]:
X = df.drop('readmitted', 1)
y = df['readmitted']
X_dum = pd.get_dummies(X, drop_first = True)
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size = 0.3, random_state = 0, stratify = y)

In [None]:
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
model.fit(X_train, y_train)
sorted(zip(model.feature_importances_, X_train.columns), reverse = True)

[(0.09878159999880749, 'num_lab_procedures'),
 (0.08720190393977335, 'num_medications'),
 (0.06318743211903118, 'time_in_hospital'),
 (0.05554712572719704, 'preceding_year_visits'),
 (0.05145485596828784, 'age'),
 (0.04438641347711193, 'num_procedures'),
 (0.03973634017483778, 'number_diagnoses'),
 (0.022522569138021604, 'gender'),
 (0.02090317188290997, 'insulin'),
 (0.017551457662588057, 'admission_source_id_Referral'),
 (0.01702549936800917, 'race_Caucasian'),
 (0.016753609539938923, 'diag_2_Circulatory'),
 (0.016741771574289275, 'diag_3_Circulatory'),
 (0.015543278385913326, 'diag_1_Circulatory'),
 (0.012561560461283984, 'diag_3_Diabetes'),
 (0.012350080565701596, 'number_changes'),
 (0.012299510396421817, 'metformin'),
 (0.012045603077577184,
  'discharge_disposition_id_Transferred to another medical facility'),
 (0.011635478446431223, 'glipizide'),
 (0.010516778044156827, 'glyburide'),
 (0.010163062655697398, 'diag_2_Diabetes'),
 (0.01010559628942244, 'diag_2_Respiratory'),
 (0.0

In [None]:
model_cw1 = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced')
model_cw1.fit(X_train, y_train)
sorted(zip(model_cw1.feature_importances_, X_train.columns), reverse = True)

[(0.09722876488991944, 'num_lab_procedures'),
 (0.08517285276620576, 'num_medications'),
 (0.062411255902758415, 'time_in_hospital'),
 (0.06111154118857542, 'preceding_year_visits'),
 (0.05103688212488469, 'age'),
 (0.04399676244160737, 'num_procedures'),
 (0.042068541788467646, 'number_diagnoses'),
 (0.02081980606729452, 'gender'),
 (0.020719372659299306, 'insulin'),
 (0.017292437898349142,
  'discharge_disposition_id_Transferred to another medical facility'),
 (0.017149048558068244, 'admission_source_id_Referral'),
 (0.016908302300800246, 'diag_3_Circulatory'),
 (0.016702707688939996, 'race_Caucasian'),
 (0.016596775493678294, 'diag_2_Circulatory'),
 (0.015091184322786404, 'diag_1_Circulatory'),
 (0.01272820745092543, 'diag_3_Diabetes'),
 (0.012615762984400113, 'metformin'),
 (0.012368744244885727, 'number_changes'),
 (0.011584394380125533, 'glipizide'),
 (0.010820693664102368,
  'discharge_disposition_id_Discharged to home with home health service'),
 (0.010622844091598032, 'diag_2_

In [None]:
model_cw2 = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced_subsample')
model_cw2.fit(X_train, y_train)
sorted(zip(model_cw2.feature_importances_, X_train.columns), reverse = True)

[(0.09671993472854003, 'num_lab_procedures'),
 (0.08548228657373078, 'num_medications'),
 (0.062446399779969713, 'time_in_hospital'),
 (0.06042540819753317, 'preceding_year_visits'),
 (0.05120547544362624, 'age'),
 (0.04415096538210633, 'num_procedures'),
 (0.041227826263442145, 'number_diagnoses'),
 (0.020495090825902097, 'gender'),
 (0.0201503462093601, 'insulin'),
 (0.017333868624548127,
  'discharge_disposition_id_Transferred to another medical facility'),
 (0.016921752064277085, 'diag_3_Circulatory'),
 (0.01687463532578772, 'admission_source_id_Referral'),
 (0.01661287811728064, 'diag_2_Circulatory'),
 (0.016412853715092886, 'race_Caucasian'),
 (0.015584348159338826, 'diag_1_Circulatory'),
 (0.012845175947945077, 'diag_3_Diabetes'),
 (0.012753774968281301, 'metformin'),
 (0.012294583089840683, 'number_changes'),
 (0.0117224876851911, 'glipizide'),
 (0.010831088323488567, 'diag_1_Respiratory'),
 (0.010619770473868244, 'diag_2_Respiratory'),
 (0.010508531378868785, 'diag_2_Diabetes'

In [None]:
fi_df = pd.DataFrame({'Features' : X_train.columns, 'No_Class_Weight' : model.feature_importances_,
                          'Balanced' : model_cw1.feature_importances_, 'Balanced_Subsample' : model_cw2.feature_importances_})
fi_df

Unnamed: 0,Features,No_Class_Weight,Balanced,Balanced_Subsample
0,gender,0.022523,0.02082,0.020495
1,age,0.051455,0.051037,0.051205
2,time_in_hospital,0.063187,0.062411,0.062446
3,num_lab_procedures,0.098782,0.097229,0.09672
4,num_procedures,0.044386,0.043997,0.044151
5,num_medications,0.087202,0.085173,0.085482
6,number_diagnoses,0.039736,0.042069,0.041228
7,metformin,0.0123,0.012616,0.012754
8,repaglinide,0.00362,0.002939,0.003152
9,glimepiride,0.006575,0.006472,0.006629


### Without Class Weight

In [None]:
imp = fi_df[fi_df['No_Class_Weight'] >= 0]['Features']
print("Number of features:", len(imp))
model = RandomForestClassifier(n_jobs = -1, random_state = 0)
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Number of features: 95
Training Time: 13.621416091918945
Prediction Time: 1.024709701538086
Train Accuracy Score: 0.9999411323198281
Train Confusion Matrix:
 [[60165     0]
 [    4  7780]] 

Train F1 Score: 0.9997429966589566
Train Precision Score: 1.0
Train Recall Score: 0.9994861253854059
Train ROC_AUC Score: 0.999999998932364

Test Accuracy Score: 0.8855465128257958
Test Confusion Matrix:
 [[25778     7]
 [ 3326    10]] 

Test F1 Score: 0.005964807634953773
Test Precision Score: 0.5882352941176471
Test Recall Score: 0.002997601918465228
Test ROC_AUC Score: 0.6344802982512187


In [None]:
imp = fi_df[fi_df['No_Class_Weight'] >= 0.009]['Features']
print("Number of features:", len(imp))
model = RandomForestClassifier(n_jobs = -1, random_state = 0)
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Number of features: 30
Training Time: 7.889331102371216
Prediction Time: 0.7116963863372803
Train Accuracy Score: 0.9999411323198281
Train Confusion Matrix:
 [[60165     0]
 [    4  7780]] 

Train F1 Score: 0.9997429966589566
Train Precision Score: 1.0
Train Recall Score: 0.9994861253854059
Train ROC_AUC Score: 0.9999999989323639

Test Accuracy Score: 0.8855808523058961
Test Confusion Matrix:
 [[25782     3]
 [ 3329     7]] 

Test F1 Score: 0.0041841004184100415
Test Precision Score: 0.7
Test Recall Score: 0.0020983213429256594
Test ROC_AUC Score: 0.6317506785729065


### With Class Weight

#### Balanced

In [None]:
imp = fi_df[fi_df['Balanced'] >= 0]['Features']
print("Number of features:", len(imp))
model = RandomForestClassifier(n_jobs = -1, random_state = 0, class_weight = 'balanced')
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Number of features: 95
Training Time: 12.926573753356934
Prediction Time: 0.922868013381958
Train Accuracy Score: 0.9999411323198281
Train Confusion Matrix:
 [[60164     1]
 [    3  7781]] 

Train F1 Score: 0.999743029680072
Train Precision Score: 0.9998714983294783
Train Recall Score: 0.9996145940390545
Train ROC_AUC Score: 0.9999996615593517

Test Accuracy Score: 0.8856838707461969
Test Confusion Matrix:
 [[25779     6]
 [ 3323    13]] 

Test F1 Score: 0.00774962742175857
Test Precision Score: 0.6842105263157895
Test Recall Score: 0.0038968824940047962
Test ROC_AUC Score: 0.6365730277906819


In [None]:
imp = fi_df[fi_df['Balanced'] >= 0.009]['Features']
print("Number of features:", len(imp))
model = RandomForestClassifier(n_jobs = -1, random_state = 0, class_weight = 'balanced')
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Number of features: 30
Training Time: 8.294874429702759
Prediction Time: 0.8141505718231201
Train Accuracy Score: 0.9999558492398711
Train Confusion Matrix:
 [[60164     1]
 [    2  7782]] 

Train F1 Score: 0.9998072846405859
Train Precision Score: 0.999871514840036
Train Recall Score: 0.999743062692703
Train ROC_AUC Score: 0.9999997533760575

Test Accuracy Score: 0.8854778338655953
Test Confusion Matrix:
 [[25781     4]
 [ 3331     5]] 

Test F1 Score: 0.0029895366218236174
Test Precision Score: 0.5555555555555556
Test Recall Score: 0.001498800959232614
Test ROC_AUC Score: 0.6252348673707921


#### Balanced Subsample

In [None]:
imp = fi_df[fi_df['Balanced_Subsample'] >= 0]['Features']
print("Number of features:", len(imp))
model = RandomForestClassifier(n_jobs = -1, random_state = 0, class_weight = 'balanced_subsample')
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Number of features: 95
Training Time: 14.563291072845459
Prediction Time: 0.9247901439666748
Train Accuracy Score: 0.9999411323198281
Train Confusion Matrix:
 [[60164     1]
 [    3  7781]] 

Train F1 Score: 0.999743029680072
Train Precision Score: 0.9998714983294783
Train Recall Score: 0.9996145940390545
Train ROC_AUC Score: 0.9999996594240794

Test Accuracy Score: 0.885443494385495
Test Confusion Matrix:
 [[25777     8]
 [ 3328     8]] 

Test F1 Score: 0.00477326968973747
Test Precision Score: 0.5
Test Recall Score: 0.002398081534772182
Test ROC_AUC Score: 0.6343718277268819


In [None]:
imp = fi_df[fi_df['Balanced_Subsample'] >= 0.009]['Features']
print("Number of features:", len(imp))
model = RandomForestClassifier(n_jobs = -1, random_state = 0, class_weight = 'balanced_subsample')
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Number of features: 30
Training Time: 9.56050419807434
Prediction Time: 0.7116415500640869
Train Accuracy Score: 0.9999411323198281
Train Confusion Matrix:
 [[60164     1]
 [    3  7781]] 

Train F1 Score: 0.999743029680072
Train Precision Score: 0.9998714983294783
Train Recall Score: 0.9996145940390545
Train ROC_AUC Score: 0.9999998644102134

Test Accuracy Score: 0.8853404759451942
Test Confusion Matrix:
 [[25778     7]
 [ 3332     4]] 

Test F1 Score: 0.0023902001792650133
Test Precision Score: 0.36363636363636365
Test Recall Score: 0.001199040767386091
Test ROC_AUC Score: 0.6283022215153997


### Sampling Techniques

#### Under Sampling

In [None]:
rus = RandomUnderSampler(random_state = 0)
X_rus, y_rus = rus.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
model.fit(X_rus, y_rus)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[15582 10203]
 [ 1377  1959]]
              precision    recall  f1-score   support

           0       0.92      0.60      0.73     25785
           1       0.16      0.59      0.25      3336

    accuracy                           0.60     29121
   macro avg       0.54      0.60      0.49     29121
weighted avg       0.83      0.60      0.67     29121

ROC_AUC: 0.6322889216259336


In [None]:
rus = RandomUnderSampler(random_state = 0)
X_rus, y_rus = rus.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced_subsample')
model.fit(X_rus, y_rus)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[15623 10162]
 [ 1387  1949]]
              precision    recall  f1-score   support

           0       0.92      0.61      0.73     25785
           1       0.16      0.58      0.25      3336

    accuracy                           0.60     29121
   macro avg       0.54      0.60      0.49     29121
weighted avg       0.83      0.60      0.68     29121

ROC_AUC: 0.6346809754058301


#### Over Sampling

In [None]:
ros = RandomOverSampler(random_state = 0)
X_ros, y_ros = ros.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
model.fit(X_ros, y_ros)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[25689    96]
 [ 3278    58]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.38      0.02      0.03      3336

    accuracy                           0.88     29121
   macro avg       0.63      0.51      0.49     29121
weighted avg       0.83      0.88      0.83     29121

ROC_AUC: 0.6280131392268384


In [None]:
ros = RandomOverSampler(random_state = 0)
X_ros, y_ros = ros.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced_subsample')
model.fit(X_ros, y_ros)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[25692    93]
 [ 3284    52]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.36      0.02      0.03      3336

    accuracy                           0.88     29121
   macro avg       0.62      0.51      0.48     29121
weighted avg       0.83      0.88      0.83     29121

ROC_AUC: 0.6252679589894111


#### SMOTE

In [None]:
sm = SMOTE(random_state = 0, n_jobs = -1)
X_sm, y_sm = sm.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
model.fit(X_sm, y_sm)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[23574  2211]
 [ 2998   338]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90     25785
           1       0.13      0.10      0.11      3336

    accuracy                           0.82     29121
   macro avg       0.51      0.51      0.51     29121
weighted avg       0.80      0.82      0.81     29121

ROC_AUC: 0.5673032719839253


In [None]:
sm = SMOTE(random_state = 0, n_jobs = -1)
X_sm, y_sm = sm.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced_subsample')
model.fit(X_sm, y_sm)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[23575  2210]
 [ 2995   341]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90     25785
           1       0.13      0.10      0.12      3336

    accuracy                           0.82     29121
   macro avg       0.51      0.51      0.51     29121
weighted avg       0.80      0.82      0.81     29121

ROC_AUC: 0.5661021793385536


#### SmoteTomek

In [None]:
smtmk = SMOTETomek(random_state = 0)
X_smtmk, y_smtmk = sm.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
model.fit(X_smtmk, y_smtmk)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[23574  2211]
 [ 2998   338]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90     25785
           1       0.13      0.10      0.11      3336

    accuracy                           0.82     29121
   macro avg       0.51      0.51      0.51     29121
weighted avg       0.80      0.82      0.81     29121

ROC_AUC: 0.5673032719839253


In [None]:
smtmk = SMOTETomek(random_state = 0)
X_smtmk, y_smtmk = sm.fit_resample(X_train[imp], y_train)
model = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced_subsample')
model.fit(X_smtmk, y_smtmk)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[23575  2210]
 [ 2995   341]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90     25785
           1       0.13      0.10      0.12      3336

    accuracy                           0.82     29121
   macro avg       0.51      0.51      0.51     29121
weighted avg       0.80      0.82      0.81     29121

ROC_AUC: 0.5661021793385536


### Hyper-Parameter Tuning

In [None]:
X_rus.shape, y_rus.sum()

((15568, 25), 7784)

In [None]:
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
params = {'n_estimators' : randint(100, 1000),
          'criterion' : ['gini', 'entropy'],
          'max_depth' : randint(1, 50),
          'min_samples_split' : randint(2, 25),
          'min_samples_leaf' : randint(1, 25),
          'max_features' : randint(1, 25),
          'bootstrap' : ['True', 'False']}
rsearch = RandomizedSearchCV(model, params, n_iter = 10, scoring = 'recall', n_jobs = -1,
                             cv = 5, verbose = 2, random_state = 0)
rsearch.fit(X_rus, y_rus)

In [None]:
rsearch.best_score_, rsearch.best_params_

(0.622302546093354,
 {'bootstrap': 'False',
  'criterion': 'gini',
  'max_depth': 12,
  'max_features': 19,
  'min_samples_leaf': 24,
  'min_samples_split': 4,
  'n_estimators': 228})

In [None]:
model = RandomForestClassifier(random_state = 0, n_jobs = -1)
params = {'n_estimators' : randint(200, 300),
          'criterion' : ['gini', 'entropy'],
          'max_depth' : randint(5, 20),
          'min_samples_split' : randint(2, 10),
          'min_samples_leaf' : randint(20, 50),
          'max_features' : randint(10, 25),
          'bootstrap' : ['True', 'False']}
rsearch = RandomizedSearchCV(model, params, n_iter = 50, scoring = 'recall', n_jobs = -1,
                             cv = 5, verbose = 1, random_state = 0)
rsearch.fit(X_rus, y_rus)
rsearch.best_score_, rsearch.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 15.6min finished


(0.6248720844416046,
 {'bootstrap': 'True',
  'criterion': 'gini',
  'max_depth': 16,
  'max_features': 10,
  'min_samples_leaf': 34,
  'min_samples_split': 5,
  'n_estimators': 253})

In [None]:
model = rsearch.best_estimator_
model.fit(X_rus, y_rus)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

[[15400 10385]
 [ 1246  2090]]
              precision    recall  f1-score   support

           0       0.93      0.60      0.73     25785
           1       0.17      0.63      0.26      3336

    accuracy                           0.60     29121
   macro avg       0.55      0.61      0.50     29121
weighted avg       0.84      0.60      0.67     29121

ROC_AUC: 0.6543836774675664
