In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.combine import SMOTETomek
from scipy.stats import uniform, randint

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/Diabetes_Preprocessed_Before_Feature_Selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,Not Required,Not Required,1,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",Diabetes,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",Diabetes,External causes of injury,6,,,-2,-2,-2,-2,0,-2,-2,-2,-2,-2,1,0,3,0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,Diabetes,Circulatory,7,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,Neoplasms,Diabetes,5,,,-2,-2,-2,-2,0,-2,-2,-2,0,-2,1,0,0,0,insulin_combo


In [None]:
df.shape

(97070, 30)

In [None]:
X = df.drop('readmitted', 1)
y = df['readmitted']
X_dum = pd.get_dummies(X, drop_first = True)
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size = 0.3, random_state = 0, stratify = y)

In [None]:
model = AdaBoostClassifier(random_state = 0)
model.fit(X_train, y_train)
sorted(zip(model.feature_importances_, X_train.columns), reverse = True)

[(0.16, 'preceding_year_visits'),
 (0.08, 'time_in_hospital'),
 (0.08, 'age'),
 (0.06, 'num_medications'),
 (0.02, 'number_diagnoses'),
 (0.02, 'num_procedures'),
 (0.02, 'num_lab_procedures'),
 (0.02, 'metformin'),
 (0.02, 'insulin_treatment_insulin_only'),
 (0.02, 'insulin'),
 (0.02, 'gender'),
 (0.02, 'discharge_disposition_id_Transferred to another medical facility'),
 (0.02, 'discharge_disposition_id_Still patient/referred to this institution'),
 (0.02, 'discharge_disposition_id_Not Available'),
 (0.02, 'discharge_disposition_id_Left AMA'),
 (0.02,
  'discharge_disposition_id_Discharged to home with home health service'),
 (0.02, 'diag_3_Pregnancy, Childbirth'),
 (0.02, 'diag_3_Neoplasms'),
 (0.02, 'diag_3_External causes of injury'),
 (0.02, 'diag_3_Endocrine, Nutritional, Metabolic, Immunity'),
 (0.02, 'diag_3_Circulatory'),
 (0.02, 'diag_2_Pregnancy, Childbirth'),
 (0.02, 'diag_2_Neoplasms'),
 (0.02, 'diag_2_Diabetes'),
 (0.02, 'diag_1_Sense Organs'),
 (0.02, 'diag_1_Respirator

In [None]:
fi_df = pd.DataFrame({'Features' : X_train.columns, 'Importances' : model.feature_importances_})
fi_df

Unnamed: 0,Features,Importances
0,gender,0.02
1,age,0.08
2,time_in_hospital,0.08
3,num_lab_procedures,0.02
4,num_procedures,0.02
5,num_medications,0.06
6,number_diagnoses,0.02
7,metformin,0.02
8,repaglinide,0.0
9,nateglinide,0.0


In [None]:
imp = fi_df[fi_df['Importances'] >= 0]['Features']
print("Number of features:", len(imp))
model = AdaBoostClassifier(random_state = 0)
model.fit(X_train[imp], y_train)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

Number of features: 97
[[25783     2]
 [ 3334     2]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.50      0.00      0.00      3336

    accuracy                           0.89     29121
   macro avg       0.69      0.50      0.47     29121
weighted avg       0.84      0.89      0.83     29121

ROC_AUC: 0.6560870849568164


In [None]:
imp = fi_df[fi_df['Importances'] > 0]['Features']
print("Number of features:", len(imp))
model = AdaBoostClassifier(random_state = 0)
model.fit(X_train[imp], y_train)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

Number of features: 35
[[25783     2]
 [ 3334     2]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.50      0.00      0.00      3336

    accuracy                           0.89     29121
   macro avg       0.69      0.50      0.47     29121
weighted avg       0.84      0.89      0.83     29121

ROC_AUC: 0.6560870849568164


### Sampling Techniques

#### Under Sampling

In [None]:
rus = RandomUnderSampler(random_state = 0)
X_rus, y_rus = rus.fit_resample(X_train[imp], y_train)
model = AdaBoostClassifier(random_state = 0)
model.fit(X_rus, y_rus)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[16566  9219]
 [ 1406  1930]]
              precision    recall  f1-score   support

           0       0.92      0.64      0.76     25785
           1       0.17      0.58      0.27      3336

    accuracy                           0.64     29121
   macro avg       0.55      0.61      0.51     29121
weighted avg       0.84      0.64      0.70     29121

ROC_AUC: 0.6531558406561545


#### Over Sampling

In [None]:
ros = RandomOverSampler(random_state = 0)
X_ros, y_ros = ros.fit_resample(X_train[imp], y_train)
model = AdaBoostClassifier(random_state = 0)
model.fit(X_ros, y_ros)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[16488  9297]
 [ 1390  1946]]
              precision    recall  f1-score   support

           0       0.92      0.64      0.76     25785
           1       0.17      0.58      0.27      3336

    accuracy                           0.63     29121
   macro avg       0.55      0.61      0.51     29121
weighted avg       0.84      0.63      0.70     29121

ROC_AUC: 0.6563384022276071


#### SMOTE

In [None]:
sm = SMOTE(random_state = 0, n_jobs = -1)
X_sm, y_sm = sm.fit_resample(X_train[imp], y_train)
model = AdaBoostClassifier(random_state = 0)
model.fit(X_sm, y_sm)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[19730  6055]
 [ 2506   830]]
              precision    recall  f1-score   support

           0       0.89      0.77      0.82     25785
           1       0.12      0.25      0.16      3336

    accuracy                           0.71     29121
   macro avg       0.50      0.51      0.49     29121
weighted avg       0.80      0.71      0.75     29121

ROC_AUC: 0.5220423893578564


#### SmoteTomek

In [None]:
smtmk = SMOTETomek(random_state = 0)
X_smtmk, y_smtmk = sm.fit_resample(X_train[imp], y_train)
model = AdaBoostClassifier(random_state = 0)
model.fit(X_smtmk, y_smtmk)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[19730  6055]
 [ 2506   830]]
              precision    recall  f1-score   support

           0       0.89      0.77      0.82     25785
           1       0.12      0.25      0.16      3336

    accuracy                           0.71     29121
   macro avg       0.50      0.51      0.49     29121
weighted avg       0.80      0.71      0.75     29121

ROC_AUC: 0.5220423893578564


### Hyper-Parameter Tuning

In [None]:
X_rus.shape, X_ros.shape

((15568, 35), (120330, 35))

In [None]:
np.logspace(-1, 1, 100)

array([ 0.1       ,  0.10476158,  0.10974988,  0.1149757 ,  0.12045035,
        0.12618569,  0.13219411,  0.13848864,  0.14508288,  0.15199111,
        0.15922828,  0.16681005,  0.17475284,  0.18307383,  0.19179103,
        0.2009233 ,  0.21049041,  0.22051307,  0.23101297,  0.24201283,
        0.25353645,  0.26560878,  0.27825594,  0.29150531,  0.30538555,
        0.31992671,  0.33516027,  0.35111917,  0.36783798,  0.38535286,
        0.40370173,  0.42292429,  0.44306215,  0.46415888,  0.48626016,
        0.5094138 ,  0.53366992,  0.55908102,  0.58570208,  0.61359073,
        0.64280731,  0.67341507,  0.70548023,  0.7390722 ,  0.77426368,
        0.81113083,  0.84975344,  0.89021509,  0.93260335,  0.97700996,
        1.02353102,  1.07226722,  1.12332403,  1.17681195,  1.23284674,
        1.29154967,  1.35304777,  1.41747416,  1.48496826,  1.55567614,
        1.62975083,  1.70735265,  1.78864953,  1.87381742,  1.96304065,
        2.05651231,  2.15443469,  2.25701972,  2.36448941,  2.47

In [None]:
model = AdaBoostClassifier(random_state = 0)
params = {'n_estimators' : np.arange(100, 1000),
          'learning_rate' : np.logspace(-1, 1, 100)}
rsearch = RandomizedSearchCV(model, params, n_iter = 50, scoring = 'recall', n_jobs = -1,
                             cv = 5, verbose = 1, random_state = 0)
rsearch.fit(X_ros, y_ros)
rsearch.best_score_, rsearch.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 122.8min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 163.3min finished


In [None]:
model = AdaBoostClassifier(random_state = 0)
params = {'n_estimators' : np.arange(100, 1000),
          'learning_rate' : np.logspace(-1, 1, 100)}
rsearch = RandomizedSearchCV(model, params, n_iter = 50, scoring = 'recall', n_jobs = -1,
                             cv = 5, verbose = 1, random_state = 0)
rsearch.fit(X_rus, y_rus)
rsearch.best_score_, rsearch.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 18.9min finished


(1.0, {'learning_rate': 6.280291441834256, 'n_estimators': 471})

In [None]:
rsearch.best_score_

1.0

In [None]:
model = rsearch.best_estimator_
model.fit(X_rus, y_rus)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

[[    0 25785]
 [    0  3336]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     25785
           1       0.11      1.00      0.21      3336

    accuracy                           0.11     29121
   macro avg       0.06      0.50      0.10     29121
weighted avg       0.01      0.11      0.02     29121

ROC_AUC: 0.5819535471099561


  _warn_prf(average, modifier, msg_start, len(result))
