In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.combine import SMOTETomek
from scipy.stats import uniform, randint



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/Diabetes_Preprocessed_Before_Feature_Selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,Not Required,Not Required,1,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",Diabetes,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",Diabetes,External causes of injury,6,,,-2,-2,-2,-2,0,-2,-2,-2,-2,-2,1,0,3,0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,Diabetes,Circulatory,7,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,Neoplasms,Diabetes,5,,,-2,-2,-2,-2,0,-2,-2,-2,0,-2,1,0,0,0,insulin_combo


In [None]:
df.shape

(97070, 30)

In [None]:
X = df.drop('readmitted', 1)
y = df['readmitted']
X_dum = pd.get_dummies(X, drop_first = True)
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size = 0.3, random_state = 0, stratify = y)

In [None]:
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_train, y_train)
sorted(zip(model.feature_importances_, X_train.columns), reverse = True)

[(0.4399437961887246, 'preceding_year_visits'),
 (0.14916716517337786,
  'discharge_disposition_id_Transferred to another medical facility'),
 (0.03675321674542318, 'age'),
 (0.03235826126558023, 'number_diagnoses'),
 (0.028345531759157275, 'num_medications'),
 (0.024792976799512024, 'time_in_hospital'),
 (0.02413955481848056, 'insulin_treatment_insulin_only'),
 (0.022673720381530982, 'diag_1_External causes of injury'),
 (0.01781331252490117, 'num_lab_procedures'),
 (0.017286799536575477,
  'discharge_disposition_id_Still patient/referred to this institution'),
 (0.012095674943428605, 'diag_2_Neoplasms'),
 (0.011103871875848827, 'diag_3_Neoplasms'),
 (0.010302500664576027, 'diag_2_Diabetes'),
 (0.007182684718835615, 'diag_1_Respiratory'),
 (0.007082551679241033, 'diag_1_Diabetes'),
 (0.006951132874845997, 'diag_1_Musculoskeletal System and Connective Tissue'),
 (0.006440853440397732, 'diag_1_Circulatory'),
 (0.00626544984663948,
  'discharge_disposition_id_Discharged to home with home

In [None]:
fi_df = pd.DataFrame({'Features' : X_train.columns, 'Importances' : model.feature_importances_})
fi_df

Unnamed: 0,Features,Importances
0,gender,0.000668
1,age,0.036753
2,time_in_hospital,0.024793
3,num_lab_procedures,0.017813
4,num_procedures,0.004263
5,num_medications,0.028346
6,number_diagnoses,0.032358
7,metformin,0.005385
8,repaglinide,0.004177
9,nateglinide,0.0


In [None]:
imp = fi_df[fi_df['Importances'] >= 0]['Features']
print("Number of features:", len(imp))
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_train[imp], y_train)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

Number of features: 97
[[25775    10]
 [ 3317    19]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.66      0.01      0.01      3336

    accuracy                           0.89     29121
   macro avg       0.77      0.50      0.48     29121
weighted avg       0.86      0.89      0.83     29121

ROC_AUC: 0.6631890357405757


In [None]:
imp = fi_df[fi_df['Importances'] >= 0.01]['Features']
print("Number of features:", len(imp))
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_train[imp], y_train)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

Number of features: 13
[[25775    10]
 [ 3318    18]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.64      0.01      0.01      3336

    accuracy                           0.89     29121
   macro avg       0.76      0.50      0.48     29121
weighted avg       0.86      0.89      0.83     29121

ROC_AUC: 0.6562559144075083


### Sampling Techniques

#### Under Sampling

In [None]:
rus = RandomUnderSampler(random_state = 0)
X_rus, y_rus = rus.fit_resample(X_train[imp], y_train)
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_rus, y_rus)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[15700 10085]
 [ 1277  2059]]
              precision    recall  f1-score   support

           0       0.92      0.61      0.73     25785
           1       0.17      0.62      0.27      3336

    accuracy                           0.61     29121
   macro avg       0.55      0.61      0.50     29121
weighted avg       0.84      0.61      0.68     29121

ROC_AUC: 0.6539001085344638


#### Over Sampling

In [None]:
ros = RandomOverSampler(random_state = 0)
X_ros, y_ros = ros.fit_resample(X_train[imp], y_train)
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_ros, y_ros)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[15816  9969]
 [ 1303  2033]]
              precision    recall  f1-score   support

           0       0.92      0.61      0.74     25785
           1       0.17      0.61      0.27      3336

    accuracy                           0.61     29121
   macro avg       0.55      0.61      0.50     29121
weighted avg       0.84      0.61      0.68     29121

ROC_AUC: 0.6547157038766892


#### SMOTE

In [None]:
sm = SMOTE(random_state = 0, n_jobs = -1)
X_sm, y_sm = sm.fit_resample(X_train[imp], y_train)
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_sm, y_sm)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[13928 11857]
 [ 1754  1582]]
              precision    recall  f1-score   support

           0       0.89      0.54      0.67     25785
           1       0.12      0.47      0.19      3336

    accuracy                           0.53     29121
   macro avg       0.50      0.51      0.43     29121
weighted avg       0.80      0.53      0.62     29121

ROC_AUC: 0.5256856701956643


#### SmoteTomek

In [None]:
smtmk = SMOTETomek(random_state = 0)
X_smtmk, y_smtmk = sm.fit_resample(X_train[imp], y_train)
model = GradientBoostingClassifier(random_state = 0)
model.fit(X_smtmk, y_smtmk)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))



[[13928 11857]
 [ 1754  1582]]
              precision    recall  f1-score   support

           0       0.89      0.54      0.67     25785
           1       0.12      0.47      0.19      3336

    accuracy                           0.53     29121
   macro avg       0.50      0.51      0.43     29121
weighted avg       0.80      0.53      0.62     29121

ROC_AUC: 0.5256856701956643


### Hyper-Parameter Tuning

In [None]:
X_rus.shape, X_ros.shape

((15568, 13), (120330, 13))

In [None]:
np.logspace(-2, 0, 100)

array([0.01      , 0.01047616, 0.01097499, 0.01149757, 0.01204504,
       0.01261857, 0.01321941, 0.01384886, 0.01450829, 0.01519911,
       0.01592283, 0.01668101, 0.01747528, 0.01830738, 0.0191791 ,
       0.02009233, 0.02104904, 0.02205131, 0.0231013 , 0.02420128,
       0.02535364, 0.02656088, 0.02782559, 0.02915053, 0.03053856,
       0.03199267, 0.03351603, 0.03511192, 0.0367838 , 0.03853529,
       0.04037017, 0.04229243, 0.04430621, 0.04641589, 0.04862602,
       0.05094138, 0.05336699, 0.0559081 , 0.05857021, 0.06135907,
       0.06428073, 0.06734151, 0.07054802, 0.07390722, 0.07742637,
       0.08111308, 0.08497534, 0.08902151, 0.09326033, 0.097701  ,
       0.1023531 , 0.10722672, 0.1123324 , 0.1176812 , 0.12328467,
       0.12915497, 0.13530478, 0.14174742, 0.14849683, 0.15556761,
       0.16297508, 0.17073526, 0.17886495, 0.18738174, 0.19630407,
       0.20565123, 0.21544347, 0.22570197, 0.23644894, 0.24770764,
       0.25950242, 0.27185882, 0.28480359, 0.29836472, 0.31257

In [None]:
model = GradientBoostingClassifier(random_state = 0)

params = {'n_estimators' : np.arange(100, 500),
          'learning_rate' : np.logspace(-2, 0, 100),
          'min_samples_split' : np.arange(2, 50),
          'min_samples_leaf' : np.arange(1, 50),
          'max_depth' : np.arange(3, 30),
          'max_features' : np.arange(1, 13),}

rsearch = RandomizedSearchCV(model, params, n_iter = 10, scoring = 'recall', n_jobs = -1,
                             cv = 5, verbose = 1, random_state = 0)
rsearch.fit(X_rus, y_rus)
rsearch.best_score_, rsearch.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.5min finished


(0.5874881330354829,
 {'learning_rate': 0.011497569953977356,
  'max_depth': 25,
  'max_features': 9,
  'min_samples_leaf': 37,
  'min_samples_split': 10,
  'n_estimators': 368})

In [None]:
model = GradientBoostingClassifier(random_state = 0)

params = {'n_estimators' : np.arange(300, 500),
          'learning_rate' : np.logspace(-3, -1, 100),
          'min_samples_split' : np.arange(5, 15),
          'min_samples_leaf' : np.arange(30, 60),
          'max_depth' : np.arange(20, 40),
          'max_features' : np.arange(1, 13),}

rsearch = RandomizedSearchCV(model, params, n_iter = 10, scoring = 'recall', n_jobs = -1,
                             cv = 5, verbose = 1, random_state = 0)
rsearch.fit(X_ros, y_ros)
rsearch.best_score_, rsearch.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 103.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 110.4min finished


(0.9988199119089172,
 {'learning_rate': 0.05994842503189412,
  'max_depth': 34,
  'max_features': 11,
  'min_samples_leaf': 40,
  'min_samples_split': 11,
  'n_estimators': 499})

In [None]:
model = rsearch.best_estimator_
model.fit(X_ros, y_ros)
pred = model.predict(X_test[imp])
prob = model.predict_proba(X_test[imp])[:, 1]
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC_AUC:", roc_auc_score(y_test, prob))

[[24168  1617]
 [ 2995   341]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.91     25785
           1       0.17      0.10      0.13      3336

    accuracy                           0.84     29121
   macro avg       0.53      0.52      0.52     29121
weighted avg       0.81      0.84      0.82     29121

ROC_AUC: 0.5750269185466054
