In [5]:
import warnings 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV, validation_curve

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [6]:
pd.set_option('display.max_columns',None)
warnings.simplefilter(action = 'ignore', category= Warning)

In [9]:
df = pd.read_csv('diabetes.csv')

In [10]:
y = df['Outcome']
X = df.drop(['Outcome'], axis = 1)

In [11]:
rf_model = RandomForestClassifier(random_state=17)
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 17,
 'verbose': 0,
 'warm_start': False}

In [13]:
cv_results = cross_validate(rf_model, X, y, cv = 10, scoring=['accuracy', 'f1','roc_auc'])
cv_results['test_roc_auc'].mean()

0.8233960113960114

In [12]:
rf_params = {'max_depth' : [5, 8, None],
             'max_features' : [3, 5, 7, 'auto'],
             'min_samples_split' : [2, 5, 8, 15, 20],
             'n_estimators': [100, 200, 500]}

In [15]:
rf_best_grid = GridSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=True).fit(X,y)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


In [16]:
rf_best_grid.best_params_

{'max_depth': None,
 'max_features': 5,
 'min_samples_split': 8,
 'n_estimators': 500}

In [20]:
rf_final = rf_model.set_params(**rf_best_grid.best_params_, random_state=17).fit(X,y)

In [21]:
cv_results = cross_validate(rf_final, X, y, cv=10, scoring=['accuracy','f1','roc_auc'])

In [22]:
print(cv_results['test_accuracy'].mean())
print(cv_results['test_f1'].mean())
print(cv_results['test_roc_auc'].mean())

0.766848940533151
0.6447777811143756
0.8271054131054132


In [23]:
gbm_model = GradientBoostingClassifier(random_state=17)


In [24]:
gbm_model.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 17,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [25]:
cv_results = cross_validate(gbm_model, X, y, cv=5, scoring = ['accuracy','f1','roc_auc'])

In [26]:
print(cv_results['test_accuracy'].mean())
print(cv_results['test_f1'].mean())
print(cv_results['test_roc_auc'].mean())

0.7591715474068416
0.634235802826363
0.8257494758909854


In [27]:
gbm_params = {'learning_rate' : [0.01, 0.1],
              'max_depth': [3, 8, 10],
              'n_estimators' : [100, 500, 1000], 
              'subsample' : [1, 0.5, 0.7]}

In [None]:
gbm_best_grid = GridSearchCV(gbm_model, gbm_params, cv = 5, random_state =17).fit(X,y)

In [28]:
xgboost_model = XGBClassifier(random_state =17)
cv_results = cross_validate(xgboost_model, X, y, cv=5, scoring = ['accuracy','f1','roc_auc'])
print(cv_results['test_accuracy'].mean())
print(cv_results['test_roc_auc'].mean())
print(cv_results['test_f1'].mean())

0.7526525761819879
0.7987134870719776
0.6317893713482235


In [31]:
xgboost_params = {'learning_rate' : [0.1, 0.01],
           'max_depth' : [5, 8, None],
           'n_estimators' : [100, 500, 1000],
           'colsample_bytree' : [None, 0.7, 1]}

In [32]:
xgboost_best_grid = GridSearchCV(xgboost_model, xgboost_params, cv =5, n_jobs = -1, verbose = True).fit(X,y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [33]:
xgboost_final = xgboost_model.set_params(**xgboost_best_grid.best_params_ , random_state = 17).fit(X,y)

In [34]:
cv_results = cross_validate(xgboost_final, X, y, cv = 5, scoring= ['accuracy','f1','roc_auc'])

In [35]:
lgm_model = LGBMClassifier(random_state=17)

In [36]:
lgm_model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': 17,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [37]:
cv_results = cross_validate(lgm_model, X, y, cv= 5, scoring=['accuracy','f1','roc_auc'])
print(cv_results['test_accuracy'].mean())
print(cv_results['test_roc_auc'].mean())
print(cv_results['test_f1'].mean())

[LightGBM] [Info] Number of positive: 214, number of negative: 400
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348534 -> initscore=-0.625489
[LightGBM] [Info] Start training from score -0.625489
[LightGBM] [Info] Number of positive: 214, number of negative: 400
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 662
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348534 -> initscore=-0.625489
[LightGBM] [Info] Start training from score -0.625489
[LightGBM] [Info] Number of positive: 214, number of negative: 400
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 667
[LightGBM] [In

In [38]:
lgm_params = {'learning_rate' : [0.01, 0.1],
               'n_estimators' : [100, 300, 500, 1000],
               'colsample_bytree' : [0.5, 0.7, 1]}

In [39]:
lgm_best_grid = GridSearchCV(lgm_model, lgm_params, cv = 5, n_jobs= -1, verbose=True).fit(X,y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[LightGBM] [Info] Number of positive: 268, number of negative: 500
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 768, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348958 -> initscore=-0.623621
[LightGBM] [Info] Start training from score -0.623621


In [40]:
lgm_final = lgm_model.set_params(**lgm_best_grid.best_params_, random_state = 17).fit(X,y)

[LightGBM] [Info] Number of positive: 268, number of negative: 500
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 768, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348958 -> initscore=-0.623621
[LightGBM] [Info] Start training from score -0.623621


In [41]:
cv_results =cross_validate(lgm_final, X, y, cv = 5, scoring=['accuracy','f1','roc_auc'])
print(cv_results['test_accuracy'].mean())
print(cv_results['test_roc_auc'].mean())
print(cv_results['test_f1'].mean())

[LightGBM] [Info] Number of positive: 214, number of negative: 400
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348534 -> initscore=-0.625489
[LightGBM] [Info] Start training from score -0.625489
[LightGBM] [Info] Number of positive: 214, number of negative: 400
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 662
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348534 -> initscore=-0.625489
[LightGBM] [Info] Start training from score -0.625489
[LightGBM] [Info] Number of positive: 214, number of negative: 400
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 614, number of used 

In [42]:
catboost_model = CatBoostClassifier(random_state = 17)

cv_results = cross_validate(catboost_model, X, y, cv = 5, scoring=['accuracy','f1','roc_auc'])

print(cv_results['test_accuracy'].mean())
print(cv_results['test_roc_auc'].mean())
print(cv_results['test_roc_auc'].mean())

Learning rate set to 0.008365
0:	learn: 0.6886231	total: 160ms	remaining: 2m 40s
1:	learn: 0.6842253	total: 163ms	remaining: 1m 21s
2:	learn: 0.6800051	total: 164ms	remaining: 54.7s
3:	learn: 0.6759062	total: 166ms	remaining: 41.4s
4:	learn: 0.6719895	total: 168ms	remaining: 33.4s
5:	learn: 0.6675294	total: 169ms	remaining: 28s
6:	learn: 0.6637191	total: 171ms	remaining: 24.2s
7:	learn: 0.6605811	total: 172ms	remaining: 21.3s
8:	learn: 0.6571010	total: 173ms	remaining: 19.1s
9:	learn: 0.6540252	total: 175ms	remaining: 17.3s
10:	learn: 0.6499686	total: 176ms	remaining: 15.8s
11:	learn: 0.6461533	total: 177ms	remaining: 14.6s
12:	learn: 0.6428399	total: 178ms	remaining: 13.5s
13:	learn: 0.6394652	total: 180ms	remaining: 12.6s
14:	learn: 0.6356107	total: 181ms	remaining: 11.9s
15:	learn: 0.6326628	total: 182ms	remaining: 11.2s
16:	learn: 0.6294287	total: 183ms	remaining: 10.6s
17:	learn: 0.6261973	total: 185ms	remaining: 10.1s
18:	learn: 0.6229005	total: 186ms	remaining: 9.59s
19:	learn: 

In [44]:
catboost_params = {'iterations' : [200, 500],
                   'learning_rate' : [0.01, 0.1],
                   'depth' : [3,6]}

In [45]:
catboost_best_grid = GridSearchCV(catboost_model, catboost_params, cv = 5, 
                                  n_jobs= -1, verbose= True).fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
0:	learn: 0.6893609	total: 1.04ms	remaining: 518ms
1:	learn: 0.6850996	total: 2.01ms	remaining: 501ms
2:	learn: 0.6813698	total: 2.9ms	remaining: 481ms
3:	learn: 0.6786691	total: 3.92ms	remaining: 487ms
4:	learn: 0.6749818	total: 4.89ms	remaining: 484ms
5:	learn: 0.6710180	total: 5.87ms	remaining: 484ms
6:	learn: 0.6671597	total: 6.85ms	remaining: 483ms
7:	learn: 0.6643789	total: 7.7ms	remaining: 473ms
8:	learn: 0.6617727	total: 8.6ms	remaining: 469ms
9:	learn: 0.6591806	total: 9.48ms	remaining: 465ms
10:	learn: 0.6555457	total: 10.4ms	remaining: 462ms
11:	learn: 0.6518575	total: 11.4ms	remaining: 463ms
12:	learn: 0.6481689	total: 12.4ms	remaining: 465ms
13:	learn: 0.6453643	total: 13.4ms	remaining: 465ms
14:	learn: 0.6416217	total: 14.4ms	remaining: 467ms
15:	learn: 0.6383526	total: 15.4ms	remaining: 466ms
16:	learn: 0.6354100	total: 16.4ms	remaining: 465ms
17:	learn: 0.6329683	total: 17.2ms	remaining: 461ms
18:	learn: 0.6308

In [46]:
catboost_final = catboost_model.set_params(**catboost_best_grid.best_params_, random_state = 17).fit(X,y)

0:	learn: 0.6893609	total: 1.39ms	remaining: 691ms
1:	learn: 0.6850996	total: 2.9ms	remaining: 722ms
2:	learn: 0.6813698	total: 3.88ms	remaining: 642ms
3:	learn: 0.6786691	total: 5.04ms	remaining: 625ms
4:	learn: 0.6749818	total: 6.03ms	remaining: 597ms
5:	learn: 0.6710180	total: 7.29ms	remaining: 600ms
6:	learn: 0.6671597	total: 8.37ms	remaining: 590ms
7:	learn: 0.6643789	total: 9.5ms	remaining: 584ms
8:	learn: 0.6617727	total: 10.7ms	remaining: 583ms
9:	learn: 0.6591806	total: 11.7ms	remaining: 575ms
10:	learn: 0.6555457	total: 12.7ms	remaining: 563ms
11:	learn: 0.6518575	total: 13.6ms	remaining: 553ms
12:	learn: 0.6481689	total: 14.5ms	remaining: 544ms
13:	learn: 0.6453643	total: 15.5ms	remaining: 537ms
14:	learn: 0.6416217	total: 16.4ms	remaining: 531ms
15:	learn: 0.6383526	total: 17.4ms	remaining: 527ms
16:	learn: 0.6354100	total: 18.4ms	remaining: 522ms
17:	learn: 0.6329683	total: 19.6ms	remaining: 524ms
18:	learn: 0.6308805	total: 20.6ms	remaining: 521ms
19:	learn: 0.6284278	tot

In [49]:
cv_results = cross_validate(catboost_final, X, y, cv = 5, scoring=['accuracy', 'f1','roc_auc'])
print(cv_results['test_accuracy'].mean())
print(cv_results['test_roc_auc'].mean())
print(cv_results['test_roc_auc'].mean())

0:	learn: 0.6897881	total: 1.13ms	remaining: 562ms
1:	learn: 0.6861516	total: 2.35ms	remaining: 585ms
2:	learn: 0.6830288	total: 4.04ms	remaining: 669ms
3:	learn: 0.6798794	total: 5.55ms	remaining: 688ms
4:	learn: 0.6763495	total: 6.7ms	remaining: 664ms
5:	learn: 0.6732158	total: 7.69ms	remaining: 633ms
6:	learn: 0.6689040	total: 8.68ms	remaining: 611ms
7:	learn: 0.6657336	total: 9.61ms	remaining: 591ms
8:	learn: 0.6618963	total: 10.5ms	remaining: 573ms
9:	learn: 0.6582027	total: 11.7ms	remaining: 574ms
10:	learn: 0.6542115	total: 12.7ms	remaining: 563ms
11:	learn: 0.6503329	total: 13.9ms	remaining: 566ms
12:	learn: 0.6461895	total: 14.9ms	remaining: 559ms
13:	learn: 0.6428322	total: 15.9ms	remaining: 554ms
14:	learn: 0.6400593	total: 17ms	remaining: 549ms
15:	learn: 0.6370495	total: 17.9ms	remaining: 542ms
16:	learn: 0.6341663	total: 18.9ms	remaining: 536ms
17:	learn: 0.6315598	total: 19.8ms	remaining: 531ms
18:	learn: 0.6286342	total: 20.8ms	remaining: 526ms
19:	learn: 0.6262759	tota

In [51]:
rf_model = RandomForestClassifier(random_state=17)