In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('new_dataframe.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,-0.23958,0.528651,-0.441381,0.894329,-1.024037,0.179171,-1.731629,1.258507,1
1,-1.676951,-1.185363,0.710054,0.641256,0.944704,0.789176,-0.781667,-0.955768,0
2,-1.676951,0.750838,-3.344118,-1.32006,-1.024037,1.584834,0.775403,-0.03092,1
3,-1.676951,0.433428,-0.095951,1.337207,1.22525,1.332875,-0.079025,-0.823902,1
4,-0.770071,0.560392,-0.441381,1.273938,1.490019,1.1207,0.522458,-1.237634,0


In [4]:
x = df.drop('Outcome', axis=1).copy()
y = df.Outcome.copy()

## <font color='red'>Model Selection</font>

### <font color='green'>Logistic Regression</font>

In [5]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression()
score = cross_val_score(classifier_lr, x, y, cv=10)
score.mean()

0.7541853900218918

### <font color='green'>Decision Tree</font>

In [6]:
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier()
score = cross_val_score(classifier_dt, x, y, cv=10)
score.mean()

0.9790888927295771

### <font color='green'>Random Forest</font>

In [7]:
from sklearn.ensemble import RandomForestClassifier
classifer_rf = RandomForestClassifier()
score = cross_val_score(classifer_rf, x, y, cv=10)
score.mean()

0.9889733840304183

### <font color='green'>Gradient Boosting</font>

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
classifier_gb = GradientBoostingClassifier()
score = cross_val_score(classifier_gb, x, y, cv=10)
score.mean()

0.8818527480124437

### <font color='green'>AdaBoost</font>

In [9]:
from sklearn.ensemble import AdaBoostClassifier
classifer_ab = AdaBoostClassifier()
score = cross_val_score(classifer_ab, x, y, cv=10)
score.mean()

0.8248689365134233

### <font color='green'>XGBoost</font>

In [10]:
from xgboost import XGBClassifier
classifier_xb = XGBClassifier()
score = cross_val_score(classifier_xb, x, y, cv=10)
score.mean()

0.9897338403041825

### <font color='green'>SVM</font>

In [11]:
from sklearn.svm import SVC
classifier_svm = SVC()
score = cross_val_score(classifier_svm, x, y, cv=10)
score.mean()

0.8469106463878328

It is seen that xgboost is performing best

### <font color='green'>Check If our XGBoost model is overfitting</font>

In [12]:
# Split the data into train & test data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [13]:
classifier_xb.fit(x_train, y_train)
train_score = classifier_xb.score(x_train, y_train)
print(train_score)
test_score = classifier_xb.score(x_test, y_test)
print(test_score)

1.0
0.9905123339658444


XGBoost is working fine in testing also.

## <font color='red'>Hyperparameter Tuning</font>

In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [15]:
params = {
    'n_estimators'     : [100, 200, 300, 500, 700, 1000],
    'learning_rate'    : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth'        : [3, 4, 5, 6, 8, 10, 12, 15],
    'min_child_weight' : [1, 3, 5, 7, 9, 11, 13, 15],
    'gamma'            : [0.1, 0.2, 0.4],
    'colsample_byte'   : [0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.3]
}

In [16]:
rsv = RandomizedSearchCV(estimator=classifier_xb, param_distributions=params, n_iter=100, cv=5, 
                  n_jobs=-1, scoring='accuracy', verbose=3)
# scoring: ['accuracy', 'precision']

In [17]:
rsv.fit(x, y)

Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=0,
                                           num_pa...
                                           scale_pos_weight=1, subsample=1,
                                           tree_method='exac

In [19]:
rsv.best_params_

{'n_estimators': 200,
 'min_child_weight': 3,
 'max_depth': 15,
 'learning_rate': 0.25,
 'gamma': 0.1,
 'colsample_byte': 0.9}

In [20]:
rsv.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_byte=0.9, colsample_bytree=1,
              gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.25, max_delta_step=0,
              max_depth=15, min_child_weight=3, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [21]:
# Lets check for scoring
score = cross_val_score(rsv.best_estimator_, x, y, cv=10)
score.mean()

Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bin

0.9893536121673003

In [24]:
params_g = {
    'n_estimators': [rsv.best_params_['n_estimators']-100,
                    rsv.best_params_['n_estimators'],
                    rsv.best_params_['n_estimators']+100
                    ],
     'min_child_weight': [rsv.best_params_['min_child_weight'],
                         rsv.best_params_['min_child_weight'],
                         rsv.best_params_['min_child_weight']
                         ],
     'max_depth': [rsv.best_params_['max_depth'],
                  rsv.best_params_['max_depth'],
                  rsv.best_params_['max_depth']
                  ],
     'learning_rate': [rsv.best_params_['learning_rate']-0.1,
                      rsv.best_params_['learning_rate'],
                      rsv.best_params_['learning_rate']+0.1
                      ],
     'gamma': [rsv.best_params_['gamma'],
              rsv.best_params_['gamma']],
     'colsample_byte': [rsv.best_params_['colsample_byte'],
                        rsv.best_params_['colsample_byte']+0.1
                       ]
}

In [25]:
# Let's try it with GridSearchCV

gsv = GridSearchCV(estimator=classifier_xb, param_grid=params_g, scoring='accuracy', n_jobs=-1
                   , verbose=3)

In [27]:
gsv.fit(x, y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed: 13.5min finished


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact', validate_parameters=1,
                            

In [28]:
model2 = gsv.best_estimator_

In [29]:
gsv.best_params_

{'colsample_byte': 0.3,
 'gamma': 0.1,
 'learning_rate': 0.3,
 'max_depth': 8,
 'min_child_weight': 7,
 'n_estimators': 600}

In [30]:
# Lets check for scoring
score = cross_val_score(model2, x, y, cv=10)
score.mean()

Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bin

0.9889733840304183

***Both Are giving same result, So we'll consider RandomizedSearchCV***

## <font color='red'>Train & Test the model</font>

In [22]:
rsv.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_byte=0.9, colsample_bytree=1,
              gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.25, max_delta_step=0,
              max_depth=15, min_child_weight=3, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [25]:
xgb_classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_byte=0.9, colsample_bytree=1,
              gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.25, max_delta_step=0,
              max_depth=15, min_child_weight=3,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1)

In [26]:
xgb_classifier.fit(x_train, y_train)

Parameters: { colsample_byte } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_byte=0.9, colsample_bytree=1,
              gamma=0.1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.25, max_delta_step=0,
              max_depth=15, min_child_weight=3, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [27]:
y_pred = xgb_classifier.predict(x_test)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [29]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[255   4]
 [  3 265]]
0.9867172675521821
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       259
           1       0.99      0.99      0.99       268

    accuracy                           0.99       527
   macro avg       0.99      0.99      0.99       527
weighted avg       0.99      0.99      0.99       527



## <font color='red'>Dump the model</font>

In [31]:
# Let's import pickle to store our model

import pickle

In [32]:
# Let's dump our model In order to use it in future

f_in = open('model.pkl', 'wb')
pickle.dump(xgb_classifier, f_in)

In [33]:
type(xgb_classifier)

xgboost.sklearn.XGBClassifier