# Python - Week 6 - Day 2 - HW
## Shahin Mammadov

You will use the Pima Indian diabetes dataset. The dataset corresponds to a classification
problem on which you need to make predictions on the basis of whether a person is to suffer
diabetes given the 8 features in the dataset. You are asked to use this data to build a XGBoost
Classifier model using scikit-learn. Tune the parameters using Grid Search and Randomized Search
strategies.

In [1]:
from datetime import datetime
import time

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('diabetes.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
data[data.duplicated()]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [7]:
x_list = []

for x in data.columns[:len(data.columns)-1]:
    x_list.append(data[x])

In [8]:
X = pd.DataFrame(np.c_[x_list]).T
y = data['Outcome']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)

from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()    
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test) 

## XGBoost without parameter tuning

In [10]:
from xgboost import XGBClassifier
from sklearn import metrics
from xgboost import cv
import xgboost as xgb

In [11]:
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
y_pred_xgb = XGB.predict(X_test)

In [12]:
metrics.accuracy_score(y_test, y_pred_xgb)

0.7532467532467533

## XGBoost with hyper parameter tuning with Grid Search

In [13]:
from sklearn.model_selection import GridSearchCV

In [27]:
params = {
        'learning_rate': [0.1, 0.2, 0.4, 0.8, 1.5],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [15]:
xgb_grid = GridSearchCV(estimator=XGB, param_grid=params, scoring='accuracy', cv=10, n_jobs=-1, return_train_score=True)

start_time = timer(None) # timing starts from this point for "start_time" variable
xgb_grid_fit = xgb_grid.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable



 Time taken: 0 hours 7 minutes and 3.63 seconds.


In [16]:
pd.DataFrame.from_dict(xgb_grid_fit.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_min_child_weight,param_subsample,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.305996,0.117018,0.009226,0.004222,0.6,0.5,0.1,3,1,0.6,...,0.890014,0.890014,0.885673,0.875543,0.887120,0.872648,0.880058,0.881503,0.881366,0.006204
1,0.080330,0.005295,0.004250,0.000639,0.6,0.5,0.1,3,1,0.8,...,0.888567,0.884226,0.881331,0.874096,0.876990,0.882779,0.894509,0.881503,0.883535,0.005487
2,0.073071,0.005594,0.005536,0.001544,0.6,0.5,0.1,3,1,1.0,...,0.872648,0.882779,0.887120,0.874096,0.881331,0.879884,0.885838,0.874277,0.879919,0.004927
3,0.080920,0.013248,0.006401,0.003769,0.6,0.5,0.1,3,5,0.6,...,0.856729,0.843705,0.848046,0.849493,0.850941,0.843705,0.861272,0.845376,0.849391,0.005459
4,0.099846,0.021978,0.005684,0.002849,0.6,0.5,0.1,3,5,0.8,...,0.853835,0.863965,0.868307,0.853835,0.855282,0.865412,0.862717,0.868497,0.862702,0.005787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,0.170993,0.025412,0.005947,0.001184,1.0,5,1.5,5,5,0.8,...,0.907381,0.927641,0.914616,0.924747,0.914616,0.904486,0.930636,0.906069,0.916811,0.008664
2021,0.169707,0.020864,0.005402,0.000863,1.0,5,1.5,5,5,1.0,...,0.837916,0.861071,0.858177,0.833575,0.848046,0.835022,0.852601,0.874277,0.851704,0.012403
2022,0.124391,0.013371,0.007442,0.004779,1.0,5,1.5,5,10,0.6,...,0.840810,0.830680,0.827786,0.817656,0.824891,0.833575,0.822254,0.825145,0.828705,0.007029
2023,0.146517,0.020625,0.006520,0.004588,1.0,5,1.5,5,10,0.8,...,0.882779,0.856729,0.876990,0.890014,0.874096,0.863965,0.880058,0.861272,0.871239,0.010511


In [17]:
xgb_grid.best_params_

{'colsample_bytree': 0.6,
 'gamma': 5,
 'learning_rate': 0.2,
 'max_depth': 4,
 'min_child_weight': 1,
 'subsample': 0.6}

In [18]:
xgb_grid.best_score_

0.7773581681476418

## XGBoost with hyper parameter tuning with Randomized Search

In [19]:
params_empty = {}

In [20]:
xgb_grid_test = GridSearchCV(estimator=XGB, param_grid=params_empty, scoring='accuracy', cv=10, n_jobs=-1, return_train_score=True)

start_time = timer(None) # timing starts from this point for "start_time" variable
xgb_grid_fit_test = xgb_grid_test.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable


 Time taken: 0 hours 0 minutes and 0.71 seconds.


In [21]:
xgb_grid_test.best_score_

0.7357142857142857

In [22]:
import itertools as it

In [28]:
allNames = params
combinations = it.product(*(params[Name] for Name in allNames))
comb_params = list(combinations)

In [29]:
scores = []
start_time = timer(None) # timing starts from this point for "start_time" variable
for x in comb_params:
    XGB = XGBClassifier(learning_rate=x[0], min_child_weight=x[1], gamma=x[2], subsample=x[3], colsample_bytree=x[4], max_depth=x[5])
    XGB.fit(X_train, y_train)
    y_pred_xgb = XGB.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred_xgb))
timer(start_time) # timing ends here for "start_time" variable


 Time taken: 0 hours 1 minutes and 53.58 seconds.


In [30]:
max(scores)

0.8008658008658008

In [31]:
scores.index(max(scores))

80

In [32]:
comb_params[80]

(0.1, 1, 1.5, 1.0, 1.0, 5)

In [33]:
xgb_grid.best_params_

{'colsample_bytree': 0.6,
 'gamma': 5,
 'learning_rate': 0.2,
 'max_depth': 4,
 'min_child_weight': 1,
 'subsample': 0.6}