In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [64]:
data = pd.read_csv('pima-indians-diabetes.csv')
data.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skinfold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age,Is Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [65]:
data.columns

Index(['Number of times pregnant', 'Plasma glucose concentration',
       'Diastolic blood pressure (mm Hg)', 'Triceps skinfold thickness (mm)',
       '2-Hour serum insulin (mu U/ml)',
       'Body mass index (weight in kg/(height in m)^2)',
       'Diabetes pedigree function', 'Age', 'Is Diabetic'],
      dtype='object')

In [66]:
cols = ['Plasma glucose concentration',
       'Diastolic blood pressure (mm Hg)', 'Triceps skinfold thickness (mm)',
       '2-Hour serum insulin (mu U/ml)',
       'Body mass index (weight in kg/(height in m)^2)',
       'Diabetes pedigree function', 'Age']

In [67]:
for col in cols:
    data[col] = data[col].replace(0,np.nan)

In [68]:
data.isna().sum()

Number of times pregnant                            0
Plasma glucose concentration                        5
Diastolic blood pressure (mm Hg)                   35
Triceps skinfold thickness (mm)                   227
2-Hour serum insulin (mu U/ml)                    374
Body mass index (weight in kg/(height in m)^2)     11
Diabetes pedigree function                          0
Age                                                 0
Is Diabetic                                         0
dtype: int64

In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Number of times pregnant                        768 non-null    int64  
 1   Plasma glucose concentration                    763 non-null    float64
 2   Diastolic blood pressure (mm Hg)                733 non-null    float64
 3   Triceps skinfold thickness (mm)                 541 non-null    float64
 4   2-Hour serum insulin (mu U/ml)                  394 non-null    float64
 5   Body mass index (weight in kg/(height in m)^2)  757 non-null    float64
 6   Diabetes pedigree function                      768 non-null    float64
 7   Age                                             768 non-null    int64  
 8   Is Diabetic                                     768 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 54.1 K

In [70]:
data.shape

(768, 9)

In [71]:
data.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skinfold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age,Is Diabetic
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [72]:
# imputing the missing values
data['Plasma glucose concentration']=data['Plasma glucose concentration'].fillna(data['Plasma glucose concentration'].mode()[0])
data['Diastolic blood pressure (mm Hg)']=data['Diastolic blood pressure (mm Hg)'].fillna(data['Diastolic blood pressure (mm Hg)'].mode()[0])
data['Triceps skinfold thickness (mm)']=data['Triceps skinfold thickness (mm)'].fillna(data['Triceps skinfold thickness (mm)'].mean())
data['2-Hour serum insulin (mu U/ml)']=data['2-Hour serum insulin (mu U/ml)'].fillna(data['2-Hour serum insulin (mu U/ml)'].mean())
data['Body mass index (weight in kg/(height in m)^2)']=data['Body mass index (weight in kg/(height in m)^2)'].fillna(data['Body mass index (weight in kg/(height in m)^2)'].mean())

In [73]:
data.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skinfold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age,Is Diabetic
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.539062,72.295573,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.49066,12.106756,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.15342,155.548223,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [74]:
data.isna().sum()
# clearly now there are no null values

Number of times pregnant                          0
Plasma glucose concentration                      0
Diastolic blood pressure (mm Hg)                  0
Triceps skinfold thickness (mm)                   0
2-Hour serum insulin (mu U/ml)                    0
Body mass index (weight in kg/(height in m)^2)    0
Diabetes pedigree function                        0
Age                                               0
Is Diabetic                                       0
dtype: int64

In [75]:
# separting the features and the target variable
X = data.drop(columns='Is Diabetic',axis=1)
y = data['Is Diabetic']

In [76]:
# we can see that the values are lot bigger and it's better to standarize so we will scale them
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
scaled_data = ss.fit_transform(X)

In [77]:
type(X)

pandas.core.frame.DataFrame

In [78]:
x_train,x_test,y_train,y_test = train_test_split(scaled_data,y,test_size=0.3,random_state=42)

In [79]:
model = XGBClassifier(objective='binary:logistic')
model.fit(x_train,y_train)

In [82]:
y_pred = model.predict(x_train)
preiction_y = [round(value) for value in y_pred]
accuracy = accuracy_score(preiction_y,y_train)
accuracy
# the accuracy here is the acuracy for training dataset

1.0

In [83]:
y_pred = model.predict(x_test)
preiction_y = [round(value) for value in y_pred]
accuracy = accuracy_score(preiction_y,y_test)
accuracy
# the accuracy here is the acuracy for testing dataset

0.7142857142857143

In [84]:
x_test[0]

array([ 0.63994726, -0.77251205, -1.18156252,  0.43784695,  0.40547846,
        0.22451019, -0.1264714 ,  0.83038113])

In [87]:
# now to increase the accuracy we will do hyperparamaeter tuning
grid_param = {
    'learning_rate':[1,0.5,0.1,0.01,0.001],
    'max_depth':[5,10,15,20],
    'n_estimators':[10,50,100,200]
}

In [90]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid=grid_param,verbose=3)

In [91]:
grid.fit(x_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.787 total time=   0.0s
[CV 2/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.769 total time=   0.0s
[CV 3/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.748 total time=   0.0s
[CV 4/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.710 total time=   0.0s
[CV 5/5] END learning_rate=1, max_depth=5, n_estimators=10;, score=0.776 total time=   0.0s
[CV 1/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.769 total time=   0.0s
[CV 2/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.759 total time=   0.0s
[CV 3/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.766 total time=   0.0s
[CV 4/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.710 total time=   0.0s
[CV 5/5] END learning_rate=1, max_depth=5, n_estimators=50;, score=0.738 total time=   0.0s
[CV 1/5] END learn

[CV 2/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.750 total time=   0.0s
[CV 3/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.776 total time=   0.0s
[CV 4/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.701 total time=   0.0s
[CV 5/5] END learning_rate=0.5, max_depth=5, n_estimators=100;, score=0.710 total time=   0.0s
[CV 1/5] END learning_rate=0.5, max_depth=5, n_estimators=200;, score=0.778 total time=   0.0s
[CV 2/5] END learning_rate=0.5, max_depth=5, n_estimators=200;, score=0.769 total time=   0.0s
[CV 3/5] END learning_rate=0.5, max_depth=5, n_estimators=200;, score=0.776 total time=   0.0s
[CV 4/5] END learning_rate=0.5, max_depth=5, n_estimators=200;, score=0.710 total time=   0.1s
[CV 5/5] END learning_rate=0.5, max_depth=5, n_estimators=200;, score=0.720 total time=   0.0s
[CV 1/5] END learning_rate=0.5, max_depth=10, n_estimators=10;, score=0.759 total time=   0.0s
[CV 2/5] END learning_rate=0.5, max_depth=10, n_es

[CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=200;, score=0.720 total time=   0.1s
[CV 1/5] END learning_rate=0.1, max_depth=10, n_estimators=10;, score=0.750 total time=   0.0s
[CV 2/5] END learning_rate=0.1, max_depth=10, n_estimators=10;, score=0.796 total time=   0.0s
[CV 3/5] END learning_rate=0.1, max_depth=10, n_estimators=10;, score=0.720 total time=   0.0s
[CV 4/5] END learning_rate=0.1, max_depth=10, n_estimators=10;, score=0.720 total time=   0.0s
[CV 5/5] END learning_rate=0.1, max_depth=10, n_estimators=10;, score=0.729 total time=   0.0s
[CV 1/5] END learning_rate=0.1, max_depth=10, n_estimators=50;, score=0.796 total time=   0.0s
[CV 2/5] END learning_rate=0.1, max_depth=10, n_estimators=50;, score=0.722 total time=   0.1s
[CV 3/5] END learning_rate=0.1, max_depth=10, n_estimators=50;, score=0.766 total time=   0.0s
[CV 4/5] END learning_rate=0.1, max_depth=10, n_estimators=50;, score=0.720 total time=   0.0s
[CV 5/5] END learning_rate=0.1, max_depth=10, n_es

[CV 2/5] END learning_rate=0.01, max_depth=10, n_estimators=100;, score=0.787 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=10, n_estimators=100;, score=0.710 total time=   0.1s
[CV 4/5] END learning_rate=0.01, max_depth=10, n_estimators=100;, score=0.720 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=10, n_estimators=100;, score=0.766 total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.778 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.769 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.766 total time=   0.1s
[CV 4/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.710 total time=   0.1s
[CV 5/5] END learning_rate=0.01, max_depth=10, n_estimators=200;, score=0.748 total time=   0.1s
[CV 1/5] END learning_rate=0.01, max_depth=15, n_estimators=10;, score=0.648 total time=   0.0s
[CV 2/5] END learning_rate=0.01

[CV 2/5] END learning_rate=0.001, max_depth=10, n_estimators=200;, score=0.648 total time=   0.1s
[CV 3/5] END learning_rate=0.001, max_depth=10, n_estimators=200;, score=0.654 total time=   0.1s
[CV 4/5] END learning_rate=0.001, max_depth=10, n_estimators=200;, score=0.654 total time=   0.1s
[CV 5/5] END learning_rate=0.001, max_depth=10, n_estimators=200;, score=0.645 total time=   0.1s
[CV 1/5] END learning_rate=0.001, max_depth=15, n_estimators=10;, score=0.648 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=15, n_estimators=10;, score=0.648 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=15, n_estimators=10;, score=0.654 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=15, n_estimators=10;, score=0.654 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=15, n_estimators=10;, score=0.645 total time=   0.0s
[CV 1/5] END learning_rate=0.001, max_depth=15, n_estimators=50;, score=0.648 total time=   0.0s
[CV 2/5] END learning_rate

In [92]:
grid.best_params_

{'learning_rate': 1, 'max_depth': 5, 'n_estimators': 10}

In [93]:
xgb_hyper = XGBClassifier(learning_rate=1,max_depth=5,n_estimators=10)

In [94]:
xgb_hyper.fit(x_train,y_train)

In [95]:
# accuracy after parameter's tuning
y_pred_new = xgb_hyper.predict(x_test)
prediction_y = [round(values) for values in y_pred_new]
accuracy = accuracy_score(y_pred_new,y_test)
accuracy

0.7186147186147186

In [98]:
import pickle
filename = 'xgboost_model.pickle'
pickle.dump(xgb_hyper, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))

In [100]:
# we'll save the scaler object as well for prediction
filename_scaler = 'scaler_model.pickle'
pickle.dump(ss, open(filename_scaler, 'wb'))

scaler_model = pickle.load(open(filename_scaler, 'rb'))

In [101]:
d=scaler_model.transform([[6,148,72,35,80,33.6,0.627,50]])
pred=loaded_model.predict(d)
print('This data belongs to class :',pred[0])

This data belongs to class : 1


