# Evaluate Gradient Boosting Models with XGBoost in Python

## 0. Intriduction

This notebook contains:
  1. XGBoost model evaluation using train and test sets
  2. XGBoost model evaluation using KFold cross validation 

## 1. Evaluation using train and test sets

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [4]:
pima = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"

In [5]:
data = pd.read_csv(pima, header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.shape

(768, 9)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       768 non-null    int64  
 1   1       768 non-null    int64  
 2   2       768 non-null    int64  
 3   3       768 non-null    int64  
 4   4       768 non-null    int64  
 5   5       768 non-null    float64
 6   6       768 non-null    float64
 7   7       768 non-null    int64  
 8   8       768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
model = XGBClassifier()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
score = accuracy_score(y_test, y_preds)
print(f"Accuracy: {score*100:.2f}%")

Accuracy: 68.83%


## 2. Evaluation using K-Fold cross validation

In [10]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]
cv = KFold(n_splits=5, shuffle=True, random_state=42)
model = XGBClassifier()
score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print(f"Accuracy: {np.mean(score)*100:.2f}%")

Accuracy: 74.22%
