In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # for plotting facilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb # import XGBoost
from xgboost import cv
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

# XG Boost classification demonstration

I used the steps in this article to help with this demo: https://www.kaggle.com/code/prashant111/xgboost-k-fold-cv-feature-importance/notebook.

The dataset for this demo is the diabetes.csv we used this week, which is also included in the github repository.

This demo uses the xgboost library instead of scikitlearn, so you will need to install the library. For convenience, I included the environment file xgboost-env.yml in the github, so you can easily create the anaconda environment needed. You can run: conda env create --file xgboost-env.yml from the comman line to create the environment. See the conda cheat sheet for reference: https://docs.conda.io/projects/conda/en/4.6.0/_downloads/52a95608c49671267e40c689e0bc00ca/conda-cheatsheet.pdf.

If you prefer to use scikitlearn, there is also a demo in the scikitlearn docs linked in the exercise instructions here: https://scikit-learn.org/stable/modules/tree.html.

### EDA

In [None]:
# import dataset
diabetes = pd.read_csv(r"diabetes.csv")

In [None]:
# shape of dataset
diabetes.shape

(768, 9)

In [None]:
# preview data
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# summary
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
# summary statistics
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# check for missing values
diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### Declare feature vector and target variable

In [None]:
X = diabetes.drop("Outcome", axis = 1)
y = diabetes["Outcome"]

In [None]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [None]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [None]:
# Now, we will convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains.
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


### Split data into separate training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

### Train the XGBoost Classifier

In [None]:
# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }         
           
          
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)


# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)





### Make predictions with XGBoost Classifier

In [None]:
# make predictions on test data

y_pred = xgb_clf.predict(X_test)

### Check accuracy score

In [None]:
# compute and print accuracy score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

XGBoost model accuracy score: 0.7879


### Create confusion matrix

In [None]:
# create confusion matrix
confusion_matrix(y_test, y_pred)

array([[142,  15],
       [ 34,  40]], dtype=int64)

### Check mathews coefficient

In [None]:
matthews_corrcoef(y_test, y_pred)

0.4875140970634592