# XGBoost
- Optimized gradient-boosting machine learning library
- Speed and performance
- Core algorithm is parallelizable


## When to use XGBoost
- Have large number of training samples with less features
- Mixture of categorical and numeric features

## When to not use XGBoost
- Image recognition
- Computer vision
- NLP
- Small number of training samples

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Preparation (Breast Cancer)

In [5]:
import sklearn.datasets

breast_cancer = sklearn.datasets.load_breast_cancer()

X, y = breast_cancer.data, breast_cancer.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state = 123)

## Import and using XGBoost Classifier

In [6]:
import xgboost as xgb

# initialize model
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

# fit data
xgb_clf.fit(X_train, y_train)

# score
xgb_clf.score(X_test, y_test)

0.9707602339181286

# Classification using CART (Classification and Regression Trees) model with XGBoost

## Basic Decision Tree without boost

In [7]:
# import library
from sklearn.tree import DecisionTreeClassifier

# initialize model
dt = DecisionTreeClassifier(max_depth=4)

# fit using same data (breast cancer)
dt.fit(X_train, y_train)

# score
dt.score(X_test, y_test)

0.9590643274853801

## Boosting
- Boosting can be applied to a machine learning model
- Boosting can converts a collection of week learners into a strong learners
- Strong learners = good performance

## Boosting with Cross Validation

In [9]:
# Create a DMatrix object
dmatrix = xgb.DMatrix(data=X, label=y)

# Create parameters as a dict
params = {"objective":"reg:logistic", "max_depth":3}

# Create XGB CV object
result = xgb.cv(dtrain=dmatrix, params=params, nfold=3, num_boost_round=5, metrics='error', as_pandas=True, seed=123)

# display results
result

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.02548,0.002451,0.066824,0.019564
1,0.021969,0.001257,0.061524,0.013876
2,0.014945,0.006589,0.056252,0.010004
3,0.012306,0.0033,0.052734,0.011418
4,0.010549,0.004314,0.054497,0.012485


## Measuring AUC

In [13]:
# Create XGB CV object
result_auc = xgb.cv(dtrain=dmatrix, params=params, nfold=3, num_boost_round=5, metrics='auc', as_pandas=True, seed=123)

# display results
result_auc

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.987225,0.001301,0.961473,0.02476
1,0.993244,0.004295,0.969078,0.022616
2,0.995224,0.003751,0.972491,0.024377
3,0.997125,0.002042,0.971354,0.025405
4,0.99761,0.001871,0.974002,0.026527


# Fine-tuning Model with Hyperparameters

In [17]:
# import library
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# create hyperparameters dict
grid_params = {
    'max_depth': [1, 3, 5, 10]
}

xgb_clf = xgb.XGBClassifier()

grid_clf = GridSearchCV(estimator=xgb_clf, param_grid=grid_params, cv=4)

grid_clf.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Best Parameter: {}".format(grid_clf.best_params_))
print("Best Score: {}".format(grid_clf.best_score_))

Best Parameter: {'max_depth': 5}
Best Score: 0.9723484848484849


# Pipeline

In [18]:
# import library
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [20]:
X, y = breast_cancer.data, breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state = 123)

In [22]:
# Create pipeline object
steps = [('scaler', StandardScaler()), ('xgb_model',xgb.XGBClassifier())]

clf_pipline = Pipeline(steps)

# create hyperparameters dict
grid_params = {
    'xgb_model__max_depth': [1, 3, 5, 10],
    'xgb_model__learning_rate':[0.01, 0.1, 0.5, 0.9],
    'xgb_model__subsample': [0.3, 0.5, 0.9],
    'xgb_model__n_estimators': np.arange(50, 200, 50)
}

# Create pipeline object
pipeline_cv = GridSearchCV(estimator=clf_pipline, param_grid=grid_params, cv=5, scoring='roc_auc')

pipeline_cv.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Best Parameter: {}".format(pipeline_cv.best_params_))
print("Best Score: {}".format(pipeline_cv.best_score_))

Best Parameter: {'xgb_model__learning_rate': 0.1, 'xgb_model__max_depth': 5, 'xgb_model__n_estimators': 150, 'xgb_model__subsample': 0.5}
Best Score: 0.9948435163539099


In [23]:
# BEST SCORE : 99.484351 %