# XGboost & Optuna Example

Example showing how to use the Optuna library (https://optuna.readthedocs.io/en/stable/) for Bayesian hyperparameter optimization (via tree of parzen estimator)

In [1]:
%load_ext watermark
%watermark -p scikit-learn,optuna,xgboost

scikit-learn: 1.0
optuna      : 2.10.0
xgboost     : 1.5.0



## Dataset

In [2]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets


data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 398 80 171


## Optuna

In [6]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np
import optuna


def optimization_objective(trial, X_train, y_train, cv=5):

    
    params =  {
            "n_estimators": trial.suggest_categorical("n_estimators", [30, 50, 100, 300]),
            "learning_rate": trial.suggest_categorical("learning_rate", [0.01]),
            "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
            "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
    }
    

    cv_iterator = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)

    cv_scores = np.zeros(cv)
    for idx, (train_sub_idx, valid_idx) in enumerate(cv_iterator.split(X_train, y_train)):
        
        X_train_sub, X_valid = X_train[train_sub_idx], X_train[valid_idx]
        y_train_sub, y_valid = y_train[train_sub_idx], y_train[valid_idx]
        

        model = XGBClassifier(**params, random_state=123, use_label_encoder=False)
        
        model.fit(
            X_train_sub,
            y_train_sub,
            verbose=False,
            eval_set=[(X_valid, y_valid)],
            eval_metric="auc",
            early_stopping_rounds=100,
        )
        
        preds = model.score(X_valid, y_valid)
        
        cv_scores[idx] = preds

    return np.mean(cv_scores)

In [7]:
study = optuna.create_study(direction="maximize", study_name="XGBoost Classifier")

def func(trial):
    return optimization_objective(trial, X_train, y_train)

study.optimize(func, n_trials=50);

[32m[I 2021-10-29 15:13:35,852][0m A new study created in memory with name: XGBoost Classifier[0m
[32m[I 2021-10-29 15:13:36,083][0m Trial 0 finished with value: 0.9421835443037974 and parameters: {'n_estimators': 50, 'learning_rate': 0.01, 'lambda': 9.12943426076228e-08, 'alpha': 0.0016724501964558564}. Best is trial 0 with value: 0.9421835443037974.[0m
[32m[I 2021-10-29 15:13:36,268][0m Trial 1 finished with value: 0.9396518987341771 and parameters: {'n_estimators': 50, 'learning_rate': 0.01, 'lambda': 1.5845448248138875e-08, 'alpha': 4.798603930208439e-05}. Best is trial 0 with value: 0.9421835443037974.[0m
[32m[I 2021-10-29 15:13:36,635][0m Trial 2 finished with value: 0.9447151898734176 and parameters: {'n_estimators': 100, 'learning_rate': 0.01, 'lambda': 5.826200068630577e-05, 'alpha': 0.05751334422453705}. Best is trial 2 with value: 0.9447151898734176.[0m
[32m[I 2021-10-29 15:13:37,378][0m Trial 3 finished with value: 0.9547468354430381 and parameters: {'n_estima

In [8]:
print(f"Best CV accuracy: {study.best_value:.5f}")
print("Best params:")

for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

Best CV accuracy: 0.96231
Best params:
	n_estimators: 300
	learning_rate: 0.01
	lambda: 0.00010123319508156144
	alpha: 0.6910907278209559


In [11]:
model = XGBClassifier(**study.best_params, random_state=123, use_label_encoder=False)
model.fit(
    X_train,
    y_train,
    verbose=False,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    early_stopping_rounds=100,
)

XGBClassifier(alpha=0.6910907278209559, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              lambda=0.00010123319508156144, learning_rate=0.01,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=123,
              reg_alpha=0.691090703, reg_lambda=0.000101233192,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)

In [12]:
print(f"Training Accuracy: {model.score(X_train, y_train):0.2f}")
print(f"Test Accuracy: {model.score(X_test, y_test):0.2f}")

Training Accuracy: 0.99
Test Accuracy: 0.95
