# Decision Tree & Optuna Example

Example showing how to use the Optuna library (https://optuna.readthedocs.io/en/stable/) for Bayesian hyperparameter optimization (via tree of parzen estimator)

In [1]:
%load_ext watermark
%watermark -p scikit-learn,optuna

scikit-learn: 1.0
optuna      : 2.10.0



## Dataset

In [2]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets


data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 398 80 171


## Hyperopt

In [3]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
import hyperopt.pyll.stochastic

Some random sampling examples:

In [4]:
hyperopt.pyll.stochastic.sample(hp.loguniform('test', 1e-5, 1)) # range e^{low} to e^{high}

2.083641453103286

In [5]:
hyperopt.pyll.stochastic.sample(hp.qloguniform('test', 1e-5, 1, 0.1)) # rounded to 0.1

1.1

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np
import optuna


def optimization_objective(trial, X_train, y_train, cv=5):

    
    params =  {
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_impurity_decrease': trial.suggest_uniform('min_impurity_decrease', 0.0, 0.5),
        'max_depth': trial.suggest_categorical('max_depth', [6, 16, None])
    }

    cv_iterator = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)

    cv_scores = np.zeros(cv)
    for idx, (train_sub_idx, valid_idx) in enumerate(cv_iterator.split(X_train, y_train)):
        
        X_train_sub, X_valid = X_train[train_sub_idx], X_train[valid_idx]
        y_train_sub, y_valid = y_train[train_sub_idx], y_train[valid_idx]
        

        model = DecisionTreeClassifier(**params, random_state=123)
        model.fit(X_train_sub, y_train_sub)
        preds = model.score(X_valid, y_valid)
        
        cv_scores[idx] = preds

    return np.mean(cv_scores)

In [7]:
study = optuna.create_study(direction="maximize", study_name="DT Classifier")

def func(trial):
    return optimization_objective(trial, X_train, y_train)

study.optimize(func, n_trials=50);

[32m[I 2021-10-29 14:35:16,809][0m A new study created in memory with name: DT Classifier[0m
[32m[I 2021-10-29 14:35:16,818][0m Trial 0 finished with value: 0.8969303797468354 and parameters: {'min_samples_split': 8, 'min_impurity_decrease': 0.10833033823891053, 'max_depth': 6}. Best is trial 0 with value: 0.8969303797468354.[0m
[32m[I 2021-10-29 14:35:16,831][0m Trial 1 finished with value: 0.9346518987341772 and parameters: {'min_samples_split': 3, 'min_impurity_decrease': 0.004444131707066423, 'max_depth': None}. Best is trial 1 with value: 0.9346518987341772.[0m
[32m[I 2021-10-29 14:35:16,837][0m Trial 2 finished with value: 0.6281645569620252 and parameters: {'min_samples_split': 10, 'min_impurity_decrease': 0.3816433707073382, 'max_depth': None}. Best is trial 1 with value: 0.9346518987341772.[0m
[32m[I 2021-10-29 14:35:16,845][0m Trial 3 finished with value: 0.8969303797468354 and parameters: {'min_samples_split': 4, 'min_impurity_decrease': 0.26864271459806505, 'm

In [8]:
print(f"Best CV accuracy: {study.best_value:.5f}")
print("Best params:")

for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

Best CV accuracy: 0.93972
Best params:
	min_samples_split: 2
	min_impurity_decrease: 0.006252798709964645
	max_depth: None


In [9]:
model = DecisionTreeClassifier(random_state=123, **study.best_params)
model.fit(X_train, y_train)

DecisionTreeClassifier(min_impurity_decrease=0.006252798709964645,
                       random_state=123)

In [10]:
print(f"Training Accuracy: {model.score(X_train, y_train):0.2f}")
print(f"Test Accuracy: {model.score(X_test, y_test):0.2f}")

Training Accuracy: 0.99
Test Accuracy: 0.94
