In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# Date: January 2024
# License: MIT

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

In [3]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame=True, return_X_y=True)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

Grid Search

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(1, 11),
    'min_samples_leaf': np.arange(1, 11)
}

clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

In [6]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1}

In [7]:
print(f'Best score: {grid_search.best_score_:.4f}')

Best score: 0.9374


In [8]:
test_accuracy = grid_search.score(X_test, y_test)
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9211


In [11]:
df = pd.DataFrame(grid_search.cv_results_)
df = df[['params', 'mean_test_score', 'std_test_score']]
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1}",0.660740,0.011216
1,"{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2}",0.660740,0.011216
2,"{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 3}",0.660740,0.011216
3,"{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 4}",0.660740,0.011216
4,"{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5}",0.660740,0.011216
...,...,...,...
195,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 6}",0.937411,0.013089
196,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 7}",0.937411,0.013089
197,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 8}",0.937411,0.013089
198,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 9}",0.937411,0.013089


Random Search

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter space
param_dist = {    
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(1, 11),
    'min_samples_leaf': np.arange(1, 11)
}

# Create a RandomizedSearchCV instance
clf = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(clf, param_dist, n_iter=20, cv=3, random_state=42, n_jobs=-1)

# Fit to the training data
random_search.fit(X_train, y_train)

In [11]:
random_search.best_params_

{'min_samples_leaf': 6, 'max_depth': 10, 'criterion': 'gini'}

In [12]:
print(f'Best score: {random_search.best_score_:.4f}')

Best score: 0.9374


In [13]:
test_accuracy = grid_search.score(X_test, y_test)
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9211


Bayesian Optimization

In [14]:
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer

# Define the search space for hyperparameters
search_space = {
    'criterion': Categorical(['gini', 'entropy']),
    'max_depth': Integer(1, 11),
    'min_samples_leaf': Integer(1, 11)
}

# Create a BayesSearchCV instance
clf = DecisionTreeClassifier(random_state=42)
bayes_search = BayesSearchCV(clf, search_space, n_iter=20, cv=3, random_state=42, n_jobs=-1)

# Perform the Bayesian optimization search
bayes_search.fit(X_train, y_train)

In [15]:
bayes_search.best_params_

OrderedDict([('criterion', 'gini'),
             ('max_depth', 8),
             ('min_samples_leaf', 10)])

In [16]:
print(f'Best score: {bayes_search.best_score_:.4f}')

Best score: 0.9374


In [17]:
test_accuracy = bayes_search.score(X_test, y_test)
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9211
