In [None]:
'''
                         ,/(/*.                 
                   (((((((((((((((((            
                  /((   (((((((((((##           
                  /(((((((((((((#####           
             ,/////////(/(((((####### .....     
          ((((((((((((((((((######### ........  
         (((((((((((((((((########### ........  
        .(((((((((((((((###########* .......... 
        /((((((((((*             .............. 
         ((((((((( ............................ 
         (((((((( ............................  
           (((((#  ........................,.   
                  ...................           
                  ..............  ...           
                   ............   .,,           
                     ..,..........     
             
                Python Tip Friday: 11/15/2024
            Hyperparameter Autotuning with Optuna
    ----------------------------------------------------
    Stu Sztukowski | https://linkedin.com/in/StatsGuy
                   | https://github.com/stu-code
'''

''' Hyperparameter autotuning can be a bit of a manual process in Python, where you typically 
    choose some values in a CV grid search and run through them to find the best combination.
    Optuna helps make this process easier and more efficient with Bayesian optimization.
    It's much easier to use: you can specify things like a minimum and a maxumum for your  
    hyperparameters and run a study choosing values of them to determine a good combination. 
    Optuna performs an automated search, leaving much of the guess-work out.
'''

In [1]:
import pandas as pd
import optuna
import optuna.visualization as vis
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Read HMEQ data
df_hmeq = pd.read_csv('https://raw.githubusercontent.com/stu-code/python-tips/main/data/hmeq.csv')

# Create X and y matrices, then split into train/test (AKA validation) dataframes.
# We're going to drop categoricals to prevent one-hot encoding just for simplicity.
X = df_hmeq.drop(['BAD', 'JOB', 'REASON'], axis=1)
y = df_hmeq['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

**Define an Objective**

Define the hyperparameters we wish to autotune and return an average cross-validation score.

In [2]:
def objective(trial):

    # Parameter search space
    n_estimators      = trial.suggest_int('n_estimators', 50, 300)
    max_depth         = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)

    model = RandomForestClassifier(
        n_estimators      = n_estimators,
        max_depth         = max_depth,
        min_samples_split = min_samples_split
    )

    # Use cross-validation to evaluate
    score = cross_val_score(model, X_train, y_train, cv=10).mean()
    return score

**Run a study**

Create a study that tries to maximize the cross-validation accuracy

In [7]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2024-11-15 14:31:49,441] A new study created in memory with name: no-name-26941c5a-f44a-47fd-84b1-dd3a0d8b244a
[I 2024-11-15 14:32:06,645] Trial 0 finished with value: 0.9101207072619417 and parameters: {'n_estimators': 278, 'max_depth': 26, 'min_samples_split': 6}. Best is trial 0 with value: 0.9101207072619417.
[I 2024-11-15 14:32:15,099] Trial 1 finished with value: 0.9113191743256113 and parameters: {'n_estimators': 131, 'max_depth': 39, 'min_samples_split': 6}. Best is trial 1 with value: 0.9113191743256113.
[I 2024-11-15 14:32:32,999] Trial 2 finished with value: 0.909400135394077 and parameters: {'n_estimators': 279, 'max_depth': 42, 'min_samples_split': 3}. Best is trial 1 with value: 0.9113191743256113.
[I 2024-11-15 14:32:47,764] Trial 3 finished with value: 0.9110805135795669 and parameters: {'n_estimators': 195, 'max_depth': 33, 'min_samples_split': 8}. Best is trial 1 with value: 0.9113191743256113.
[I 2024-11-15 14:33:16,030] Trial 4 finished with value: 0.910361662822

**Identify the best hyperparameters**

We can print the best parameters it found and use it in our final model.

In [8]:
print("Best Parameters: ", study.best_params, '\nBest Accuracy:', study.best_value)

Best Parameters:  {'n_estimators': 57, 'max_depth': 27, 'min_samples_split': 2} 
Best Accuracy: 0.9113197480293277


**Visualize the Autotune History**

What's really cool about optuna is that you can visualize the autotune history with Plotly to see how it changed between iterations.

**Plot 1**: How much each hyperparameter affects performance

In [9]:
vis.plot_param_importances(study).show()

**Plot 2:** Score improvement over time

In [10]:
vis.plot_optimization_history(study).show()

**Plot 3:** Parameter interactions

In [12]:
vis.plot_parallel_coordinate(study).show()