In [1]:
# Import necessary libraries
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source
import pandas as pd

# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
577,2,118,80,0,0,42.9,0.693,21,1
757,0,123,72,0,0,36.3,0.258,52,1
242,3,139,54,0,0,25.6,0.402,22,1
267,2,128,64,42,0,40.0,1.101,24,0
224,1,100,66,15,56,23.6,0.666,26,0


In [2]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [3]:
import numpy as np

# Replace zero values with NaN, in columns where zero is not a valid value
cols_with_zero_vals = ['Glucose', 'BloodPressure',
                       'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero_vals] = df[cols_with_zero_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
# Split the dataset into features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((537, 8), (231, 8), (537,), (231,))

## Tuning hyperparams using Optuna

### Step 1: Define Objective Function
The objective function tells Optuna:
- What hyperparameters to tune
- How to evaluate performance


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


def objective(trial):
    # Suggest values for the hyperparameters to tune
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)

    # Create a RandomForestClassifier with the suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return score.mean()

### Step 2: Create and Optimize the Study

In [6]:
study = optuna.create_study(
    direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-06-05 16:27:10,883] A new study created in memory with name: no-name-dcd6001c-8491-47ed-ae41-3d2b9be2631e
[I 2025-06-05 16:27:14,000] Trial 0 finished with value: 0.7559190031152648 and parameters: {'n_estimators': 117, 'max_depth': 12}. Best is trial 0 with value: 0.7559190031152648.
[I 2025-06-05 16:27:18,593] Trial 1 finished with value: 0.7634302526825891 and parameters: {'n_estimators': 162, 'max_depth': 7}. Best is trial 1 with value: 0.7634302526825891.
[I 2025-06-05 16:27:21,517] Trial 2 finished with value: 0.7633956386292835 and parameters: {'n_estimators': 94, 'max_depth': 14}. Best is trial 1 with value: 0.7634302526825891.
[I 2025-06-05 16:27:25,286] Trial 3 finished with value: 0.7671512634129456 and parameters: {'n_estimators': 146, 'max_depth': 16}. Best is trial 3 with value: 0.7671512634129456.
[I 2025-06-05 16:27:27,395] Trial 4 finished with value: 0.7615437867774315 and parameters: {'n_estimators': 92, 'max_depth': 7}. Best is trial 3 with value: 0.76715126

### Step 3: Review the Best Parameters

In [7]:
print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

Best hyperparameters:  {'n_estimators': 177, 'max_depth': 17}
Best accuracy:  0.770889581169955


In [8]:
from sklearn.metrics import accuracy_score

best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")

Test accuracy: 0.7446


## Samplers in Optuna
Other samplers in Optuna are:
- `optuna.samplers.RandomSampler()`
    - Samples hyperparameters completely randomly from the defined distribution.
- `optuna.samplers.GridSampler(search_space)`
    - just brute-force; Tries every combination from the grid exactly once.
    - `search_space`: param_grid Dict

## Optuna Visualization

In [9]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [10]:
# 1. Optimization History
plot_optimization_history(study).show()

In [11]:
# 2. Parallel Coordinates Plot
plot_parallel_coordinate(study).show()

In [12]:
# 3. Slice Plot
plot_slice(study).show()

In [13]:
# 4. Contour Plot
plot_contour(study).show()

In [14]:
# 5. Hyperparameter Importance
plot_param_importances(study).show()

## Optimizing Multiple ML Model

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [18]:
def objective(trial):
    classifier_name = trial.suggest_categorical(
        'classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # Suggest hyperparameters for SVM
        C = trial.suggest_float('C', 0.1, 100.0, log=True)
        kernel = trial.suggest_categorical(
            'kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=C, kernel=kernel, random_state=42)
    elif classifier_name == 'RandomForest':

        n_estimator = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimator,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )
    elif classifier_name == 'GradientBoosting':
        n_estimator = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float(
            'learning_rate', 0.01, 0.3, log=True)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = GradientBoostingClassifier(
            n_estimators=n_estimator,
            learning_rate=learning_rate,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return score.mean()

#### Note: Optuna also supports ***Distributed Computing*** for faster computations
- set `n_jobs = -1` in optimize

In [29]:
study = optuna.create_study(
    direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100, n_jobs=-1)

[I 2025-06-05 16:43:42,362] A new study created in memory with name: no-name-fa00f0e4-2735-4b0d-90f9-809420ed2840
[I 2025-06-05 16:43:43,231] Trial 2 finished with value: 0.7205607476635514 and parameters: {'classifier': 'SVM', 'C': 96.25147346813682, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 2 with value: 0.7205607476635514.
[I 2025-06-05 16:43:43,316] Trial 4 finished with value: 0.6851678781585323 and parameters: {'classifier': 'SVM', 'C': 49.60645079629018, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 2 with value: 0.7205607476635514.
[I 2025-06-05 16:43:43,327] Trial 1 finished with value: 0.7745932848736587 and parameters: {'classifier': 'SVM', 'C': 2.2592275147269696, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 1 with value: 0.7745932848736587.
[I 2025-06-05 16:43:43,351] Trial 3 finished with value: 0.7764278296988578 and parameters: {'classifier': 'SVM', 'C': 1.815133123469913, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 3 with value: 0.7764278296

In [21]:
print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)
print("Best trial number: ", study.best_trial.number)

Best hyperparameters:  {'classifier': 'GradientBoosting', 'n_estimators': 250, 'learning_rate': 0.030157313229321154, 'min_samples_split': 8, 'min_samples_leaf': 1, 'bootstrap': False}
Best accuracy:  0.7876600899965386
Best trial number:  62


In [23]:
study.trials_dataframe().sample(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
10,10,0.754136,2025-06-05 16:30:58.171587,2025-06-05 16:31:00.842974,0 days 00:00:02.671387,,True,RandomForest,,,,3.0,1.0,10.0,130.0,COMPLETE
30,30,0.765317,2025-06-05 16:32:25.589318,2025-06-05 16:32:27.411950,0 days 00:00:01.822632,,True,RandomForest,,,,10.0,2.0,5.0,57.0,COMPLETE
86,86,0.767151,2025-06-05 16:34:38.616205,2025-06-05 16:34:40.027631,0 days 00:00:01.411426,,False,GradientBoosting,,,0.020584,,1.0,7.0,156.0,COMPLETE
27,27,0.761596,2025-06-05 16:32:15.951634,2025-06-05 16:32:21.158593,0 days 00:00:05.206959,,False,GradientBoosting,,,0.081878,,1.0,7.0,187.0,COMPLETE
67,67,0.77089,2025-06-05 16:34:02.153466,2025-06-05 16:34:05.344634,0 days 00:00:03.191168,,False,GradientBoosting,,,0.044621,,1.0,9.0,252.0,COMPLETE


In [24]:
study.trials_dataframe()['params_classifier'].value_counts()

params_classifier
GradientBoosting    73
SVM                 18
RandomForest         9
Name: count, dtype: int64

In [28]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

params_classifier
GradientBoosting    0.772632
RandomForest        0.763646
SVM                 0.761793
Name: value, dtype: float64