# Optuna Hyperparameter Optimization for RandomForestRegressor
This notebook demonstrates how to use Optuna to optimize hyperparameters for a Random Forest model predicting life expectancy based on expenditure data.

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
from optuna.visualization import plot_optimization_history
# Optional: for matplotlib-based plots
from optuna.visualization.matplotlib import plot_optimization_history as plot_optimization_history_matplotlib

## 1. Load and Explore Dataset

In [75]:
healthexp = sns.load_dataset("healthexp")

healthexp = pd.get_dummies(healthexp)
x = healthexp.drop(['Life_Expectancy'], axis=1)
y = healthexp['Life_Expectancy']

## 2. Preprocess Data

In [76]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=19)


## 3. Train Baseline Model

In [77]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 13)
rfr.fit(x_train, y_train)

y_pred = rfr.predict(x_test)

In [78]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mean_absolute_error(y_test, y_pred)

0.25916363636361917

In [79]:
mean_squared_error(y_test, y_pred)  

0.10221141818181628

In [80]:
r2_score(y_test, y_pred)

0.9910457602615238

## 4. Define Optuna Objective Function

In [81]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_stimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 32)

    model = RandomForestRegressor(n_estimators=n_stimators,
                                    max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    random_state=98)
    
    score = cross_val_score(model, x, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    return score

In [82]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=42))


[I 2025-09-04 16:21:34,458] A new study created in memory with name: no-name-11e669cf-1a93-49b8-850b-c4f30bfcf61b


## 5. Run Optuna Study

In [None]:
study.optimize(objective, n_trials=200)

[I 2025-09-04 16:21:40,836] Trial 0 finished with value: -4.553684112850672 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20}. Best is trial 0 with value: -4.553684112850672.
[I 2025-09-04 16:21:45,129] Trial 1 finished with value: -5.231423631784025 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 28}. Best is trial 0 with value: -4.553684112850672.
[I 2025-09-04 16:21:47,160] Trial 2 finished with value: -5.620449440125375 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 32}. Best is trial 0 with value: -4.553684112850672.
[I 2025-09-04 16:21:49,921] Trial 3 finished with value: -3.3435473286896005 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 3 with value: -3.3435473286896005.
[I 2025-09-04 16:21:51,040] Trial 4 finished with value: -3.905587333298098 and parame

## 6. Visualize Optimization Results

In [None]:
import sys
print(sys.executable)

import plotly
print(plotly.__version__)

c:\Users\patrick\Documents\Optuna\.venv\Scripts\python.exe
6.3.0


In [None]:
import optuna.visualization
optuna.visualization.plot_optimization_history

<function optuna.visualization._optimization_history.plot_optimization_history(study: 'Study | Sequence[Study]', *, target: 'Callable[[FrozenTrial], float] | None' = None, target_name: 'str' = 'Objective Value', error_bar: 'bool' = False) -> "'go.Figure'">

In [None]:
best_params = study.best_params

print(plotly.__version__)

optuna.visualization.plot_optimization_history(study)

6.3.0


ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

## 7. Evaluate Best Model

In [None]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [None]:
best_model = RandomForestRegressor(n_estimators=study.best_params['n_estimators'],
                                    max_depth=study.best_params['max_depth'],   
                                    min_samples_split=study.best_params['min_samples_split'],
                                    min_samples_leaf=study.best_params['min_samples_leaf'])

best_model.fit(x_train, y_train)

0,1,2
,n_estimators,940
,criterion,'squared_error'
,max_depth,45
,min_samples_split,3
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
y_pred = best_model.predict(x_test)

In [None]:
mean_absolute_error(y_test, y_pred)

0.3105838311269472

In [None]:
mean_squared_error(y_test, y_pred)

0.13930985317088607

In [None]:
r2_score(y_test, y_pred)

0.9877957487977996