#### Import the required libraries

In [1]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
import matplotlib.pyplot as plt

#### Get the data

In [2]:
# Load the dataset.
df = sns.load_dataset(name='healthexp')

df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [3]:
# Get the dummies for the data.
df_model = pd.get_dummies(data=df)

df_model.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


In [4]:
# Get the model and target data.
X = df_model.drop(labels=['Life_Expectancy'], axis=1)
y = df_model['Life_Expectancy']

# Split the data into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=20240717)

X_train.head()
# X_test.head()
# y_train.head()
# y_test .head()

Unnamed: 0,Year,Spending_USD,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
204,2009,3880.842,False,True,False,False,False,False
116,1994,1420.271,False,False,False,False,True,False
164,2002,2065.133,False,False,False,False,True,False
171,2003,5726.538,False,False,False,False,False,True
261,2018,10451.386,False,False,False,False,False,True


#### Build the base model.

In [5]:
# Define the model.
rfr_base = RandomForestRegressor(random_state=20240717)

# Fit the data to the model.
rfr_base.fit(X=X_train, y=y_train)

# Make predictions on the test data.
y_pred = rfr_base.predict(X=X_test)

# Get the metrics.
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
r2s = r2_score(y_true=y_test, y_pred=y_pred)

print(f'mean_absolute_error : {mae}\nmean_squared_error : {mse}\nr2_score : {r2s}')

mean_absolute_error : 0.7211863636363536
mean_squared_error : 0.9606070863636212
r2_score : 0.9136726372978682


#### Hypertune and build model with 'Optuna'.

In [6]:
# Define the 'objective' function.
def objective(trial):
    # Define the hyperparameter space.
    n_estimators = trial.suggest_int(name='n_estimators', low=100, high=1000)
    max_depth = trial.suggest_int(name='max_depth', low=10, high=50)
    min_samples_split = trial.suggest_int(name='min_sample_split', low=2, high=32)
    min_samples_leaf = trial.suggest_int(name='min_sample_leaf', low=1, high=32)

    # Build the model.
    rfr_optuna = RandomForestRegressor(n_estimators=n_estimators, 
                                       max_depth=max_depth,
                                       min_samples_split=min_samples_split, 
                                       min_samples_leaf=min_samples_leaf#, random_state=20240717
                                       )
    
    # Get cross-val score
    score_crossval = cross_val_score(estimator=rfr_optuna,
                                     X=X_train, y=y_train,
                                     cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    return score_crossval 
    

In [10]:
# Create study for the above 'objective'
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=20240717),
                            direction='maximize')

study.optimize(func=objective, n_trials=100)

[I 2024-06-18 20:05:22,644] A new study created in memory with name: no-name-237a29c8-72f7-4330-8d8b-3835b57bc226
[I 2024-06-18 20:05:24,177] Trial 0 finished with value: -9.499977667792638 and parameters: {'n_estimators': 722, 'max_depth': 26, 'min_sample_split': 22, 'min_sample_leaf': 27}. Best is trial 0 with value: -9.499977667792638.
[I 2024-06-18 20:05:25,165] Trial 1 finished with value: -2.111051299838278 and parameters: {'n_estimators': 409, 'max_depth': 34, 'min_sample_split': 11, 'min_sample_leaf': 6}. Best is trial 1 with value: -2.111051299838278.
[I 2024-06-18 20:05:26,714] Trial 2 finished with value: -4.97057486261202 and parameters: {'n_estimators': 757, 'max_depth': 25, 'min_sample_split': 28, 'min_sample_leaf': 4}. Best is trial 1 with value: -2.111051299838278.
[I 2024-06-18 20:05:28,769] Trial 3 finished with value: -9.433956695991252 and parameters: {'n_estimators': 958, 'max_depth': 36, 'min_sample_split': 13, 'min_sample_leaf': 27}. Best is trial 1 with value: -

In [18]:
study.best_params

{'n_estimators': 923,
 'max_depth': 48,
 'min_sample_split': 5,
 'min_sample_leaf': 2}

In [14]:
# Do some plots.
optuna.visualization.plot_optimization_history(study=study)

In [15]:
optuna.visualization.plot_parallel_coordinate(study=study)

In [16]:
optuna.visualization.plot_slice(study=study, params=['n_estimators', 'max_depth', 'min_sample_split', 'min_sample_leaf'])



In [17]:
optuna.visualization.plot_param_importances(study=study)

In [23]:
# Finally train the best model again.
# Get the best parameters
best_params = study.best_params
# print(best_params)

best_model = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                   max_depth=best_params['max_depth'],
                                   min_samples_split=best_params['min_sample_split'],
                                   min_samples_leaf=best_params['min_sample_leaf'])

best_model.fit(X=X_train, y=y_train)

y_pred = best_model.predict(X=X_test)

print(f'mean_absolute_error : {mae}\nmean_squared_error : {mse}\nr2_score : {r2s}')

mean_absolute_error : 0.7211863636363536
mean_squared_error : 0.9606070863636212
r2_score : 0.9136726372978682


In [21]:
best_params['n_estimators']

923

#### Resources

1. [sklearn - model evaluation and scoring](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)
2. [Optuna - trial.suggest_int](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html)
3. [Optuna - create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html#optuna.study.create_study)
4. [Optuna - visualization](https://optuna.readthedocs.io/en/stable/reference/visualization/index.html)