<a href="https://colab.research.google.com/github/rozaxa/Artificial-Intelligence-Workshop-II/blob/Optuna/Healthexp_tuning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install --quiet optuna

In [22]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [23]:
healthexp = sns.load_dataset('healthexp')
healthexp.head(100)

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
...,...,...,...,...
95,1991,Canada,1805.209,77.6
96,1991,France,1558.033,77.2
97,1991,Great Britain,842.797,75.9
98,1991,Japan,1166.430,79.1


In [24]:
healthexp = pd.get_dummies(healthexp)

In [25]:
X = healthexp.drop(['Life_Expectancy'], axis=1)
y = healthexp['Life_Expectancy']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [27]:
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

In [28]:
print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
print("MSE: ",mean_squared_error(y_test, y_pred))
print("R2 score: ", r2_score(y_test, y_pred))

Mean absolute error:  0.25916363636361917
MSE:  0.10221141818181628
R2 score:  0.9910457602615238


In [29]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 100, 1000)
  max_depth = trial.suggest_int('max_depth', 10, 50)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

  model = RandomForestRegressor(n_estimators=n_estimators,
  max_depth=max_depth,
  min_samples_split=min_samples_split,
  min_samples_leaf=min_samples_leaf)

  score = cross_val_score(model, X, y, n_jobs=-1, cv=5, scoring='neg_mean_squared_error').mean()

  return score

In [30]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=42)) # Default is random Search

[I 2024-11-01 12:00:57,970] A new study created in memory with name: no-name-fcbf668d-b723-400b-9618-2f9e8bd41553


In [31]:
study.optimize(objective, n_trials=100)


[I 2024-11-01 12:01:09,641] Trial 0 finished with value: -4.532277039659695 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20}. Best is trial 0 with value: -4.532277039659695.
[I 2024-11-01 12:01:14,498] Trial 1 finished with value: -5.161468585182294 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 28}. Best is trial 0 with value: -4.532277039659695.
[I 2024-11-01 12:01:23,584] Trial 2 finished with value: -5.610463141060634 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 32}. Best is trial 0 with value: -4.532277039659695.
[I 2024-11-01 12:01:34,094] Trial 3 finished with value: -3.0271015029200674 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 3 with value: -3.0271015029200674.
[I 2024-11-01 12:01:36,429] Trial 4 finished with value: -3.7797016204640004 and param

In [32]:
best_params = study.best_params
best_score = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_score:.3f}")

Best Hyperparameters: {'n_estimators': 358, 'max_depth': 34, 'min_samples_split': 2, 'min_samples_leaf': 2}
Best Accuracy: -1.868


In [33]:
optuna.visualization.plot_optimization_history(study)

In [34]:
optuna.visualization.plot_parallel_coordinate(study)

In [35]:
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth', 'min_samples_leaf', 'min_samples_split'])

In [36]:
optuna.visualization.plot_param_importances(study)

#### Model afer tuning

In [37]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [38]:
best_model = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf)

best_model.fit(X_train, y_train)

In [39]:
y_pred = best_model.predict(X_test)

In [40]:
print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
print("MSE: ",mean_squared_error(y_test, y_pred))
print("R2 score: ", r2_score(y_test, y_pred))

Mean absolute error:  0.3032262084312665
MSE:  0.1354429860758154
R2 score:  0.9881345060092933
