In [1]:
import seaborn as sns

In [2]:
import pandas as pd

In [13]:
df = sns.load_dataset('healthexp')

In [5]:
df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             274 non-null    int64  
 1   Country          274 non-null    object 
 2   Spending_USD     274 non-null    float64
 3   Life_Expectancy  274 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 8.7+ KB


In [14]:
df = pd.get_dummies(df)

In [15]:
df.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


In [16]:
X = df.drop('Life_Expectancy',axis=1)
y = df['Life_Expectancy']

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=54)

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
rfc = RandomForestRegressor(random_state=34)

In [22]:
rfc.fit(X_train,y_train)

In [23]:
y_pred = rfc.predict(X_test)

In [25]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [26]:
mean_absolute_error(y_test,y_pred)

0.3095636363636187

In [29]:
import numpy as np
mean_squared_error(y_test,y_pred)

0.15382676363635506

In [28]:
r2_score(y_test,y_pred)

0.9837812737664462

In [30]:
import optuna

In [38]:
from sklearn.model_selection import cross_val_score

### Define objective

In [48]:
def objective(trial):
    #define parametrs to tune
    n_estimators = trial.suggest_int('n_estimators',100,1000)
    max_depth = trial.suggest_int('max_depth',10,50)
    min_samples_split = trial.suggest_int('min_samples_split',2,32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',2,32)
    
    #define model
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_leaf=min_samples_leaf,
                                  min_samples_split=min_samples_split)
    #cross val score
    score = cross_val_score(model, X_train, y_train, cv = 5, scoring='neg_mean_squared_error',n_jobs= -1)
    
    return score.mean()
    

### Create Study

In [49]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.RandomSampler(seed=42),study_name='Optuna_basics')

[I 2024-08-08 23:50:57,563] A new study created in memory with name: Optuna_basics


In [69]:
study.optimize(objective,n_trials=200)

[I 2024-08-09 00:15:13,943] Trial 700 finished with value: -3.230440027846071 and parameters: {'n_estimators': 814, 'max_depth': 47, 'min_samples_split': 31, 'min_samples_leaf': 31}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:14,515] Trial 701 finished with value: -1.380677198214156 and parameters: {'n_estimators': 569, 'max_depth': 50, 'min_samples_split': 25, 'min_samples_leaf': 7}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:15,044] Trial 702 finished with value: -2.3535282917745493 and parameters: {'n_estimators': 529, 'max_depth': 39, 'min_samples_split': 9, 'min_samples_leaf': 21}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:15,616] Trial 703 finished with value: -2.5947981303536944 and parameters: {'n_estimators': 700, 'max_depth': 16, 'min_samples_split': 19, 'min_samples_leaf': 25}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:15,798] Trial 704 finished with value: -1.918313

[I 2024-08-09 00:15:21,496] Trial 736 finished with value: -2.1960729508747243 and parameters: {'n_estimators': 783, 'max_depth': 14, 'min_samples_split': 17, 'min_samples_leaf': 17}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:21,631] Trial 737 finished with value: -2.472437638090591 and parameters: {'n_estimators': 432, 'max_depth': 48, 'min_samples_split': 27, 'min_samples_leaf': 23}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:21,865] Trial 738 finished with value: -2.4564844810791966 and parameters: {'n_estimators': 743, 'max_depth': 28, 'min_samples_split': 30, 'min_samples_leaf': 23}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:22,088] Trial 739 finished with value: -2.7919771353443696 and parameters: {'n_estimators': 756, 'max_depth': 45, 'min_samples_split': 10, 'min_samples_leaf': 27}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:22,172] Trial 740 finished with value: -3.330

[I 2024-08-09 00:15:28,325] Trial 772 finished with value: -3.2536501005212854 and parameters: {'n_estimators': 625, 'max_depth': 20, 'min_samples_split': 15, 'min_samples_leaf': 31}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:28,465] Trial 773 finished with value: -2.409142165536796 and parameters: {'n_estimators': 459, 'max_depth': 44, 'min_samples_split': 7, 'min_samples_leaf': 22}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:28,760] Trial 774 finished with value: -2.074419591404686 and parameters: {'n_estimators': 980, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 15}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:28,820] Trial 775 finished with value: -2.0828050195479144 and parameters: {'n_estimators': 183, 'max_depth': 40, 'min_samples_split': 30, 'min_samples_leaf': 15}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:28,929] Trial 776 finished with value: -1.017870

[I 2024-08-09 00:15:34,790] Trial 808 finished with value: -1.4718797427509887 and parameters: {'n_estimators': 321, 'max_depth': 12, 'min_samples_split': 14, 'min_samples_leaf': 9}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:34,883] Trial 809 finished with value: -2.2382094521741247 and parameters: {'n_estimators': 297, 'max_depth': 49, 'min_samples_split': 21, 'min_samples_leaf': 19}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:35,026] Trial 810 finished with value: -2.4554660804487094 and parameters: {'n_estimators': 474, 'max_depth': 27, 'min_samples_split': 18, 'min_samples_leaf': 23}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:35,252] Trial 811 finished with value: -2.0054440491451784 and parameters: {'n_estimators': 732, 'max_depth': 17, 'min_samples_split': 17, 'min_samples_leaf': 14}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:35,554] Trial 812 finished with value: -0.926

[I 2024-08-09 00:15:41,671] Trial 844 finished with value: -2.4470963596859874 and parameters: {'n_estimators': 785, 'max_depth': 17, 'min_samples_split': 4, 'min_samples_leaf': 23}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:41,824] Trial 845 finished with value: -1.5687011927305865 and parameters: {'n_estimators': 431, 'max_depth': 27, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:41,870] Trial 846 finished with value: -2.2671895121279784 and parameters: {'n_estimators': 130, 'max_depth': 46, 'min_samples_split': 9, 'min_samples_leaf': 19}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:41,915] Trial 847 finished with value: -3.002866722840981 and parameters: {'n_estimators': 135, 'max_depth': 37, 'min_samples_split': 12, 'min_samples_leaf': 29}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:42,175] Trial 848 finished with value: -3.337347

[I 2024-08-09 00:15:48,021] Trial 880 finished with value: -0.801924196930881 and parameters: {'n_estimators': 306, 'max_depth': 29, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:48,100] Trial 881 finished with value: -2.593473259994125 and parameters: {'n_estimators': 250, 'max_depth': 16, 'min_samples_split': 32, 'min_samples_leaf': 25}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:48,386] Trial 882 finished with value: -2.674017118360123 and parameters: {'n_estimators': 971, 'max_depth': 28, 'min_samples_split': 10, 'min_samples_leaf': 26}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:48,519] Trial 883 finished with value: -1.0077360144344967 and parameters: {'n_estimators': 393, 'max_depth': 22, 'min_samples_split': 9, 'min_samples_leaf': 6}. Best is trial 334 with value: -0.263504030750139.
[I 2024-08-09 00:15:48,628] Trial 884 finished with value: -0.90348480

In [71]:
study.best_params

{'n_estimators': 279,
 'max_depth': 37,
 'min_samples_split': 4,
 'min_samples_leaf': 2}

In [72]:
best_params = study.best_params

In [73]:
optuna.visualization.plot_optimization_history(study)

In [74]:
optuna.visualization.plot_parallel_coordinate(study)

In [75]:
optuna.visualization.plot_slice(study)

In [76]:
optuna.visualization.plot_param_importances(study)

In [77]:
#get best paramerts
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

### New Model

In [78]:
best_model = RandomForestRegressor(n_estimators = best_n_estimators,
                                   max_depth = best_max_depth,
                                   min_samples_split = best_min_samples_split,
                                   min_samples_leaf = best_min_samples_leaf)

In [79]:
best_model.fit(X_train,y_train)

In [80]:
pred = best_model.predict(X_test)

In [81]:
mean_absolute_error(y_test,pred)

0.377423645052676

In [82]:
mean_squared_error(y_test,pred)

0.21248631708833884