In [1]:

# Importing the Packages:
import optuna
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn import ensemble
from sklearn import datasets
from sklearn import model_selection
from sklearn.datasets import load_breast_cancer


In [17]:
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns= np.append(cancer['feature_names'], ['target']))

In [18]:
X = df.drop('target',axis=1)
y = df.target

In [19]:
#Step 1. Define an objective function to be maximized.
def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["LogReg", "RandomForest"])
    
    # Step 2. Setup values for the hyperparameters:
    if classifier_name == 'LogReg':
        logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
        classifier_obj = linear_model.LogisticRegression(C=logreg_c)
    else:
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = ensemble.RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=rf_n_estimators
        )

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2020-08-19 00:48:54,077] Trial 0 finished with value: 0.9384943841084192 and parameters: {'classifier': 'LogReg', 'logreg_c': 129.95410756932998}. Best is trial 0 with value: 0.9384943841084192.
[I 2020-08-19 00:48:54,658] Trial 1 finished with value: 0.9367399981435068 and parameters: {'classifier': 'LogReg', 'logreg_c': 71120416.5516676}. Best is trial 0 with value: 0.9384943841084192.
[I 2020-08-19 00:48:57,396] Trial 2 finished with value: 0.9543117051888982 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 995, 'rf_max_depth': 12}. Best is trial 2 with value: 0.9543117051888982.
[I 2020-08-19 00:48:59,445] Trial 3 finished with value: 0.9437668244685788 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 639, 'rf_max_depth': 2}. Best is trial 2 with value: 0.9543117051888982.
[I 2020-08-19 00:49:00,489] Trial 4 finished with value: 0.9490485472941613 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 396, 'rf_max_depth': 3}. Best is 

In [20]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial}")
# >> Output:
#The best trial is : 
#FrozenTrial(number=18, value=0.9631114824097281, datetime_start=datetime.datetime(2020, 8, 16, 14, 24, 37, 407344), datetime_complete=datetime.datetime(2020, 8, 16, 14, 24, 37, 675114), params={'classifier': 'RandomForest', 'rf_n_estimators': 153, 'rf_max_depth': 21},
#distributions={'classifier': CategoricalDistribution(choices=('LogReg', 'RandomForest')), 'rf_n_estimators': IntUniformDistribution(high=1000, low=10, step=1), 'rf_max_depth': IntLogUniformDistribution(high=32, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=18, state=TrialState.COMPLETE)

# Getting the best score:
print(f"The best value is : \n{study.best_value}")
# >> Output:
# 0.9631114824097281

# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")
# >> Output:
# {'classifier': 'RandomForest', 'rf_n_estimators': 153, 'rf_max_depth': 21}

The best trial is : 
FrozenTrial(number=14, value=0.9648380209783718, datetime_start=datetime.datetime(2020, 8, 19, 0, 49, 4, 92808), datetime_complete=datetime.datetime(2020, 8, 19, 0, 49, 4, 171497), params={'classifier': 'RandomForest', 'rf_n_estimators': 14, 'rf_max_depth': 32}, distributions={'classifier': CategoricalDistribution(choices=('LogReg', 'RandomForest')), 'rf_n_estimators': IntUniformDistribution(high=1000, low=10, step=1), 'rf_max_depth': IntLogUniformDistribution(high=32, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=14, state=TrialState.COMPLETE)
The best value is : 
0.9648380209783718
The best parameters are : 
{'classifier': 'RandomForest', 'rf_n_estimators': 14, 'rf_max_depth': 32}


In [21]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_classifier,params_logreg_c,params_rf_max_depth,params_rf_n_estimators,state
0,0,0.938494,2020-08-19 00:48:52.475049,2020-08-19 00:48:54.076317,00:00:01.601268,LogReg,1.299541e+02,,,COMPLETE
1,1,0.936740,2020-08-19 00:48:54.081693,2020-08-19 00:48:54.658704,00:00:00.577011,LogReg,7.112042e+07,,,COMPLETE
2,2,0.954312,2020-08-19 00:48:54.660463,2020-08-19 00:48:57.395732,00:00:02.735269,RandomForest,,12.0,995.0,COMPLETE
3,3,0.943767,2020-08-19 00:48:57.398756,2020-08-19 00:48:59.445647,00:00:02.046891,RandomForest,,2.0,639.0,COMPLETE
4,4,0.949049,2020-08-19 00:48:59.448047,2020-08-19 00:49:00.489483,00:00:01.041436,RandomForest,,3.0,396.0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...
95,95,0.957830,2020-08-19 00:50:12.118107,2020-08-19 00:50:13.260875,00:00:01.142768,RandomForest,,22.0,450.0,COMPLETE
96,96,0.963111,2020-08-19 00:50:13.263478,2020-08-19 00:50:13.769116,00:00:00.505638,RandomForest,,5.0,195.0,COMPLETE
97,97,0.950794,2020-08-19 00:50:13.771643,2020-08-19 00:50:14.276593,00:00:00.504950,RandomForest,,5.0,190.0,COMPLETE
98,98,0.954312,2020-08-19 00:50:14.278669,2020-08-19 00:50:14.634611,00:00:00.355942,RandomForest,,4.0,108.0,COMPLETE
