In [4]:
from sklearn.datasets import load_breast_cancer
import optuna
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

# -- Get the dataset
X, y = load_breast_cancer(return_X_y=True)

# -- Define the objective function
def objective(trial):
    # -- Instantiate scaler
    # (a) List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])

    # (b) Define your scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()

    # -- Instantiate dimensionality reduction
     # (a) List all dimensionality reduction options
    dim_red = trial.suggest_categorical("dim_red", ["PCA", None])

    # (b) Define the PCA algorithm and its hyperparameters
    if dim_red == "PCA":
        pca_n_components=trial.suggest_int("pca_n_components", 2, 30) # suggest an integer from 2 to 30
        dimen_red_algorithm=PCA(n_components=pca_n_components)
    # (c) No dimensionality reduction option
    else:
        dimen_red_algorithm='passthrough'

    # -- Instantiate estimator model
    knn_n_neighbors=trial.suggest_int("knn_n_neighbors", 1, 19, 2)
    knn_metric=trial.suggest_categorical("knn_metric", ['euclidean', 'manhattan', 'minkowski'])
    knn_weights=trial.suggest_categorical("knn_weights", ['uniform', 'distance'])

    estimator=KNeighborsClassifier(n_neighbors=knn_n_neighbors, metric=knn_metric, weights=knn_weights)

    # -- Make a pipeline
    pipeline = make_pipeline(scaler, dimen_red_algorithm, estimator)

    # -- Evaluate the score by cross-validation
    score = cross_val_score(pipeline, X, y, scoring='f1')
    f1 = score.mean() # calculate the mean of scores
    return f1

study = optuna.create_study(direction="maximize") # maximise the score during tuning
study.optimize(objective, n_trials=100) # run the objective function 100 times

print(study.best_trial) # print the best performing pipeline


[I 2024-11-25 17:12:23,227] A new study created in memory with name: no-name-fb38e9c8-2708-4c7d-b1be-b57badbf9725
  knn_n_neighbors=trial.suggest_int("knn_n_neighbors", 1, 19, 2)
[I 2024-11-25 17:12:23,264] Trial 0 finished with value: 0.9667700701135352 and parameters: {'scalers': 'standard', 'dim_red': 'PCA', 'pca_n_components': 5, 'knn_n_neighbors': 5, 'knn_metric': 'minkowski', 'knn_weights': 'uniform'}. Best is trial 0 with value: 0.9667700701135352.
  knn_n_neighbors=trial.suggest_int("knn_n_neighbors", 1, 19, 2)
[I 2024-11-25 17:12:23,290] Trial 1 finished with value: 0.9766010241340217 and parameters: {'scalers': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 24, 'knn_n_neighbors': 9, 'knn_metric': 'minkowski', 'knn_weights': 'distance'}. Best is trial 1 with value: 0.9766010241340217.
  knn_n_neighbors=trial.suggest_int("knn_n_neighbors", 1, 19, 2)
[I 2024-11-25 17:12:23,312] Trial 2 finished with value: 0.9524738248147026 and parameters: {'scalers': 'standard', 'dim_red': 'P

FrozenTrial(number=50, state=1, values=[0.9794225452738171], datetime_start=datetime.datetime(2024, 11, 25, 17, 12, 24, 712543), datetime_complete=datetime.datetime(2024, 11, 25, 17, 12, 24, 763540), params={'scalers': 'robust', 'dim_red': 'PCA', 'pca_n_components': 14, 'knn_n_neighbors': 7, 'knn_metric': 'minkowski', 'knn_weights': 'uniform'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'dim_red': CategoricalDistribution(choices=('PCA', None)), 'pca_n_components': IntDistribution(high=30, log=False, low=2, step=1), 'knn_n_neighbors': IntDistribution(high=19, log=False, low=1, step=2), 'knn_metric': CategoricalDistribution(choices=('euclidean', 'manhattan', 'minkowski')), 'knn_weights': CategoricalDistribution(choices=('uniform', 'distance'))}, trial_id=50, value=None)


In [8]:
study.best_trial.params

{'scalers': 'robust',
 'dim_red': 'PCA',
 'pca_n_components': 14,
 'knn_n_neighbors': 7,
 'knn_metric': 'minkowski',
 'knn_weights': 'uniform'}

In [12]:
study.best_trial.value

0.9794225452738171