# Hyperparameter tuning

In [1]:
from pprint import pformat
from typing import Any

import optuna
import pandas as pd
from rich.console import Console
from sklearn.metrics import balanced_accuracy_score

from cpt_to_soiltype.train_eval_funcs import xgb_native_pipeline, train_predict
from cpt_to_soiltype.preprocess_funcs import get_dataset, split_drillhole_data

  from .autonotebook import tqdm as notebook_tqdm


Objective function to optimise

In [2]:
def objective(
    trial: optuna.Trial,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    oversample_level: int,
    undersample_level: int,
) -> float:
    
    console = Console()

    # Defining the hyperparameters to be optimised for KNN
    model_params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 20),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "metric": trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski", "chebyshev"]),
        "p": trial.suggest_int("p", 1, 5),  # Only used if metric is "minkowski"
    }

    console.print(f"\nSuggested hyperparameters: \n{pformat(trial.params)}")

    # Call the train_predict function for KNN
    y_pred = train_predict(
        model_name="knn",
        model_params=model_params,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        undersample_level=undersample_level,
        oversample_level=oversample_level,
    )

    # Evaluate the performance using balanced accuracy (or you can use any other metric)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    console.print(f"Balanced accuracy: {balanced_accuracy}")

    return balanced_accuracy

In [3]:
def run_optimisation(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    oversample_level: int,
    undersample_level: int,
    n_trials: int = 100,
    study_name: str = "xgboost_hyperparameter_optimisation",
) -> Any:
    sampler = optuna.samplers.TPESampler()
    study = optuna.create_study(direction="maximize", study_name=study_name,sampler=sampler)
    study.optimize(
        lambda trial: objective(
            trial, X_train, X_test, y_train, y_test, oversample_level, undersample_level
        ),
        n_trials=n_trials,
    )
    return study

Get training dataset

In [4]:
FEATURES = ['Depth (m)', 'qc (MPa)', 'fs (kPa)','Rf (%)', 'σ,v (kPa)', 'u0 (kPa)',"σ',v (kPa)", 'Qtn (-)', 'Fr (%)']
LABELS = ['Oberhollenzer_classes']

df = get_dataset("../data/model_ready/dataset_train.csv")
train_df, test_df = split_drillhole_data(df, id_column="ID",train_fraction=0.75)
X_train = train_df[FEATURES]
X_test = test_df[FEATURES]
y_train = train_df[LABELS]
y_test = test_df[LABELS]

In [5]:
y_train.value_counts()

Oberhollenzer_classes
5.0                      111833
2.0                      107003
6.0                       96985
7.0                       90632
4.0                       56218
1.0                       45593
3.0                        1447
Name: count, dtype: int64

Run optimisation for 5 trials

In [13]:
study = run_optimisation(X_train, X_test, y_train, y_test, oversample_level=60000, undersample_level=90000, n_trials=10)

[I 2024-11-14 09:16:35,114] A new study created in memory with name: xgboost_hyperparameter_optimisation


[I 2024-11-14 09:16:38,038] Trial 0 finished with value: 0.3389911443991851 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'metric': 'euclidean', 'p': 2}. Best is trial 0 with value: 0.3389911443991851.


[I 2024-11-14 09:16:39,602] Trial 1 finished with value: 0.32605042834903764 and parameters: {'n_neighbors': 1, 'weights': 'distance', 'metric': 'chebyshev', 'p': 3}. Best is trial 0 with value: 0.3389911443991851.


[I 2024-11-14 09:16:46,325] Trial 2 finished with value: 0.35025260431027405 and parameters: {'n_neighbors': 13, 'weights': 'uniform', 'metric': 'euclidean', 'p': 1}. Best is trial 2 with value: 0.35025260431027405.


[I 2024-11-14 09:16:49,439] Trial 3 finished with value: 0.3428936172193498 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'euclidean', 'p': 5}. Best is trial 2 with value: 0.35025260431027405.


[I 2024-11-14 09:17:06,778] Trial 4 finished with value: 0.3568126471511241 and parameters: {'n_neighbors': 20, 'weights': 'uniform', 'metric': 'minkowski', 'p': 3}. Best is trial 4 with value: 0.3568126471511241.


[I 2024-11-14 09:17:14,702] Trial 5 finished with value: 0.35025260431027405 and parameters: {'n_neighbors': 13, 'weights': 'uniform', 'metric': 'euclidean', 'p': 3}. Best is trial 4 with value: 0.3568126471511241.


[I 2024-11-14 09:17:17,656] Trial 6 finished with value: 0.352058498017872 and parameters: {'n_neighbors': 13, 'weights': 'distance', 'metric': 'chebyshev', 'p': 4}. Best is trial 4 with value: 0.3568126471511241.


[I 2024-11-14 09:17:23,341] Trial 7 finished with value: 0.32605042834903764 and parameters: {'n_neighbors': 1, 'weights': 'uniform', 'metric': 'chebyshev', 'p': 4}. Best is trial 4 with value: 0.3568126471511241.


[I 2024-11-14 09:17:25,871] Trial 8 finished with value: 0.3456138287182665 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'chebyshev', 'p': 2}. Best is trial 4 with value: 0.3568126471511241.


[I 2024-11-14 09:17:34,886] Trial 9 finished with value: 0.3385292792147011 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'metric': 'minkowski', 'p': 5}. Best is trial 4 with value: 0.3568126471511241.


In [17]:
trial = study.best_trial
for key, value in trial.params.items():
    print(f"{key}: {value}")

n_neighbors: 20
weights: uniform
metric: minkowski
p: 3


In [14]:
import optuna.visualization as vis

# Show the optimization history plot
vis.plot_optimization_history(study).show()


In [15]:

# Show the parallel coordinate plot to understand relationships between parameters
vis.plot_parallel_coordinate(study).show()


In [16]:

# Show the parameter importance plot
vis.plot_param_importances(study).show()


In [18]:

# Show the slice plot to see the effect of parameters on objective value
vis.plot_slice(study).show()


In [19]:

# Show an interactive contour plot to explore the parameter relationships
vis.plot_contour(study).show()


In [20]:

# (Optional) Show the EDF (empirical distribution function) of the objective values
vis.plot_edf(study).show()


For Xgboos

def objective(
    trial: optuna.Trial,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    oversample_level: int,
    undersample_level: int,
) -> float:
    
    console = Console()

    # Defining the hyperparameters to be optimised
    model_params = {
        "objective": "multi:softmax",
        "device": "gpu",
        "random_state": 42,
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
    }

    console.print(f"\nSuggested hyperparameters: \n{pformat(trial.params)}")

    # Call the pipeline function
    y_pred = xgb_native_pipeline(
        X_train,
        X_test,
        y_train,
        y_test,
        model_params,
        oversample_level,
        undersample_level,
    )

    # Evaluate the performance using balanced accuracy (or you can use any other metric)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    console.print(f"Balanced accuracy: {balanced_accuracy}")

    return balanced_accuracy