#### Overview

In this section, we will set up the necessary environment for training optimizing a machine learning model using a XGBoost Classifier classifier and **Bayesian Search**. We will:

1. **Import Essential Libraries** – Load key Python libraries for data handling, model training, evaluation, and hyperparameter tuning.
2. **Load and Prepare the Dataset** – Read the dataset from a CSV file, remove unnecessary columns, and split the data into features (`X`) and target labels (`y`).
3. **Perform Data Splitting** – Divide the dataset into training and testing sets to ensure the model generalizes well to unseen data.
4. **Define the Hyperparameter Search Space** – Specify a range of values for key hyperparameters of the Random Forest model to optimize performance.
5. **Optimize Model with Bayesian Search** – Utilize Bayesian optimization via `BayesSearchCV` to efficiently search for the best hyperparameters.
6. **Evaluate the Model** – Assess the model's performance using accuracy and a classification report.

The entire process will be logged with **Rich Console** to enhance readability and provide real-time updates.

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# Load dataset
print("Loading Data...")
df: pd.DataFrame = pd.read_csv("../data/csv/dataset.csv")
print("Data Loaded Successfully!")

# Drop non-training columns
df: pd.DataFrame = df.drop(["date", "home_team", "away_team"], axis=1)
X: pd.DataFrame = df.drop("winning_team", axis=1)
y: pd.DataFrame = df["winning_team"]

# Applying SMOTE for oversapling the minority feature (winning_team = 1)
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)

# Split dataset
print("Splitting the dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)


# Optuna objective
def objective(trial) -> float:
    params: dict[str, int | float | str] = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.5),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 20),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
    }

    model: XGBClassifier = XGBClassifier(**params)

    score: np.ndarray = cross_val_score(
        model, X_train, y_train, scoring="accuracy", cv=5, n_jobs=-1
    )
    return np.mean(score)


# Create and run study
print("Starting Optuna hyperparameter tuning...")
study: optuna.Study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000, n_jobs=-1)

# Print best trial
print("\nBest Trial:")
print(f"  Value: {study.best_trial.value:.4f}")
print("  Params:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

# Train final model
best_params: dict[str, int | float | str] = study.best_trial.params
best_params.update(
    {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
    }
)

final_model: XGBClassifier = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Evaluate
y_pred: np.ndarray = final_model.predict(X_test)
accuracy: float = accuracy_score(y_test, y_pred)
print(f"\nFinal Model Accuracy on Test Set: {accuracy * 100:.2f}%")

# Visualization 1: Optimization History
# Plot optimization history
plt.figure(figsize=(8, 6))
opt_history: list[float] = [trial.value for trial in study.trials]
plt.plot(opt_history)
plt.xlabel("Trial", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
plt.title("Optuna Optimization History", fontsize=16)
plt.grid(True)
plt.show()

# Visualization 2: Hyperparameter Importance
# Plot hyperparameter importance
param_importance: dict[str, float] = study.best_trial.params
params, importances = zip(*param_importance.items())

plt.figure(figsize=(10, 6))
plt.barh(params, importances)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Hyperparameters", fontsize=14)
plt.title("Hyperparameter Importance", fontsize=16)
plt.grid(True)
plt.show()