#### XGBoost with Optuna Hyperparameter Tuning

#### Overview
This notebook implements a machine learning pipeline to predict NBA game outcomes using XGBoost with automated hyperparameter optimization via Optuna.

#### Methodology

##### 1. Data Preparation
- Load NBA game dataset from CSV
- Filter games from October 19, 2021 onwards
- Remove non-predictive features (date, team names)
- Separate features (X) and target variable (winning_team)

##### 2. Class Imbalance Handling
- Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset
- Ensures the model doesn't bias toward the majority class

##### 3. Hyperparameter Optimization
- Use Optuna framework for Bayesian optimization
- Search space includes:
  - Tree structure: `n_estimators`, `max_depth`, `min_child_weight`
  - Learning dynamics: `learning_rate`, `subsample`, `colsample_bytree`
  - Regularization: `gamma`, `reg_alpha`, `reg_lambda`
- 500 trials with 5-fold cross-validation
- Maximize accuracy metric

##### 4. Model Training & Evaluation
- Train final XGBoost classifier with optimal parameters
- Evaluate on held-out test set (20% split)
- Visualize optimization history and hyperparameter importance

#### Expected Outputs
1. Best hyperparameters and cross-validation score
2. Final model test accuracy
3. Optimization history plot
4. Hyperparameter importance visualization

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from pathlib import Path
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from optuna.pruners import MedianPruner
from optuna.importance import get_param_importances

# Suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load dataset
print("Loading Data...")
df: pd.DataFrame = pd.read_csv("../data/csv/dataset.csv")
print("Data Loaded Successfully!")

# Convert date column to datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Keep only rows from 2021-10-19 and later
df: pd.DataFrame = df[df["date"] >= "2021-10-19"].reset_index(drop=True)

# Drop non-training columns
df: pd.DataFrame = df.drop(["date", "home_team", "away_team"], axis=1)
X: pd.DataFrame = df.drop("winning_team", axis=1)
y: pd.Series = df["winning_team"]

# Split dataset BEFORE applying SMOTE to prevent data leakage
print("Splitting the dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Applying SMOTE for oversampling the minority class (winning_team = 1)
# Apply only to training data to prevent data leakage
print("Applying SMOTE to balance training data...")
smote: SMOTE = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Original training set: {y_train.value_counts().to_dict()}")
print(f"Resampled training set: {pd.Series(y_train_resampled).value_counts().to_dict()}")


# Optuna objective
def objective(trial) -> float:
    """
    Optuna objective function for optimizing an XGBoost classifier.

    This function defines the hyperparameter search space for an
    `XGBClassifier`, trains the model using cross-validation, and returns
    the mean accuracy score to be maximized by Optuna.

    Hyperparameters are sampled dynamically by the provided Optuna trial.

    Parameters
    ----------
    trial : optuna.trial.Trial
        Optuna trial object used to suggest hyperparameter values.

    Returns
    -------
    float
        Mean cross-validated accuracy score across all folds.

    Notes
    -----
    - Five-fold cross-validation is used to evaluate each trial.
    - Accuracy is used as the optimization metric.
    - Training data (`X_train_resampled`, `y_train_resampled`) is expected
      to be available in the enclosing scope.
    - The model objective is binary classification using logistic loss.
    - Parallel execution is enabled via `n_jobs=-1`.
    """
    params: dict[str, int | float | str] = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 20),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
    }

    model: XGBClassifier = XGBClassifier(**params)

    score: np.ndarray = cross_val_score(
        model, X_train_resampled, y_train_resampled, scoring="accuracy", cv=5, n_jobs=-1
    )
    return float(np.mean(score))


# Create and run study with pruning
print("\nStarting Optuna hyperparameter tuning...")
study: optuna.Study = optuna.create_study(
    direction="maximize",
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10),
)
study.optimize(objective, n_trials=500, n_jobs=-1, show_progress_bar=True)

# Print best trial
print("\n" + "=" * 60)
print("BEST TRIAL RESULTS")
print("=" * 60)
print(f"Best Cross-Validation Accuracy: {study.best_trial.value:.4f}")
print("\nOptimal Hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key:20s}: {value}")
print("=" * 60)

# Train final model with best parameters
print("\nTraining final model with optimal hyperparameters...")
best_params: dict[str, int | float | str] = study.best_trial.params.copy()
best_params.update(
    {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
    }
)

final_model: XGBClassifier = XGBClassifier(**best_params)
final_model.fit(X_train_resampled, y_train_resampled)

# Evaluate on test set
y_pred: np.ndarray = final_model.predict(X_test)
y_pred_proba: np.ndarray = final_model.predict_proba(X_test)[:, 1]

# Print comprehensive metrics
print("\n" + "=" * 60)
print("TEST SET EVALUATION")
print("=" * 60)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Away Win", "Home Win"]))
print("=" * 60)

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Visualization 1: Optimization History
print("\nGenerating visualizations...")
plt.figure(figsize=(12, 6))
opt_history: list[float] = [trial.value for trial in study.trials]
plt.plot(opt_history, linewidth=2, alpha=0.7)
plt.axhline(
    y=study.best_trial.value,
    color="r",
    linestyle="--",
    label=f"Best: {study.best_trial.value:.4f}",
)
plt.xlabel("Trial", fontsize=14)
plt.ylabel("Cross-Validation Accuracy", fontsize=14)
plt.title("Optuna Optimization History", fontsize=16, fontweight="bold")
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("models/optimization_history.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 2: Hyperparameter Importance (FIXED)
param_importance: dict[str, float] = get_param_importances(study)
params: list[str] = list(param_importance.keys())
importances: list[float] = list(param_importance.values())

plt.figure(figsize=(12, 8))
plt.barh(params, importances, color="steelblue", alpha=0.8)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Hyperparameters", fontsize=14)
plt.title("Hyperparameter Importance", fontsize=16, fontweight="bold")
plt.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
plt.savefig("models/hyperparameter_importance.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 3: Confusion Matrix
cm: np.ndarray = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Away Win", "Home Win"],
    yticklabels=["Away Win", "Home Win"],
    cbar_kws={"label": "Count"},
)
plt.title("Confusion Matrix", fontsize=16, fontweight="bold")
plt.ylabel("True Label", fontsize=14)
plt.xlabel("Predicted Label", fontsize=14)
plt.tight_layout()
plt.savefig("models/confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 4: ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc: float = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random Classifier")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
plt.title("Receiver Operating Characteristic (ROC) Curve", fontsize=16, fontweight="bold")
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("models/roc_curve.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 5: Feature Importance
feature_importance: np.ndarray = final_model.feature_importances_
feature_names: pd.Index = X_train.columns

# Get top 20 features
indices: np.ndarray = np.argsort(feature_importance)[::-1][:20]

plt.figure(figsize=(12, 10))
plt.barh(
    range(len(indices)),
    feature_importance[indices],
    color="teal",
    alpha=0.8,
)
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Feature Importance", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.title("Top 20 Most Important Features", fontsize=16, fontweight="bold")
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
plt.savefig("models/feature_importance.png", dpi=300, bbox_inches="tight")
plt.show()

print("\nAll visualizations saved to models/ directory")
print("\nâœ“ Pipeline completed successfully!")