#### XGBoost with Optuna Hyperparameter Tuning

#### Overview
This notebook implements a machine learning pipeline to predict NBA game outcomes using XGBoost with automated hyperparameter optimization via Optuna.

#### Methodology

##### 1. Data Preparation
- Load NBA game dataset from CSV
- Filter games from October 19, 2021 onwards
- Remove non-predictive features (date, team names)
- Separate features (X) and target variable (winning_team)

##### 2. Hyperparameter Optimization
- Use Optuna framework for Bayesian optimization
- Search space includes:
  - Tree structure: `n_estimators`, `max_depth`, `min_child_weight`
  - Learning dynamics: `learning_rate`, `subsample`, `colsample_bytree`
  - Regularization: `gamma`, `reg_alpha`, `reg_lambda`
- 500 trials with 5-fold cross-validation
- Maximize accuracy metric

##### 3. Model Training & Evaluation
- Train final XGBoost classifier with optimal parameters
- Evaluate on held-out test set (20% split)
- Visualize optimization history and hyperparameter importance

#### Expected Outputs
1. Best hyperparameters and cross-validation score
2. Final model test accuracy
3. Optimization history plot
4. Hyperparameter importance visualization

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    log_loss,
    brier_score_loss,
)
from optuna.pruners import MedianPruner
from optuna.importance import get_param_importances
from sklearn.calibration import calibration_curve

# Suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("=" * 60)
print("NBA GAME PREDICTOR - FULL HYPERPARAMETER OPTIMIZATION")
print("=" * 60)
print("Strategy: Train on 2000-01 to 2023-24 seasons")
print("          Test on 2024-25 season (unseen data)")
print("=" * 60 + "\n")

# Load dataset
print("Loading Data...")
df: pd.DataFrame = pd.read_csv("../data/csv/dataset.csv")
print("Data Loaded Successfully!")

# Convert date column to datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Split into train (2000-2024) and test (2024-25 season)
print("\nSplitting data by season...")
train_df = df[df["date"] < "2024-10-01"].reset_index(drop=True)
test_df = df[df["date"] >= "2024-10-01"].reset_index(drop=True)

print(f"Training set: {len(train_df)} games (2000-01 to 2023-24)")
print(f"Test set: {len(test_df)} games (2024-25 season)")
print(f"Training date range: {train_df['date'].min()} to {train_df['date'].max()}")
print(f"Test date range: {test_df['date'].min()} to {test_df['date'].max()}")

# Prepare training data
train_df = train_df.drop(["date", "home_team", "away_team"], axis=1)
X_train: pd.DataFrame = train_df.drop("winning_team", axis=1)
y_train: pd.Series = train_df["winning_team"]

# Prepare test data
test_df = test_df.drop(["date", "home_team", "away_team"], axis=1)
X_test: pd.DataFrame = test_df.drop("winning_team", axis=1)
y_test: pd.Series = test_df["winning_team"]

# Calculate class imbalance for scale_pos_weight
class_counts = y_train.value_counts()
scale_pos_weight_value = class_counts[0] / class_counts[1]  # away wins / home wins

print(f"\nTraining set distribution: {class_counts.to_dict()}")
print(f"Class imbalance ratio: {scale_pos_weight_value:.3f}")
print(f"Home win rate: {class_counts[0] / len(y_train) * 100:.1f}%")

print(f"\nTest set distribution: {y_test.value_counts().to_dict()}")


# Optuna objective - OPTIMIZED FOR NBA PREDICTION
def objective(trial) -> float:
    """
    Optuna objective function optimized for NBA game prediction.

    This function is tailored for sports betting/prediction scenarios where:
    - Probability calibration matters more than raw accuracy
    - Natural class imbalance should be preserved (no resampling)
    - We want well-calibrated win probabilities (e.g., 60% prediction = 60% actual)

    Parameters
    ----------
    trial : optuna.trial.Trial
        Optuna trial object used to suggest hyperparameter values.

    Returns
    -------
    float
        Negative log loss (lower is better, but Optuna maximizes).

    Notes
    -----
    - Uses 5-fold cross-validation on training data (2000-2024)
    - Optimizes for log loss (probability calibration)
    - Conservative hyperparameter ranges to prevent overfitting to noise
    - scale_pos_weight handles class imbalance naturally
    """
    params: dict[str, int | float | str] = {
        # Tree structure - shallower for NBA (strong signals, don't need deep interactions)
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        
        # Learning rate - log scale for better exploration
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        
        # Sampling - keep most data, NBA games have limited samples
        "subsample": trial.suggest_float("subsample", 0.7, 0.95),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.95),
        
        # Regularization - conservative to prevent overfitting to fluky games
        "gamma": trial.suggest_float("gamma", 0, 3),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.001, 3, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 15, log=True),
        
        # Min samples per leaf - higher to avoid noise
        "min_child_weight": trial.suggest_int("min_child_weight", 3, 10),
        
        # Handle class imbalance - search around calculated value
        "scale_pos_weight": trial.suggest_float(
            "scale_pos_weight", 
            scale_pos_weight_value * 0.8, 
            scale_pos_weight_value * 1.2
        ),
        
        # Fixed parameters
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
    }

    model: XGBClassifier = XGBClassifier(**params)

    # Use log loss for probability calibration (better for betting decisions)
    score: np.ndarray = cross_val_score(
        model, X_train, y_train, 
        scoring="neg_log_loss",  # Negative because sklearn maximizes
        cv=5, 
        n_jobs=-1
    )
    
    return float(np.mean(score))


# Create and run study with pruning
print("\nStarting Optuna hyperparameter tuning...")
print("Optimizing on 2000-2024 training data (5-fold CV)...\n")

study: optuna.Study = optuna.create_study(
    direction="maximize",  # Maximize negative log loss = minimize log loss
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10),
)
study.optimize(objective, n_trials=500, n_jobs=-1, show_progress_bar=True)

# Print best trial
print("\n" + "=" * 60)
print("BEST TRIAL RESULTS")
print("=" * 60)
print(f"Best Cross-Validation Log Loss: {-study.best_trial.value:.4f}")
print("\nOptimal Hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key:20s}: {value}")
print("=" * 60)

# Train final model with best parameters on ALL training data
print("\nTraining final model with optimal hyperparameters...")
print("Using all 2000-2024 training data...")
best_params: dict[str, int | float | str] = study.best_trial.params.copy()
best_params.update(
    {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
    }
)

final_model: XGBClassifier = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Evaluate on 2024-25 test set (UNSEEN DATA)
print("\nEvaluating on 2024-25 season (unseen test data)...")
y_pred: np.ndarray = final_model.predict(X_test)
y_pred_proba: np.ndarray = final_model.predict_proba(X_test)[:, 1]

# Calculate comprehensive metrics
test_accuracy = accuracy_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred_proba)
test_log_loss = log_loss(y_test, y_pred_proba)
test_brier = brier_score_loss(y_test, y_pred_proba)

# Print comprehensive metrics
print("\n" + "=" * 60)
print("2024-25 SEASON TEST SET EVALUATION (UNSEEN DATA)")
print("=" * 60)
print(f"Accuracy: {test_accuracy * 100:.2f}%")
print(f"ROC-AUC Score: {test_roc_auc:.4f}")
print(f"Log Loss: {test_log_loss:.4f}")
print(f"Brier Score: {test_brier:.4f} (lower is better)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Away Win", "Home Win"]))
print("=" * 60)

# Probability calibration analysis
print("\nProbability Calibration Analysis (2024-25 Test Set):")
print("-" * 60)
prob_bins = [0, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
for i in range(len(prob_bins) - 1):
    mask = (y_pred_proba >= prob_bins[i]) & (y_pred_proba < prob_bins[i + 1])
    if mask.sum() > 0:
        actual_rate = y_test[mask].mean()
        pred_mean = y_pred_proba[mask].mean()
        print(f"Predicted {prob_bins[i]:.1f}-{prob_bins[i+1]:.1f}: "
              f"Actual win rate = {actual_rate:.2%} (n={mask.sum()})")

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Visualization 1: Optimization History
print("\nGenerating visualizations...")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Log loss over trials
opt_history: list[float] = [-trial.value for trial in study.trials]  # Convert to log loss
ax1.plot(opt_history, linewidth=2, alpha=0.7)
ax1.axhline(
    y=-study.best_trial.value,
    color="r",
    linestyle="--",
    label=f"Best: {-study.best_trial.value:.4f}",
)
ax1.set_xlabel("Trial", fontsize=14)
ax1.set_ylabel("Cross-Validation Log Loss", fontsize=14)
ax1.set_title("Optuna Optimization History", fontsize=16, fontweight="bold")
ax1.legend(fontsize=12)
ax1.grid(True, alpha=0.3)

# Right plot: Best value progression
best_values = [min(opt_history[:i+1]) for i in range(len(opt_history))]
ax2.plot(best_values, linewidth=2, alpha=0.7, color='green')
ax2.set_xlabel("Trial", fontsize=14)
ax2.set_ylabel("Best Log Loss So Far", fontsize=14)
ax2.set_title("Best Value Progression", fontsize=16, fontweight="bold")
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("optimization_history.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 2: Hyperparameter Importance
param_importance: dict[str, float] = get_param_importances(study)
params: list[str] = list(param_importance.keys())
importances: list[float] = list(param_importance.values())

plt.figure(figsize=(12, 8))
plt.barh(params, importances, color="steelblue", alpha=0.8)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Hyperparameters", fontsize=14)
plt.title("Hyperparameter Importance", fontsize=16, fontweight="bold")
plt.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
plt.savefig("hyperparameter_importance.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 3: Confusion Matrix
cm: np.ndarray = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Away Win", "Home Win"],
    yticklabels=["Away Win", "Home Win"],
    cbar_kws={"label": "Count"},
)
plt.title("Confusion Matrix (2024-25 Test Season)", fontsize=16, fontweight="bold")
plt.ylabel("True Label", fontsize=14)
plt.xlabel("Predicted Label", fontsize=14)
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 4: ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {test_roc_auc:.4f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random Classifier")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
plt.title("ROC Curve (2024-25 Test Season)", fontsize=16, fontweight="bold")
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("roc_curve.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 5: Calibration Curve (Critical for betting!)
plt.figure(figsize=(10, 8))

# Calculate calibration curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test, y_pred_proba, n_bins=10, strategy='uniform'
)

# Plot calibration curve
plt.plot(mean_predicted_value, fraction_of_positives, "s-", 
         label=f"XGBoost (Brier: {test_brier:.4f})", linewidth=2, markersize=8)
plt.plot([0, 1], [0, 1], "k--", label="Perfect Calibration", linewidth=2)

plt.xlabel("Mean Predicted Probability", fontsize=14)
plt.ylabel("Fraction of Positives (Actual)", fontsize=14)
plt.title("Probability Calibration Curve (2024-25 Test)", fontsize=16, fontweight="bold")
plt.legend(loc="upper left", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("calibration_curve.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 6: Feature Importance
feature_importance: np.ndarray = final_model.feature_importances_
feature_names: pd.Index = X_train.columns

# Get top 20 features
indices: np.ndarray = np.argsort(feature_importance)[::-1][:20]

plt.figure(figsize=(12, 10))
plt.barh(
    range(len(indices)),
    feature_importance[indices],
    color="teal",
    alpha=0.8,
)
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Feature Importance", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.title("Top 20 Most Important Features", fontsize=16, fontweight="bold")
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
plt.savefig("feature_importance.png", dpi=300, bbox_inches="tight")
plt.show()

# Visualization 7: Prediction Confidence Distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Histogram of all predictions
ax1.hist(y_pred_proba, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
ax1.axvline(0.5, color='red', linestyle='--', linewidth=2, label='Decision Threshold')
ax1.set_xlabel("Predicted Probability (Home Win)", fontsize=14)
ax1.set_ylabel("Frequency", fontsize=14)
ax1.set_title("Distribution of Predicted Probabilities", fontsize=16, fontweight="bold")
ax1.legend(fontsize=12)
ax1.grid(True, alpha=0.3)

# Separate by correct/incorrect predictions
correct_mask = (y_pred == y_test)
ax2.hist(y_pred_proba[correct_mask], bins=30, alpha=0.7, 
         color='green', label='Correct Predictions', edgecolor='black')
ax2.hist(y_pred_proba[~correct_mask], bins=30, alpha=0.7, 
         color='red', label='Incorrect Predictions', edgecolor='black')
ax2.axvline(0.5, color='black', linestyle='--', linewidth=2)
ax2.set_xlabel("Predicted Probability (Home Win)", fontsize=14)
ax2.set_ylabel("Frequency", fontsize=14)
ax2.set_title("Prediction Confidence: Correct vs Incorrect", fontsize=16, fontweight="bold")
ax2.legend(fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("prediction_confidence.png", dpi=300, bbox_inches="tight")
plt.show()

print("\nAll visualizations saved successfully!")
print("\n" + "=" * 60)
print("KEY INSIGHTS FOR NBA BETTING")
print("=" * 60)
print(f"• Model trained on 24 years (2000-2024): {len(X_train)} games")
print(f"• Test accuracy on 2024-25 season: {test_accuracy*100:.1f}%")
print(f"• ROC-AUC of {test_roc_auc:.3f} shows ranking ability")
print(f"• Brier score of {test_brier:.4f} indicates probability calibration")
print(f"  (Lower Brier = better calibrated probabilities)")
print(f"\nNext Steps:")
print(f"• This model is now ready for 2025-26 predictions")
print(f"• Monitor calibration - your probabilities should match reality")
print(f"• For betting: focus on games with 60%+ confidence")
print("=" * 60)

print("\n✓ Pipeline completed successfully!")