In [None]:
"""
XGBoost Hyperparameter Tuning with Optuna
Uses final_train_v3.csv and final_test_v3.csv
Outputs:
 - best_xgb_model.json
 - optuna_xgb_study.pkl
 - tuned_predictions_plot.png
 - tuned_metrics.txt
"""

import pandas as pd
import numpy as np
import optuna
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt
import pickle

TRAIN_PATH = Path("../data/final_train_v3.csv")
TEST_PATH = Path("../data/final_test_v3.csv")
MODEL_OUT = Path("../data/best_xgb_model.json")
STUDY_OUT = Path("../data/optuna_xgb_study.pkl")
PLOT_OUT = Path("../data/tuned_predictions_plot.png")
METRIC_OUT = Path("../data/tuned_metrics.txt")

print("Loading data...")
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

TARGET = "Total_Purchases"
blacklist = ["Date", "products_grouped", "Product_Category", "Country"]
features = [c for c in train.columns if c not in blacklist + [TARGET]]

X_train = train[features]
y_train = train[TARGET]
X_test = test[features]
y_test = test[TARGET]

# Convert to DMatrix for compatibility
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

print(f"Training rows: {len(X_train)}, Test rows: {len(X_test)}")
print(f"Feature count: {len(features)}")

# -----------------------------------------------------
# Optuna objective
# -----------------------------------------------------

def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "tree_method": "hist",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "lambda": trial.suggest_float("lambda", 0.0, 5.0),
        "alpha": trial.suggest_float("alpha", 0.0, 5.0),
    }

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtest, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False
    )

    preds = model.predict(dtest)
    mae = mean_absolute_error(y_test, preds)
    return mae

# -----------------------------------------------------
# Run Optuna Study
# -----------------------------------------------------
print("Starting Optuna tuning... this may take time.")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("Best MAE:", study.best_value)
print("Best params:", study.best_params)

# Save study
with open(STUDY_OUT, "wb") as f:
    pickle.dump(study, f)

# -----------------------------------------------------
# Train final model using best params
# -----------------------------------------------------
best_params = study.best_params
best_params.update({
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "tree_method": "hist"
})

print("Training final XGBoost model with best parameters...")
best_model = xgb.train(
    best_params,
    dtrain,
    num_boost_round=3000,
    evals=[(dtest, "valid")],
    early_stopping_rounds=100,
    verbose_eval=False
)

best_model.save_model(str(MODEL_OUT))

# -----------------------------------------------------
# Evaluate final model
# -----------------------------------------------------
preds = best_model.predict(dtest)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
accuracy = max(0.0, 1 - (mae / (y_test.mean() + 1e-9)))

print(f"Final Tuned MAE: {mae:.4f}")
print(f"Final Tuned RMSE: {rmse:.4f}")
print(f"Final Tuned Accuracy: {accuracy:.4f}")

with open(METRIC_OUT, "w") as f:
    f.write(f"MAE: {mae}\nRMSE: {rmse}\nAccuracy: {accuracy}\n")

# -----------------------------------------------------
# Plot Predictions
# -----------------------------------------------------
plt.figure(figsize=(10,5))
plt.plot(y_test.values[:200], label="Actual")
plt.plot(preds[:200], label="Predicted")
plt.legend()
plt.title("Tuned XGBoost: Actual vs Predicted (first 200)")
plt.xlabel("Index")
plt.ylabel("Total Purchases")
plt.tight_layout()
plt.savefig(PLOT_OUT)

print("Tuned XGBoost training complete âœ”")
print(f"Saved best model -> {MODEL_OUT}")
print(f"Saved study -> {STUDY_OUT}")
print(f"Saved metrics -> {METRIC_OUT}")
print(f"Saved plot -> {PLOT_OUT}")