In [None]:
"""
XGBoost Training Script
Uses final_train_v3.csv and final_test_v3.csv
Outputs:
 - model_xgb.json
 - xgb_metrics.txt
 - xgb_feature_importance.csv
 - actual_vs_pred_xgb.png
"""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt

TRAIN_PATH = Path("../data/final_train_v3.csv")
TEST_PATH = Path("../data/final_test_v3.csv")
MODEL_OUT = Path("../data/model_xgb.json")
METRIC_OUT = Path("../data/xgb_metrics.txt")
FI_OUT = Path("../data/xgb_feature_importance.csv")
PLOT_OUT = Path("../data/actual_vs_pred_xgb.png")

print("Loading train/test...")
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

TARGET = "Total_Purchases"
blacklist = ["Date", "products_grouped", "Product_Category", "Country"]
features = [c for c in train.columns if c not in blacklist + [TARGET]]

X_train = train[features]
y_train = train[TARGET]
X_test = test[features]
y_test = test[TARGET]

print(f"Training rows: {len(X_train)}, Test rows: {len(X_test)}")
print(f"Features used: {len(features)}")

# -----------------------
# XGBoost Parameters
# -----------------------
params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "learning_rate": 0.05,
    "max_depth": 8,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_estimators": 1500,
    "tree_method": "hist"
}

# -----------------------
# Train Model
# -----------------------
print("Training XGBoost...")
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=50)

# -----------------------
# Predictions
# -----------------------
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
accuracy = max(0.0, 1 - (mae / (y_test.mean() + 1e-9)))

print(f"Accuracy: {accuracy:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

with open(METRIC_OUT, "w") as f:
    f.write(f"Accuracy: {accuracy}\nMAE: {mae}\nRMSE: {rmse}\n")

# -----------------------
# Feature Importance
# -----------------------
fi = pd.DataFrame({
    "feature": features,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)
fi.to_csv(FI_OUT, index=False)

# -----------------------
# Actual vs Predicted Plot
# -----------------------
plt.figure(figsize=(10,5))
plt.plot(y_test.values[:200], label='Actual')
plt.plot(preds[:200], label='Predicted')
plt.legend()
plt.title('XGBoost: Actual vs Predicted (first 200)')
plt.xlabel('Index')
plt.ylabel('Total Purchases')
plt.tight_layout()
plt.savefig(PLOT_OUT)

# -----------------------
# Save model
# -----------------------
model.save_model(str(MODEL_OUT))
print("XGBoost training complete âœ”")
print(f"Saved -> {MODEL_OUT}")
print(f"Saved metrics -> {METRIC_OUT}")
print(f"Saved feature importance -> {FI_OUT}")
print(f"Saved plot -> {PLOT_OUT}")