XGBOOST

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
  accuracy_score, classification_report, roc_auc_score,
  roc_curve, f1_score
)

In [None]:
def run_xgboost_booster_pipeline(
  X_train, y_train,
  X_val, y_val,
  X_test, y_test,
  feature_names,
  learning_rate=0.05,
  max_depth=5,
  num_boost_round=100,
  early_stopping_rounds=10,
  seed=42
):
  # === Convert to NumPy (if needed) ===
  X_train_np = X_train.values if hasattr(X_train, "values") else X_train
  y_train_np = y_train.values if hasattr(y_train, "values") else y_train
  X_val_np = X_val.values if hasattr(X_val, "values") else X_val
  y_val_np = y_val.values if hasattr(y_val, "values") else y_val
  X_test_np = X_test.values if hasattr(X_test, "values") else X_test
  y_test_np = y_test.values if hasattr(y_test, "values") else y_test

  # === Create DMatrix ===
  dtrain = xgb.DMatrix(X_train_np, label=y_train_np, feature_names=feature_names)
  dval = xgb.DMatrix(X_val_np, label=y_val_np, feature_names=feature_names)
  dtest = xgb.DMatrix(X_test_np, label=y_test_np, feature_names=feature_names)

  # === XGBoost Parameters ===
  params = {
      'objective': 'binary:logistic',
      'eval_metric': 'auc',
      'learning_rate': learning_rate,
      'max_depth': max_depth,
      'seed': seed,
      'verbosity': 1
  }

  # === Train XGBoost model with Early Stopping ===
  model_xgb = xgb.train(
      params,
      dtrain,
      num_boost_round=num_boost_round,
      evals=[(dtrain, 'train'), (dval, 'val')],
      early_stopping_rounds=early_stopping_rounds
  )

  # === Predict Probabilities ===
  val_probs = model_xgb.predict(dval)
  test_probs = model_xgb.predict(dtest)

  # === Threshold Search (F1 optimized) ===
  thresholds = np.linspace(0, 1, 101)
  f1s = [f1_score(y_val_np, (val_probs >= t).astype(int)) for t in thresholds]
  best_threshold = thresholds[np.argmax(f1s)]
  print(f"\n✅ Best threshold (F1): {best_threshold:.2f}")

  # === Evaluate ===
  test_preds = (test_probs >= best_threshold).astype(int)
  accuracy = accuracy_score(y_test_np, test_preds)
  auc = roc_auc_score(y_test_np, test_probs)
  report_str = classification_report(y_test_np, test_preds, target_names=["Class 0", "Class 1"])
  report_df = pd.DataFrame(classification_report(y_test_np, test_preds, output_dict=True)).T

  print(f"\n🔎 Test Accuracy: {accuracy:.2f}")
  print("Classification Report:")
  print(report_str)
  print(f"AUC-ROC (Test): {auc:.2f}")

  # === ROC Curve ===
  fpr, tpr, _ = roc_curve(y_test_np, test_probs)
  plt.figure(figsize=(6, 4))
  plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
  plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
  plt.xlabel("False Positive Rate")
  plt.ylabel("True Positive Rate")
  plt.title("ROC Curve - XGBoost Model")
  plt.legend()
  plt.grid(True)
  plt.tight_layout()
  plt.show()

  # === Feature Importance (Gain) ===
  importance_dict = model_xgb.get_score(importance_type='gain')
  feature_importance = pd.DataFrame({
      'Feature': list(importance_dict.keys()),
      'Importance': list(importance_dict.values())
  }).sort_values(by='Importance', ascending=False)

  print("\n📊 Top 10 Important Features (XGBoost):")
  print(feature_importance.head(10))

  # === Return results ===
  results_xgb = {
      'val_probs': val_probs,
      'test_probs': test_probs,
      'test_preds': test_preds,
      'accuracy': accuracy,
      'auc': auc,
      'best_threshold': best_threshold,
      'fpr': fpr,
      'tpr': tpr,
      'report_df': report_df,
      'feature_importance': feature_importance
  }

  return model_xgb, results_xgb
