In [None]:

# # SHAP Comparison: sklearn vs XGBoost Gradient Boosting on Peacock Renewal Data

# 📦 Install dependencies
!pip install shap xgboost scikit-learn pandas matplotlib seaborn

# 📥 Upload the dataset: peacock_user_data_with_renewed_and_propensity.csv
from google.colab import files
uploaded = files.upload()

# 📊 Load data
import pandas as pd
import shap
import xgboost as xgb
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# Load dataset
df = pd.read_csv("peacock_user_data_with_renewed_and_propensity.csv")

# Define features and labels
X = df.drop(columns=["user_id", "assigned_promo", "renewed", "propensity_score"])
y = df["renewed"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing
numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# Get feature names
feature_names = preprocessor.get_feature_names_out()

# ✅ Train sklearn GradientBoostingClassifier
sk_gbm = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=0)
sk_gbm.fit(X_train_proc, y_train)

# ✅ Train XGBoost
xgb_gbm = xgb.XGBClassifier(n_estimators=100, max_depth=3, use_label_encoder=False, eval_metric="logloss", random_state=0)
xgb_gbm.fit(X_train_proc, y_train)

# 🎯 Evaluate both
print("Sklearn GBM:")
print(classification_report(y_test, sk_gbm.predict(X_test_proc)))
print("AUC:", roc_auc_score(y_test, sk_gbm.predict_proba(X_test_proc)[:, 1]))

print("\nXGBoost GBM:")
print(classification_report(y_test, xgb_gbm.predict(X_test_proc)))
print("AUC:", roc_auc_score(y_test, xgb_gbm.predict_proba(X_test_proc)[:, 1]))

# 🔍 SHAP for sklearn GBM
explainer_sk = shap.Explainer(sk_gbm.predict_proba, X_test_proc, feature_names=feature_names)
shap_values_sk = explainer_sk(X_test_proc)

shap.plots.beeswarm(shap_values_sk, max_display=10, show=False)
plt.title("SHAP Summary - Sklearn GBM")
plt.show()

# 🔍 SHAP for XGBoost
explainer_xgb = shap.Explainer(xgb_gbm, X_test_proc, feature_names=feature_names)
shap_values_xgb = explainer_xgb(X_test_proc)

shap.plots.beeswarm(shap_values_xgb, max_display=10, show=False)
plt.title("SHAP Summary - XGBoost")
plt.show()
