In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import shap
import numpy as np

# 假设 df 是你的 DataFrame
# 将目标变量 Supporter 转为 0/1
df['Supporter'] = df['Supporter'].map({'Yes': 1, 'No': 0})  # 根据实际值替换

# 特征与目标
feature_cols = [
    'Region', 'Digital Only', 'Email Opt-Out',
    'Previous Year Total Information Calls',
    'Previous Year Total Value Calls',
    'Previous Year Total Meetings Virtual',
    'Previous Year Total Meetings in Person',
    'Previous Year Total Group Meetings Virtual',
    'Previous Year Total Group Meetings in Person',
    'Previous Year Total Sales Emails',
    'purchase_recency', 'redemption_recency',
    'redemption_frequency', 'purchase_frequency'
]

X = df[feature_cols]
y = df['Supporter']

# 特征分组
categorical_cols = ['Region', 'Digital Only', 'Email Opt-Out']
numerical_cols = [col for col in feature_cols if col not in categorical_cols]


In [None]:
# 划分训练集
X_train, _, y_train, _ = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 预处理 + 模型流水线
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

pipeline.fit(X_train, y_train)

# 拿到处理后的特征
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

# 创建解释器并计算 SHAP 值
explainer = shap.Explainer(pipeline.named_steps['clf'], X_train_transformed)
shap_values = explainer(X_train_transformed)


In [None]:
# 创建 region -> 特征 SHAP 排名字典
region_shap_importance = {}

X_train_original = X_train.reset_index(drop=True)

for region in X_train_original['Region'].unique():
    mask = (X_train_original['Region'] == region)
    X_region = X_train_transformed[mask]
    shap_region = shap_values[mask]

    # 平均 SHAP 绝对值
    shap_mean = np.abs(shap_region.values).mean(axis=0)
    shap_series = pd.Series(shap_mean, index=feature_names).sort_values(ascending=False)

    region_shap_importance[region] = shap_series

# 示例：打印 Ontario 的前 10 个重要变量
print("Ontario Region - Top 10 Features by SHAP Impact")
print(region_shap_importance['Ontario'].head(10))


In [None]:
import matplotlib.pyplot as plt

# 可视化每个 region 的 Top 10 SHAP 特征
for region, shap_series in region_shap_importance.items():
    plt.figure(figsize=(8, 5))
    shap_series.head(10).plot(kind='barh')
    plt.title(f"{region} - Top 10 SHAP Features")
    plt.xlabel("Mean |SHAP Value|")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
