### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import shap
import matplotlib.pyplot as plt

# Assume you have your baseline data in 'baseline_df' and current data in 'current_df'
# For demonstration, let's create some sample data
np.random.seed(42)
n_samples = 1000
baseline_df = pd.DataFrame({
    'feature_1': np.random.normal(0, 1, n_samples),
    'feature_2': np.random.normal(0, 1, n_samples),
    'feature_3': np.random.rand(n_samples)
})
current_df = pd.DataFrame({
    'feature_1': np.random.normal(0.5, 1.2, n_samples),
    'feature_2': np.random.normal(-0.2, 0.8, n_samples),
    'feature_3': np.random.rand(n_samples) + 0.1
})

# Add a target variable to distinguish between the datasets
baseline_df['is_current'] = 0  # 0 for baseline
current_df['is_current'] = 1   # 1 for current

# Combine the datasets
combined_df = pd.concat([baseline_df, current_df], ignore_index=True)

# Separate features (X) and the target variable (y)
X = combined_df.drop('is_current', axis=1)
y = combined_df['is_current']

# Train a classifier to distinguish between the two datasets (same as in adversarial validation)
model = GradientBoostingClassifier(random_state=42)
model.fit(X, y)

# Initialize the SHAP explainer
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for all samples
shap_values = explainer.shap_values(X)

# Visualize the SHAP values to understand feature importance in distinguishing datasets
shap.summary_plot(shap_values, X, class_names=['Baseline', 'Current'], color_bar_label='SHAP value (impact on model output)')
plt.title('SHAP Summary Plot for Dataset Distinction')
plt.show()

# You can also look at the mean absolute SHAP values per feature to get an overall importance
mean_abs_shap = np.abs(shap_values).mean(axis=0)
feature_importance_df = pd.DataFrame(list(zip(X.columns, mean_abs_shap)), columns=['Feature', 'Mean_Abs_SHAP'])
feature_importance_df = feature_importance_df.sort_values(by='Mean_Abs_SHAP', ascending=False).reset_index(drop=True)

print("\nFeature Importance based on Mean Absolute SHAP Values:")
print(feature_importance_df)

# Further analysis: Look at SHAP values for baseline vs. current data separately (optional)
baseline_X = X[y == 0]
current_X = X[y == 1]

shap_values_baseline = explainer.shap_values(baseline_X)
shap_values_current = explainer.shap_values(current_X)

# You can compare the distributions of SHAP values for each feature between the two datasets
for i, feature in enumerate(X.columns):
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    shap.dependence_plot(i, shap_values_baseline, baseline_X, display_features=X)
    plt.title(f'SHAP Dependence Plot for {feature} (Baseline)')
    plt.subplot(1, 2, 2)
    shap.dependence_plot(i, shap_values_current, current_X, display_features=X)
    plt.title(f'SHAP Dependence Plot for {feature} (Current)')
    plt.tight_layout()
    plt.show()

ModuleNotFoundError: No module named 'shap'