# Phase 4: Predictive Modeling 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
import shap
import joblib

In [None]:
# Load clustered data from Phase 3
df = pd.read_csv('clustered_customers.csv')

## SHAP Analysis 

In [None]:
# Generate SHAP values
explainer = shap.TreeExplainer(best_rf)
shap_values = explainer.shap_values(X_test)

# Handle binary classification SHAP values
if isinstance(shap_values, list) and len(shap_values) == 2:
    # Binary classification case
    shap_values = shap_values[1]  # We want values for class 1 (positive response)
elif len(shap_values.shape) == 3:
    # Multi-class format, select class 1
    shap_values = shap_values[:, :, 1]

# Verify shapes match
assert shap_values.shape == X_test.shape, \
    f"SHAP values shape {shap_values.shape} doesn't match X_test shape {X_test.shape}"

#  Global Feature Importance (Matplotlib)
plt.figure()
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.title("Top Features Driving Campaign Response", fontsize=12)
plt.tight_layout()
plt.savefig('shap_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

# Individual Prediction Explanation (HTML)
sample_idx = 0  # First test case
shap.initjs()  # Initialize JS visualization
force_plot = shap.force_plot(
    explainer.expected_value[1],
    shap_values[sample_idx, :],
    X_test.iloc[sample_idx, :],
    feature_names=X_test.columns.tolist(),
    matplotlib=False
)

# Save interactive plot
shap.save_html('shap_force_plot.html', force_plot)

In [None]:
# Bonus: Beeswarm Plot for Detailed Analysis
plt.figure()
shap.summary_plot(shap_values, X_test, show=False)
plt.title("Feature Impact on Campaign Response", fontsize=12)
plt.tight_layout()
plt.savefig('shap_beeswarm.png', dpi=300, bbox_inches='tight')
plt.close()

print("SHAP outputs saved successfully:")
print("- Global feature importance: shap_feature_importance.png")
print("- Detailed impact analysis: shap_beeswarm.png")
print("- Interactive explanation: shap_force_plot.html")

## Response Probability by Segment

In [None]:
df['Response_Probability'] = best_rf.predict_proba(X)[:, 1]

# Visualize
plt.figure(figsize=(10, 6))
sns.boxplot(
    x='Cluster', 
    y='Response_Probability', 
    data=df,
    order=sorted(df['Cluster'].unique()),
    palette='viridis'
)
plt.title('Response Probability by Customer Segment', fontsize=14)
plt.xlabel('Segment')
plt.ylabel('Response Probability')
plt.xticks(ticks=range(len(df['Cluster'].unique())),
           labels=[f'Segment {i+1}' for i in sorted(df['Cluster'].unique())])
plt.savefig('response_by_segment.png', dpi=300, bbox_inches='tight')
plt.close()

## Save Results

In [None]:
# Save probabilities with cluster info
output_cols = ['ID', 'Cluster'] + features + ['Response_Probability']
df[output_cols].to_csv('customers_with_predictions.csv', index=False)

# Generate segment-level insights
segment_stats = df.groupby('Cluster')['Response_Probability'].agg(
    ['mean', 'median', 'std', 'count']
).reset_index()
segment_stats.columns = ['Segment', 'Avg_Prob', 'Median_Prob', 'Std_Prob', 'Count']
segment_stats.to_csv('segment_response_stats.csv', index=False)

print("\nOutput Files Created:")
print("- rf_model_with_segments.pkl: Trained model")
print("- shap_*.png: SHAP interpretation plots")
print("- response_by_segment.png: Segment comparison")
print("- customers_with_predictions.csv: Full dataset with predictions")
print("- segment_response_stats.csv: Summary statistics by segment")