In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt
import joblib
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../src')
from utils.data_loader import load_data

Load Processed Datasets

In [2]:
# Load the preprocessed datasets
fraud_data = load_data('../data/processed/fraud_data_processed.csv')
creditcard_data = load_data('../data/processed/creditcard_processed.csv')


Data loaded successfully from ../data/processed/fraud_data_processed.csv
Data loaded successfully from ../data/processed/creditcard_processed.csv


Load Saved Best Model

In [3]:
fraud_rf_model = joblib.load('../models/fraud_data_rf_model.pkl')
creditcard_rf_model = joblib.load('../models/creditcard_rf_model.pkl')

Preprocess Data(just in case)

In [4]:

# Preprocess function from Task 2
def preprocess_data(df, target_col):
    """Preprocess data to ensure all features are numeric"""
    df = df.copy()
    
    for col in df.select_dtypes(include=['datetime64', 'object']).columns:
        if col != target_col:
            if 'time' in col.lower():
                df[col] = pd.to_datetime(df[col])
                df[f'{col}_hour'] = df[col].dt.hour
                df[f'{col}_day'] = df[col].dt.dayofweek
                df[f'{col}_month'] = df[col].dt.month
                df = df.drop(columns=[col])
            else:
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
    
    return df


In [None]:

# Prepare data function 
def prepare_data(df, target_col):
    """Prepare features and target, perform train-test split"""
    df = preprocess_data(df, target_col)
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X = X.select_dtypes(include=[np.number])
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Generate SHAP PLOTS

In [None]:

def generate_shap_plots(model, X_test, dataset_name, max_samples=1000):
    """Generate SHAP Summary and Force plots that produce proper output"""
    print(f"\nGenerating SHAP plots for {dataset_name}...")
    
    # Subsample for SHAP calculations
    if len(X_test) > max_samples:
        X_test_subsample = X_test.sample(n=max_samples, random_state=42)
    else:
        X_test_subsample = X_test.copy()
    
    print(f"Using {len(X_test_subsample)} samples for SHAP analysis")
    
    # Initialize SHAP explainer
    print("Initializing SHAP explainer...")
    explainer = shap.TreeExplainer(model)
    
    # Calculate SHAP values - using Explanation object
    print("Calculating SHAP values...")
    try:
        # Get SHAP values as Explanation object
        shap_explanation = explainer(X_test_subsample)
        
        # Handle binary classification
        if len(shap_explanation.shape) == 3 and shap_explanation.shape[-1] == 2:
            print("Binary classification detected - using SHAP values for class 1")
            shap_values = shap_explanation[..., 1]
            expected_value = explainer.expected_value[1]
        else:
            shap_values = shap_explanation.values
            expected_value = explainer.expected_value
        
        print(f"SHAP values shape: {shap_values.shape}")
    except Exception as e:
        print(f"Error calculating SHAP values: {e}")
        return
    
    # Create plots directory if it doesn't exist
    os.makedirs('../plots', exist_ok=True)
    
    # Summary Plot (Bar)
    print("Creating summary plot (bar)...")
    try:
        plt.figure(figsize=(12, 8))
        
        # Create bar plot using the Explanation object
        shap.plots.bar(
            shap_explanation[..., 1] if len(shap_explanation.shape) == 3 else shap_explanation,
            show=False
        )
        plt.title(f'SHAP Feature Importance - {dataset_name}')
        plt.tight_layout()
        plt.savefig(f'../plots/shap_summary_bar_{dataset_name}.png', bbox_inches='tight', dpi=300)
        plt.close()
        print("Summary bar plot saved successfully")
    except Exception as e:
        print(f"Error creating bar plot: {e}")
    
    # Summary Plot (Beeswarm)
    print("Creating summary plot (beeswarm)...")
    try:
        plt.figure(figsize=(12, 8))
        shap.plots.beeswarm(
            shap_explanation[..., 1] if len(shap_explanation.shape) == 3 else shap_explanation,
            show=False,
            max_display=20
        )
        plt.title(f'SHAP Value Distribution - {dataset_name}')
        plt.tight_layout()
        plt.savefig(f'../plots/shap_summary_beeswarm_{dataset_name}.png', bbox_inches='tight', dpi=300)
        plt.close()
        print("Summary beeswarm plot saved successfully")
    except Exception as e:
        print(f"Error creating beeswarm plot: {e}")
    
    # Force Plot
    print("Creating force plot...")
    try:
        plt.figure(figsize=(12, 4))
        shap.plots.force(
            expected_value,
            shap_values[0],  # First instance
            feature_names=X_test_subsample.columns,
            matplotlib=True,
            show=False
        )
        plt.title(f'SHAP Force Plot - First Instance ({dataset_name})')
        plt.tight_layout()
        plt.savefig(f'../plots/shap_force_{dataset_name}.png', bbox_inches='tight', dpi=300)
        plt.close()
        print("Force plot saved successfully")
    except Exception as e:
        print(f"Error creating force plot: {e}")

Analyze Model Explainability

In [None]:


def analyze_model_explainability(max_samples=300):
    print("Starting SHAP analysis...")
    # Prepare data
    X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(fraud_data, 'class')
    X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = prepare_data(creditcard_data, 'Class')
    
    # Load saved Random Forest models
    print("Loading models...")
    fraud_model = joblib.load('../models/fraud_data_rf_model.pkl')
    creditcard_model = joblib.load('../models/creditcard_rf_model.pkl')
    print("Models loaded.")
    
    # Generate SHAP plots for both datasets with sample limit
    generate_shap_plots(fraud_model, X_test_fraud, 'Fraud_Data', max_samples)
    generate_shap_plots(creditcard_model, X_test_creditcard, 'creditcard', max_samples)
    print("SHAP analysis complete.")

In [8]:
# Run analysis
analyze_model_explainability()

Starting SHAP analysis...
Loading models...
Models loaded.

Generating SHAP plots for Fraud_Data...
Using 300 samples for SHAP analysis
Initializing SHAP explainer...
Calculating SHAP values...
Binary classification detected - using SHAP values for class 1
SHAP values shape: (300, 25)
Creating summary plot (bar)...
Summary bar plot saved successfully
Creating summary plot (beeswarm)...
Summary beeswarm plot saved successfully
Creating force plot...
Error creating force plot: visualize() can only display Explanation objects (or arrays of them)!

Generating SHAP plots for creditcard...
Using 300 samples for SHAP analysis
Initializing SHAP explainer...
Calculating SHAP values...
Binary classification detected - using SHAP values for class 1
SHAP values shape: (300, 32)
Creating summary plot (bar)...
Summary bar plot saved successfully
Creating summary plot (beeswarm)...
Summary beeswarm plot saved successfully
Creating force plot...
Error creating force plot: visualize() can only display 

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>