In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt
import joblib
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../src')
from utils.data_loader import load_data

In [2]:
# Load the preprocessed datasets
fraud_data = load_data('../data/processed/fraud_data_processed.csv')
creditcard_data = load_data('../data/processed/creditcard_processed.csv')


Data loaded successfully from ../data/processed/fraud_data_processed.csv
Data loaded successfully from ../data/processed/creditcard_processed.csv


In [3]:
fraud_rf_model = joblib.load('../models/fraud_data_rf_model.pkl')
creditcard_rf_model = joblib.load('../models/creditcard_rf_model.pkl')

In [4]:

# Preprocess function from Task 2
def preprocess_data(df, target_col):
    """Preprocess data to ensure all features are numeric"""
    df = df.copy()
    
    for col in df.select_dtypes(include=['datetime64', 'object']).columns:
        if col != target_col:
            if 'time' in col.lower():
                df[col] = pd.to_datetime(df[col])
                df[f'{col}_hour'] = df[col].dt.hour
                df[f'{col}_day'] = df[col].dt.dayofweek
                df[f'{col}_month'] = df[col].dt.month
                df = df.drop(columns=[col])
            else:
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
    
    return df


In [5]:

# Prepare data function from Task 2
def prepare_data(df, target_col):
    """Prepare features and target, perform train-test split"""
    df = preprocess_data(df, target_col)
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X = X.select_dtypes(include=[np.number])
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

# def generate_shap_plots(model, X_test, dataset_name, max_samples=1000):
#     """Generate SHAP Summary and Force plots with version compatibility"""
#     print(f"\nGenerating SHAP plots for {dataset_name}...")
    
#     # Subsample for SHAP calculations
#     if len(X_test) > max_samples:
#         X_test_subsample = X_test.sample(n=max_samples, random_state=42)
#     else:
#         X_test_subsample = X_test.copy()
    
#     print(f"Using {len(X_test_subsample)} samples for SHAP analysis")
    
#     # Initialize SHAP explainer
#     print("Initializing SHAP explainer...")
#     explainer = shap.TreeExplainer(model)
    
#     # Calculate SHAP values
#     print("Calculating SHAP values...")
#     try:
#         shap_values = explainer.shap_values(X_test_subsample)
        
#         # Handle different model types
#         if isinstance(shap_values, list):
#             if len(shap_values) == 2:  # Binary classification
#                 print("Binary classification detected")
#                 shap_values_plot = shap_values[1]
#                 expected_value = explainer.expected_value[1]
#             else:  # Multiclass
#                 print(f"Multiclass classification with {len(shap_values)} classes detected")
#                 shap_values_plot = shap_values[0]  # Using first class by default
#                 expected_value = explainer.expected_value[0]
#         else:  # Regression
#             print("Regression detected")
#             shap_values_plot = shap_values
#             expected_value = explainer.expected_value
        
#         print(f"SHAP values shape: {np.array(shap_values_plot).shape}")
#     except Exception as e:
#         print(f"Error calculating SHAP values: {e}")
#         return
    
#     # Create plots directory if it doesn't exist
#     os.makedirs('../plots', exist_ok=True)
    
#     # Summary Plot (Bar)
#     print("Creating summary plot (bar)...")
#     try:
#         plt.figure(figsize=(10, 6))
#         shap.summary_plot(shap_values_plot, X_test_subsample, plot_type="bar", show=False)
#         plt.title(f'SHAP Feature Importance - {dataset_name}')
#         plt.tight_layout()
#         plt.savefig(f'../plots/shap_summary_bar_{dataset_name}.png', bbox_inches='tight')
#         plt.close()
#         print("Summary bar plot saved successfully")
#     except Exception as e:
#         print(f"Error creating bar plot: {e}")
    
#     # Summary Plot (Beeswarm)
#     print("Creating summary plot (beeswarm)...")
#     try:
#         plt.figure(figsize=(10, 6))
#         shap.summary_plot(shap_values_plot, X_test_subsample, show=False)
#         plt.title(f'SHAP Value Distribution - {dataset_name}')
#         plt.tight_layout()
#         plt.savefig(f'../plots/shap_summary_beeswarm_{dataset_name}.png', bbox_inches='tight')
#         plt.close()
#         print("Summary beeswarm plot saved successfully")
#     except Exception as e:
#         print(f"Error creating beeswarm plot: {e}")
    
#     # Force Plot with version compatibility
#     print("Creating force plot...")
#     try:
#         # Select first instance
#         instance_idx = 0
#         instance_shap = shap_values_plot[instance_idx]
#         instance_features = X_test_subsample.iloc[instance_idx]
        
#         # Create figure
#         plt.figure(figsize=(10, 4))
        
#         # Try different API versions
#         try:
#             # New API (shap >= 0.40)
#             shap.plots.force(
#                 base_value=expected_value,
#                 shap_values=instance_shap,
#                 features=instance_features,
#                 matplotlib=True,
#                 show=False
#             )
#         except TypeError:
#             # Fallback to older API
#             shap.force_plot(
#                 base_value=expected_value,
#                 shap_values=instance_shap,
#                 features=instance_features,
#                 matplotlib=True,
#                 show=False
#             )
        
#         plt.title(f'SHAP Force Plot - {dataset_name} (First Instance)')
#         plt.tight_layout()
#         plt.savefig(f'../plots/shap_force_{dataset_name}.png', bbox_inches='tight')
#         plt.close()
#         print("Force plot saved successfully")
#     except Exception as e:
#         print(f"Error creating force plot: {e}")
#         import traceback
#         traceback.print_exc()

def generate_shap_plots(model, X_test, dataset_name, max_samples=1000):
    """Generate SHAP Summary and Force plots that produce the first chart style"""
    print(f"\nGenerating SHAP plots for {dataset_name}...")
    
    # Subsample for SHAP calculations
    if len(X_test) > max_samples:
        X_test_subsample = X_test.sample(n=max_samples, random_state=42)
    else:
        X_test_subsample = X_test.copy()
    
    print(f"Using {len(X_test_subsample)} samples for SHAP analysis")
    
    # Initialize SHAP explainer
    print("Initializing SHAP explainer...")
    explainer = shap.TreeExplainer(model)
    
    # Calculate SHAP values - CRITICAL CHANGE HERE
    print("Calculating SHAP values...")
    try:
        # Get both raw SHAP values and expected value
        shap_values = explainer(X_test_subsample)
        expected_value = explainer.expected_value
        
        # Handle binary classification
        if isinstance(expected_value, np.ndarray) and len(expected_value) == 2:
            print("Binary classification detected - using SHAP values for class 1")
            shap_values_plot = shap_values[..., 1]  # Get values for positive class
            expected_value = expected_value[1]
        else:
            shap_values_plot = shap_values.values  # Get the raw SHAP values
        
        print(f"SHAP values shape: {shap_values_plot.shape}")
    except Exception as e:
        print(f"Error calculating SHAP values: {e}")
        return
    
    # Create plots directory if it doesn't exist
    os.makedirs('../plots', exist_ok=True)
    
    # Summary Plot (Bar) - CRITICAL IMPROVEMENT HERE
    print("Creating summary plot (bar)...")
    try:
        plt.figure(figsize=(12, 8))
        
        # Calculate mean absolute SHAP values for each feature
        mean_abs_shap = np.abs(shap_values_plot).mean(axis=0)
        feature_order = np.argsort(mean_abs_shap)[::-1]  # Sort descending
        
        # Create bar plot manually for better control
        plt.barh(
            range(len(feature_order)),
            mean_abs_shap[feature_order],
            align='center',
            color='#1f77b4'
        )
        
        # Set y-ticks to feature names
        plt.yticks(
            range(len(feature_order)),
            [X_test_subsample.columns[i] for i in feature_order]
        )
        
        plt.gca().invert_yaxis()  # Most important at top
        plt.xlabel('mean(|SHAP value|) (average impact on model output magnitude)')
        plt.title(f'SHAP Feature Importance - {dataset_name}')
        plt.tight_layout()
        plt.savefig(f'../plots/shap_summary_bar_{dataset_name}.png', bbox_inches='tight', dpi=300)
        plt.close()
        print("Summary bar plot saved successfully")
    except Exception as e:
        print(f"Error creating bar plot: {e}")
    
    # Summary Plot (Beeswarm)
    print("Creating summary plot (beeswarm)...")
    try:
        plt.figure(figsize=(12, 8))
        shap.summary_plot(
            shap_values_plot, 
            X_test_subsample,
            plot_type="dot",
            show=False,
            max_display=20  # Limit number of features shown
        )
        plt.title(f'SHAP Value Distribution - {dataset_name}')
        plt.tight_layout()
        plt.savefig(f'../plots/shap_summary_beeswarm_{dataset_name}.png', bbox_inches='tight', dpi=300)
        plt.close()
        print("Summary beeswarm plot saved successfully")
    except Exception as e:
        print(f"Error creating beeswarm plot: {e}")
    
    # Force Plot
    print("Creating force plot...")
    try:
        plt.figure(figsize=(12, 4))
        shap.plots.force(
            expected_value,
            shap_values_plot[0],  # First instance
            X_test_subsample.iloc[0],
            matplotlib=True,
            show=False
        )
        plt.title(f'SHAP Force Plot - First Instance ({dataset_name})')
        plt.tight_layout()
        plt.savefig(f'../plots/shap_force_{dataset_name}.png', bbox_inches='tight', dpi=300)
        plt.close()
        print("Force plot saved successfully")
    except Exception as e:
        print(f"Error creating force plot: {e}")

In [7]:
# # Main analysis function
# def analyze_model_explainability():
#     print("Starting SHAP analysis...")
#     # Prepare data
#     X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(fraud_data, 'class')
#     X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = prepare_data(creditcard_data, 'Class')
    
#     # Load saved Random Forest models
#     print("Loading models...")
#     fraud_model = joblib.load('../models/fraud_data_rf_model.pkl')
#     creditcard_model = joblib.load('../models/creditcard_rf_model.pkl')
#     print("Models loaded.")
    
#     # Generate SHAP plots for both datasets
#     generate_shap_plots(fraud_model, X_test_fraud, 'Fraud_Data')
#     generate_shap_plots(creditcard_model, X_test_creditcard, 'creditcard')
#     print("SHAP analysis complete.")

def analyze_model_explainability(max_samples=300):
    print("Starting SHAP analysis...")
    # Prepare data
    X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(fraud_data, 'class')
    X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = prepare_data(creditcard_data, 'Class')
    
    # Load saved Random Forest models
    print("Loading models...")
    fraud_model = joblib.load('../models/fraud_data_rf_model.pkl')
    creditcard_model = joblib.load('../models/creditcard_rf_model.pkl')
    print("Models loaded.")
    
    # Generate SHAP plots for both datasets with sample limit
    generate_shap_plots(fraud_model, X_test_fraud, 'Fraud_Data', max_samples)
    generate_shap_plots(creditcard_model, X_test_creditcard, 'creditcard', max_samples)
    print("SHAP analysis complete.")

In [8]:
# Run analysis
analyze_model_explainability()

Starting SHAP analysis...
Loading models...
Models loaded.

Generating SHAP plots for Fraud_Data...
Using 300 samples for SHAP analysis
Initializing SHAP explainer...
Calculating SHAP values...
Regression detected
SHAP values shape: (300, 25, 2)
Creating summary plot (bar)...
Summary bar plot saved successfully
Creating summary plot (beeswarm)...
Summary beeswarm plot saved successfully
Creating force plot...
Error creating force plot: In v0.20, force plot now requires the base value as the first parameter! Try shap.plots.force(explainer.expected_value, shap_values) or for multi-output models try shap.plots.force(explainer.expected_value[0], shap_values[..., 0]).

Generating SHAP plots for creditcard...
Using 300 samples for SHAP analysis
Initializing SHAP explainer...
Calculating SHAP values...


Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_12584\3568945207.py", line 172, in generate_shap_plots
    shap.plots.force(
    ~~~~~~~~~~~~~~~~^
        base_value=expected_value,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<3 lines>...
        show=False
        ^^^^^^^^^^
    )
    ^
  File "d:\projects\TenAcademy\week8n9\KAIM_WEEK_8_N_9\venv\Lib\site-packages\shap\plots\_force.py", line 130, in force
    raise TypeError(emsg)
TypeError: In v0.20, force plot now requires the base value as the first parameter! Try shap.plots.force(explainer.expected_value, shap_values) or for multi-output models try shap.plots.force(explainer.expected_value[0], shap_values[..., 0]).

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_12584\3568945207.py", line 181, in generate_shap_plots
    shap.force_plot(
    ~~~~~~~~~~~~~~~^
        base_value=expected_value

Regression detected
SHAP values shape: (300, 32, 2)
Creating summary plot (bar)...
Summary bar plot saved successfully
Creating summary plot (beeswarm)...
Summary beeswarm plot saved successfully
Creating force plot...
Error creating force plot: In v0.20, force plot now requires the base value as the first parameter! Try shap.plots.force(explainer.expected_value, shap_values) or for multi-output models try shap.plots.force(explainer.expected_value[0], shap_values[..., 0]).
SHAP analysis complete.


Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_12584\3568945207.py", line 172, in generate_shap_plots
    shap.plots.force(
    ~~~~~~~~~~~~~~~~^
        base_value=expected_value,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<3 lines>...
        show=False
        ^^^^^^^^^^
    )
    ^
  File "d:\projects\TenAcademy\week8n9\KAIM_WEEK_8_N_9\venv\Lib\site-packages\shap\plots\_force.py", line 130, in force
    raise TypeError(emsg)
TypeError: In v0.20, force plot now requires the base value as the first parameter! Try shap.plots.force(explainer.expected_value, shap_values) or for multi-output models try shap.plots.force(explainer.expected_value[0], shap_values[..., 0]).

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_12584\3568945207.py", line 181, in generate_shap_plots
    shap.force_plot(
    ~~~~~~~~~~~~~~~^
        base_value=expected_value

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x400 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x400 with 0 Axes>