In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt
import joblib
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../src')
from utils.data_loader import load_data

In [2]:
# Load the preprocessed datasets
fraud_data = load_data('../data/processed/fraud_data_processed.csv')
creditcard_data = load_data('../data/processed/creditcard_processed.csv')


Data loaded successfully from ../data/processed/fraud_data_processed.csv
Data loaded successfully from ../data/processed/creditcard_processed.csv


In [3]:
fraud_rf_model = joblib.load('../models/fraud_data_rf_model.pkl')
creditcard_rf_model = joblib.load('../models/creditcard_rf_model.pkl')

In [4]:

# Preprocess function from Task 2
def preprocess_data(df, target_col):
    """Preprocess data to ensure all features are numeric"""
    df = df.copy()
    
    for col in df.select_dtypes(include=['datetime64', 'object']).columns:
        if col != target_col:
            if 'time' in col.lower():
                df[col] = pd.to_datetime(df[col])
                df[f'{col}_hour'] = df[col].dt.hour
                df[f'{col}_day'] = df[col].dt.dayofweek
                df[f'{col}_month'] = df[col].dt.month
                df = df.drop(columns=[col])
            else:
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
    
    return df


In [5]:

# Prepare data function from Task 2
def prepare_data(df, target_col):
    """Prepare features and target, perform train-test split"""
    df = preprocess_data(df, target_col)
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X = X.select_dtypes(include=[np.number])
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [6]:

def generate_shap_plots(model, X_test, dataset_name):
    """Generate SHAP Summary and Force Plots"""
    print(f"Computing SHAP values for {dataset_name}...")
    
    # Use TreeExplainer for Random Forest
    explainer = shap.TreeExplainer(model)
    
    # Compute SHAP values for sampled test set
    shap_values = explainer.shap_values(X_test)
    
    # Save SHAP values to avoid recomputation
    os.makedirs('../shap_values', exist_ok=True)
    joblib.dump(shap_values, f'../shap_values/shap_values_{dataset_name.lower()}.pkl')
    print(f"SHAP values for {dataset_name} saved.")
    
    # Summary Plot (global feature importance)
    print(f"Generating Summary Plot for {dataset_name}...")
    plt.figure()
    shap.summary_plot(shap_values[1], X_test, show=False)
    plt.title(f'SHAP Summary Plot - {dataset_name}')
    plt.tight_layout()
    os.makedirs('../plots', exist_ok=True)
    plt.savefig(f'../plots/shap_summary_{dataset_name.lower()}.png')
    plt.close()
    print(f"Summary Plot for {dataset_name} saved.")
    
    # Force Plot for first instance (local feature importance)
    print(f"Generating Force Plot for {dataset_name}...")
    plt.figure()
    shap.force_plot(explainer.expected_value[1], shap_values[1][0], X_test.iloc[0], matplotlib=True, show=False)
    plt.title(f'SHAP Force Plot - First Instance ({dataset_name})')
    plt.tight_layout()
    plt.savefig(f'../plots/shap_force_{dataset_name.lower()}.png')
    plt.close()
    print(f"Force Plot for {dataset_name} saved.")


In [7]:
# Main analysis function
def analyze_model_explainability():
    print("Starting SHAP analysis...")
    # Prepare data
    X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(fraud_data, 'class')
    X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = prepare_data(creditcard_data, 'Class')
    
    # Load saved Random Forest models
    print("Loading models...")
    fraud_model = joblib.load('../models/fraud_data_rf_model.pkl')
    creditcard_model = joblib.load('../models/creditcard_rf_model.pkl')
    print("Models loaded.")
    
    # Generate SHAP plots for both datasets
    generate_shap_plots(fraud_model, X_test_fraud, 'Fraud_Data')
    generate_shap_plots(creditcard_model, X_test_creditcard, 'creditcard')
    print("SHAP analysis complete.")


In [None]:
# Run analysis
analyze_model_explainability()

Starting SHAP analysis...
Loading models...
Models loaded.
Computing SHAP values for Fraud_Data...
