# Cell 1: Add Project Root to Python Path

In [1]:
# notebooks/model_explainability.ipynb - Cell 1: Add project root to path

import os
import sys

# Get the current working directory of the notebook (e.g., 'fraud_detection_project/notebooks/')
current_dir = os.getcwd()
# Get the parent directory (which is 'fraud_detection_project/')
project_root = os.path.dirname(current_dir)

# Add the project root to sys.path so Python can find 'src'
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path.")
else:
    print(f"'{project_root}' already in sys.path.")

Added 'c:\Users\skibret\Downloads\KAIM\Week 8\Project\fraud_detection_project' to sys.path.


# Cell 2: Import Statements

In [2]:
# notebooks/model_explainability.ipynb - Cell 2: Import Statements

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap # Import shap library

# Import functions from our custom scripts
from src.model_training import load_processed_data, split_data, handle_imbalance, evaluate_model
from src.model_explainability import explain_model_shap, plot_shap_summary, plot_shap_dependence, plot_shap_force

# Import scikit-learn preprocessing and models (needed to load preprocessor and potentially the model)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SklearnPipeline # Alias for sklearn's Pipeline

# Configure plot styles
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

  from .autonotebook import tqdm as notebook_tqdm


# Cell 3: Load Processed Data and Re-run Preprocessing (for consistency)

In [None]:
# notebooks/model_explainability.ipynb - Cell 3: Load Data & Re-run Preprocessing

# Ensure necessary imports are at the top of this cell or in Cell 2
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SklearnPipeline # Alias for sklearn's Pipeline

# Import functions from src/model_training (assuming Cell 1 has added project_root to sys.path)
from src.model_training import load_processed_data, split_data, handle_imbalance

# Define paths to your processed data, using project_root for absolute paths
ecommerce_filepath = os.path.join(project_root, 'data', 'processed', 'processed_ecommerce_fraud.csv')
creditcard_filepath = os.path.join(project_root, 'data', 'processed', 'processed_creditcard_fraud.csv')

print("--- Loading Processed E-commerce Fraud Data ---")
ecommerce_df = load_processed_data(ecommerce_filepath)

print("\n--- Loading Processed Credit Card Fraud Data ---")
creditcard_df = load_processed_data(creditcard_filepath)

# --- Re-run E-commerce Data Preparation & Preprocessing Pipeline ---
ecommerce_X_train_processed = None
ecommerce_X_test_processed = None
ecommerce_y_train = None
ecommerce_y_test = None
ecommerce_preprocessor_fitted = None # Will be fitted below
ecommerce_feature_names = [] # Initialize list for feature names

if ecommerce_df is not None:
    print("\n--- Re-running E-commerce Data Preparation & Preprocessing Pipeline ---")

    # Pass the original full dataframe (ecommerce_df) to split_data.
    # split_data will handle dropping the 'class' column internally to create X.
    ecommerce_X_train_raw, ecommerce_X_test_raw, ecommerce_y_train, ecommerce_y_test, \
    _, _ = \
        split_data(ecommerce_df, target_column='class', test_size=0.3, random_state=42)

    # Now, from the X_train_raw and X_test_raw returned by split_data,
    # drop the *other* non-feature columns that are not relevant for modeling.
    cols_to_drop_ecommerce_from_X_after_split = [
        'ip_address', 'lower_bound_ip_address', 'upper_bound_ip_address', 'ip_address_int',
        'signup_time', 'purchase_time', 'time_diff_prev_transaction'
    ]
    ecommerce_X_train = ecommerce_X_train_raw.drop(columns=cols_to_drop_ecommerce_from_X_after_split, errors='ignore')
    ecommerce_X_test = ecommerce_X_test_raw.drop(columns=cols_to_drop_ecommerce_from_X_after_split, errors='ignore')

    # Re-identify numerical and categorical columns *after* dropping the non-feature columns.
    # This ensures the preprocessor uses the correct, final column lists.
    ecommerce_numerical_cols_for_preprocessor = ecommerce_X_train.select_dtypes(include=np.number).columns.tolist()
    ecommerce_categorical_cols_for_preprocessor = ecommerce_X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"E-commerce Numerical Columns for Preprocessor (after feature selection): {ecommerce_numerical_cols_for_preprocessor}")
    print(f"E-commerce Categorical Columns for Preprocessor (after feature selection): {ecommerce_categorical_cols_for_preprocessor}")

    # Create and fit the preprocessing pipeline using ColumnTransformer
    ecommerce_preprocessor_fitted = ColumnTransformer(
        transformers=[
            ('num', SklearnPipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')), # Impute NaNs in numerical columns
                ('scaler', StandardScaler())                    # Scale numerical features
            ]), ecommerce_numerical_cols_for_preprocessor),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ecommerce_categorical_cols_for_preprocessor) # One-hot encode categorical features
        ],
        remainder='passthrough' # Keep any other columns not specified (e.g., if you added more later)
    ).fit(ecommerce_X_train) # Fit the preprocessor ONLY on the training data to prevent data leakage

    # Transform both training and testing sets using the fitted preprocessor
    ecommerce_X_train_processed = ecommerce_preprocessor_fitted.transform(ecommerce_X_train)
    ecommerce_X_test_processed = ecommerce_preprocessor_fitted.transform(ecommerce_X_test)

    # Get feature names after one-hot encoding for plotting with SHAP
    ecommerce_feature_names = ecommerce_preprocessor_fitted.get_feature_names_out()
    print(f"Total E-commerce features after preprocessing: {len(ecommerce_feature_names)}")
    print(f"E-commerce X_train_processed shape: {ecommerce_X_train_processed.shape}")
    print(f"E-commerce X_test_processed shape: {ecommerce_X_test_processed.shape}")
else:
    print("E-commerce data not loaded. Skipping preprocessing.")

# --- Re-run Credit Card Data Preparation & Preprocessing Pipeline ---
creditcard_X_train_processed = None
creditcard_X_test_processed = None
creditcard_y_train = None
creditcard_y_test = None
creditcard_preprocessor_fitted = None # Will be fitted below
creditcard_feature_names = [] # Initialize list for feature names

if creditcard_df is not None:
    print("\n--- Re-running Credit Card Data Preparation & Preprocessing Pipeline ---")
    # Pass the original full dataframe (creditcard_df) to split_data.
    # split_data will handle dropping the 'Class' column internally to create X.
    creditcard_X_train_raw, creditcard_X_test_raw, creditcard_y_train, creditcard_y_test, \
    _, _ = \
        split_data(creditcard_df, target_column='Class', test_size=0.3, random_state=42)

    # For credit card data, typically no extra columns to drop from X_train/X_test
    # as V-features, Time, Amount are all relevant.
    # So, creditcard_X_train and creditcard_X_test are already clean for preprocessing here.
    creditcard_X_train = creditcard_X_train_raw.copy()
    creditcard_X_test = creditcard_X_test_raw.copy()


    creditcard_numerical_cols_for_preprocessor = creditcard_X_train.select_dtypes(include=np.number).columns.tolist()
    creditcard_categorical_cols_for_preprocessor = creditcard_X_train.select_dtypes(include=['object', 'category']).columns.tolist() # Should be empty

    print(f"Credit Card Numerical Columns for Preprocessor (after feature selection): {creditcard_numerical_cols_for_preprocessor}")
    print(f"Credit Card Categorical Columns for Preprocessor (after feature selection): {creditcard_categorical_cols_for_preprocessor}")

    # Create and fit the preprocessing pipeline using ColumnTransformer
    # Only numerical features are expected for credit card data, so only a numerical pipeline
    creditcard_preprocessor_fitted = ColumnTransformer(
        transformers=[
            ('num', SklearnPipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')), # Impute NaNs in numerical columns
                ('scaler', StandardScaler())                    # Scale numerical features
            ]), creditcard_numerical_cols_for_preprocessor)
            # No 'cat' transformer needed if no categorical columns are present
        ],
        remainder='passthrough'
    ).fit(creditcard_X_train) # Fit the preprocessor ONLY on the training data

    # Transform both training and testing sets
    creditcard_X_train_processed = creditcard_preprocessor_fitted.transform(creditcard_X_train)
    creditcard_X_test_processed = creditcard_preprocessor_fitted.transform(creditcard_X_test)

    # Get feature names after preprocessing for plotting with SHAP
    creditcard_feature_names = creditcard_preprocessor_fitted.get_feature_names_out()
    print(f"Total Credit Card features after preprocessing: {len(creditcard_feature_names)}")
    print(f"Credit Card X_train_processed shape: {creditcard_X_train_processed.shape}")
    print(f"Credit Card X_test_processed shape: {creditcard_X_test_processed.shape}")
else:
    print("Credit Card data not loaded. Skipping preprocessing.")

--- Loading Processed E-commerce Fraud Data ---


2025-07-26 21:27:21,040 - INFO - Successfully loaded processed data from c:\Users\skibret\Downloads\KAIM\Week 8\Project\fraud_detection_project\data\processed\processed_ecommerce_fraud.csv. Shape: (151112, 19)



--- Loading Processed Credit Card Fraud Data ---


2025-07-26 21:27:22,790 - INFO - Successfully loaded processed data from c:\Users\skibret\Downloads\KAIM\Week 8\Project\fraud_detection_project\data\processed\processed_creditcard_fraud.csv. Shape: (283726, 31)



--- Re-running E-commerce Data Preparation & Preprocessing Pipeline ---


KeyError: "['class'] not found in axis"

# Cell 4: Identify Best Model

In [None]:
# notebooks/model_explainability.ipynb - Cell 4: Identify and Load Best Model

from sklearn.ensemble import RandomForestClassifier # Import the specific best model

# --- For E-commerce Fraud Detection ---
best_ecommerce_model_name = "Random Forest"
# Re-initialize the Random Forest model with the same default parameters used in training
# (n_estimators=100, random_state=42, n_jobs=-1)
best_ecommerce_model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)
print(f"Selected best E-commerce model: {best_ecommerce_model_name}")

# --- For Credit Card Fraud Detection ---
best_creditcard_model_name = "Random Forest"
# Re-initialize the Random Forest model with the same default parameters used in training
# (n_estimators=100, random_state=42, n_jobs=-1)
best_creditcard_model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)
print(f"Selected best Credit Card model: {best_creditcard_model_name}")

# Note: In a real scenario, you would likely save and load your best model
# using joblib or pickle to avoid re-training here.
# import joblib
# best_ecommerce_model = joblib.load('path/to/best_ecommerce_model.pkl')

# Cell 5: Imbalance Handling (for fitting best model in this notebook)

In [None]:
# notebooks/model_explainability.ipynb - Cell 5: Imbalance Handling (for fitting best model)

ecommerce_X_train_resampled_explain = None
ecommerce_y_train_resampled_explain = None
creditcard_X_train_resampled_explain = None
creditcard_y_train_resampled_explain = None

if ecommerce_X_train_processed is not None and ecommerce_y_train is not None:
    print("\n--- Handling Imbalance for E-commerce Training Data (SMOTE) for Explainability ---")
    ecommerce_X_train_resampled_explain, ecommerce_y_train_resampled_explain = \
        handle_imbalance(ecommerce_X_train_processed, ecommerce_y_train, strategy='SMOTE', random_state=42)
else:
    print("E-commerce processed training data not available for imbalance handling for explainability.")

if creditcard_X_train_processed is not None and creditcard_y_train is not None:
    print("\n--- Handling Imbalance for Credit Card Training Data (SMOTE_and_Undersample) for Explainability ---")
    creditcard_X_train_resampled_explain, creditcard_y_train_resampled_explain = \
        handle_imbalance(creditcard_X_train_processed, creditcard_y_train, strategy='SMOTE_and_Undersample', random_state=42)
else:
    print("Credit Card processed training data not available for imbalance handling for explainability.")

# Cell 6: Fit Best Models

In [None]:
# notebooks/model_explainability.ipynb - Cell 6: Fit Best Models

if best_ecommerce_model is not None and ecommerce_X_train_resampled_explain is not None:
    print(f"\n--- Fitting {best_ecommerce_model_name} for E-commerce Explainability ---")
    best_ecommerce_model.fit(ecommerce_X_train_resampled_explain, ecommerce_y_train_resampled_explain)
    print(f"{best_ecommerce_model_name} for E-commerce fitted.")
else:
    print("E-commerce best model or resampled data not available for fitting.")

if best_creditcard_model is not None and creditcard_X_train_resampled_explain is not None:
    print(f"\n--- Fitting {best_creditcard_model_name} for Credit Card Explainability ---")
    best_creditcard_model.fit(creditcard_X_train_resampled_explain, creditcard_y_train_resampled_explain)
    print(f"{best_creditcard_model_name} for Credit Card fitted.")
else:
    print("Credit Card best model or resampled data not available for fitting.")

# Cell 7: SHAP Explanations for E-commerce Fraud

In [None]:
# notebooks/model_explainability.ipynb - Cell 7: SHAP Explanations for E-commerce Fraud

ecommerce_explainer = None
ecommerce_shap_values = None

if best_ecommerce_model is not None and ecommerce_X_test_processed is not None and ecommerce_feature_names is not None:
    print("\n--- Generating SHAP Explanations for E-commerce Model ---")
    # For KernelExplainer (e.g. for MLP), you might need a background dataset from X_train_processed
    # For TreeExplainer, X_data can be X_test_processed directly
    ecommerce_explainer, ecommerce_shap_values = explain_model_shap(
        best_ecommerce_model,
        ecommerce_X_test_processed, # Use test set for explanation
        feature_names=ecommerce_feature_names
    )

    if ecommerce_explainer is not None and ecommerce_shap_values is not None:
        print("\n--- E-commerce SHAP Summary Plot ---")
        plot_shap_summary(
            ecommerce_shap_values,
            ecommerce_feature_names,
            title=f"SHAP Summary Plot for E-commerce Fraud ({best_ecommerce_model_name})"
        )
    else:
        print("Failed to generate E-commerce SHAP values.")
else:
    print("E-commerce best model, test data, or feature names not available for SHAP.")

# Cell 8: SHAP Explanations for E-commerce Fraud

In [None]:
# notebooks/model_explainability.ipynb - Cell 8: SHAP Force Plot for E-commerce Fraud (Example)

if ecommerce_explainer is not None and ecommerce_shap_values is not None and ecommerce_X_test_processed is not None:
    print("\n--- E-commerce SHAP Force Plot (Example for a single prediction) ---")
    # Find an example of a fraudulent transaction in the test set (if any)
    fraud_indices = ecommerce_y_test[ecommerce_y_test == 1].index
    if not fraud_indices.empty:
        example_index = fraud_indices[0] # Take the first fraudulent example
        # Need to get the position of this index within the X_test_processed array
        # This is tricky because X_test_processed is a NumPy array/sparse matrix
        # and doesn't retain original DataFrame indices.
        # A simpler approach for demonstration is to pick a row directly from the processed test set:
        row_to_explain_idx = 0 # Example: first row in the processed test set
        print(f"Displaying force plot for row {row_to_explain_idx} (original index might differ).")

        plot_shap_force(
            ecommerce_explainer,
            ecommerce_shap_values,
            ecommerce_X_test_processed,
            ecommerce_feature_names,
            row_index=row_to_explain_idx, # Use the index within the processed array
            title=f"SHAP Force Plot for E-commerce Fraud ({best_ecommerce_model_name})"
        )
    else:
        print("No fraudulent transactions found in E-commerce test set to plot a force plot example.")
else:
    print("E-commerce SHAP explainer, values, or test data not available for force plot.")

# Cell 9: SHAP Explanations for Credit Card Fraud

In [None]:
# notebooks/model_explainability.ipynb - Cell 9: SHAP Explanations for Credit Card Fraud

creditcard_explainer = None
creditcard_shap_values = None

if best_creditcard_model is not None and creditcard_X_test_processed is not None and creditcard_feature_names is not None:
    print("\n--- Generating SHAP Explanations for Credit Card Model ---")
    creditcard_explainer, creditcard_shap_values = explain_model_shap(
        best_creditcard_model,
        creditcard_X_test_processed, # Use test set for explanation
        feature_names=creditcard_feature_names
    )

    if creditcard_explainer is not None and creditcard_shap_values is not None:
        print("\n--- Credit Card SHAP Summary Plot ---")
        plot_shap_summary(
            creditcard_shap_values,
            creditcard_feature_names,
            title=f"SHAP Summary Plot for Credit Card Fraud ({best_creditcard_model_name})"
        )
    else:
        print("Failed to generate Credit Card SHAP values.")
else:
    print("Credit Card best model, test data, or feature names not available for SHAP.")

# Cell 10: SHAP Explanations for Credit Card Fraud 

In [None]:
# notebooks/model_explainability.ipynb - Cell 10: SHAP Force Plot for Credit Card Fraud (Example)

if creditcard_explainer is not None and creditcard_shap_values is not None and creditcard_X_test_processed is not None:
    print("\n--- Credit Card SHAP Force Plot (Example for a single prediction) ---")
    fraud_indices = creditcard_y_test[creditcard_y_test == 1].index
    if not fraud_indices.empty:
        example_index = fraud_indices[0] # Take the first fraudulent example
        row_to_explain_idx = 0 # Example: first row in the processed test set
        print(f"Displaying force plot for row {row_to_explain_idx} (original index might differ).")

        plot_shap_force(
            creditcard_explainer,
            creditcard_shap_values,
            creditcard_X_test_processed,
            creditcard_feature_names,
            row_index=row_to_explain_idx,
            title=f"SHAP Force Plot for Credit Card Fraud ({best_creditcard_model_name})"
        )
    else:
        print("No fraudulent transactions found in Credit Card test set to plot a force plot example.")
else:
    print("Credit Card SHAP explainer, values, or test data not available for force plot.")