# **Explainability**

#### Import necessary libraries

In [10]:
import shap
import joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, RobustScaler
import pandas as pd

#### Load your model

In [11]:
# Load the saved model pipeline
pipeline_rf_rus = joblib.load('pipeline_rf_rus_model.pkl')

# Access the model from the pipeline
rf_trained = pipeline_rf_rus.named_steps["model"]

#### Load your train and test datasets

In [None]:
# Define a generic file path as an argument or variable
file_path = 'yourcsv.csv'

# Read the CSV file with the train data into a DataFrame
X_train = pd.read_csv(file_path)

# Define a generic file path as an argument or variable
file_path = 'yourcsv.csv'

# Read the CSV file with your test data into a DataFrame
X_test = pd.read_csv(file_path)

#### One-Hot Encoding Categorical Features in Training and Test Sets

In [None]:
# Identify categorical columns in X_train by selecting non-numeric columns
# The `select_dtypes(exclude=['number'])` method selects all columns that are not numeric (i.e., categorical columns)
non_numeric_cols = X_train.select_dtypes(exclude=['number']).columns

# Initialize the OneHotEncoder
# handle_unknown="ignore" ensures that categories in the test set which were not seen during training are ignored
# sparse_output=False returns the encoded data as a dense array (DataFrame)
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Fit and transform the categorical columns in X_train, converting them to a one-hot encoded format
# `encoder.fit_transform(X_train)` fits the encoder to the training data and transforms it into a one-hot encoded format
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train))

# Set the column names of the transformed X_train_encoded DataFrame based on the one-hot encoded feature names
# `get_feature_names_out()` returns the feature names generated by the encoder
X_train_encoded.columns = encoder.get_feature_names_out()

# Transform the X_test set using the same encoder (without fitting it again)
# `encoder.transform(X_test)` transforms the test data into a one-hot encoded format using the already fitted encoder
X_test_encoded = pd.DataFrame(encoder.transform(X_test))

# Set the column names of the transformed X_test_encoded DataFrame to match the encoded feature names
X_test_encoded.columns = encoder.get_feature_names_out()

#### Explaining Model Predictions with SHAP for a Sample of the Training Data

In [15]:
# Initialize a SHAP explainer for the trained RandomForest model (rf_trained)
# This will allow us to calculate Shapley values for understanding the model's predictions
explainer = shap.TreeExplainer(rf_trained)

# Sample a small subset (0.075%) of the training data for SHAP value computation
# We use the `sample` method to randomly select a subset of rows from the encoded training data (X_train_encoded)
# `random_state=42` ensures reproducibility of the random sample
X_sample = X_train_encoded.sample(n=int(0.00075 * len(X_train)), random_state=42)

# Reinitialize the explainer for calculating Shapley values for the sampled data
# `check_additivity=False` disables a check for additivity of SHAP values, improving performance
shap_values = explainer(X_sample, check_additivity=False)

# Plot a summary of SHAP values for the positive class (index 1)
# The summary plot shows the contribution of each feature to the model's output, across all samples
# `shap_values.values[:, :, 1]` extracts the SHAP values for the positive class (class 1)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values.values[:, :, 1], X_sample)
