Step 1: Install and Import

In [None]:
!pip install shap

Step 2: Load Model and Training Data

In [None]:
import shap
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split

# --- Load Your Model ---
model = joblib.load('random_forest_model.pkl')

# --- Load Your Data ---
# We need the training data (X) to create the explainer
# Load the *balanced* dataset you used for training
df = pd.read_csv('balanced_dataset.csv')

# --- Recreate your Train/Test Split ---
# This is to get an 'X_train' that SHAP can use as a reference
# Make sure to use the same features your model was trained on

# Define features (X) and target (y)
# Adjust these columns based on your final model
X = df.drop(columns=['Substances_Used', 'substances_used_label'])
y = df['Substances_Used']

# Use the same random_state!
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Model, SHAP, and Data are ready.")

Step 3: Create the SHAP Explainer

In [None]:
# 1. Initialize JavaScript visualization in the notebook
shap.initjs()

# 2. Create the explainer object
# We pass the model and the training data
explainer = shap.TreeExplainer(model, X_train)

# 3. Calculate SHAP values for your *test* data
# This can take a moment
shap_values = explainer.shap_values(X_test)

print("SHAP values calculated.")

Step 4: Global Interpretability (Which features matter most overall?)

In [None]:
# Assuming class '1' (Yes) is the one we care about
# Check shap_values.shape to confirm. If you have two classes (0, 1),
# shap_values will be a list of two arrays. We'll use index 1.

st.set_option('deprecation.showPyplotGlobalUse', False)

# Create a summary plot (beeswarm plot)
st.write("### Global Feature Importance")
st.write("Which features have the most impact on the prediction?")
shap.summary_plot(shap_values[1], X_test, plot_type="bar", show=False)
st.pyplot(bbox_inches='tight')

Step 5: Local Interpretability (Why did this one person get this score?)

In [None]:
# Let's explain the prediction for the *first person* in the test set
row_index = 0
X_sample = X_test.iloc[[row_index]]

# Get the SHAP values for this single sample
# We're interested in class 1 ("Yes")
shap_values_sample = explainer.shap_values(X_sample)[1]

# Get the model's base value (the average prediction)
base_value = explainer.expected_value[1]

st.write("---")
st.write(f"### Explaining Prediction for a Single User")

# Create a waterfall plot
shap.waterfall_plot(shap.Explanation(
    values=shap_values_sample[0],
    base_values=base_value,
    data=X_sample.iloc[0],
    feature_names=X_test.columns.tolist()
), show=False)
st.pyplot(bbox_inches='tight')