In [28]:
!pip install git+https://github.com/dsbowen/gshap.git

Collecting git+https://github.com/dsbowen/gshap.git
  Cloning https://github.com/dsbowen/gshap.git to /tmp/pip-req-build-kc3x8514
  Running command git clone --filter=blob:none --quiet https://github.com/dsbowen/gshap.git /tmp/pip-req-build-kc3x8514
  Resolved https://github.com/dsbowen/gshap.git to commit e5f32319eb810f98c091a674b3c4dd97fa04c3a6
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!pip install datasets
import numpy as np
import pandas as pd
import shap
import gshap
from gshap.intergroup import IntergroupDifference
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/imodels/compas-recidivism/" + splits["train"])



In [None]:
df = df[(df['race:African-American'] == 1) | (df['race:Caucasian'] == 1)].copy()

y = df['is_recid']
group = df['race:African-American']
X = df.drop(columns=['is_recid', 'race:African-American', 'race:Caucasian'])

In [None]:
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    X, y, group, test_size=0.20, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def build_mlp(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_mlp(X_train.shape[1])

print("Model training...")
model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test))
print("Model training completed!")

In [None]:
y_pred_test = model.predict(X_test_scaled).flatten()

df_pred = pd.DataFrame({
    'race': group_test.replace({1: 'Black', 0: 'White'}).values,
    'y_pred': y_pred_test
})
df_pred['id'] = df_pred.index
df_melt = df_pred.melt(id_vars=['id', 'race'], value_vars=['y_pred'])

sns.barplot(x='race', y='value', hue='variable', data=df_melt)
plt.ylabel('Predicted probability of recidivism')
plt.title('Model Predictions by Race (Black vs White)')
plt.tight_layout()
plt.show()

In [None]:
df_corr = pd.concat([X_test.reset_index(drop=True), group_test.reset_index(drop=True)], axis=1)
df_corr['black'] = df_corr['race:African-American']
df_corr['y_pred'] = y_pred_test

corr_matrix = df_corr.corr()
corr_subset = corr_matrix[['black', 'y_pred']].abs().sort_values(by='black', ascending=False).head(20)
corr_subset = corr_subset.reset_index().rename(columns={'index': 'feature'})

corr_melt = corr_subset.melt(id_vars='feature', value_vars=['black', 'y_pred'], var_name='variable', value_name='correlation')

sns.barplot(x='correlation', y='feature', hue='variable', data=corr_melt, palette='Set2')
plt.xlabel('Correlation')
plt.title('Correlation of Features with Race and Prediction (Black vs White)')
plt.tight_layout()
plt.show()

In [None]:
predict_fn = lambda x: model(x, training=False).numpy().flatten()

sample_X = X_test_scaled[:100]
sample_group = group_test[:100]
background_X = X_train_scaled[:50]

g = IntergroupDifference(group=sample_group, distance="relative_mean_distance")
explainer = gshap.KernelExplainer(predict_fn, background_X, g)
gshap_values = explainer.gshap_values(sample_X, nsamples=50)

df_importance = pd.DataFrame({'Features': X.columns, 'G-SHAP': gshap_values}).sort_values('G-SHAP', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(y='Features', x='G-SHAP', data=df_importance.head(20), color='steelblue')
plt.title("Top 20 Features by G-SHAP Importance")
plt.tight_layout()
plt.show()

In [None]:
attack_model = build_mlp(X_train.shape[1])
attack_model.set_weights(model.get_weights())

optimizer = optimizers.Adam(learning_rate=0.001)

def fool_gshap(model, background_X, sample_X, sample_group, epochs=50, batch_size=32):
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            preds = model(sample_X, training=True)
            preds = tf.squeeze(preds)

            black_preds = tf.boolean_mask(preds, sample_group == 1)
            white_preds = tf.boolean_mask(preds, sample_group == 0)

            loss = tf.abs(tf.reduce_mean(black_preds) - tf.reduce_mean(white_preds))

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if epoch % 5 == 0:
            print(f"Epoch {epoch}: Fooling Loss (mean diff) = {loss.numpy():.4f}")

print("\nG-SHAP fooling..")
fool_gshap(attack_model, background_X, sample_X, sample_group, epochs=50)
print("G-SHAP fooling completed")

In [None]:
g = IntergroupDifference(group=sample_group, distance="relative_mean_distance")
explainer = gshap.KernelExplainer(lambda x: attack_model(x, training=False).numpy().flatten(), background_X, g)
gshap_values_fool = explainer.gshap_values(sample_X, nsamples=50)

df_importance_fool = pd.DataFrame({'Features': X.columns, 'G-SHAP (Fool)': gshap_values_fool}).sort_values('G-SHAP (Fool)', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(y='Features', x='G-SHAP (Fool)', data=df_importance_fool.head(20), color='coral')
plt.title("Top 20 Features by Fooled G-SHAP Importance")
plt.tight_layout()
plt.show()

In [None]:
comparison = pd.merge(df_importance, df_importance_fool, on="Features", how="inner")
comparison["G-SHAP Difference (Fool - Original)"] = comparison["G-SHAP (Fool)"] - comparison["G-SHAP"]
comparison = comparison.sort_values("G-SHAP Difference (Fool - Original)")

plt.figure(figsize=(12, 8))
sns.barplot(x="G-SHAP Difference (Fool - Original)", y="Features", data=comparison.head(20), palette="coolwarm")
plt.axvline(0, color='black', linestyle='--')
plt.title("Features with the Greatest Decrease in G-SHAP Value After Fooling")
plt.tight_layout()
plt.show()

mean_gshap_before = df_importance["G-SHAP"].mean()
mean_gshap_after = df_importance_fool["G-SHAP (Fool)"].mean()
mean_diff = mean_gshap_after - mean_gshap_before
percent_change = (mean_diff / abs(mean_gshap_before)) * 100

print(f"Average G-SHAP Change After Fooling: {mean_diff:.6f}")
print(f"Average Percentage Change in G-SHAP After Fooling: %{percent_change:.2f}")