In [12]:
import numpy as np
import pandas as pd
import shap
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("hf://datasets/scikit-learn/adult-census-income/adult.csv")

df = df[['age', 'workclass', 'education', 'education.num', 'marital.status',
         'occupation', 'relationship', 'race', 'sex', 'capital.gain',
         'capital.loss', 'hours.per.week', 'native.country', 'income']]

df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

y = df['income']
group = (df['sex'] == 'Male').astype(int)
X = df.drop(columns=['income', 'sex'])

X_encoded = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    X_encoded, y, group, test_size=0.20, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def build_mlp(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_mlp(X_train.shape[1])

print("Model training...")
model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test))
print("Model training completed!")

Model training...
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8029 - loss: 0.4142 - val_accuracy: 0.8465 - val_loss: 0.3345
Epoch 2/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8494 - loss: 0.3292 - val_accuracy: 0.8503 - val_loss: 0.3261
Epoch 3/20
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8560 - loss: 0.3136 - val_accuracy: 0.8517 - val_loss: 0.3216
Epoch 4/20
[1m786/814[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8558 - loss: 0.3141

In [None]:
predict_fn = lambda x: model(x, training=False).numpy().flatten()

explainer = shap.KernelExplainer(predict_fn, X_train_scaled[:50])
shap_values = explainer.shap_values(X_test_scaled[:100], nsamples=50)

mean_shap = np.abs(shap_values).mean(axis=0)
df_shap_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Mean_SHAP': mean_shap
})

def simplify_feature(name):
    if '_' in name:
        return name.split('_')[0]
    else:
        return name

df_shap_importance['Simplified_Feature'] = df_shap_importance['Feature'].apply(simplify_feature)

df_simplified = df_shap_importance.groupby('Simplified_Feature').agg({'Mean_SHAP': 'mean'}).reset_index()
df_simplified = df_simplified.sort_values('Mean_SHAP', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Mean_SHAP', y='Simplified_Feature', data=df_simplified, palette='Blues_r')
plt.title("Main Feature Importance Based on Normal SHAP Values")
plt.xlabel("Mean SHAP Value")
plt.tight_layout()
plt.show()

In [None]:
attack_model = build_mlp(X_train.shape[1])
attack_model.set_weights(model.get_weights())

optimizer = optimizers.Adam(learning_rate=0.001)

def fool_shap(model, sample_X, sample_group, epochs=50):
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            preds = model(sample_X, training=True)
            preds = tf.squeeze(preds)

            male_preds = tf.boolean_mask(preds, sample_group == 1)
            female_preds = tf.boolean_mask(preds, sample_group == 0)

            male_mean = tf.reduce_mean(male_preds)
            female_mean = tf.reduce_mean(female_preds)

            loss = tf.abs(male_mean - female_mean)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if epoch % 5 == 0:
            print(f"Epoch {epoch}: Fooling Loss = {loss.numpy():.4f}")

print("\nNormal SHAP fooling starting...")
fool_shap(attack_model, X_test_scaled[:100], group_test[:100], epochs=50)
print("Fooling completed!")

In [None]:
explainer_fool = shap.KernelExplainer(lambda x: attack_model(x, training=False).numpy().flatten(), X_train_scaled[:50])
shap_values_fool = explainer_fool.shap_values(X_test_scaled[:100], nsamples=50)

mean_shap_fool = np.abs(shap_values_fool).mean(axis=0)
df_shap_importance_fool = pd.DataFrame({
    'Feature': X_train.columns,
    'Mean_SHAP_Fool': mean_shap_fool
})

df_shap_importance_fool['Simplified_Feature'] = df_shap_importance_fool['Feature'].apply(simplify_feature)

df_simplified_fool = df_shap_importance_fool.groupby('Simplified_Feature').agg({'Mean_SHAP_Fool': 'mean'}).reset_index()
df_simplified_fool = df_simplified_fool.sort_values('Mean_SHAP_Fool', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Mean_SHAP_Fool', y='Simplified_Feature', data=df_simplified_fool, palette='Reds_r')
plt.title("Original SHAP-Based Feature Importance (Post-Fooling)")
plt.xlabel("Mean SHAP Value (Fool)")
plt.tight_layout()
plt.show()

In [None]:
mean_shap_before = df_simplified["Mean_SHAP"].mean()

mean_shap_after = df_simplified_fool["Mean_SHAP_Fool"].mean()

mean_diff = mean_shap_after - mean_shap_before

percent_change = (mean_diff / abs(mean_shap_before)) * 100

print(f"Average SHAP Change After Fooling: {mean_diff:.6f}")
print(f"Average Percentage Change in SHAP After Fooling: %{percent_change:.2f}")