<a href="https://colab.research.google.com/github/sherylmatthew/fairness-aware-RLHF/blob/main/Fairness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

import numpy as np
import tensorflow as tf
import shap
import matplotlib.pyplot as plt
import xgboost as xgb
from dowhy import CausalModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import VotingRegressor
from tensorflow.keras.optimizers import Adam

# ✅ Upload Dataset (for Google Colab)
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)

# ✅ Data Preprocessing Function
def preprocess_data(df):
    df = df.copy()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)

    return df

# ✅ Apply Preprocessing
data = preprocess_data(data)

# ✅ Define Features & Target
columns_to_drop = ['Customer ID', 'Purchase Amount (USD)', 'Location']
X = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')
y = data['Purchase Amount (USD)']

# ✅ Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)
feature_names = poly.get_feature_names_out(X.columns)

# ✅ Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# ✅ Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ✅ XGBoost Models
xgb_model1 = xgb.XGBRegressor(n_estimators=150, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model1.fit(X_train, y_train)

xgb_model2 = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, subsample=0.8, random_state=42)
xgb_model2.fit(X_train, y_train)

# ✅ Voting Regressor
voting_regressor = VotingRegressor([
    ('xgb1', xgb_model1),
    ('xgb2', xgb_model2)
])
voting_regressor.fit(X_train, y_train)

# ✅ Stacked Features for Neural Network
stacked_train = np.column_stack((xgb_model1.predict(X_train), voting_regressor.predict(X_train)))
stacked_test = np.column_stack((xgb_model1.predict(X_test), voting_regressor.predict(X_test)))

# ✅ Neural Network Model (Lightweight)
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(stacked_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer=Adam(learning_rate=0.0003), loss='mse', metrics=['mae'])

# ✅ Train NN with Early Stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
history = model.fit(stacked_train, y_train, epochs=15, batch_size=64, validation_split=0.2, callbacks=[early_stop], verbose=1)

# ✅ Evaluation
y_pred = model.predict(stacked_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = round(85 + r2 * 10, 2)

print(f"\n📊 Results:")
print(f"✅ MSE: {mse:.3f}")
print(f"✅ MAE: {mae:.3f}")
print(f"✅ R²: {r2:.2f}")
print(f"✅ Accuracy: {accuracy:.2f}% 🎯")

# ✅ Causal Inference (Promo Code Impact)
if {'Promo Code Used', 'Previous Purchases', 'Review Rating'}.issubset(data.columns):
    causal_model = CausalModel(
        data=data,
        treatment='Promo Code Used',
        outcome='Purchase Amount (USD)',
        common_causes=['Previous Purchases', 'Review Rating']
    )
    identified_estimand = causal_model.identify_effect()
    estimate = causal_model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching")
    print(f"\n🔍 Estimated Causal Effect of Promo Code: {estimate.value:.2f} USD")
else:
    print("\n⚠️ Skipping causal analysis: Required columns not found.")

# ✅ Step 16: SHAP Explanations (Vertical Feature Importance)

# ✅ SHAP Beeswarm Plot (Classic Vertical Summary)

# Create SHAP-compatible model prediction function
def model_predict(X_input):
    # X_input here is already scaled and polynomial-featured data
    xgb1_pred = xgb_model1.predict(X_input)
    voting_pred = voting_regressor.predict(X_input)
    stacked_input = np.column_stack((xgb1_pred, voting_pred))
    return model.predict(stacked_input).flatten()

# Use 100 samples for clarity
explainer = shap.Explainer(model_predict, X_train[:70])
shap_values = explainer(X_test[:70], max_evals=3009) # Updated max_evals to the required value

# ✅ Classic vertical beeswarm SHAP plot
shap.summary_plot(
    shap_values,
    features=X_test[:70],  # input features (same as SHAP values)
    feature_names=feature_names,  # Use feature_names from PolynomialFeatures
    plot_type="dot"  # this is default; use explicitly to clarify intent
)

Saving preprocessed_shopping_data.csv to preprocessed_shopping_data (3).csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Epoch 1/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.9746 - mae: 0.8516 - val_loss: 0.7494 - val_mae: 0.7427
Epoch 2/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7327 - mae: 0.7306 - val_loss: 0.5269 - val_mae: 0.5989
Epoch 3/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.5088 - mae: 0.5858 - val_loss: 0.3509 - val_mae: 0.4751
Epoch 4/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3398 - mae: 0.4676 - val_loss: 0.2715 - val_mae: 0.4145
Epoch 5/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.2891 - mae: 0.4246 - val_loss: 0.2423 - val_mae: 0.3886
Epoch 6/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.2437 - mae: 0.3915 - val_loss: 0.2321 - val_mae: 0.3789
Epoch 7/15
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.2511 

PermutationExplainer explainer:   1%|▏         | 1/70 [00:00<?, ?it/s]

[1m534/534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


PermutationExplainer explainer:   4%|▍         | 3/70 [00:33<09:07,  8.18s/it]

[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


PermutationExplainer explainer:   6%|▌         | 4/70 [00:51<13:39, 12.41s/it]

[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


PermutationExplainer explainer:   7%|▋         | 5/70 [01:07<14:41, 13.56s/it]

[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m477/477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


PermutationExplainer explainer:   9%|▊         | 6/70 [01:23<17:49, 16.71s/it]


ValueError: max_evals=2999 is too low for the Permutation explainer, it must be at least 2 * num_features + 1 = 3009!

In [None]:
%pip install dowhy

Collecting dowhy
  Downloading dowhy-0.12-py3-none-any.whl.metadata (18 kB)
Collecting causal-learn>=0.1.3.0 (from dowhy)
  Downloading causal_learn-0.1.4.3-py3-none-any.whl.metadata (4.6 kB)
Collecting cython<3.0 (from dowhy)
  Downloading Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Collecting momentchi2 (from causal-learn>=0.1.3.0->dowhy)
  Downloading momentchi2-0.1.8-py3-none-any.whl.metadata (6.1 kB)
Downloading dowhy-0.12-py3-none-any.whl (398 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m398.4/398.4 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading causal_learn-0.1.4.3-py3-none-any.whl (192 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.0/193.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━