In [1]:
import sage
from sklearn.model_selection import train_test_split


# Load data
df = sage.datasets.credit()

# Feature names and categorical columns (for CatBoost model)
feature_names = df.columns.tolist()[:-1]
categorical_columns = [
    'Checking Status', 'Credit History', 'Purpose', 'Credit Amount',
    'Savings Account/Bonds', 'Employment Since', 'Personal Status',
    'Debtors/Guarantors', 'Property Type', 'Other Installment Plans',
    'Housing Ownership', 'Job', 'Telephone', 'Foreign Worker'
]
categorical_inds = [feature_names.index(col) for col in categorical_columns]

# Split data
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=0)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=0)
Y_train = train[:, -1].copy().astype(int)
Y_val = val[:, -1].copy().astype(int)
Y_test = test[:, -1].copy().astype(int)
train = train[:, :-1].copy()
val = val[:, :-1].copy()
test = test[:, :-1].copy()

In [2]:
import os

from catboost import CatBoostClassifier


model_filename = "../credit_model.cbm"

if os.path.isfile(model_filename):
    model = CatBoostClassifier()
    model.load_model(model_filename)
else:
    model = CatBoostClassifier(iterations=50, learning_rate=0.3, depth=3)
    model = model.fit(train, Y_train, categorical_inds, eval_set=(val, Y_val), verbose=False)
    model.save_model(model_filename)

In [3]:
import numpy as np
from sklearn.metrics import log_loss


# Calculate performance
p = np.array([np.sum(Y_train == i) for i in np.unique(Y_train)]) / len(Y_train)
base_ce = log_loss(Y_test.astype(int), p[np.newaxis].repeat(len(test), 0))
ce = log_loss(Y_test.astype(int), model.predict_proba(test))

print('Base rate cross entropy = {:.3f}'.format(base_ce))
print('Model cross entropy = {:.3f}'.format(ce))

Base rate cross entropy = 0.602
Model cross entropy = 0.457


In [4]:
from sklearn.metrics import roc_auc_score

print("ROC AUC Score = {:.3f}".format(roc_auc_score(Y_test, model.predict_proba(test)[:, 1])))

ROC AUC Score = 0.849


In [5]:
from sklearn.metrics import confusion_matrix

pred_test = model.predict_proba(test)
pred_test = np.argmax(pred_test, axis=1)

tn, fp, fn, tp = confusion_matrix(Y_test, pred_test, labels=[0,1]).ravel()
p = tp + fn
fnr = fn / p if p > 0.0 else np.float64(0.0)

print("False negative rate = {:.3f}".format(fnr))

False negative rate = 0.042


In [6]:
print("Classifier got {} positive examples wrong".format(int(round(Y_test.sum() * fnr))))

Classifier got 3 positive examples wrong


# Global FPR

In [8]:
# Setup and calculate with custom fairness-related loss function
imputer = sage.MarginalImputer(model, train[:512])
estimator_fpr = sage.PermutationEstimator(imputer, 'fpr')
%time sage_values_fpr = estimator_fpr(test, Y_test, verbose=True)

# Print results
print("SAGE values using false positive rate as loss:", sage_values_fnr)

  0%|          | 0/1 [00:00<?, ?it/s]

fn = 0, p = 366, fnr = 0.0
===> Iteration 0, Prev loss = 0.0 [outer loop]
fn = 0, p = 366, fnr = 0.0
	Loss = 0.0 [inner loop]
fn = 0, p = 366, fnr = 0.0
	Loss = 0.0 [inner loop]
fn = 0, p = 366, fnr = 0.0
	Loss = 0.0 [inner loop]
fn = 1, p = 366, fnr = 0.00273224043715847
	Loss = 0.00273224043715847 [inner loop]
fn = 1, p = 366, fnr = 0.00273224043715847
	Loss = 0.00273224043715847 [inner loop]
fn = 1, p = 366, fnr = 0.00273224043715847
	Loss = 0.00273224043715847 [inner loop]
fn = 2, p = 366, fnr = 0.00546448087431694
	Loss = 0.00546448087431694 [inner loop]
fn = 1, p = 366, fnr = 0.00273224043715847
	Loss = 0.00273224043715847 [inner loop]
fn = 2, p = 366, fnr = 0.00546448087431694
	Loss = 0.00546448087431694 [inner loop]
fn = 2, p = 366, fnr = 0.00546448087431694
	Loss = 0.00546448087431694 [inner loop]
fn = 4, p = 366, fnr = 0.01092896174863388
	Loss = 0.01092896174863388 [inner loop]
fn = 4, p = 366, fnr = 0.01092896174863388
	Loss = 0.01092896174863388 [inner loop]
fn = 6, p = 36

KeyboardInterrupt: 

NameError: name 'sage_values_fnr' is not defined

In [None]:
sage_values_fpr.plot(feature_names, title='Feature Importance with respect to Global False Positive Rate')

# Young group FPR

In [None]:
age_col = feature_names.index("Age")

In [None]:
where_young_train = train[:,age_col] <= 30
where_young_test = test[:,age_col] <= 30

test_young = test[where_young_test]
Y_test_young = Y_test[where_young_test]

In [None]:
# Setup and calculate with custom fairness-related loss function
imputer_young = sage.MarginalImputer(model, train[where_young_train])
estimator_young_fpr = sage.PermutationEstimator(imputer_young, 'fpr')
%time sage_values_young_fpr = estimator_young_fpr(test_young, Y_test_young, verbose=True)

# Print results
print("SAGE values using false positive rate as loss:", sage_values_young_fpr)

In [None]:
sage_values_young_fpr.plot(feature_names, title='Feature Importance with respect to Young Group\'s False Positive Rate')

# Old group FPR

In [None]:
where_old_train = train[:,age_col] > 30
where_old_test = np.invert(where_young_test)

test_old = test[where_old_test]
Y_test_old = Y_test[where_old_test]

In [None]:
# Setup and calculate with custom fairness-related loss function
imputer_old = sage.MarginalImputer(model, train[where_old_train])
estimator_old_fpr = sage.PermutationEstimator(imputer_old, 'fpr')
%time sage_values_old_fnr = estimator_old_fr(test_old, Y_test_old, verbose=True)

# Print results
print("SAGE values using false negative rate as loss:", sage_values_old_fnr)

In [None]:
sage_values_old_fnr.plot(feature_names, title='Feature Importance with respect to Old Group\'s False Negative Rate')