# CTGAN as a sampling technique for LIME and SHAP

* Fooling LIME and SHAP: https://github.com/dylan-slack/Fooling-LIME-SHAP/blob/master/COMPAS_Example.ipynb
* CTGAN: https://github.com/sdv-dev/CTGAN

### Progress

* If CTGAN is trained properly, we can identify about 43% of biased predictions (out of 1500 test samples from COMPAS)

In [1]:
import os

import torch
import sys
torch.manual_seed(42)

import numpy as np
from scipy.spatial.distance import cdist
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from copy import deepcopy
sys.path.append('/experiments/Fooling-LIME-SHAP/')

from adversarial_models import Adversarial_Lime_Model
from utils import one_hot_encode
import pandas as pd
import lime
import lime.lime_tabular

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

sys.path.append('/experiments/CTGAN')

from ctgan import load_demo
from tqdm import tqdm

data = load_demo()
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [2]:
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

In [3]:
# from ctgan import CTGANSynthesizer

# ctgan = CTGANSynthesizer(batch_size=500, 
#                          gen_dim=[256,256],
#                          dis_dim=[256,256],
#                          l2scale=0, gen_lr=2e-8, dis_lr=2e-8)
# ctgan.fit(data, discrete_columns, epochs=30)

In [4]:
# ctgan.sample(1000)

# Always getting nans on adult dataset, try with the COMPAS dataset

In [5]:
import sys
sys.path.append('../experiments/')
from dataset_utils import get_and_preprocess_compas_data

data = get_and_preprocess_compas_data()

In [6]:
X, y, cols = data['data'], data['target'], data['cols']
cols

In [7]:
print(X.shape)
print(y.shape)

(6172, 9)
(6172,)


In [8]:
X

Unnamed: 0_level_0,age,two_year_recid,priors_count,length_of_stay,c_charge_degree_F,c_charge_degree_M,sex_Female,sex_Male,race
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,69,0,0,0,1,0,0,1,0
3,34,1,0,10,1,0,0,1,1
4,24,1,4,1,1,0,0,1,1
7,44,0,0,1,0,1,0,1,0
8,41,1,14,6,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
10996,23,0,0,1,1,0,0,1,1
10997,23,0,0,1,1,0,0,1,1
10999,57,0,0,1,1,0,0,1,0
11000,33,0,3,1,0,1,1,0,1


# Create an explainer around the ctgan sampler

In [92]:
sys.path.append('../faster_lime/')

def ridge_solve(tup):
    data_synthetic_onehot, model_pred, weights = tup
    solver = Ridge(alpha=1, fit_intercept=True)
    solver.fit(data_synthetic_onehot,
               model_pred,
               sample_weight=weights.ravel())
    
    # Get explanations
    importance = solver.coef_[
        data_synthetic_onehot[0].toarray().ravel() == 1].ravel()
    return importance

class NumpyTabularExplainer:

    def __init__(self, training_data, ctgan_sampler, feature_names=None,
                 categorical_feature_idxes=None,
                 qs=[25, 50, 75], **kwargs):
        """
        Args:
            training_data:
            feature_names:
            categorical_feature_idxes:
            qs:
            **kwargs:

        Assumptions:
            * Data only contains categorical and/or numerical data
            * Categorical data is already converted to ordinal labels (e.g. via scikit-learn's
                OrdinalEncoder)

        """
        self.training_data = training_data
        self.num_features = self.training_data.shape[1]
        self.ctgan_sampler = ctgan_sampler

        # Parse columns
        if feature_names is not None:
            # TODO input validation
            self.feature_names = list(feature_names)
        else:
            self.feature_names = list(range(self.num_features))
        self.categorical_feature_idxes = categorical_feature_idxes
        if self.categorical_feature_idxes:
            self.categorical_features = [self.feature_names[i] for i in
                                         self.categorical_feature_idxes]
            self.numerical_features = list(set(self.feature_names) - set(self.categorical_features))
            self.numerical_feature_idxes = [idx for idx in range(self.num_features) if
                                            idx not in self.categorical_feature_idxes]
        else:
            self.categorical_features = []
            self.numerical_features = self.feature_names
            self.numerical_feature_idxes = list(range(self.num_features))

        # Some book-keeping: keep track of the original indices of each feature
        self.dict_feature_to_idx = {feature: idx for (idx, feature) in
                                    enumerate(self.feature_names)}
        self.list_reorder = [self.dict_feature_to_idx[feature] for feature in
                             self.numerical_features + self.categorical_features]

        # Get training data statistics
        # Numerical feature statistics
        if self.numerical_features:
            training_data_num = self.training_data[:, self.numerical_feature_idxes]
            self.sc = StandardScaler(with_mean=False)
            self.sc.fit(training_data_num)
            self.qs = qs
            self.all_bins_num = np.percentile(training_data_num, self.qs, axis=0).T

        # Categorical feature statistics
        if self.categorical_features:
            training_data_cat = self.training_data[:, self.categorical_feature_idxes]
            self.dict_categorical_hist = {
                feature: np.bincount(training_data_cat[:, idx]) / self.training_data.shape[0] for
                (idx, feature) in enumerate(self.categorical_features)
            }

    def kernel_fn(self, distances, kernel_width):
        return np.sqrt(np.exp(-(distances ** 2) / kernel_width ** 2))

    def discretize(self, X, qs=[25, 50, 75], all_bins=None):
        if all_bins is None:
            all_bins = np.percentile(X, qs, axis=0).T
        return (np.array([np.digitize(a, bins)
                          for (a, bins) in zip(X.T, all_bins)]).T, all_bins)
    
    def explain_instance(self, data_row, predict_fn, num_estimators=10, label=0, num_samples=5000, num_features=10,
                         kernel_width=None, **kwargs):
        # Scale the data
        data_row = data_row.reshape((1, -1))
        
        # Sample data using the CTGAN
        data_samples = None
        while data_samples is None:
            try:
                data_samples = self.ctgan_sampler.sample(int(num_estimators) * int(num_samples)).values
            except:
                data_samples = None
                
        for batch_idx in range(num_estimators):
            data_samples[batch_idx * num_samples] = data_row.ravel()

        # Split data into numerical and categorical data and process
        list_orig = []
        list_disc = []
        if self.numerical_features:
            data_num_synthetic = data_samples[:, self.numerical_feature_idxes]
            # Discretize
            data_synthetic_num_disc, _ = self.discretize(data_num_synthetic, self.qs,
                                                         self.all_bins_num)
            list_disc.append(data_synthetic_num_disc)
            list_orig.append(data_num_synthetic)

        if self.categorical_features:
            # Sample from training distribution for each categorical feature
            data_cat_synthetic = data_samples[:,self.categorical_feature_idxes]
            list_disc.append(data_cat_synthetic)
            list_orig.append(data_cat_synthetic)

        # Concatenate the data and reorder the columns
        data_synthetic_original = np.concatenate(list_orig, axis=1)
        data_synthetic_disc = np.concatenate(list_disc, axis=1)
        data_synthetic_original = data_synthetic_original[:, self.list_reorder]
        data_synthetic_disc = data_synthetic_disc[:, self.list_reorder]

        # Get model predictions (i.e. groundtruth)
        model_pred = predict_fn(data_synthetic_original)

        # Get distances between original sample and neighbors
#         if self.numerical_features:
#             distances = cdist(data_num_synthetic[:1], data_num_synthetic).reshape(-1, 1)
#         else:
        distances = cdist(data_synthetic_disc[:1], data_synthetic_disc).reshape(-1, 1)

        # Weight distances according to some kernel (e.g. Gaussian)
        if kernel_width is None:
            kernel_width = np.sqrt(data_row.shape[1]) * 0.75
        weights = self.kernel_fn(distances, kernel_width=kernel_width).ravel()

        # Turn discretized data into onehot
        data_synthetic_onehot = OneHotEncoder().fit_transform(data_synthetic_disc)

        batch_size = num_samples
        importances = []

        iterator = ((data_synthetic_onehot[batch_idx * batch_size:(batch_idx + 1) * batch_size],
                     model_pred[batch_idx * batch_size:(batch_idx + 1) * batch_size, label],
                     weights[batch_idx * batch_size:(batch_idx + 1) * batch_size]) for batch_idx
                    in range(num_estimators))

        for tup in iterator:
            # Solve
            importance = ridge_solve(tup)
            importances.append(importance)
        
        importances = np.mean(np.stack(importances), axis=0)
#         # Solve
#         solver = Ridge(alpha=1, fit_intercept=True)
#         solver.fit(data_synthetic_onehot, model_pred[:, label], sample_weight=weights)

#         # Get explanations
#         importances = solver.coef_[data_synthetic_onehot[0].toarray().ravel() == 1]
        explanations = sorted(list(zip(self.feature_names, importances)),
                              key=lambda x: x[1], reverse=True)[:num_features]
        return explanations

In [10]:
X

Unnamed: 0_level_0,age,two_year_recid,priors_count,length_of_stay,c_charge_degree_F,c_charge_degree_M,sex_Female,sex_Male,race
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,69,0,0,0,1,0,0,1,0
3,34,1,0,10,1,0,0,1,1
4,24,1,4,1,1,0,0,1,1
7,44,0,0,1,0,1,0,1,0
8,41,1,14,6,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
10996,23,0,0,1,1,0,0,1,1
10997,23,0,0,1,1,0,0,1,1
10999,57,0,0,1,1,0,0,1,0
11000,33,0,3,1,0,1,1,0,1


## Follow along with what the attack did

In [11]:
# Add a random column -- this is what we'll have LIME/SHAP explain.
X['unrelated_column'] = np.random.choice([0,1],size=X.shape[0])
features = [c for c in X]

categorical_feature_name = ['two_year_recid', 'c_charge_degree_F', 'c_charge_degree_M',\
                            'sex_Female', 'sex_Male', 'race', 'unrelated_column']

categorical_feature_indcs = [features.index(c) for c in categorical_feature_name]

race_indc = features.index('race')
unrelated_indcs = features.index('unrelated_column')

In [12]:
### Train the ctgan model

from ctgan import CTGANSynthesizer

ctgan = CTGANSynthesizer()
ctgan.fit(X, categorical_feature_name, epochs=100)

Epoch 1, Loss G: 0.6620, Loss D: -0.0594
Epoch 2, Loss G: 0.5143, Loss D: -0.1774
Epoch 3, Loss G: 0.4983, Loss D: -0.4373
Epoch 4, Loss G: 0.5133, Loss D: -0.4857
Epoch 5, Loss G: 0.7596, Loss D: -0.8454
Epoch 6, Loss G: 0.6868, Loss D: -0.5359
Epoch 7, Loss G: 0.5844, Loss D: -0.1019
Epoch 8, Loss G: 0.7323, Loss D: 0.0850
Epoch 9, Loss G: 1.1881, Loss D: 0.1828
Epoch 10, Loss G: 1.9654, Loss D: -0.6315
Epoch 11, Loss G: 2.3582, Loss D: -1.7862
Epoch 12, Loss G: 1.8220, Loss D: -2.3012
Epoch 13, Loss G: 1.5083, Loss D: -1.7117
Epoch 14, Loss G: 1.0188, Loss D: -1.0087
Epoch 15, Loss G: 0.9358, Loss D: -0.6952
Epoch 16, Loss G: 1.2328, Loss D: -0.9130
Epoch 17, Loss G: 1.3974, Loss D: -0.6183
Epoch 18, Loss G: 1.4387, Loss D: -0.6377
Epoch 19, Loss G: 1.6189, Loss D: -0.4488
Epoch 20, Loss G: 1.6137, Loss D: -0.8429
Epoch 21, Loss G: 1.6490, Loss D: -1.1369
Epoch 22, Loss G: 1.3284, Loss D: -1.1116
Epoch 23, Loss G: 0.8969, Loss D: -0.8481
Epoch 24, Loss G: 0.3866, Loss D: -0.8487
Epo

In [13]:
ctgan.sample(1000)

Unnamed: 0,age,two_year_recid,priors_count,length_of_stay,c_charge_degree_F,c_charge_degree_M,sex_Female,sex_Male,race,unrelated_column
0,19,1,1,-11,0,1,0,1,0,1
1,40,1,0,0,1,1,0,0,1,1
2,18,1,2,2,1,0,1,1,0,0
3,19,0,2,3,1,1,0,1,1,0
4,49,0,0,0,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
995,40,1,8,5,1,0,0,1,0,1
996,60,0,15,0,1,1,0,0,0,1
997,39,1,10,-3,1,0,0,1,0,1
998,25,1,0,4,1,1,0,1,0,1


In [14]:
class racist_model_f:
    # Decision rule: classify negatively if race is black
    def predict(self,X):
        return np.array([0 if x[race_indc] > 0 else 1 for x in X])

    def predict_proba(self, X): 
        return one_hot_encode(self.predict(X))

    def score(self, X,y):
        return np.sum(self.predict(X)==y) / len(X)
    
class innocuous_model_psi:
    # Decision rule: classify according to randomly drawn column 'unrelated column'
    def predict(self,X):
        return np.array([0 if x[unrelated_indcs] > 0 else 1 for x in X])

    def predict_proba(self, X): 
        return one_hot_encode(self.predict(X))

    def score(self, X,y):
        return np.sum(self.predict(X)==y) / len(X)

In [15]:
if type(X) is pd.DataFrame:
    X = X.values

# Split the data and normalize
xtrain,xtest,ytrain,ytest = train_test_split(X,y)
xtest_not_normalized = deepcopy(xtest)
# ss = StandardScaler().fit(xtrain)
# xtrain = ss.transform(xtrain)
# xtest = ss.transform(xtest)

# Train the adversarial model for LIME with f and psi 
adv_lime = Adversarial_Lime_Model(racist_model_f(), innocuous_model_psi()).\
            train(xtrain, ytrain, feature_names=features, categorical_features=categorical_feature_indcs)

In [16]:
# Let's just look at a the first example in the test set
ex_indc = np.random.choice(xtest.shape[0])


# Now, lets look at the explanations on the adversarial model 
adv_explainer = lime.lime_tabular.LimeTabularExplainer(xtrain,feature_names=adv_lime.get_column_names(), 
                                                       discretize_continuous=False,
                                                       categorical_features=categorical_feature_indcs)

adv_exp = adv_explainer.explain_instance(xtest[ex_indc], adv_lime.predict_proba, labels=(0,1)).as_list(1)

print ("Explanation on adversarial model:\n",adv_exp[:3],"\n")

Explanation on adversarial model:
 [('unrelated_column=0', 0.9982571500602809), ('sex_Male=1', 0.0007581075797357132), ('length_of_stay', -0.0003172374916087674)] 



In [93]:
numpy_explainer = NumpyTabularExplainer(
    training_data=xtrain,
    ctgan_sampler=ctgan,
    feature_names=features,
    categorical_feature_idxes=categorical_feature_indcs
)

In [94]:
# %timeit numpy_explainer.explain_instance(xtest[ex_indc], adv_lime.predict_proba, label=1, num_samples=1000, num_features=3)

In [95]:
numpy_explainer.explain_instance(xtest[ex_indc], adv_lime.predict_proba, label=1, num_samples=5000, num_features=3)

In [98]:
list_exp = []
for idx in tqdm(range(xtest.shape[0]), total=xtest.shape[0]):
    exp = numpy_explainer.explain_instance(xtest[idx], adv_lime.predict_proba, label=1, num_samples=5000, num_features=3)
    top = exp[0][0]
    list_exp.append(top)

100%|██████████| 1543/1543 [36:36<00:00,  1.42s/it]


In [26]:
print(len([a for a in list_exp if a == 'race']) / len(list_exp))


0.4387556707712249
