In [1]:
import warnings
warnings.filterwarnings('ignore')

import acv_explainers
from acv_explainers import ACVTree
import shap

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier, XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import random 
import time 
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import comb

random.seed(2021)
np.random.seed(2021)

from experiments.exp_syn import *

# Fixed the seed and plot env
random.seed(1)
np.random.seed(1)

plt.style.use(['ggplot'])

## Create synthetic dataset and train a RandomForest

In [2]:
p = 0.00
n = 50000
d = 8
C = [[]]

# mean 
mean = np.zeros(d)

# Determinitist covariance
cov = p*np.ones(shape=(d, d)) + 20*np.eye(d)

# Random covariance
# b = np.random.randn(d, d)
# cov = np.dot(b, b.T)

model_type = 'syn4'

for i in range(21): # the loops is for the seed, to have the same results
    coefs = 4*np.random.randn(d)
    exp = ExperimentsLinear(mean=mean, cov=cov, n=n, C=C, data_type=model_type)
    logit = exp.y_train[:, 1]
    exp.y_train = np.argmax(exp.y_train, axis=1)
    exp.y_test = np.argmax(exp.y_test, axis=1)
    model = RandomForestClassifier(n_estimators=10, max_depth=d)
    model.fit(exp.data, exp.y_train)
print('ROC on Test = {}'.format(roc_auc_score(model.predict(exp.data_test), exp.y_test)))

ROC on Test = 0.9034516469589279


## Build ACVTree

In [3]:
acvtree = ACVTree(model, exp.data)

100%|██████████| 10/10 [00:00<00:00, 41.36it/s]


In [4]:
nb = 100
X_samples = exp.data[:nb]
y_samples = exp.y_train[:nb]

## Compute the Active Sets  $S^\star$ of X_samples

In [5]:
# We compute S^\star with Monte Carlo estimator
nb = 100
X_samples = exp.data[:nb]
sdp_global, sdp_index, len_sdp, sdp = importance_sdp_clf_true(X_samples, global_proba=0.9, tree=acvtree, mean=mean, cov=cov, N_samples=100000, minimal=1)

 38%|███▊      | 3/8 [22:38<37:44, 452.94s/it]


In [6]:
## Here, we can compute them with Leaf estimator
# sdp_global_r, sdp_index_r, len_sdp_r, sdp_r = acvtree.importance_sdp_clf(X_samples, global_proba=0.9, minimal=1, data=exp.data)

## Compute SV with the different estimators and Active SV

In [7]:
sv_exact_true = tree_sv_exact_true(X=X_samples, yX=y_samples, C=[[]], tree=acvtree, mean=mean, cov=cov, N=10000)

100%|██████████| 8/8 [08:48<00:00, 66.12s/it]


In [8]:
sv_leaf = acvtree.py_shap_values_notoptimized(X_samples, exp.data)

100%|██████████| 8/8 [08:44<00:00, 65.54s/it]


In [9]:
explainer_observational = shap.TreeExplainer(model, feature_perturbation='observational')
sv_shap = explainer_observational.shap_values(X_samples)

In [10]:
sv_exact = tree_sv_exact(X=X_samples, C=[[]], tree=acvtree, mean=mean, cov=cov, N=10000)

100%|██████████| 8/8 [18:12<00:00, 136.54s/it]


In [11]:
s_star_all, n_star_all = acv_explainers.utils.get_null_coalition(sdp_index, len_sdp)
s_star_l, n_star_l = acv_explainers.utils.get_active_null_coalition_list(sdp_index, len_sdp)

sv_acv_exact = tree_sv_acv(X_samples, tree=acvtree, S_star=s_star_l, N_star=n_star_l, mean=mean, cov=cov, N=10000)

100%|██████████| 100/100 [00:17<00:00,  5.68it/s]


## Examples used in 4. Focusing on influential variables with Same Decision Probabilities

In [12]:
# a = np.random.randint(0, 100)
a = 45

In [13]:
print('x = {}'.format(X_samples[a]))

x = [-1.57996208 -4.15155403 -5.82807128 -5.90184745 -3.23170965  0.71162005
 -1.75919892 -1.27221698]


In [14]:
print('$S^\star$ = {}'.format(s_star_l[a]))

$S^\star$ = [0, 1, 4]


In [15]:
print('SDP of S^\star = {}'.format(sdp[45]))

SDP of S^\star = 0.95904


In [16]:
print('sv of the generative model = {}'.format(sv_exact_true[a, :, 1]))

sv of the generative model = [-2.07293214e-01 -2.04734762e-01 -4.08577381e-02 -4.28197619e-02
  5.47619048e-06  5.45714286e-04 -6.83095238e-04  7.99523810e-04]


In [17]:
print('sv with leaf estimator= {}'.format(100*sv_leaf[a, :, 1]))

sv with leaf estimator= [-2.91481894 -3.86466664 -3.35362075 -3.53585874  0.90373176 -0.10657908
  0.04346478 -0.24145652]


In [18]:
print('sv tree shap estimator = {}'.format(100*sv_shap[1][a]))

sv tree shap estimator = [-2.94875784 -3.81203468 -3.34194448 -3.52361971  0.86986683 -0.13874777
  0.04514599 -0.24564648]


In [19]:
print('sv exact with monte carlo estimator = {}'.format(100*sv_exact[a, :, 1]))

sv exact with monte carlo estimator = [-2.88630298 -3.85182425 -3.24036626 -3.4908592   0.88103758 -0.15895333
  0.05437585 -0.22226488]


In [20]:
print('active sv with monte carlo estimator = {}'.format(100*sv_acv_exact[a, :, 1]))

active sv with monte carlo estimator = [-7.14368164 -7.35435343  0.          0.          1.3781321   0.
  0.          0.        ]
