In [1]:
import warnings
warnings.filterwarnings('ignore')

import shap
from acv_explainers import ACVTree
from acv_explainers.utils import *
from experiments.exp_linear_gmm import *

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import seaborn as sns

import random 
import numpy as np
import pandas as pd

random.seed(212)
np.random.seed(212)

In [2]:
import random 
# from sklearn.tree import DecisionTreeClassifier
random.seed(212)
np.random.seed(212)

### Load data and model

In [4]:
X = pd.read_csv('/home/samoukou/Documents/ACV/data/lucas0_train.csv')
X.head()

y = X.Lung_cancer.values
X.drop(['Lung_cancer'], axis=1, inplace=True)

n_estimators = 1
num_features = X.shape[1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Model 
tree = DecisionTreeClassifier(min_samples_leaf=20, random_state=212)
tree.fit(X_train, y_train)

accuracy_train = roc_auc_score(y_train, tree.predict_proba(X_train)[:, 1])
accuracy_test = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])
print('[* TREE *]')
print('train acc = {} --- test acc = {}'.format(accuracy_train, accuracy_test))

forest = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf=10, random_state=212)
forest.fit(X_train, y_train)
forest.estimators_[0] = tree

accuracy_train = roc_auc_score(y_train, forest.predict_proba(X_train)[:, 1])
accuracy_test = roc_auc_score(y_test, forest.predict_proba(X_test)[:, 1])
print('[* FOREST *]')
print('train acc = {} --- test acc = {}'.format(accuracy_train, accuracy_test))

[* TREE *]
train acc = 0.9230919439227646 --- test acc = 0.9127666666666667
[* FOREST *]
train acc = 0.9230919439227646 --- test acc = 0.9127666666666667


## Build ACVTree for explanations

In [5]:
acvtree = ACVTree(forest, X_train.values)

### Let's choose an observation to explain

In [6]:
idx = np.random.randint(0, 400)
# idx = 174
ind = X_test.values[idx]
y_ind = y_test[idx]
fx = forest.predict(np.expand_dims(ind, 0))[0]

In [7]:
# idx = np.random.randint(0, 400)
# ind = X_test.values[idx]
# y_ind = y_test[idx]
# fx = forest.predict(np.expand_dims(ind, 0))[0]

In [8]:
X_test[idx:idx+1]

Unnamed: 0,Smoking,Yellow_Fingers,Anxiety,Peer_Pressure,Genetics,Attention_Disorder,Born_an_Even_Day,Car_Accident,Fatigue,Allergy,Coughing
1945,0,0,1,0,0,1,0,1,1,1,1


In [9]:
print('f(x) = {}, y_ind = {}'.format(fx, y_ind))

f(x) = 0, y_ind = 0


In [10]:
from scipy.special import comb

def powerset(iterable):
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))

from scipy.special import comb

def vote_exp_clf(x, fx, tx, pi, forest, S, data, algo="plugin"):
    if len(S) == 0:
        return 0
    sdp = cond_sdp_forest_clf(x, fx, tx, forest, S, data=data, algo=algo)
    return int(sdp >= pi)


def tree_shap_vote_clf(tree, x,  tx, fx, pi, algo, data=None, C=[[]]):
    """
    Compute Classic Shapley values of a tree_classifier of x

    Args:
        tree (DecisionTreeClassifier):
        x (array): observation
        algo (string): name of the estimators, recommended 'plugin'
        data (array): data used to compute the Shapley values
        C (list[list]): list of the different coalition of variables by their index

    Returns:
        array: Shapley values of x
    """
    m = len(x)
    va_id = list(range(m))
    va_id_buffer = va_id.copy()

    if C[0] != []:
        for c in C:
            m -= len(c)
            va_id = list(set(va_id) - set(c))
        m += len(C)
        for c in C:
            va_id += [c]

    phi = np.zeros(len(x))
    
    for i in va_id:
        if C[0] != []:
            Sm = list(set(va_id_buffer) - set(convert_list(i)))
            for c in C:
                if c != convert_list(i):
                    Sm = list(set(Sm) - set(c))
            for c in C:
                if c != convert_list(i):
                    Sm += [c]
        else:
            Sm = list(set(va_id_buffer) - set(convert_list(i)))

        for S in tqdm(powerset(Sm)):
            phi[chain_l(i)] +=\
                comb(m-1, len(S))**(-1)*(vote_exp_clf(x=x, fx=fx, tx=tx, pi=pi, forest=tree, S=chain_l(S)+convert_list(i), data=data, algo=algo) - \
                vote_exp_clf(x=x, fx=fx, tx=tx, pi=pi, forest=tree, S=chain_l(S), data=data, algo=algo))
    return phi/m
            
#     for p in tqdm(itertools.permutations(va_id)):
#         for i in range(m):
#             phi[chain_l(p[i])] += \
#                 vote_exp(x=x, fx=fx, tx=tx, pi=pi, forest=tree, S=chain_l(p[:i+1]), data=data, algo=algo) - \
#                 vote_exp(x=x, fx=fx, tx=tx, pi=pi, forest=tree, S=chain_l(p[:i]), data=data, algo=algo)
#     return phi / math.factorial(m)

In [11]:
pis = []
sv, _, _ = acvtree.shap_values_swing_clf(x=ind, fx=fx, tx=0.5, threshold=0.95, data=X_train.values, C=[[]])
sv

array([[0.33333333, 0.33333333],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.33333333, 0.33333333],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.33333333, 0.33333333],
       [0.        , 0.        ]])