In [1]:

from sklearn.model_selection import train_test_split
from sklearn import datasets, svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import numpy as np
from math import sqrt
from shapleycomposition import ShapleyExplainer, fig_3D_ilr_space
from composition_stats import ilr, sbp_basis
from bifurc_tree import create_tree_from_sbp, init_graph, build_graph
import plotly.graph_objects as go

In [2]:
K = 10         #Index of the instance you want to test in the test set
N_class = 4    #Number of class, the dimension of the simplex is therefore N_class-1
N_feat  = 6    #In this example, since the number of feature of the digit dataset is quite large (64), we propose to reduce it with a PCA

#load the dataset, take a subset of N_class classes, scale it and split into a training and testing set
X, Y = datasets.load_digits(return_X_y=True)
subset_i = np.where(Y < N_class)
X = X[subset_i]
Y = Y[subset_i]
X = scale(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

#Reduce the number of feature to N_feat with a PCA
pca = PCA(n_components=N_feat)
X_train = pca.fit_transform(X_train)
X_test  = pca.transform(X_test)


#train an SVM classifier
svc_linear = svm.SVC(kernel='rbf', probability=True)
svc_linear.fit(X_train, Y_train)

SVC(probability=True)

In [3]:

# explain all the predictions in the test set
explainer = ShapleyExplainer(svc_linear.predict_proba, X_train, N_class)
(shapley, base) = explainer.explain_instance(np.array(X_test[K]))

print("True label of the tested instance: ", end="")
print(Y_test[K])

#get the model prediction for the Kth instance of the test partition
pred = svc_linear.predict_proba(X_test[K].reshape(1,-1))
print("Prediction on the simplex: ",end="")
print(pred)
ilr_pred = ilr(pred)
print("Prediction in the ILR space: ",end="")
print(ilr_pred)

#The sum of the base distribution and the shapley composition in the ILR space is equal to the predicted probability distribution
sum_shap_base = np.array(shapley).sum(axis=0)+base
print("Sum of the shapley composition and the base distribution in the ILR space: ", end="")
print(sum_shap_base)

True label of the tested instance: 0
Prediction on the simplex: [[9.95427008e-01 6.49890738e-04 2.31482529e-03 1.60827639e-03]]
Prediction in the ILR space: [5.18600798 1.95695955 1.69916086]
Sum of the shapley composition and the base distribution in the ILR space: [5.18600791 1.95695953 1.69916083]


In [4]:
#SUMMARIZE WITH NORM, COSINE AND INNER PRODUCTS

(norm_shapley, cos_shap_class, cos_shap_shap) = explainer.summarize()


List of the features sorted by their Shapley strength (norm of their Shapley composition):
	 feature n.1: 4.4740958
	 feature n.3: 1.7095308
	 feature n.2: 0.8776221
	 feature n.4: 0.3363169
	 feature n.5: 0.1862424
	 feature n.6: 0.1521287

Cosine between the Shapley compositions and the class vectors:
		feat. n.1 	feat. n.2 	feat. n.3 	feat. n.4 	feat. n.5 	feat. n.6 	
class 1:  	0.9662640	0.4311435	0.5714654	0.7552058	0.8699537	0.9519177	
class 2:  	-0.2675725	-0.9943497	0.0286992	-0.4430406	-0.4470729	-0.6049444	
class 3:  	-0.5542691	0.2739673	0.3425054	0.3528324	0.1675225	-0.1507584	
class 4:  	-0.1444224	0.2892389	-0.9426701	-0.6649976	-0.5904032	-0.1962148	

Cosine between each Shapley compositions:
		feat. n.1 	feat. n.2 	feat. n.3 	feat. n.4 	feat. n.5 	feat. n.6 	
feat. n. 1:	1.0000000	0.3667762	0.3681074	0.5615626	0.7144832	0.8951766	
feat. n. 2:	0.3667762	1.0000000	0.0292687	0.5028456	0.5210624	0.6854127	
feat. n. 3:	0.3681074	0.0292687	1.0000000	0.8749346	0.8236880	0.4949

In [14]:
#PLOT the 3D ILR SPACE (CORRESPONDING TO THE CHOSEN ILR COMPONENTS LISTED IN BALANCES).
#plot range [-lim, lim]
#If shapley_sum is True, the sum of the shapley vectors are summed fro the base distribution to the prediction

fig = explainer.plot_ilr_space(balances=[1, 2, 3], lim=6, figsize=750)

fig = explainer.plot_ilr_space(balances=[1, 2, 3], shapley_sum=True, lim=6, figsize=750)
