In [11]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Creating synthetic data for testing
X, y = make_classification(n_samples=3000,
                           n_features=10,
                           n_informative=5,
                           n_classes=2,
                           random_state=42,
                           n_redundant=2,
                           shuffle=True)

# Converting to pandas dataframe
df = pd.DataFrame({'X_A':X[:,0], 'X_B':X[:,1], 'X_C':X[:,2], 'X_D':X[:,3], 'X_E':X[:,4], 'X_F':X[:,5],
                   'X_G':X[:,6], 'X_H':X[:,7], 'X_I':X[:,8], 'X_J':X[:,9]})


X_df = df

In [12]:
X_df.head(3)

Unnamed: 0,X_A,X_B,X_C,X_D,X_E,X_F,X_G,X_H,X_I,X_J
0,1.41058,-0.049361,-0.697806,-1.181848,-2.309817,-1.561517,0.096659,0.388382,2.027015,-1.954503
1,-0.133019,-1.106122,-2.035283,-1.478108,0.71302,0.939141,-2.566454,-2.824373,0.837529,-0.118245
2,-0.707181,-0.66671,-2.948409,-0.478965,-0.658751,-0.149534,-0.978965,-1.178977,0.04019,1.567323


In [13]:
# Train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.20, random_state=42)

In [14]:
from sklearn.metrics import classification_report

clf = RandomForestClassifier(n_estimators=5, class_weight="balanced", oob_score=True, random_state=42)
clf.fit(X_train, y_train)
y_hat_train = clf.predict(X_train)
y_hat = clf.predict(X_test)

print("\n--------Train dataset classification report----------\n")
target_names = ['class 0', 'class 1']
print(classification_report(y_train, y_hat_train, target_names=target_names))

print("\n--------Test dataset classification report----------\n")
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_hat, target_names=target_names))


--------Train dataset classification report----------

             precision    recall  f1-score   support

    class 0       0.99      0.99      0.99      1207
    class 1       0.99      0.99      0.99      1193

avg / total       0.99      0.99      0.99      2400


--------Test dataset classification report----------

             precision    recall  f1-score   support

    class 0       0.83      0.84      0.83       292
    class 1       0.85      0.83      0.84       308

avg / total       0.84      0.84      0.84       600



In [15]:
print("Classes: {}".format(np.unique(y_train)))
feature_names = list(X_train.columns)
print("Features: {}".format(feature_names))

Classes: [0 1]
Features: ['X_A', 'X_B', 'X_C', 'X_D', 'X_E', 'X_F', 'X_G', 'X_H', 'X_I', 'X_J']


## Quick Global Evaluation

#### 1. Visualizing decision boundaries

In [39]:
%matplotlib inline
from skater.core.visualizer import decision_boundary as db

# Uncomment the the below function to generate decision plot.
# _, _ = db.plot_decision_boundary(clf, X0=X_train.iloc[:, 3], X1=X_train.iloc[:, 9], Y=y_train, 
#                                  x0_label=feature_names[3], x1_label=feature_names[9],
#                                  mode='interactive', height=6, width=10, file_name='iplot')

#### 2. Using pruned Tree Surrogates

In [21]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
from skater.util.dataops import show_in_notebook
from skater.util.logger import _INFO, _DEBUG

target_names = ['class 0', 'class 1']
interpreter = Interpretation(X_train, feature_names=feature_names)
model_inst = InMemoryModel(clf.predict, examples=X_train, model_type='classifier', 
                           unique_values=[0, 1], probability=False,
                           feature_names=feature_names, target_names=target_names, log_level=_INFO)

In [22]:
# Learn or Fit a TreeSurrogate(Decision Tree - with no pruning). This setting will most likely overfit the 
# previously trained instance.
# Note: TreeSurrogate is being trained with respect to the predicted labels retrieved by querying the base estimator
# (for this e.g. previously trained RandomForest). 
surrogate_explainer = interpreter.tree_surrogate(oracle=model_inst, seed=5)
surrogate_explainer.fit(X_train, y_train, use_oracle=True, prune=None, scorer_type='f1')

2018-09-24 00:44:22,662 - skater.core.global_interpretation.tree_surrogate - INFO - No pruning applied ...
2018-09-24 00:44:22,687 - skater.core.global_interpretation.tree_surrogate - INFO - Done generating prediction using the surrogate, shape (2400,)
2018-09-24 00:44:22,689 - skater.core.global_interpretation.tree_surrogate - INFO - Done scoring, surrogate score 1.0; oracle score 0.987


-0.013

In [23]:
y_hat = surrogate_explainer.predict(X_test)
print("\n--------Test dataset classification report----------\n")
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_hat, target_names=target_names))


--------Test dataset classification report----------

             precision    recall  f1-score   support

    class 0       0.83      0.82      0.82       292
    class 1       0.83      0.84      0.83       308

avg / total       0.83      0.83      0.83       600



In [24]:
%matplotlib inline
surrogate_explainer.plot_global_decisions(colors=['lightsteelblue','darkkhaki'], 
                                          file_name='tree_experiment.png', show_img=False)

<pydotplus.graphviz.Dot at 0x7fce8ad7f978>

In [40]:
#show_in_notebook('tree_experiment.png', width=800, height=400)

In [26]:
# One can specify the external params if default setting is not producing acceptable results. 
# Currently, prepruning is done with the help of CV.
params = {"criterion": ['gini', 'entropy'], "min_samples_leaf": [4, 6],
"max_leaf_nodes": [12, 20], "max_depth": [10, 12]
}

surrogate_explainer.fit(X_train, y_train, use_oracle=True, prune='pre', scorer_type='f1', param_grid=params)

2018-09-24 00:45:51,432 - skater.core.global_interpretation.tree_surrogate - INFO - pre pruning applied ...
2018-09-24 00:45:51,433 - skater.core.global_interpretation.tree_surrogate - INFO - Scorer used f1-score
2018-09-24 00:45:52,514 - skater.core.global_interpretation.tree_surrogate - INFO - Done generating prediction using the surrogate, shape (2400,)
2018-09-24 00:45:52,517 - skater.core.global_interpretation.tree_surrogate - INFO - Done scoring, surrogate score 0.846; oracle score 0.987


0.141

In [27]:
y_hat = surrogate_explainer.predict(X_test)
print("\n--------Test dataset classification report----------\n")
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_hat, target_names=target_names))


--------Test dataset classification report----------

             precision    recall  f1-score   support

    class 0       0.77      0.89      0.83       292
    class 1       0.88      0.74      0.81       308

avg / total       0.83      0.82      0.82       600



In [30]:
# Estimator used for final scoring ...
surrogate_explainer.estimator_

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=10, max_features=None, max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=5,
            splitter='best')

In [31]:
surrogate_explainer.plot_global_decisions(colors=['lightsteelblue','darkkhaki'], file_name='tree_pruned.png', 
                                          show_img=False)

<pydotplus.graphviz.Dot at 0x7fce6b987748>

In [41]:
#show_in_notebook('tree_pruned.png', width=600, height=600)

In [42]:
# as text:
surrogate_explainer.decisions_as_txt()

if [0;32;49mX_I <= -1.6024528741836548[0;30;49m {
   [1;34;49m Predicted Label: 1
[0;30;49m} else {
  if [0;32;49mX_J <= 1.2718250751495361[0;30;49m {
    if [0;32;49mX_D <= -0.6693952679634094[0;30;49m {
      if [0;32;49mX_C <= 1.4347039461135864[0;30;49m {
        if [0;32;49mX_I <= -0.0651244968175888[0;30;49m {
           [1;34;49m Predicted Label: 1
        [0;30;49m} else {
          if [0;32;49mX_C <= 0.4285590648651123[0;30;49m {
             [1;34;49m Predicted Label: 0
          [0;30;49m} else {
             [1;34;49m Predicted Label: 0
          [0;30;49m}
        [0;30;49m}
      [0;30;49m} else {
         [1;34;49m Predicted Label: 1
      [0;30;49m}
    [0;30;49m} else {
      if [0;32;49mX_C <= -0.25395074486732483[0;30;49m {
        if [0;32;49mX_D <= 0.4669700264930725[0;30;49m {
           [1;34;49m Predicted Label: 0
        [0;30;49m} else {
          if [0;32;49mX_E <= 0.15138494968414307[0;30;49m {
             [1;34;49m Predict

## Quick Local Evaluation

In [43]:
# Filter for in-correct predictions
in_correct_prediction = np.where(y_hat != y_test)

In [44]:
in_correct_prediction

(array([  3,   7,  21,  31,  32,  36,  43,  63,  65,  70,  77,  82,  83,
         98,  99, 102, 105, 106, 113, 119, 128, 129, 130, 131, 139, 140,
        145, 161, 164, 171, 173, 184, 192, 194, 195, 196, 198, 199, 205,
        211, 212, 214, 217, 219, 240, 251, 253, 258, 263, 277, 279, 282,
        284, 289, 294, 299, 306, 307, 312, 314, 323, 339, 342, 349, 354,
        360, 361, 369, 371, 377, 381, 387, 391, 393, 414, 416, 424, 428,
        438, 439, 444, 452, 463, 465, 466, 471, 476, 478, 488, 492, 493,
        495, 498, 500, 505, 517, 521, 530, 534, 536, 539, 542, 549, 550,
        553, 565, 576, 585, 593, 595]),)

In [45]:
sample_index = 306
print("Data row: {}".format(X_test.iloc[sample_index]))
print("-----------------------------------------------")
print("Target Label: {}".format(y_test[sample_index]))

Data row: X_A    1.998863
X_B    0.611233
X_C   -3.723808
X_D    0.251678
X_E    2.958946
X_F    3.354935
X_G   -4.038143
X_H    0.865037
X_I   -1.696806
X_J    2.361284
Name: 1437, dtype: float64
-----------------------------------------------
Target Label: 0


In [46]:
print("Ground Truth : {}".format(y_test[sample_index]))
print("Oracle Model {}:".format(clf.predict(X_test.iloc[sample_index].values.reshape(1, -1))))
print("Using TreeSurrogate {}:".format(surrogate_explainer.predict(X_test.iloc[sample_index].values.reshape(1, -1))))

Ground Truth : 0
Oracle Model [0]:
Using TreeSurrogate [1]:


In [47]:
surrogate_explainer.decisions_as_txt(scope="local", X=X_test.iloc[sample_index])

As [0;32;49mX_I[-1.6968063342595445] <= -1.6024528741836548[0;30;49m then,
   [1;34;49m Predicted Label: 1
