# Tabular Explainer in pure Numpy and Scikit-learn

We know now that Numpy explainer is much faster than LIME for numerical data. We also found that JAX is not that fast especially when mixed with original numpy. What about tabular datasets?

## Steps:

* Get training data statistics
    * For numerical -> bins
    * For categorical -> distributions 
* Create synthetic neighborhood
    * For numerical -> add unit Gaussian noise to scaled data
    * For categorical -> sample from distribution
    * Strategy for more efficient synthetic data generation: Split the data into categorical and numerical, apply sampling separately, the concatenate again
* Get model predictions
* Solve
* Explain

In [2]:
import scipy
import numpy as np
import pandas as pd

import sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import classification_report

from lime.lime_tabular import LimeTabularExplainer

# Prepare the dataset and train a model

In [3]:
df = pd.read_csv('../data/german_credit_data.csv')
print(df.shape)
df = df.fillna('None')
target_col = 'Risk'
df[df[target_col] == 'good'][target_col] = 1
df[df[target_col] == 'bad'][target_col] = 0

print(df[target_col].value_counts())

numerical_features = ['Age', 'Credit amount', 'Duration']
categorical_features = ['Sex', 'Job', 'Housing', 'Saving accounts', 
                        'Checking account', 'Purpose']
feature_names = list(df.columns)[:-1]
X, y = df[df.columns[:-1]], df[target_col]

dict_le = {}
for cat_col in categorical_features:
    le = LabelEncoder()
    X[cat_col] = le.fit_transform(X[cat_col])
    dict_le[cat_col] = le

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print(classification_report(clf.predict(X_test), y_test))

(1000, 10)
good    700
bad     300
Name: Risk, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.76
              precision    recall  f1-score   support

         bad       0.29      1.00      0.45        10
        good       1.00      0.73      0.85        90

    accuracy                           0.76       100
   macro avg       0.65      0.87      0.65       100
weighted avg       0.93      0.76      0.81       100



In [115]:
feature_names

['Age',
 'Sex',
 'Job',
 'Housing',
 'Saving accounts',
 'Checking account',
 'Credit amount',
 'Duration',
 'Purpose']

In [4]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,skilled,own,,little,1169,6,radio/TV,good
1,22,female,skilled,own,little,moderate,5951,48,radio/TV,bad
2,49,male,unskilled_and_resident,own,little,,2096,12,education,good
3,45,male,skilled,free,little,little,7882,42,furniture/equipment,good
4,53,male,skilled,free,little,little,4870,24,car,bad


In [5]:
X_train.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
459,32,1,1,1,1,0,4594,18,5
795,22,0,1,2,2,0,2301,9,4
727,25,0,1,2,1,1,1882,18,5
542,31,1,1,1,0,1,6350,30,4
970,22,1,1,1,2,2,1514,15,6


In [48]:
X_test.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
5,35,1,3,0,0,0,9055,36,3
447,35,1,1,1,1,2,2576,7,5
543,34,1,3,1,1,3,2864,18,4
46,39,1,1,1,3,0,2299,36,5
718,31,1,1,1,0,3,3148,24,5


In [49]:
y_test.head()

5      good
447    good
543     bad
46     good
718    good
Name: Risk, dtype: object

## Process categorical data

In [6]:
X_train_cat = X_train[categorical_features]
dict_feature_to_hist = {feature: X_train_cat[feature].value_counts().sort_index().values / X_train_cat.shape[0] 
                        for feature in categorical_features}
dict_feature_to_hist

{'Sex': array([0.31222222, 0.68777778]),
 'Job': array([0.15222222, 0.62222222, 0.02444444, 0.20111111]),
 'Housing': array([0.10666667, 0.71333333, 0.18      ]),
 'Saving accounts': array([0.18666667, 0.59888889, 0.10555556, 0.06444444, 0.04444444]),
 'Checking account': array([0.38777778, 0.27444444, 0.27666667, 0.06111111]),
 'Purpose': array([0.09777778, 0.33777778, 0.01333333, 0.05777778, 0.18      ,
        0.27888889, 0.02111111, 0.01333333])}

In [7]:
list_buf = []
for feature in categorical_features:
    list_buf.append(np.random.choice(a=len(dict_feature_to_hist[feature]), size=(1, 1000), p=dict_feature_to_hist[feature]))

X_synthetic_orig_cat = X_synthetic_disc_cat = np.concatenate(list_buf).T
X_synthetic_orig_cat.shape

(1000, 6)

In [19]:
def discretize(X, qs=[25, 50, 75], all_bins=None):
    if all_bins is None:
        all_bins = np.percentile(X, qs, axis=0).T
    return (np.array([np.digitize(a, bins) for (a, bins) in zip(X.T, all_bins)]).T, all_bins)

def kernel_fn(distances, kernel_width):
    return np.sqrt(np.exp(-(distances ** 2) / kernel_width ** 2))

## Explainer

In [29]:
def explain_instance(training_data, data_instance, clf, feature_names, numerical_features, categorical_features, dict_feature_to_hist,
                     label, qs=[25, 50, 75], num_samples=5000, num_features=10):
    data_instance = data_instance.reshape((1, -1))
    
    # book-keeping of indices
    dict_feature_to_idx = {feature: idx for (idx, feature) in enumerate(feature_names)}
    # For indexing
    numerical_feature_idxes = [dict_feature_to_idx[feature] for feature in numerical_features]
    cat_feature_idxes = [dict_feature_to_idx[feature] for feature in categorical_features]
    # For re-ordering
    list_reorder = [dict_feature_to_idx[feature] for feature in numerical_features + categorical_features]
    
    # Split the data into numerical and categorical data
    training_data_num = training_data[:,numerical_feature_idxes]
    training_data_cat = training_data[:,cat_feature_idxes]
    data_num = data_instance[:,numerical_feature_idxes]
    data_cat = data_instance[:,cat_feature_idxes]
    
    ## Process numerical data
    all_bins_num = np.percentile(training_data_num, qs, axis=0).T
    sc = StandardScaler(with_mean=False)
    sc.fit(training_data_num)
    data_scaled = sc.transform(data_num)
    X_synthetic = np.tile(data_scaled, (num_samples, 1))
    X_synthetic = X_synthetic + np.random.normal(size=(num_samples, data_num.shape[1]))
    X_synthetic[0] = data_scaled.ravel()
    X_synthetic_orig_num = sc.inverse_transform(X_synthetic)
    X_synthetic_disc_num, all_bins_num = discretize(X_synthetic_orig_num, qs, all_bins_num)    
    
    ## Process categorical data
    list_buf = []
    for feature in categorical_features:
        list_buf.append(np.random.choice(a=len(dict_feature_to_hist[feature]), size=(1, num_samples), 
                                         p=dict_feature_to_hist[feature]))

    X_synthetic_orig_cat = X_synthetic_disc_cat = np.concatenate(list_buf).T   
    
    ###########################################
    # Concatenate the data
    X_synthetic_orig = np.concatenate([X_synthetic_orig_num, X_synthetic_orig_cat], axis=1)
    X_synthetic_orig = X_synthetic_orig[:,list_reorder]
    X_synthetic_disc = np.concatenate([X_synthetic_disc_num, X_synthetic_disc_cat], axis=1)
    X_synthetic_disc = X_synthetic_disc[:,list_reorder]
    
    # Get model predictions (i.e. groundtruth)
    model_pred = clf.predict_proba(X_synthetic_orig)

    # Get distances
    distances = scipy.spatial.distance.cdist(X_synthetic[:1], X_synthetic)
    distances = distances.reshape(-1, 1)
    weights = kernel_fn(distances, kernel_width=training_data.shape[1]).ravel()

    # Solve
    oe = OneHotEncoder()
    X_synthetic_onehot = oe.fit_transform(X_synthetic_disc)    
    solver = Ridge(alpha=1, fit_intercept=True)
    solver.fit(X_synthetic_onehot, model_pred[:,label], sample_weight=weights)
    
    # Explain
    importances = solver.coef_[X_synthetic_onehot[0].toarray().ravel() == 1]
    explanations = sorted(list(zip(feature_names, importances)), 
                          key=lambda x: x[1], reverse=True)[:num_features]
    return explanations

In [78]:
def get_explanations():
    return explain_instance(
        training_data=X_train.to_numpy(),
        data_instance=X_test.to_numpy()[0],
        clf=clf,
        feature_names=feature_names,
        numerical_features=numerical_features,
        categorical_features=categorical_features,
        dict_feature_to_hist=dict_feature_to_hist,
        label=1,
        qs=[25, 50, 75],
        num_samples=10000,
        num_features=10
    )

In [79]:
%timeit get_explanations()

91.2 ms ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [80]:
exp = get_explanations()
exp

[('Age', 0.04984175708928291),
 ('Purpose', 0.0390298188083696),
 ('Job', 0.033878452030482016),
 ('Duration', 0.03119003368235152),
 ('Credit amount', 0.024531250348135455),
 ('Housing', 0.023519455607880328),
 ('Saving accounts', 0.022430220873815382),
 ('Sex', 0.021305008518442817),
 ('Checking account', -0.015774595533411616)]

In [81]:
explainer = LimeTabularExplainer(training_data=X_train.to_numpy(), 
                                 feature_names=feature_names,
                                 categorical_features=[idx for (idx, col) in enumerate(df.columns) if col in categorical_features],
                                 categorical_names=categorical_features
                                )
explainer

<lime.lime_tabular.LimeTabularExplainer at 0x7fbf93230b90>

In [82]:
%timeit explainer.explain_instance(data_row=X_test.to_numpy()[0], predict_fn=clf.predict_proba, labels=(0,), num_samples=10000)

exp = explainer.explain_instance(
    data_row=X_test.to_numpy()[0],
    predict_fn=clf.predict_proba,
    labels=(0,1)
)

334 ms ± 38.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [67]:
sorted(exp.as_list(1), key=lambda x: x[1], reverse=True)

[('Checking account=0', 0.22626244691004274),
 ('33.00 < Age <= 42.00', 0.03779095775967931),
 ('Saving accounts=0', 0.03632257458570031),
 ('Sex=1', 0.012973461565769399),
 ('Job=3', -0.0023834451843519943),
 ('Purpose=3', -0.01444845258576066),
 ('Housing=0', -0.0328900251283455),
 ('Credit amount > 3973.75', -0.0707249293865586),
 ('Duration > 24.00', -0.10014282387099928)]

In [68]:
sorted(exp.as_list(0), key=lambda x: x[1], reverse=True)

[('Duration > 24.00', 0.10014282387099933),
 ('Credit amount > 3973.75', 0.07072492938655862),
 ('Housing=0', 0.032890025128345546),
 ('Purpose=3', 0.014448452585760649),
 ('Job=3', 0.002383445184352031),
 ('Sex=1', -0.012973461565769406),
 ('Saving accounts=0', -0.03632257458570024),
 ('33.00 < Age <= 42.00', -0.03779095775967935),
 ('Checking account=0', -0.2262624469100428)]

In [75]:
dict_le['Checking account'].classes_

array(['None', 'little', 'moderate', 'rich'], dtype=object)

In [76]:
dict_le['Job'].classes_

array(['highly_skilled', 'skilled', 'unskilled_and_non-resident',
       'unskilled_and_resident'], dtype=object)

In [77]:
dict_le['Saving accounts'].classes_

array(['None', 'little', 'moderate', 'quite rich', 'rich'], dtype=object)

In [111]:
arr = np.random.randint(low=0, high=4, size=(4,5))
arr

array([[1, 3, 2, 2, 3],
       [0, 2, 2, 2, 1],
       [1, 1, 1, 3, 0],
       [0, 0, 0, 1, 3]])

In [112]:
dict_feature = {}
for feature in range(arr.shape[1]):
    dict_feature[feature] = np.bincount(arr[:,feature]) / arr.shape[0]
dict_feature

{0: array([0.5, 0.5]),
 1: array([0.25, 0.25, 0.25, 0.25]),
 2: array([0.25, 0.25, 0.5 ]),
 3: array([0.  , 0.25, 0.5 , 0.25]),
 4: array([0.25, 0.25, 0.  , 0.5 ])}

In [114]:
np.concatenate([arr], axis=-1)

array([[1, 3, 2, 2, 3],
       [0, 2, 2, 2, 1],
       [1, 1, 1, 3, 0],
       [0, 0, 0, 1, 3]])