# Tabular data: post-hoc explanations with Adult credit dataset
### Author: Francesca Naretto

In [None]:
!pip install bitarray deap lime
!git clone https://github.com/rinziv/XAI_lib_HAI-net_Tutorial.git

In [None]:
cd XAI_lib_HAI-net_Tutorial

In [16]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from xailib.data_loaders.dataframe_loader import prepare_dataframe

from xailib.explainers.lime_explainer import LimeXAITabularExplainer
from xailib.explainers.lore_explainer import LoreTabularExplainer
from xailib.explainers.shap_explainer_tab import ShapXAITabularExplainer

from xailib.models.sklearn_classifier_wrapper import sklearn_classifier_wrapper

# Learning and explaining Adult Dataset

## Loading and preparation of data

In this notebook we are going to use the Adult dataset for the training of a machine learning model to explain. 

The Adult dataset classifies people as high (>50k) or low (=<50k) income.

We start by reading from a CSV file the dataset to analyze. The table is loaded by means of the ```DataFrame``` class from the ```pandas``` library.

Among all the attributes of the table, we select the ```class``` column that contains the observed class for the corresponding row.

In [17]:
source_file = 'datasets/adult_clean.csv'
class_field = 'class'
# Load and transform dataset 
df = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             30162 non-null  int64
 1   workclass       30162 non-null  int64
 2   fnlwgt          30162 non-null  int64
 3   education-num   30162 non-null  int64
 4   marital-status  30162 non-null  int64
 5   occupation      30162 non-null  int64
 6   relationship    30162 non-null  int64
 7   race            30162 non-null  int64
 8   sex             30162 non-null  int64
 9   capital-gain    30162 non-null  int64
 10  capital-loss    30162 non-null  int64
 11  hours-per-week  30162 non-null  int64
 12  native-country  30162 non-null  int64
 13  class           30162 non-null  int64
dtypes: int64(14)
memory usage: 3.2 MB


After the data is loaded in memory, we need to extract metadata information to automatically handle the content withint the table.

The method ```prepare_dataframe``` scans the table and extract the following information:
 * ```df```: is a trasformed version of the original dataframe, where discrete attributes are transformed into numerical attributes by using one hot encoding strategy;
 * ```feature_names```: is a list containint the names of the features after the transformation;
 * ```class_values```: the list of all the possible values for the ```class_field``` column;
 * ```numeric_columns```: a list of the original features that contain numeric (i.e. continuous) values;
 * ```rdf```: the original dataframe, before the transformation;
 * ```real_feature_names```: the list of the features of the dataframe before the transformation;
 * ```features_map```: it is a dictionary pointing each feature to the original one before the transformation.

In [19]:
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = prepare_dataframe(df, class_field)

### Learning a Random Forest classfier

We train a RF classifier by using the ```sklearn``` library. We start by splitting the dataset into a train and test subsets. 

In [20]:
test_size = 0.3
random_state = 42
X_train, X_test, Y_train, Y_test = train_test_split(df[feature_names], df[class_field],
                                                        test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=df[class_field])




In [21]:
bb = RandomForestClassifier(n_estimators=20, random_state=random_state)
bb.fit(X_train.values, Y_train.values)
bbox = sklearn_classifier_wrapper(bb)   

In [22]:
Y_pred = bb.predict(X_train)
print(classification_report(Y_train, Y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     15857
           1       1.00      0.98      0.99      5256

    accuracy                           1.00     21113
   macro avg       1.00      0.99      0.99     21113
weighted avg       1.00      1.00      1.00     21113



In [23]:
Y_pred = bb.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      6797
           1       0.75      0.61      0.67      2252

    accuracy                           0.85      9049
   macro avg       0.81      0.77      0.79      9049
weighted avg       0.85      0.85      0.85      9049



In [24]:
inst = X_train.iloc[147].values
print('Instance ',inst)
print('True class ',Y_train.iloc[8])
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

Instance  [    58      2 111169      9      2     14      5      1      1      0
      0     40     41]
True class  1
Predicted class  [0]


In [25]:
explainer = ShapXAITabularExplainer(bbox, feature_names)
config = {'explainer' : 'kernel', 'X_train' : X_train.iloc[0:100].values}
explainer.fit(config)

In [26]:
exp = explainer.explain(inst)
# print(exp.exp)

In [27]:
exp.plot_features_importance()

In [28]:
explainer = LoreTabularExplainer(bbox)
config = {'neigh_type':'rndgen', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

<xailib.explainers.lore_explainer.LoreTabularExplanation object at 0x7f95f0531b50>


In [29]:
exp.plotRules()

In [30]:
exp.plotCounterfactualRules()

In [32]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'forward_selection'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.exp.as_list())

[('capital-gain', 0.20202781323631933), ('marital-status', -0.05393839333174429), ('relationship', -0.04623399356174276), ('education-num', 0.04331929786281839), ('age', 0.04189865018147442), ('hours-per-week', 0.03327784956240396), ('capital-loss', -0.014159256232603161), ('workclass', 0.013766392999885604), ('occupation', -0.013656069666905439), ('race', 0.00843993849390575)]


In [33]:
lime_exp.plot_features_importance()