# Visual Auditor Package Demo (German Credit Dataset)
Author: David Munechika (david.munechika@gatech.edu)

In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from visual_auditor import SliceFinder

# Import visual auditor module
import visual_auditor

In [2]:
# Load German Credit dataset
german_credit_data = pd.read_csv(
    "data/german_credit_data.csv",
    names=[
        "Age", "Sex", "Job", "Housing", "Savings Account", "Checking Account",
        "Credit Amount", "Duration", "Purpose", "Risk"],
        engine='python',
        na_values="NA")

# Drop NA values
german_credit_data = german_credit_data.dropna()

# Bin numerical features
# german_credit_data["Age"] = pd.cut(german_credit_data["Age"], 3, labels=["0", "1", "2"])
# german_credit_data["Credit Amount"] = pd.cut(german_credit_data["Credit Amount"], 3, labels=["0", "1", "2"])
# german_credit_data["Duration"] = pd.cut(german_credit_data["Duration"], 3, labels=["0", "1", "2"])

age_bins = np.histogram_bin_edges(german_credit_data["Age"], bins='auto', range=None, weights=None)
german_credit_data["Age"] = pd.cut(german_credit_data["Age"], age_bins, labels=[x for x in range(len(age_bins) - 1)], right=True, include_lowest=True)
credit_amount_bins = np.histogram_bin_edges(german_credit_data["Credit Amount"], bins='auto', range=None, weights=None)
german_credit_data["Credit Amount"] = pd.cut(german_credit_data["Credit Amount"], credit_amount_bins, labels=[x for x in range(len(credit_amount_bins) - 1)], right=True, include_lowest=True)
duration_bins = np.histogram_bin_edges(german_credit_data["Duration"], bins='auto', range=None, weights=None)
german_credit_data["Duration"] = pd.cut(german_credit_data["Duration"], duration_bins, labels=[x for x in range(len(duration_bins) - 1)], right=True, include_lowest=True)

# Encode categorical features
encoders = {}
for column in german_credit_data.columns.difference(["Age", "Credit Amount", "Duration"]):
    if german_credit_data.dtypes[column] == np.object:
        le = LabelEncoder()
        german_credit_data[column] = le.fit_transform(german_credit_data[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = german_credit_data[german_credit_data.columns.difference(["Risk"])], german_credit_data["Risk"]

# Train a classifier
rfc = RandomForestClassifier(max_depth=5, n_estimators=10)
rfc.fit(X, y)

Checking Account ['little' 'moderate' 'rich'] [0 1 2]
Housing ['free' 'own' 'rent'] [0 1 2]
Purpose ['business' 'car' 'domestic appliances' 'education' 'furniture/equipment'
 'radio/TV' 'repairs' 'vacation/others'] [0 1 2 3 4 5 6 7]
Risk ['bad' 'good'] [0 1]
Savings Account ['little' 'moderate' 'quite rich' 'rich'] [0 1 2 3]
Sex ['female' 'male'] [0 1]


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [3]:
# Generate the visual auditor for this model and dataset
visual_auditor.find_slices_and_visualize(rfc, (X, y), k=100, epsilon=0.01, degree=2, max_workers=4, precompute=False, prefix='gc_')

In [4]:
# Compare to the textual outputs of running SliceFinder in isolation
sf = SliceFinder(rfc, (X, y))
recommendations = sf.find_slice(k=20, epsilon=0.2, degree=2, max_workers=4)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering
degree 2
crossing
effect size filtering
sorting

Slice description:
Credit Amount:0 
Checking Account:little 
---------------------
effect_size: 0.20604875663239955
---------------------
metric: 0.6500569971886706
size: 29

Slice description:
Age:2 
Purpose:car 
---------------------
effect_size: 0.2203047135367654
---------------------
metric: 0.6634938413376075
size: 27

Slice description:
Credit Amount:0 
Duration:1 
---------------------
effect_size: 0.34299508186813044
---------------------
metric: 0.714606256551388
size: 27

Slice description:
Duration:3 
Purpose:furniture/equipment 
---------------------
effect_size: 0.2284819170149972
---------------------
metric: 0.6672316704632526
size: 21

Slice description:
Age:0 
Checking Account:little 
---------------------
effect_size: 0.21779150081920526
---------------------
metric: 0.6571599201660882
size: 20

Slice description:
Credit Amount:6 
Housing:own 
---------------------
effect_size: 0