# Visual Auditor Package Demo (German Credit Dataset)
Author: David Munechika (david.munechika@gatech.edu)

### Install Packages and Libraries

In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from visual_auditor import SliceFinder
import json

# Import visual auditor module
import visual_auditor

# Expand Jupyter notebook width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Data Preprocessing and Model Training

In [2]:
# Helper function for binning numerical features
def bin_feature(feature):
    bins = np.histogram_bin_edges(german_credit_data[feature], bins='auto', range=None, weights=None)
    german_credit_data[feature] = pd.cut(german_credit_data[feature], bins, labels=[x for x in range(len(bins) - 1)], right=True, include_lowest=True, duplicates='drop')
    intervals = []
    for i in range(len(bins) - 1):
        intervals.append(f' {int(bins[i])} - {int(bins[i+1])}')
    return intervals

# Helper function for mapping encodings to original values
def map_encodings(filename, features, encodings):
    slices_file = open(filename, "r")
    data = json.dumps(json.loads(slices_file.read()))
    slices_file.close()
    for feature in features:
        for i in range(len(encodings[feature])):
            index = len(encodings[feature]) - i - 1
            data = data.replace(f'{feature}: {index}', f'{feature}: {encodings[feature][index]}')
    with open(filename, 'w') as outfile:
        json.dump(json.JSONDecoder().decode(data), outfile)

In [3]:
# Load German Credit dataset
german_credit_data = pd.read_csv(
    "data/german.csv",
    names=[
        "Checking Account", "Duration", "Credit History", "Purpose", "Credit Amount", "Savings Account",
        "Employment", "Installment Rate", "Relationship/Sex", "Debtors/Guarantors", "Residence Since", "Property", "Age", "Installment Plans", "Housing", "Existing Credits", "Job", "Maintenance", "Telephone", "Foreign", "Risk"],
        engine='python',
        na_values="NA")

# Drop NA values
german_credit_data = german_credit_data.dropna()

# Bin numerical features
encoders = {}
encodings = {}
numerical_features = ["Age", "Credit Amount", "Duration"]
for feature in numerical_features:
    encodings[feature] = bin_feature(feature)

# Encode categorical features
for column in german_credit_data.columns.difference(numerical_features):
    if german_credit_data.dtypes[column] == np.object:
        le = LabelEncoder()
        german_credit_data[column] = le.fit_transform(german_credit_data[column])
        encoders[column] = le
        encodings[column] = le.classes_
        print(column, le.classes_, le.transform(le.classes_))
encodings["Checking Account"] = [' 0 - 200', '< 0', '> 200', 'No checking account']
encodings["Savings Account"] = [' 100 - 500', ' 500 - 1000', '< 100', '> 1000', 'Unknown']

# Separate Target values
X, y = german_credit_data[german_credit_data.columns.difference(["Risk"])], german_credit_data["Risk"]

# Train a classifier
rfc = RandomForestClassifier(max_depth=5, n_estimators=10)
rfc.fit(X, y)

Checking Account ['0 <= ... < 200 DM' '< 0 DM' '>= 200 DM' 'No checking account'] [0 1 2 3]
Credit History ['All credits paid back' 'Critical account' 'Delay in paying'
 'Existing credits paid back' 'No credits taken'] [0 1 2 3 4]
Debtors/Guarantors ['Co-applicant' 'Guarantor' 'None'] [0 1 2]
Employment ['1 <= ... < 4 years' '4 <= ... < 7 years' '< 1 year' '>= 7 years'
 'Unemployed'] [0 1 2 3 4]
Foreign ['No' 'Yes'] [0 1]
Housing ['For free' 'Own' 'Rent'] [0 1 2]
Installment Plans ['Bank' 'None' 'Stores'] [0 1 2]
Job ['Management' 'Skilled' 'Unemployed' 'Unskilled'] [0 1 2 3]
Property ['Car or other' 'Life insurance' 'Real estate' 'Unknown or none'] [0 1 2 3]
Purpose ['Appliances' 'Business' 'Car (new)' 'Car (used)' 'Education' 'Furniture'
 'Others' 'Radio/TV' 'Repairs' 'Retraining'] [0 1 2 3 4 5 6 7 8 9]
Relationship/Sex ['Female (separated/married)' 'Male (married)' 'Male (separated)'
 'Male (single)'] [0 1 2 3]
Savings Account ['100 <= ... < 500 DM' '500 <= ... < 1000 DM' '< 100 DM'

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Precompute Data for Demo (Internal Use)

In [4]:
# Map encodings
features = ["Age", "Credit Amount", "Duration", "Checking Account", "Credit History", "Debtors/Guarantors", "Employment", "Foreign", "Housing", "Installment Plans", "Job", "Property", "Purpose", "Relationship/Sex", "Savings Account", "Telephone"]
map_encodings('gc_slices.json', features, encodings)
map_encodings('gc_common_samples.json', features, encodings)
map_encodings('gc_overlapping_samples.json', features, encodings)
map_encodings('gc_reverse_slices.json', features, encodings)
map_encodings('gc_reverse_common_samples.json', features, encodings)
map_encodings('gc_reverse_overlapping_samples.json', features, encodings)

### Interact with Visual Auditor

In [5]:
# Generate the visual auditor for this model and dataset
visual_auditor.find_slices_and_visualize(rfc, (X, y), k=100, epsilon=0.01, degree=2, max_workers=4, precompute=True, prefix='gc_')