# Visual Auditor Package Demo
Author: David Munechika (david.munechika@gatech.edu)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import visual_auditor
from visual_auditor import SliceFinder

In [2]:
# Load Adult dataset
adult_data = pd.read_csv(
    "data/adult.data",
    names=[
        "Age", "Workclass", "Final Weight", "Education", "Education-Num", "Marital Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

# Drop NA values
adult_data = adult_data.dropna()

# Encode categorical features
encoders = {}
for column in adult_data.columns:
    if adult_data.dtypes[column] == np.object:
        le = LabelEncoder()
        adult_data[column] = le.fit_transform(adult_data[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = adult_data[adult_data.columns.difference(["Target"])], adult_data["Target"]

# Train a classifier
rfc = RandomForestClassifier(max_depth=5, n_estimators=10)
rfc.fit(X, y)

Workclass ['Federal-gov' 'Local-gov' 'Private' 'Self-emp-inc' 'Self-emp-not-inc'
 'State-gov' 'Without-pay'] [0 1 2 3 4 5 6]
Education ['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college'] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
Marital Status ['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed'] [0 1 2 3 4 5 6]
Occupation ['Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'Other-service'
 'Priv-house-serv' 'Prof-specialty' 'Protective-serv' 'Sales'
 'Tech-support' 'Transport-moving'] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
Relationship ['Husband' 'Not-in-family' 'Other-relative' 'Own-child' 'Unmarried' 'Wife'] [0 1 2 3 4 5]
Race ['Amer-Indian-Eskimo' 'Asian-Pac-Islander' 'Black' 'Other' 'White'] [0 1 2 3 4]
Sex ['Female' 'M

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [3]:
# Generate slices
sf = SliceFinder(rfc, (X, y))
recommendations = sf.find_slice(k=20, epsilon=0.2, degree=1, max_workers=4)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering
sorting
Sex: 1
Marital Status: 2
Relationship: 0
Occupation: 3
Hours per week: 50
Education: 12
Education-Num: 14
Relationship: 5
Hours per week: 60
Workclass: 3
Age: 47
Hours per week: 55
Age: 50
Age: 48
Age: 54
Education: 10
Education-Num: 16
Age: 59
Hours per week: 70
Hours per week: 65

Slice description:
Sex:Male 
---------------------
effect_size: 0.29061607542070694
---------------------
metric: 0.4115335972299737
size: 20380

Slice description:
Marital Status:Married-civ-spouse 
---------------------
effect_size: 0.5952707829757081
---------------------
metric: 0.553161165584165
size: 14065

Slice description:
Relationship:Husband 
---------------------
effect_size: 0.5293867527594431
---------------------
metric: 0.5444814147736076
size: 12463

Slice description:
Occupation:Exec-managerial 
---------------------
effect_size: 0.24362876834220473
---------------------
metric: 0.4905952965984625
size: 3992

Slice description:
Hours per week

In [3]:
# Generate Visual Auditor
visual_auditor.visualize()