# SliceFinder

In [2]:
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from slice_finder import SliceFinder

from ipywidgets import interact, interactive
from IPython.display import display

from bokeh.layouts import widgetbox, row
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models.widgets import DataTable, TableColumn  
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 17})

## UCI Census Data Set
For the demo, we use UCI Census data set and the classification task is to predict who makes above 50k a year.

In [3]:
# Load Adult dataset
adult_data = pd.read_csv(
    "data/adult.data",
    names=[
        "Age", "Workclass", "Final Weight", "Education", "Education-Num", "Marital Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

# Drop NA values
adult_data = adult_data.dropna()

# Encode categorical features
encoders = {}
for column in adult_data.columns:
    if adult_data.dtypes[column] == np.object:
        le = LabelEncoder()
        adult_data[column] = le.fit_transform(adult_data[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = adult_data[adult_data.columns.difference(["Target"])], adult_data["Target"]

# Save encoders to file
pickle.dump(encoders, open("encoders.pkl", "wb"), protocol=2)

# Train a classifier
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X, y)

# Evaluate model with SliceFinder
sf = SliceFinder(lr, (X, y))
metrics_all = sf.evaluate_model((X,y))

# Store metrics
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))
print("Metrics: ", reference)

Workclass ['Federal-gov' 'Local-gov' 'Private' 'Self-emp-inc' 'Self-emp-not-inc'
 'State-gov' 'Without-pay'] [0 1 2 3 4 5 6]
Education ['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college'] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
Marital Status ['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed'] [0 1 2 3 4 5 6]
Occupation ['Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'Other-service'
 'Priv-house-serv' 'Prof-specialty' 'Protective-serv' 'Sales'
 'Tech-support' 'Transport-moving'] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
Relationship ['Husband' 'Not-in-family' 'Other-relative' 'Own-child' 'Unmarried' 'Wife'] [0 1 2 3 4 5]
Race ['Amer-Indian-Eskimo' 'Asian-Pac-Islander' 'Black' 'Other' 'White'] [0 1 2 3 4]
Sex ['Female' 'M

## SliceFinder Recommendations

In [4]:
# Generate slices
recommendations = sf.find_slice(k=20, epsilon=0.2, degree=2, max_workers=4)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering


KeyboardInterrupt: 

In [7]:
# Save slice recommendations to file
slices = []
for s in recommendations:
    slice = {}
    description = ''
    for i in range(len(s.get_filter().keys())):
        if (i > 0):
            description += ', '
        description += str(list(s.get_filter().keys())[i]) + ': ' + str(list(s.get_filter().values())[i][0][0])
        print(description)
    slice[description] = {
        "slice": description,
        "effect_size": s.effect_size,
        "metric": s.metric,
        "size": s.size,
        "degree": len(s.get_filter().keys())
    }
    slices.append(slice)
data = {}
data["data"] = slices
with open('logloss.json', 'w') as f:
    json.dump(data, f)

Capital Loss: 0
Capital Gain: 0
Education: 11
Education-Num: 9
Sex: 0
Marital Status: 4
Relationship: 1
Education: 15
Education-Num: 10
Relationship: 3
Marital Status: 0
Occupation: 7
Relationship: 4
Occupation: 6
Occupation: 5
Education: 1
Education-Num: 7
Marital Status: 5
Relationship: 2
Marital Status: 6
Age: 23
Education: 0
Education-Num: 6
Age: 25
Age: 27
Age: 24
Age: 26
Age: 22
Age: 20
Age: 21
Country: 25
Age: 19
Hours per week: 25
Education: 5
Education-Num: 4
Education: 6
Education-Num: 5
Age: 18
Education: 2
Education-Num: 8
Marital Status: 3
Hours per week: 15
Age: 17
Education: 4
Education-Num: 3
Hours per week: 24
Hours per week: 16
Education: 3
Education-Num: 2
Occupation: 8
Capital Gain: 5013
Country: 5
Country: 12
Country: 3
Capital Gain: 3325
Capital Gain: 2174
Education: 13
Education-Num: 1
Capital Loss: 1740
Capital Gain: 4064
Capital Gain: 4650
Hours per week: 6
Capital Loss: 1876
Hours per week: 22
Capital Loss: 1590
Capital Gain: 3137
Capital Loss: 1602
Capital Lo

In [11]:
# Calculate overlapping samples
sampleDict = {}
for s in recommendations:
    sliceSet = set()
    description = ''
    keyList = list(s.get_filter().keys())
    valueList = list(s.get_filter().values())
    for i in range(len(s.get_filter().keys())):
        if (i > 0):
            description += ', '
        description += str(keyList[i]) + ': ' + str(valueList[i][0][0])
    for i in range(len(s.get_filter().keys())):
        for index, row in adult_data.iterrows():
            for key in keyList:
                if row[key] == valueList[i][0][0]:
                    sliceSet.add(index)
    sampleDict[description] = sliceSet

['Sex']
[[[1]]]
['Marital Status']
[[[2]]]
['Relationship']
[[[0]]]
['Occupation']
[[[3]]]
['Hours per week']
[[[50]]]


KeyboardInterrupt: 

In [6]:
# Save overlapping samples to file
for key in sampleDict.keys():
    sampleDict[key] = list(sampleDict[key])
with open("loglosssamples.json", "w") as outfile:
    json.dump(sampleDict, outfile)