# Slice Finding on UCI Adult

In [14]:
!pip install pandas
!pip install xgboost
!pip install scikit-learn



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost
import divisi

In [3]:
# Read the adult dataset and create a simple XGBoost model. The task is to predict
# whether someone makes over 50K in income.

df = pd.read_csv("adult.csv")

df_prepped = df.drop(columns=['fnlwgt', 'educational-num'])

X = df_prepped.drop(columns=['income'])
y = df_prepped['income'] == '>50K'

X_continous  = X[['age', 'capital-gain', 'capital-loss', 'hours-per-week']]

X_categorical = X[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
                   'gender', 'native-country']]

X_encoded = pd.get_dummies(X_categorical)
X = pd.concat([X_continous, X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

model = xgboost.XGBClassifier()
model.fit(X_train, y_train)

# Our outcomes will be y (true values), model_preds (predicted values), 
# model_probs (model probabilities of positive label), and is_error (if each prediction is incorrect).
model_preds = model.predict(X)
model_probs = model.predict_proba(X)
is_error = model_preds != y
print(f"Train + test error rate: {is_error.mean():.2%}")

Train + test error rate: 11.57%


In [4]:
# Discretize the dataset using a different method per-column so that we can perform slicing.

discrete_df = divisi.discretization.discretize_data(df, {
    'age': { "method": "bin", "bins": [25, 45, 65] }, 
    'workclass': { "method": "unique" }, 
    'education': { "method": "unique" }, 
    'marital-status': { "method": "unique" }, 
    'occupation': { "method": "unique" }, 
    'relationship': { "method": "unique" }, 
    'race': { "method": "unique" }, 
    'gender': { "method": "unique" },   
    'capital-gain': { "method": "bin", "bins": [1] }, 
    'capital-loss': { "method": "bin", "bins": [1] }, 
    'hours-per-week': { "method": "bin", "bins": [40] }, 
    'native-country': { "method": lambda x, c: (x != 'United-States', {0: 'US', 1: 'Non-US'}) },
})

In [5]:
# Create a widget to control the slice finder and to show metrics for each slice
w = divisi.SliceFinderWidget(discrete_df, metrics={
    "> 50K": y.values,
    "Model Prob.": model_probs[:,1],
    "Error": is_error.values,
})
w

SliceFinderWidget(base_slice={'scoreValues': {'Large Slice': 0.0, 'Simple Rule': 1.0, '> 50K High': 1.0, '> 50…

In [6]:
w.slice_intersection_counts 
# w.selected_intersection_index

[{'slices': [], 'count': 24149, '> 50K': 5748, 'Error': 2696}]

In [7]:
w.slice_intersection_labels

[]