# Visual Auditor Package Demo (Customer Churn Prediction Dataset)
Author: David Munechika (david.munechika@gatech.edu)

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import visual_auditor
from visual_auditor import SliceFinder

In [2]:
# Load Customer Churn Prediction dataset
customer_churn = pd.read_csv(
    "data/customer_churn_prediction.csv",
    names=[
        "State", "Account Length", "Area Code", "International", "Voicemail Plan", "Voicemail Messages",
        "Day Minutes", "Day Calls", "Day Charge", "Eve Minutes", "Eve Calls", "Eve Charge", "Night Minutes",
        "Night Calls", "Night Charge", "Intl Minutes", "Intl Calls", "Intl Charge", "CustServ Calls", "Churn"],
        engine='python',
        na_values="NA")

# Drop NA values
customer_churn = customer_churn.dropna()

# Bin numerical features
customer_churn["Account Length"] = pd.cut(customer_churn["Account Length"], 3, labels=["0", "1", "2"])
customer_churn["Voicemail Messages"] = pd.cut(customer_churn["Voicemail Messages"], 3, labels=["0", "1", "2"])
customer_churn["Day Minutes"] = pd.cut(customer_churn["Day Minutes"], 3, labels=["0", "1", "2"])
customer_churn["Eve Minutes"] = pd.cut(customer_churn["Eve Minutes"], 3, labels=["0", "1", "2"])
customer_churn["Night Minutes"] = pd.cut(customer_churn["Night Minutes"], 3, labels=["0", "1", "2"])
customer_churn["Intl Minutes"] = pd.cut(customer_churn["Intl Minutes"], 3, labels=["0", "1", "2"])
customer_churn["Day Charge"] = pd.cut(customer_churn["Day Charge"], 3, labels=["0", "1", "2"])
customer_churn["Eve Charge"] = pd.cut(customer_churn["Eve Charge"], 3, labels=["0", "1", "2"])
customer_churn["Night Charge"] = pd.cut(customer_churn["Night Charge"], 3, labels=["0", "1", "2"])
customer_churn["Intl Charge"] = pd.cut(customer_churn["Intl Charge"], 3, labels=["0", "1", "2"])
customer_churn["Day Calls"] = pd.cut(customer_churn["Day Calls"], 3, labels=["0", "1", "2"])
customer_churn["Eve Calls"] = pd.cut(customer_churn["Eve Calls"], 3, labels=["0", "1", "2"])
customer_churn["Night Calls"] = pd.cut(customer_churn["Night Calls"], 3, labels=["0", "1", "2"])
customer_churn["Intl Calls"] = pd.cut(customer_churn["Intl Calls"], 3, labels=["0", "1", "2"])
customer_churn["CustServ Calls"] = pd.cut(customer_churn["CustServ Calls"], 3, labels=["0", "1", "2"])

# Encode categorical features
encoders = {}
for column in customer_churn.columns:
    if customer_churn.dtypes[column] == np.object:
        le = LabelEncoder()
        customer_churn[column] = le.fit_transform(customer_churn[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = customer_churn[customer_churn.columns.difference(["Churn"])], customer_churn["Churn"]

# Train a classifier
rfc = RandomForestClassifier(max_depth=5, n_estimators=10)
rfc.fit(X, y)

State ['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'HI' 'IA' 'ID'
 'IL' 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC'
 'ND' 'NE' 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD'
 'TN' 'TX' 'UT' 'VA' 'VT' 'WA' 'WI' 'WV' 'WY'] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50]
Area Code ['area_code_408' 'area_code_415' 'area_code_510'] [0 1 2]
International ['no' 'yes'] [0 1]
Voicemail Plan ['no' 'yes'] [0 1]
Churn ['no' 'yes'] [0 1]


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [4]:
visual_auditor.find_slices_and_visualize(rfc, (X, y), k=100, epsilon=0.2, degree=2, max_workers=4, precompute=False, prefix='cp_')

In [None]:
# Compare to the textual outputs of running SliceFinder in isolation
sf = SliceFinder(rfc, (X, y))
recommendations = sf.find_slice(k=20, epsilon=0.2, degree=2, max_workers=4)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))