# Visual Auditor Package Demo (Customer Churn Prediction Dataset)
Author: David Munechika (david.munechika@gatech.edu)

### Install Packages and Libraries

In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from visual_auditor import SliceFinder
import json

# Import visual auditor module
import visual_auditor

# Expand Jupyter notebook width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Data Preprocessing and Model Training

In [12]:
# Helper function for binning numerical features
def bin_feature(feature):
    bins = np.histogram_bin_edges(customer_churn[feature], bins=10, range=None, weights=None)
    customer_churn[feature] = pd.cut(customer_churn[feature], bins, labels=[x for x in range(len(bins) - 1)], right=True, include_lowest=True, duplicates='drop')
    intervals = []
    for i in range(len(bins) - 1):
        intervals.append(f' {int(bins[i])} - {int(bins[i+1])}')
    return intervals

# Helper function for mapping encodings to original values
def map_encodings(filename, features, encodings):
    slices_file = open(filename, "r")
    data = json.dumps(json.loads(slices_file.read()))
    slices_file.close()
    for feature in features:
        for i in range(len(encodings[feature])):
            index = len(encodings[feature]) - i - 1
            data = data.replace(f'{feature}: {index}', f'{feature}: {encodings[feature][index]}')
    with open(filename, 'w') as outfile:
        json.dump(json.JSONDecoder().decode(data), outfile)

In [13]:
# Load Customer Churn Prediction dataset
customer_churn = pd.read_csv(
    "data/customer_churn_prediction.csv",
    names=[
        "State", "Account Length", "Area Code", "International", "Voicemail Plan", "Voicemail Messages",
        "Day Minutes", "Day Calls", "Day Charge", "Eve Minutes", "Eve Calls", "Eve Charge", "Night Minutes",
        "Night Calls", "Night Charge", "Intl Minutes", "Intl Calls", "Intl Charge", "CustServ Calls", "Churn"],
        engine='python',
        na_values="NA")

# Drop NA values
customer_churn = customer_churn.dropna()

# Bin numerical features
encoders = {}
encodings = {}
numerical_features = ["Account Length", "Voicemail Messages", "Day Minutes", "Eve Minutes", "Night Minutes", "Intl Minutes", "Day Charge",
                     "Eve Charge", "Night Charge", "Intl Charge", "Day Calls", "Eve Calls", "Night Calls", "Intl Calls", "CustServ Calls"]
for feature in numerical_features:
    encodings[feature] = bin_feature(feature)

# Encode categorical features
for column in customer_churn.columns.difference(numerical_features):
    if customer_churn.dtypes[column] == np.object:
        le = LabelEncoder()
        customer_churn[column] = le.fit_transform(customer_churn[column])
        encoders[column] = le
        encodings[column] = le.classes_
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = customer_churn[customer_churn.columns.difference(["Churn"])], customer_churn["Churn"]

# Train a classifier
rfc = RandomForestClassifier(max_depth=5, n_estimators=10)
rfc.fit(X, y)

Area Code ['area_code_408' 'area_code_415' 'area_code_510'] [0 1 2]
Churn ['no' 'yes'] [0 1]
International ['no' 'yes'] [0 1]
State ['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'HI' 'IA' 'ID'
 'IL' 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC'
 'ND' 'NE' 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD'
 'TN' 'TX' 'UT' 'VA' 'VT' 'WA' 'WI' 'WV' 'WY'] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50]
Voicemail Plan ['no' 'yes'] [0 1]
{'Account Length': [' 1 - 25', ' 25 - 49', ' 49 - 73', ' 73 - 97', ' 97 - 122', ' 122 - 146', ' 146 - 170', ' 170 - 194', ' 194 - 218', ' 218 - 243'], 'Voicemail Messages': [' 0 - 5', ' 5 - 10', ' 10 - 15', ' 15 - 20', ' 20 - 26', ' 26 - 31', ' 31 - 36', ' 36 - 41', ' 41 - 46', ' 46 - 52'], 'Day Minutes': [' 0 - 35', ' 35 - 70', ' 70 - 105', ' 105 - 140', ' 140 - 175', ' 175 - 210', ' 210 - 246', ' 246 - 281', ' 2

### Interact with Visual Auditor

In [19]:
visual_auditor.find_slices_and_visualize(rfc, (X, y), k=100, epsilon=0.2, degree=2, max_workers=4, precompute=True, prefix='cp_')

### Precompute Data for Demo (Internal Use)

In [18]:
# Map encodings
features = ["State", "Account Length", "Area Code", "International", "Voicemail Plan", "Voicemail Messages",
        "Day Minutes", "Day Calls", "Day Charge", "Eve Minutes", "Eve Calls", "Eve Charge", "Night Minutes",
        "Night Calls", "Night Charge", "Intl Minutes", "Intl Calls", "Intl Charge", "CustServ Calls"]
map_encodings('cp_slices.json', features, encodings)
map_encodings('cp_common_samples.json', features, encodings)
map_encodings('cp_overlapping_samples.json', features, encodings)
map_encodings('cp_reverse_slices.json', features, encodings)
map_encodings('cp_reverse_common_samples.json', features, encodings)
map_encodings('cp_reverse_overlapping_samples.json', features, encodings)