### Setup a classification experiment¶
* [Interpretable Classification Methods](https://nbviewer.jupyter.org/github/interpretml/interpret/blob/master/examples/python/notebooks/Interpretable%20Classification%20Methods.ipynb)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
# df = df.sample(frac=0.1, random_state=1)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1) #Turning response into 0 and 1

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [2]:
df.shape

(32561, 15)

In [3]:
df.describe()

Unnamed: 0,Age,fnlwgt,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
df.info

<bound method DataFrame.info of        Age          WorkClass  fnlwgt    Education  EducationNum  \
0       39          State-gov   77516    Bachelors            13   
1       50   Self-emp-not-inc   83311    Bachelors            13   
2       38            Private  215646      HS-grad             9   
3       53            Private  234721         11th             7   
4       28            Private  338409    Bachelors            13   
...    ...                ...     ...          ...           ...   
32556   27            Private  257302   Assoc-acdm            12   
32557   40            Private  154374      HS-grad             9   
32558   58            Private  151910      HS-grad             9   
32559   22            Private  201490      HS-grad             9   
32560   52       Self-emp-inc  287927      HS-grad             9   

             MaritalStatus          Occupation    Relationship    Race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1  

In [7]:
from interpret import show
from interpret.data import ClassHistogram
hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

### Train the Explainable Boosting Machine (EBM)

In [8]:
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

ebm = ExplainableBoostingClassifier(random_state=seed, n_jobs=-1)
ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingClassifier(feature_names=['Age', 'WorkClass', 'fnlwgt',
                                             'Education', 'EducationNum',
                                             'MaritalStatus', 'Occupation',
                                             'Relationship', 'Race', 'Gender',
                                             'CapitalGain', 'CapitalLoss',
                                             'HoursPerWeek', 'NativeCountry',
                                             'Relationship x HoursPerWeek',
                                             'Age x Relationship',
                                             'MaritalStatus x HoursPerWeek',
                                             'EducationNum x Occupation',
                                             'fnlwgt x HoursPerWeek',
                                             'Occupat...
                              feature_types=['continuous', 'categorical',
                                             'co

### Global Explanations: What the model learned overall

In [9]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

### Local Explanations: How an individual prediction was made

In [10]:
ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
show(ebm_local)

### Evaluate EBM performance

In [11]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

### Let's test out a few other Explainable Models

In [12]:
from interpret.glassbox import LogisticRegression, ClassificationTree

# We have to transform categorical variables to use Logistic Regression and Decision Tree
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1', solver='liblinear')
lr.fit(X_train_enc, y_train)

tree = ClassificationTree()
tree.fit(X_train_enc, y_train)

<interpret.glassbox.decisiontree.ClassificationTree at 0x7fe00120adc0>

### Compare performance using the Dashboard

In [13]:
lr_perf = ROC(lr.predict_proba).explain_perf(X_test_enc, y_test, name='Logistic Regression')
tree_perf = ROC(tree.predict_proba).explain_perf(X_test_enc, y_test, name='Classification Tree')

show(lr_perf)
show(tree_perf)
show(ebm_perf)

### Glassbox: All of our models have global and local explanations

In [14]:
lr_global = lr.explain_global(name='Logistic Regression')
tree_global = tree.explain_global(name='Classification Tree')

show(lr_global)
show(tree_global)
show(ebm_global)

### Dashboard: look at everything at once

In [15]:

# Do everything in one shot with the InterpretML Dashboard by passing a list into show

show([hist, lr_global, lr_perf, tree_global, tree_perf, ebm_global, ebm_perf], share_tables=True)