# Rulefit and AutoML with H2O
This notebooks walks through using the Adult dataset with Rulefit and H2O AutoML

In [1]:
#% load_ext autoreload
#% autoreload 2
#import os

#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

from imodels import OneRClassifier #pip install imodels

import interpret #pip install interpret
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder

## Download dataset and create splits

In [7]:
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                 'marital_status', 'occupation', 'relationship', 'race', 'sex', 
                 'capital_gain', 'capital_loss', 'hours_per_week', 
                 'native_country', 'income']

df = pd.read_csv('adult.data', header=None, names=names, na_values=['?', ' ?'])
#create a binary target
df['income_label'] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop('income', axis=1, inplace=True)

target_col = 'income_label'
target = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.25,random_state=1)

In [7]:
categorical_features = ['education', 'relationship' , 'workclass' , 'occupation' , 'marital_status','race']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder())
    ])

numeric_features = ['age','hours_per_week','capital_gain','capital_loss','education_num','fnlwgt']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],sparse_threshold=0.0)

## OneR Model
Build a OneR Model.  The top feature is feature2 (capital_gain).  The performance of this model is not that great with even one feature (0.59 AUC)

In [8]:
oner = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', OneRClassifier()) 
                      ])

oner.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(sparse_threshold=0.0,
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['age', 'hours_per_week',
                                                   'capital_gain',
                                                   'capital_loss',
                                                   'education_num', 'fnlwgt']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                       

In [9]:
oner.named_steps.classifier.rules_

[{'col': 'feat 2',
  'index_col': 2,
  'cutoff': 5178.0,
  'val': 0.24582309582309583,
  'flip': False,
  'val_right': 0.9537953795379538,
  'num_pts': 24420,
  'num_pts_right': 1212},
 {'col': 'feat 0',
  'index_col': 2,
  'cutoff': 4386.0,
  'val': 0.2088503964150293,
  'flip': False,
  'val_right': 0.3804878048780488,
  'num_pts': 23208,
  'num_pts_right': 205},
 {'col': 'feat 0',
  'index_col': 2,
  'cutoff': 3137.0,
  'val': 0.20732078424553319,
  'flip': True,
  'val_right': 0.20944224857268334,
  'num_pts': 23003,
  'num_pts_right': 22770},
 {'val': 0, 'num_pts': 233}]

In [10]:
preds = oner.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')

test AUC: 0.59


## GA2M
A special type of GAM tht includes interactions is the GA2M.  It is available in the interpretml package.

In [8]:
df = pd.read_csv('adult.data', header=None, names=names)

df['income_label'] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop('income', axis=1, inplace=True)
target = df['income_label'].values
train_cols = df.columns[0:-1]

X_train, X_test, y_train, y_test = train_test_split(df[train_cols], target, test_size=0.25,random_state=1)

In [10]:
ebm = ExplainableBoostingClassifier(random_state=42)
ebm.fit(X_train, y_train)

ExplainableBoostingClassifier(feature_names=['age', 'workclass', 'fnlwgt',
                                             'education', 'education_num',
                                             'marital_status', 'occupation',
                                             'relationship', 'race', 'sex',
                                             'capital_gain', 'capital_loss',
                                             'hours_per_week', 'native_country',
                                             'relationship x hours_per_week',
                                             'age x relationship',
                                             'marital_status x hours_per_week',
                                             'education_num x occupation',
                                             'age x capital_loss',
                                             '...
                                             'education_num x marital_status'],
                              feature_type

In [11]:
preds = ebm.predict_proba(X_test)
print(f'test r2: {metrics.log_loss(y_test, preds[:,1]):0.2f}')
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds[:,1])
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')

test r2: 0.28
test AUC: 0.93


In [14]:
ebm_global = ebm.explain_global()
show(ebm_global)

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


In [15]:
local_coefs = ebm.explain_local(X_test).data(1) # Local explanation for 0th datapoint on X_test
for feat_name, coef in zip(local_coefs['names'], local_coefs['scores']):
    print(f"{feat_name}: {coef}")

age: -1.7184664917712695
workclass: 0.05813984463003678
fnlwgt: 0.1306403653992196
education: -0.879301602747939
education_num: -0.3127480114354104
marital_status: -0.7128669177094049
occupation: -0.8536759910015794
relationship: -0.5714548266035924
race: 0.030982823570150864
sex: 0.2702053132277903
capital_gain: -0.24022631319118096
capital_loss: -0.047991786627382196
hours_per_week: -0.8893505582985164
native_country: 0.03957421387315183
relationship x hours_per_week: -0.10969191332215959
age x relationship: -0.013637908291421637
marital_status x hours_per_week: -0.04382030697349104
education_num x occupation: -0.025409571607263988
age x capital_loss: -0.13477522664260733
occupation x hours_per_week: -0.000589903201990215
relationship x capital_loss: 0.003866242275338318
occupation x relationship: 0.012376666049370589
workclass x race: 0.004478780967675712
education_num x marital_status: -0.03602698641040848
