# Rulefit and AutoML with H2O
This notebooks walks through using the Adult dataset with Rulefit and H2O AutoML

In [11]:
#% load_ext autoreload
#% autoreload 2
#import os

#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

import h2o
from h2o.automl import H2OAutoML

## Download dataset and create splits

In [2]:
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                 'marital_status', 'occupation', 'relationship', 'race', 'sex', 
                 'capital_gain', 'capital_loss', 'hours_per_week', 
                 'native_country', 'income']

df = pd.read_csv('adult.data', header=None, names=names, na_values=['?', ' ?'])
#create a binary target
df['income_label'] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop('income', axis=1, inplace=True)

target_col = 'income_label'
target = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.25,random_state=1)

## Baseline AutoML Model

In [7]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,7 mins 34 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_rajiv_shah_cdk9tv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.869 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
y = "income_label"

train_h2o_df = h2o.H2OFrame(X_train)
train_h2o_df[y] = h2o.H2OFrame(y_train)
train_h2o_df.set_names(list(df.columns))

test_h2o_df = h2o.H2OFrame(X_test)
test_h2o_df[y] = h2o.H2OFrame(y_test)
test_h2o_df.set_names(list(df.columns))

train_h2o_df[y] = train_h2o_df[y].asfactor()

x = train_h2o_df.columns
x.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [9]:
##AutoML baseline

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train_h2o_df)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)


AutoML progress: |
20:33:56.139: AutoML: XGBoost is not available; skipping it.
20:33:56.140: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
20:33:56.140: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_4_AutoML_2_20211023_203356,0.928061,0.281552,0.832478,0.168102,0.298665,0.0892008
StackedEnsemble_AllModels_3_AutoML_2_20211023_203356,0.928003,0.281703,0.832187,0.170368,0.298785,0.0892728
StackedEnsemble_AllModels_6_AutoML_2_20211023_203356,0.92799,0.281716,0.832355,0.16946,0.298739,0.0892451
StackedEnsemble_AllModels_2_AutoML_2_20211023_203356,0.927971,0.281966,0.832503,0.167536,0.298776,0.0892668
StackedEnsemble_AllModels_1_AutoML_2_20211023_203356,0.927573,0.282694,0.831669,0.169705,0.299155,0.089494
GBM_5_AutoML_2_20211023_203356,0.927409,0.283566,0.83089,0.16457,0.299373,0.0896241
GBM_2_AutoML_2_20211023_203356,0.92739,0.283288,0.831196,0.164179,0.29933,0.0895986
StackedEnsemble_AllModels_5_AutoML_2_20211023_203356,0.927329,0.283004,0.83022,0.162547,0.299267,0.0895609
StackedEnsemble_BestOfFamily_2_AutoML_2_20211023_203356,0.927233,0.28322,0.830825,0.164628,0.299382,0.0896298
StackedEnsemble_BestOfFamily_3_AutoML_2_20211023_203356,0.927233,0.283268,0.830764,0.166766,0.299394,0.089637




In [12]:
preds = aml.leader.predict(test_h2o_df)
fpr, tpr, thresholds = metrics.roc_curve(y_test, h2o.as_list(preds[:,2]))
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
test AUC: 0.93


## Baseline Logistic Regression Model. 


In [15]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
numeric_features = ['age','hours_per_week','capital_gain','capital_loss','education_num','fnlwgt'] ##use this to build a numeric only version of the model

glm_model = H2OGeneralizedLinearEstimator(family= "binomial",
                                          lambda_ = 0,
                                          compute_p_values = True,
                                          remove_collinear_columns = True)
glm_model.train(x, y, training_frame= train_h2o_df)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_model_python_1635038711053_5431


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,,99,97,9,py_2_sid_b3d9




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.10146390687737612
RMSE: 0.31853399642326424
LogLoss: 0.31678900376396657
Null degrees of freedom: 24419
Residual degrees of freedom: 24322
Null deviance: 27238.050801038953
Residual deviance: 15471.974943832125
AIC: 15667.974943832125
AUC: 0.9099635445892192
AUCPR: 0.777754198909492
Gini: 0.8199270891784385

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.312024119603062: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,15632.0,2785.0,0.1512,(2785.0/18417.0)
1,1,1263.0,4740.0,0.2104,(1263.0/6003.0)
2,Total,16895.0,7525.0,0.1658,(4048.0/24420.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.312024,0.700769,221.0
1,max f2,0.152533,0.796744,290.0
2,max f0point5,0.572685,0.718979,129.0
3,max accuracy,0.508836,0.853235,149.0
4,max precision,0.998317,0.996599,1.0
5,max recall,0.001929,1.0,396.0
6,max specificity,0.999774,0.999946,0.0
7,max absolute_mcc,0.386542,0.596542,193.0
8,max min_per_class_accuracy,0.278972,0.824564,235.0
9,max mean_per_class_accuracy,0.222238,0.828191,259.0



Gains/Lift Table: Avg response rate: 24.58 %, avg score: 39.37 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010033,1.830601,0.033208,0.033208,0.008163,19.282144,0.008163,19.282144,0.000333,0.000333,-96.679211,-96.679211,-0.012861
1,2,0.020025,0.981862,1.983967,1.006593,0.487705,1.177587,0.247444,10.248377,0.019823,0.020157,98.396703,0.659282,0.000175
2,3,0.030016,0.671325,0.66688,0.89351,0.163934,0.815748,0.219645,7.108457,0.006663,0.02682,-33.312033,-10.649041,-0.004238
3,4,0.040008,0.573809,0.66688,0.83691,0.163934,0.615478,0.205732,5.486874,0.006663,0.033483,-33.312033,-16.30899,-0.008652
4,5,0.05,0.481516,0.616864,0.792937,0.151639,0.522973,0.194922,4.494907,0.006164,0.039647,-38.31363,-20.706314,-0.013728
5,6,0.1,0.337223,0.809595,0.801266,0.199017,0.392282,0.19697,2.443594,0.04048,0.080127,-19.04048,-19.873397,-0.026351
6,7,0.15,0.288063,0.476428,0.692987,0.117117,0.311146,0.170352,1.732778,0.023821,0.103948,-52.357155,-30.701316,-0.061063
7,8,0.2,0.257418,0.489755,0.642179,0.120393,0.273168,0.157862,1.367876,0.024488,0.128436,-51.024488,-35.782109,-0.09489
8,9,0.3,0.21459,0.456438,0.580265,0.112203,0.230723,0.142643,0.988825,0.045644,0.17408,-54.356155,-41.973458,-0.166964
9,10,0.4,0.191279,0.654673,0.598867,0.160934,0.202728,0.147215,0.792301,0.065467,0.239547,-34.532734,-40.113277,-0.212753




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iterations,negative_log_likelihood,objective,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2021-10-23 21:10:06,0.000 sec,0,13619.025401,0.5577,,,,,,,
1,,2021-10-23 21:10:06,0.045 sec,1,8948.305262,0.366433,,,,,,,
2,,2021-10-23 21:10:06,0.052 sec,2,8061.787052,0.330131,,,,,,,
3,,2021-10-23 21:10:06,0.057 sec,3,7784.445725,0.318773,,,,,,,
4,,2021-10-23 21:10:06,0.062 sec,4,7740.364848,0.316968,,,,,,,
5,,2021-10-23 21:10:06,0.065 sec,5,7736.288294,0.316801,,,,,,,
6,,2021-10-23 21:10:06,0.071 sec,6,7736.071372,0.316792,,,,,,,
7,,2021-10-23 21:10:06,0.076 sec,7,7736.015219,0.31679,,,,,,,
8,,2021-10-23 21:10:06,0.080 sec,8,7735.994916,0.316789,,,,,,,
9,,2021-10-23 21:10:06,0.084 sec,9,7735.987472,0.316789,0.318534,0.316789,0.452712,0.909964,0.777754,0.033208,0.165766



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,education. Preschool,15.588765,1.0,0.092166
1,native_country. Columbia,10.619346,0.681218,0.062785
2,native_country. Outlying-US(Guam-USVI-etc),9.974444,0.639848,0.058972
3,native_country. Peru,9.588564,0.615095,0.056691
4,workclass. Without-pay,9.223418,0.591671,0.054532
5,workclass. Never-worked,6.659704,0.427212,0.039374
6,occupation. Priv-house-serv,4.143973,0.265831,0.0245
7,native_country. Dominican-Republic,3.28136,0.210495,0.0194
8,native_country. Laos,2.845364,0.182527,0.016823
9,education. Doctorate,2.842674,0.182354,0.016807



See the whole table with table.as_data_frame()




In [16]:
preds = glm_model.predict(test_h2o_df)
fpr, tpr, thresholds = metrics.roc_curve(y_test, h2o.as_list(preds[:,2]))
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
test AUC: 0.90




In [17]:
glm_model._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,std_error,z_value,p_value,standardized_coefficients
0,Intercept,-5.897661e+00,8.548516e-01,-6.899047,5.235257e-12,-3.131953
1,native_country. Canada,-1.077198e+00,7.690538e-01,-1.400680,1.613097e-01,-1.077198
2,native_country. China,-2.139845e+00,7.811511e-01,-2.739349,6.156101e-03,-2.139845
3,native_country. Columbia,-1.061935e+01,2.550652e+01,-0.416338,6.771624e-01,-10.619346
4,native_country. Cuba,-8.090436e-01,7.944151e-01,-1.018414,3.084811e-01,-0.809044
...,...,...,...,...,...,...
95,fnlwgt,8.415078e-07,1.984745e-07,4.239880,2.236398e-05,0.089251
96,education_num,0.000000e+00,,,,0.000000
97,capital_gain,3.274884e-04,1.212336e-05,27.012998,0.000000e+00,2.375432
98,capital_loss,6.451210e-04,4.269218e-05,15.110987,0.000000e+00,0.261712


## Rulefit model

In [19]:
from h2o.estimators import H2ORuleFitEstimator
# Build and train the model:
rfit = H2ORuleFitEstimator(max_num_rules=29, model_type='rules_and_linear',
                           seed=1)
rfit.train(training_frame=train_h2o_df, x=x, y=y)

# Retrieve the rule importance:
print(rfit._model_json['output']['rule_importance'])
#perf =  rfit.model_performance(valid)
#perf
#0.9 AUC with max length 10 and 100 rules

preds = rfit.predict(test_h2o_df)
fpr, tpr, thresholds = metrics.roc_curve(y_test, h2o.as_list(preds[:,2]))
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')

# 2 rules 0.87 AUC
# 5 rules 0.88 AUC
# 10 rules 0.88 AUC
# 20 rules 0.89 AUC
# 30 rules 0.90 AUC
# 45 rules 0.91 AUC

#R&L 5 rules 0.88
# R 5 rules 0.80
# L 5 rules 0.89

rulefit Model Build progress: |██████████████████████████████████████████████████| (done) 100%

Rule Importance: 


Unnamed: 0,Unnamed: 1,variable,coefficient,rule
0,,M0T15N18,-2.381426,"(relationship in { Not-in-family, Other-relative, Own-child, Un..."
1,,M0T22N15,-1.684331,"(relationship in { Husband, Wife}) & (capital_loss < 1783.0 or ca..."
2,,M0T35N15,1.633017,"(relationship in { Husband, Wife}) & (occupation in { Adm-clerica..."
3,,M0T28N18,1.574632,(capital_gain < 5127.0 or capital_gain is NA) & (relationship in {...
4,,linear.marital_status. Married-civ-spouse,-0.913548,
5,,M0T17N15,-0.833574,(capital_gain < 5127.0 or capital_gain is NA) & (marital_status in...
6,,M0T18N21,-0.577827,"(marital_status in { Divorced, Married-spouse-absent, Never-marr..."
7,,M0T27N19,0.551968,"(age >= 29.5 or age is NA) & (relationship in { Husband, Wife} or..."
8,,M0T31N21,-0.495248,"(relationship in { Not-in-family, Other-relative, Own-child, Un..."
9,,M0T45N18,-0.4015,"(marital_status in { Divorced, Married-spouse-absent, Never-marr..."



See the whole table with table.as_data_frame()

rulefit prediction progress: |███████████████████████████████████████████████████| (done) 100%
test AUC: 0.90


In [21]:
rfit

Model Details
H2ORuleFitEstimator :  RuleFit
Model Key:  RuleFit_model_python_1635038711053_5433


Rulefit Model Summary: 


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,rule_ensemble_size,number_of_trees,number_of_internal_trees,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,binomial,logit,Lasso (lambda = 1.448E-4 ),506,25,5,399.0,50.0,50.0,0.0,3.0,3.0,0.0,8.0,7.98




ModelMetricsBinomialGLM: rulefit
** Reported on train data. **

MSE: 0.10343585037287835
RMSE: 0.3216144436633379
LogLoss: 0.32373149459036427
Null degrees of freedom: 24419
Residual degrees of freedom: 24394
Null deviance: 27238.050801040306
Residual deviance: 15811.046195793388
AIC: 15863.046195793388
AUC: 0.9045071227395117
AUCPR: 0.7731947512477808
Gini: 0.8090142454790235

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3632027886963357: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,16127.0,2290.0,0.1243,(2290.0/18417.0)
1,1,1640.0,4363.0,0.2732,(1640.0/6003.0)
2,Total,17767.0,6653.0,0.1609,(3930.0/24420.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.363203,0.689475,213.0
1,max f2,0.174344,0.791023,298.0
2,max f0point5,0.583974,0.719654,131.0
3,max accuracy,0.580128,0.84955,132.0
4,max precision,0.949241,0.98913,18.0
5,max recall,0.002905,1.0,398.0
6,max specificity,0.999855,0.999891,0.0
7,max absolute_mcc,0.394624,0.583792,202.0
8,max min_per_class_accuracy,0.266129,0.814759,255.0
9,max mean_per_class_accuracy,0.217969,0.820518,278.0



Gains/Lift Table: Avg response rate: 24.58 %, avg score: 24.58 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010033,0.993776,4.00155,4.00155,0.983673,0.998441,0.983673,0.998441,0.040147,0.040147,300.155025,300.155025,0.039929
1,2,0.020025,0.977111,4.034622,4.018052,0.991803,0.987527,0.98773,0.992995,0.040313,0.08046,303.462203,301.805232,0.080134
2,3,0.030016,0.949299,4.034622,4.023568,0.991803,0.963043,0.989086,0.983025,0.040313,0.120773,303.462203,302.356802,0.120339
3,4,0.04009,0.922075,3.886065,3.989017,0.955285,0.936037,0.980592,0.971218,0.039147,0.15992,288.60651,298.901673,0.158888
4,5,0.05,0.879045,3.597292,3.911378,0.884298,0.901644,0.961507,0.957428,0.035649,0.195569,259.729226,291.137764,0.193017
5,6,0.1,0.727711,3.208396,3.559887,0.788698,0.793623,0.875102,0.875526,0.16042,0.355989,220.83958,255.988672,0.339428
6,7,0.150082,0.616926,2.770741,3.296551,0.681112,0.674444,0.810368,0.808425,0.138764,0.494753,177.074055,229.655091,0.457016
7,8,0.2,0.492738,2.089046,2.995169,0.513536,0.548844,0.736282,0.743636,0.104281,0.599034,108.904572,199.516908,0.529098
8,9,0.300041,0.313247,1.653496,2.547823,0.406467,0.40253,0.626314,0.629903,0.165417,0.764451,65.349581,154.782258,0.615784
9,10,0.4,0.212399,1.184893,2.20723,0.291274,0.257844,0.542588,0.536926,0.118441,0.882892,18.489301,120.722972,0.64029





