# H20 automl: Porto Seguro (Classification) - Final

Based on: https://github.com/h2oai/h2o-tutorials/blob/master/h2o-world-2017/automl/Python/automl_binary_classification_product_backorders.ipynb

Version used for final presentation.

In [1]:
time_limit = 60*60  # 1 hour max (confirm with timer)
metric = "AUCPR"  # specify your evaluation metric
stopping = "AUCPR"  # specify your stopping metric
target = "target" # name of target or label variable (just happens to be called "target" for this dataset)

## Import

In [2]:
# Import packages

from h2o.automl import H2OAutoML

import h2o
import numpy as np
import pandas as pd

In [3]:
# Import data
train_df = pd.read_csv("porto_train.csv")
test_df = pd.read_csv("porto_test.csv")

In [4]:
train_df.shape

(476170, 60)

In [5]:
test_df.shape

(119042, 59)

In [6]:
# initialize h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 10 mins
H2O_cluster_timezone:,Etc/GMT
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.1
H2O_cluster_version_age:,11 months and 2 days !!!
H2O_cluster_name:,H2O_from_python_stever7_49cmhp
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,23.24 Gb
H2O_cluster_total_cores:,64
H2O_cluster_allowed_cores:,64


## Tidy

In [7]:
train_df = train_df.replace(-1, np.nan)
test_df = test_df.replace(-1, np.nan)

## Transform

In [8]:
# Convert categorical variables to "category" data type (after converting to h20 dataframe - easier)
cat_vars = [col for col in train_df.columns if 'cat' in col]
cat_vars

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [9]:
train_df = h2o.H2OFrame(train_df)
test_df = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
for col in cat_vars:
    test_df[col] = test_df[col].asfactor()
    
cat_vars = cat_vars + ["target"]

for col in cat_vars:
    train_df[col] = train_df[col].asfactor()

In [11]:
predictors = train_df.columns
predictors.remove(target)  # remove "target"
predictors.remove("id")  # remove "id" (should not be a predictor)
predictors.remove("fold")  # remove "fold" (not be a predictor)

## Visualize

(skipped)

## Model

In [12]:
%%time

aml = H2OAutoML(
    max_runtime_secs=time_limit, 
    seed=1, 
    sort_metric=metric,
    stopping_metric=stopping
)

CPU times: user 3.38 ms, sys: 138 µs, total: 3.52 ms
Wall time: 5.15 ms


In [13]:
%%time

aml.train(x=predictors, y=target, training_frame=train_df)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%

19:19:29.35: GBM_lr_annealing_selection_AutoML_2_20230821_181949 [GBM lr_annealing] failed: water.exceptions.H2OIllegalArgumentException: Can only convert jobs producing a single Model or ModelContainer.

CPU times: user 24.5 s, sys: 2.73 s, total: 27.2 s
Wall time: 1h 4s


Unnamed: 0,0,1,Error,Rate
0,8720.0,756.0,0.0798,(756.0/9476.0)
1,213.0,164.0,0.565,(213.0/377.0)
Total,8933.0,920.0,0.0983,(969.0/9853.0)

metric,threshold,value,idx
max f1,0.0614494,0.2528913,148.0
max f2,0.0582411,0.3491078,157.0
max f0point5,0.1161955,0.2864939,48.0
max accuracy,0.2241477,0.962245,5.0
max precision,0.4301536,1.0,0.0
max recall,0.0149878,1.0,380.0
max specificity,0.4301536,1.0,0.0
max absolute_mcc,0.0582411,0.2364676,157.0
max min_per_class_accuracy,0.0405891,0.7214854,229.0
max mean_per_class_accuracy,0.0401648,0.7235754,231.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100477,0.1121494,11.3516866,11.3516866,0.4343434,0.1495132,0.4343434,0.1495132,0.1140584,0.1140584,1035.1686628,1035.1686628,0.1081487
2,0.0200954,0.0947308,3.4319053,7.3917959,0.1313131,0.1019766,0.2828283,0.1257449,0.0344828,0.1485411,243.1905259,639.1795944,0.1335559
3,0.0300416,0.0871376,5.6004168,6.7987042,0.2142857,0.0910251,0.2601351,0.1142499,0.0557029,0.204244,460.0416825,579.8704208,0.181133
4,0.0400893,0.0794072,2.1119417,5.6240473,0.0808081,0.0831302,0.2151899,0.1064502,0.0212202,0.2254642,111.1941698,462.4047275,0.19275
5,0.0500355,0.0749848,4.5336708,5.407299,0.1734694,0.0772495,0.2068966,0.1006456,0.0450928,0.270557,353.3670763,440.7299003,0.2292949
6,0.100071,0.0594801,3.6578787,4.5325889,0.1399594,0.0665439,0.173428,0.0835948,0.1830239,0.4535809,265.7878737,353.258887,0.3675741
7,0.1500051,0.05193,1.752976,3.607305,0.0670732,0.0553403,0.1380244,0.0741894,0.0875332,0.5411141,75.2975998,260.7305018,0.4066691
8,0.2000406,0.046752,1.3783311,3.0497788,0.0527383,0.0491978,0.116692,0.0679383,0.0689655,0.6100796,37.8331118,204.9778822,0.4263523
9,0.3000101,0.0400014,1.1939975,2.4313943,0.0456853,0.043215,0.0930311,0.0597,0.1193634,0.729443,19.3997496,143.1394314,0.4465177
10,0.3999797,0.035355,0.6633319,1.9894909,0.0253807,0.0375289,0.0761228,0.0541586,0.066313,0.795756,-33.6668058,98.9490879,0.4115221

Unnamed: 0,0,1,Error,Rate
0,409116.0,49695.0,0.1083,(49695.0/458811.0)
1,13142.0,4217.0,0.7571,(13142.0/17359.0)
Total,422258.0,53912.0,0.132,(62837.0/476170.0)

metric,threshold,value,idx
max f1,0.0578096,0.1183371,191.0
max f2,0.0420328,0.1999309,246.0
max f0point5,0.0789704,0.0982191,141.0
max accuracy,0.4559583,0.9635508,7.0
max precision,0.6050734,1.0,0.0
max recall,0.0094975,1.0,398.0
max specificity,0.6050734,1.0,0.0
max absolute_mcc,0.0444154,0.0817414,237.0
max min_per_class_accuracy,0.0352824,0.6000346,277.0
max mean_per_class_accuracy,0.0357398,0.6004504,275.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100006,0.1117048,3.5714093,3.5714093,0.1301974,0.1417147,0.1301974,0.1417147,0.0357163,0.0357163,257.1409302,257.1409302,0.0266887
2,0.0200013,0.0945659,2.5575899,3.0644996,0.0932381,0.1019778,0.1117178,0.1218463,0.0255775,0.0612939,155.7589887,206.4499595,0.0428549
3,0.0300019,0.0851073,2.2638127,2.797604,0.0825283,0.089515,0.101988,0.1110692,0.0226396,0.0839334,126.3812671,179.7603953,0.055972
4,0.0400004,0.0787715,2.2009111,2.6484542,0.0802352,0.0817974,0.0965506,0.1037524,0.0220059,0.1059393,120.0911139,164.8454245,0.0684336
5,0.0500011,0.0739402,2.0276388,2.5242859,0.0739185,0.0762556,0.092024,0.0982528,0.0202777,0.1262169,102.763883,152.4285947,0.0790995
6,0.1,0.0599566,1.8757235,2.2000115,0.0683804,0.0660428,0.0802024,0.0821481,0.0937842,0.2200012,87.5723475,120.0011521,0.1245414
7,0.1500011,0.0524136,1.5334663,1.9778267,0.0559032,0.0559179,0.0721026,0.0734046,0.0766749,0.2966761,53.3466269,97.782666,0.1522244
8,0.2,0.0473023,1.374532,1.8270062,0.0501092,0.0497201,0.0666044,0.0674836,0.0687252,0.3654012,37.4532006,82.7006164,0.1716591
9,0.3,0.0403395,1.2097471,1.6212531,0.0441019,0.0435637,0.0591035,0.0595103,0.1209747,0.4863759,20.9747105,62.1253144,0.1934274
10,0.4,0.0354595,1.0657296,1.4823723,0.0388517,0.037786,0.0540406,0.0540792,0.106573,0.5929489,6.5729593,48.2372256,0.2002491

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8652531,0.0106636,0.8790284,0.866631,0.8552238,0.8713363,0.8540459
auc,0.6412331,0.0030388,0.6401285,0.6397766,0.6434467,0.6451908,0.637623
err,0.1347469,0.0106636,0.1209716,0.1333690,0.1447762,0.1286637,0.1459541
err_count,12833.2,1027.4187,11524.0,12707.0,13795.0,12217.0,13923.0
f0point5,0.0904033,0.0028577,0.0927363,0.0920621,0.0855834,0.0914871,0.0901476
f1,0.1185893,0.0020982,0.1188255,0.1205620,0.1150811,0.1187333,0.1197446
f2,0.1725866,0.0052523,0.1653403,0.1746191,0.1756069,0.1690911,0.1782756
lift_top_group,3.5843952,0.2705577,3.9719255,3.4878626,3.2554417,3.4929717,3.7137737
logloss,0.1518456,0.0032298,0.1517068,0.1528318,0.1466299,0.1526171,0.1554423
max_per_class_error,0.7513156,0.0194400,0.7762741,0.7509294,0.7295749,0.7642509,0.7355487


In [14]:
%%time

lb = aml.leaderboard

CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 16.5 µs


In [15]:
lb.head(rows=lb.nrows)

model_id,aucpr,auc,logloss,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_3_AutoML_2_20230821_181949,0.0672009,0.641036,0.151848,0.432692,0.186384,0.0347391
StackedEnsemble_AllModels_4_AutoML_2_20230821_181949,0.0671449,0.641159,0.151842,0.428852,0.186384,0.0347392
StackedEnsemble_BestOfFamily_4_AutoML_2_20230821_181949,0.0664944,0.638724,0.15198,0.433377,0.186416,0.0347509
XGBoost_grid_1_AutoML_2_20230821_181949_model_29,0.0659153,0.636857,0.152117,0.439107,0.186448,0.034763
XGBoost_grid_1_AutoML_2_20230821_181949_model_16,0.0657561,0.63725,0.152101,0.437428,0.186452,0.0347642
XGBoost_grid_1_AutoML_2_20230821_181949_model_23,0.0654968,0.636616,0.152144,0.43116,0.186465,0.0347692
StackedEnsemble_AllModels_2_AutoML_2_20230821_181949,0.0653811,0.637593,0.152095,0.43895,0.186459,0.034767
StackedEnsemble_BestOfFamily_3_AutoML_2_20230821_181949,0.0651974,0.636964,0.152133,0.429679,0.186468,0.0347704
GBM_grid_1_AutoML_2_20230821_181949_model_7,0.0649473,0.634313,0.152286,0.431045,0.186488,0.0347777
GBM_grid_1_AutoML_2_20230821_181949_model_20,0.0648433,0.632125,0.152534,0.435165,0.186529,0.0347929


In [16]:
aml.leader

Unnamed: 0,0,1,Error,Rate
0,8720.0,756.0,0.0798,(756.0/9476.0)
1,213.0,164.0,0.565,(213.0/377.0)
Total,8933.0,920.0,0.0983,(969.0/9853.0)

metric,threshold,value,idx
max f1,0.0614494,0.2528913,148.0
max f2,0.0582411,0.3491078,157.0
max f0point5,0.1161955,0.2864939,48.0
max accuracy,0.2241477,0.962245,5.0
max precision,0.4301536,1.0,0.0
max recall,0.0149878,1.0,380.0
max specificity,0.4301536,1.0,0.0
max absolute_mcc,0.0582411,0.2364676,157.0
max min_per_class_accuracy,0.0405891,0.7214854,229.0
max mean_per_class_accuracy,0.0401648,0.7235754,231.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100477,0.1121494,11.3516866,11.3516866,0.4343434,0.1495132,0.4343434,0.1495132,0.1140584,0.1140584,1035.1686628,1035.1686628,0.1081487
2,0.0200954,0.0947308,3.4319053,7.3917959,0.1313131,0.1019766,0.2828283,0.1257449,0.0344828,0.1485411,243.1905259,639.1795944,0.1335559
3,0.0300416,0.0871376,5.6004168,6.7987042,0.2142857,0.0910251,0.2601351,0.1142499,0.0557029,0.204244,460.0416825,579.8704208,0.181133
4,0.0400893,0.0794072,2.1119417,5.6240473,0.0808081,0.0831302,0.2151899,0.1064502,0.0212202,0.2254642,111.1941698,462.4047275,0.19275
5,0.0500355,0.0749848,4.5336708,5.407299,0.1734694,0.0772495,0.2068966,0.1006456,0.0450928,0.270557,353.3670763,440.7299003,0.2292949
6,0.100071,0.0594801,3.6578787,4.5325889,0.1399594,0.0665439,0.173428,0.0835948,0.1830239,0.4535809,265.7878737,353.258887,0.3675741
7,0.1500051,0.05193,1.752976,3.607305,0.0670732,0.0553403,0.1380244,0.0741894,0.0875332,0.5411141,75.2975998,260.7305018,0.4066691
8,0.2000406,0.046752,1.3783311,3.0497788,0.0527383,0.0491978,0.116692,0.0679383,0.0689655,0.6100796,37.8331118,204.9778822,0.4263523
9,0.3000101,0.0400014,1.1939975,2.4313943,0.0456853,0.043215,0.0930311,0.0597,0.1193634,0.729443,19.3997496,143.1394314,0.4465177
10,0.3999797,0.035355,0.6633319,1.9894909,0.0253807,0.0375289,0.0761228,0.0541586,0.066313,0.795756,-33.6668058,98.9490879,0.4115221

Unnamed: 0,0,1,Error,Rate
0,409116.0,49695.0,0.1083,(49695.0/458811.0)
1,13142.0,4217.0,0.7571,(13142.0/17359.0)
Total,422258.0,53912.0,0.132,(62837.0/476170.0)

metric,threshold,value,idx
max f1,0.0578096,0.1183371,191.0
max f2,0.0420328,0.1999309,246.0
max f0point5,0.0789704,0.0982191,141.0
max accuracy,0.4559583,0.9635508,7.0
max precision,0.6050734,1.0,0.0
max recall,0.0094975,1.0,398.0
max specificity,0.6050734,1.0,0.0
max absolute_mcc,0.0444154,0.0817414,237.0
max min_per_class_accuracy,0.0352824,0.6000346,277.0
max mean_per_class_accuracy,0.0357398,0.6004504,275.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100006,0.1117048,3.5714093,3.5714093,0.1301974,0.1417147,0.1301974,0.1417147,0.0357163,0.0357163,257.1409302,257.1409302,0.0266887
2,0.0200013,0.0945659,2.5575899,3.0644996,0.0932381,0.1019778,0.1117178,0.1218463,0.0255775,0.0612939,155.7589887,206.4499595,0.0428549
3,0.0300019,0.0851073,2.2638127,2.797604,0.0825283,0.089515,0.101988,0.1110692,0.0226396,0.0839334,126.3812671,179.7603953,0.055972
4,0.0400004,0.0787715,2.2009111,2.6484542,0.0802352,0.0817974,0.0965506,0.1037524,0.0220059,0.1059393,120.0911139,164.8454245,0.0684336
5,0.0500011,0.0739402,2.0276388,2.5242859,0.0739185,0.0762556,0.092024,0.0982528,0.0202777,0.1262169,102.763883,152.4285947,0.0790995
6,0.1,0.0599566,1.8757235,2.2000115,0.0683804,0.0660428,0.0802024,0.0821481,0.0937842,0.2200012,87.5723475,120.0011521,0.1245414
7,0.1500011,0.0524136,1.5334663,1.9778267,0.0559032,0.0559179,0.0721026,0.0734046,0.0766749,0.2966761,53.3466269,97.782666,0.1522244
8,0.2,0.0473023,1.374532,1.8270062,0.0501092,0.0497201,0.0666044,0.0674836,0.0687252,0.3654012,37.4532006,82.7006164,0.1716591
9,0.3,0.0403395,1.2097471,1.6212531,0.0441019,0.0435637,0.0591035,0.0595103,0.1209747,0.4863759,20.9747105,62.1253144,0.1934274
10,0.4,0.0354595,1.0657296,1.4823723,0.0388517,0.037786,0.0540406,0.0540792,0.106573,0.5929489,6.5729593,48.2372256,0.2002491

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8652531,0.0106636,0.8790284,0.866631,0.8552238,0.8713363,0.8540459
auc,0.6412331,0.0030388,0.6401285,0.6397766,0.6434467,0.6451908,0.637623
err,0.1347469,0.0106636,0.1209716,0.1333690,0.1447762,0.1286637,0.1459541
err_count,12833.2,1027.4187,11524.0,12707.0,13795.0,12217.0,13923.0
f0point5,0.0904033,0.0028577,0.0927363,0.0920621,0.0855834,0.0914871,0.0901476
f1,0.1185893,0.0020982,0.1188255,0.1205620,0.1150811,0.1187333,0.1197446
f2,0.1725866,0.0052523,0.1653403,0.1746191,0.1756069,0.1690911,0.1782756
lift_top_group,3.5843952,0.2705577,3.9719255,3.4878626,3.2554417,3.4929717,3.7137737
logloss,0.1518456,0.0032298,0.1517068,0.1528318,0.1466299,0.1526171,0.1554423
max_per_class_error,0.7513156,0.0194400,0.7762741,0.7509294,0.7295749,0.7642509,0.7355487


In [17]:
perf = aml.leader.model_performance(test_df)
perf

Unnamed: 0,0,1,Error,Rate
0,99017.0,15690.0,0.1368,(15690.0/114707.0)
1,3082.0,1253.0,0.711,(3082.0/4335.0)
Total,102099.0,16943.0,0.1577,(18772.0/119042.0)

metric,threshold,value,idx
max f1,0.0537185,0.1177742,196.0
max f2,0.0406554,0.1982208,250.0
max f0point5,0.0918231,0.1007284,104.0
max accuracy,0.3834208,0.9636179,3.0
max precision,0.4589905,1.0,0.0
max recall,0.0105656,1.0,397.0
max specificity,0.4589905,1.0,0.0
max absolute_mcc,0.0521117,0.0821172,202.0
max min_per_class_accuracy,0.0352646,0.5930796,276.0
max mean_per_class_accuracy,0.0394068,0.5969095,256.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100049,0.1117355,3.5046362,3.5046362,0.1276238,0.1391871,0.1276238,0.1391871,0.0350634,0.0350634,250.4636175,250.4636175,0.0260056
2,0.0200013,0.0946044,2.9306764,3.2177768,0.1067227,0.1024165,0.1171777,0.1208095,0.0292964,0.0643599,193.0676437,221.7776835,0.0460349
3,0.0300062,0.0856353,2.0289999,2.8214069,0.0738875,0.0897856,0.1027436,0.1104653,0.0202999,0.0846597,102.8999891,182.1406919,0.056719
4,0.0400027,0.0794676,2.0999335,2.6411143,0.0764706,0.0823552,0.0961781,0.1034407,0.0209919,0.1056517,109.993351,164.111432,0.06813
5,0.0500076,0.0745248,2.0751135,2.5278761,0.0755668,0.0768644,0.0920544,0.0981237,0.0207612,0.1264129,107.5113524,152.7876129,0.0792929
6,0.1000067,0.0603665,1.8039519,2.1659444,0.0656922,0.0665078,0.0788744,0.0823171,0.0901961,0.216609,80.3951876,116.5944407,0.1210089
7,0.1500059,0.0526106,1.6655412,1.9991527,0.0606519,0.0561737,0.0728006,0.0736031,0.0832757,0.2998847,66.5541247,99.9152694,0.155543
8,0.200005,0.0475091,1.2733778,1.8177166,0.046371,0.0498968,0.0661935,0.0676768,0.0636678,0.3635525,27.3377795,81.771659,0.1697282
9,0.3000034,0.0405077,1.2133999,1.6162833,0.0441868,0.0437474,0.0588581,0.0597005,0.1213379,0.4848904,21.3399856,61.6283319,0.1918743
10,0.4000017,0.0356316,0.9619539,1.4527044,0.0350302,0.0379618,0.0529013,0.054266,0.0961938,0.5810842,-3.8046122,45.2704394,0.187926


## Communicate

(skipped)