In [23]:
import h2o
import numpy as np

In [24]:
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,3 days 13 hours 6 mins
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.2
H2O cluster version age:,16 days
H2O cluster name:,H2O_started_from_R_raj_cjn618
H2O cluster total nodes:,1
H2O cluster free memory:,2.457 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [26]:
url =  "http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip"
data=h2o.import_file(url)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [27]:
train,valid,test=data.split_frame([0.8,0.1],seed=69)

In [28]:
print("%d/%d/%d" %(train.nrows,valid.nrows,test.nrows))

35255/4272/4451


In [29]:
y='IsArrDelayed' # binomial Classification
ignoreFields=['ArrDelay',
 'DepDelay','CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay','IsArrDelayed',
 'IsDepDelayed','ActualElapsedTime','ArrTime','TailNum']
x=[i for i in train.names if i not in ignoreFields]

In [30]:
nfolds=5
train2=train.rbind(valid)

In [31]:
train2.nrows

39527

In [32]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

In [33]:
m_GLM= H2OGeneralizedLinearEstimator(
family='binomial',
model_id='glm_def',
nfolds=nfolds,
fold_assignment="Modulo",
keep_cross_validation_predictions=True)
m_GLM.train(x,y,train2)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [34]:
m_GBM= H2OGradientBoostingEstimator(
model_id='gbm_def',
nfolds=nfolds,
fold_assignment="Modulo",
keep_cross_validation_predictions=True)
m_GBM.train(x,y,train2)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [35]:
m_RF= H2OGradientBoostingEstimator(
model_id='rf_def',
nfolds=nfolds,
fold_assignment="Modulo",
keep_cross_validation_predictions=True)
m_RF.train(x,y,train2)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [36]:
models=[m_GLM.model_id,m_GBM.model_id,m_RF.model_id]

In [37]:
m_SE= H2OStackedEnsembleEstimator(model_id='SE_glm_gbm_rf',base_models=models)
m_SE.train(x,y,train2)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [38]:
import pandas as pd

### Analyzing the Performance

In [39]:
all_models=[m_GLM,m_GBM,m_RF,m_SE]

In [40]:
names=['GLM','GBM','RF','SE']

In [41]:
pd.Series(map(lambda x : x.logloss(),all_models),names) # lower is better

GLM    0.573282
GBM    0.508120
RF     0.508120
SE     0.484990
dtype: float64

In [42]:
pd.Series(map(lambda x : x.auc(),all_models),names) # data it was trained on

GLM    0.768183
GBM    0.850473
RF     0.850473
SE     0.846251
dtype: float64

In [43]:
pd.Series(map(lambda x : x.auc(xval=True),all_models),names) # cross validation results

GLM    0.760952
GBM    0.805887
RF     0.805887
SE          NaN
dtype: float64

In [None]:
# no cross validation set, stacked ensemble was built on all data

In [45]:
test_perf=list(map(lambda x : x.model_performance(test),all_models))

In [47]:
pd.Series(map(lambda x : x.logloss(),test_perf),names)

GLM    0.580694
GBM    0.544807
RF     0.544807
SE     0.533573
dtype: float64

In [46]:
pd.Series(map(lambda x : x.auc(),test_perf),names)

GLM    0.755183
GBM    0.801738
RF     0.801738
SE     0.802337
dtype: float64

In [48]:
# Saving mojo/pojo/binary

In [49]:
m_GBM.save_mojo("/tmp/models")

'/private/tmp/models/gbm_def.zip'

In [53]:
#m_GBM.download_mojo("/tmp/models")
m_GBM.download_pojo("/tmp/models")

'/tmp/models/gbm_def.java'

In [51]:
m_GBM.save_model_details("/tmp/models") # creates a json object [same as flow interface]

'/private/tmp/models/gbm_def.json'

In [52]:
# m_GBM.save_model -> no
h2o.save_model(m_GBM,"/tmp/models") #for comparison , you can save binary object

'/private/tmp/models/gbm_def'

In [54]:
h2o.load_model('/tmp/models/gbm_def')

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_def


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.16735639801328567
RMSE: 0.4090921632264369
LogLoss: 0.5081200815906951
Mean Per-Class Error: 0.23428084622047485
AUC: 0.8504733739633966
Gini: 0.7009467479267932
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4601094284051148: 


0,1,2,3,4
,NO,YES,Error,Rate
NO,10716.0,6871.0,0.3907,(6871.0/17587.0)
YES,2790.0,19150.0,0.1272,(2790.0/21940.0)
Total,13506.0,26021.0,0.2444,(9661.0/39527.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4601094,0.7985655,248.0
max f2,0.3134609,0.8789982,320.0
max f0point5,0.5948227,0.8069861,175.0
max accuracy,0.5208668,0.7660839,216.0
max precision,0.9847206,1.0,0.0
max recall,0.1400292,1.0,390.0
max specificity,0.9847206,1.0,0.0
max absolute_mcc,0.5551244,0.5282309,198.0
max min_per_class_accuracy,0.5357913,0.7646308,208.0


Gains/Lift Table: Avg response rate: 55.51 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100185,0.9629966,1.8015953,1.8015953,1.0,1.0,0.0180492,0.0180492,80.1595260,80.1595260
,2,0.0200116,0.9467954,1.8015953,1.8015953,1.0,1.0,0.0180036,0.0360529,80.1595260,80.1595260
,3,0.0300048,0.9321607,1.8015953,1.8015953,1.0,1.0,0.0180036,0.0540565,80.1595260,80.1595260
,4,0.0400233,0.9174588,1.8015953,1.8015953,1.0,1.0,0.0180492,0.0721057,80.1595260,80.1595260
,5,0.0500164,0.9039982,1.8015953,1.8015953,1.0,1.0,0.0180036,0.0901094,80.1595260,80.1595260
,6,0.1000076,0.8453925,1.7487144,1.7751615,0.9706478,0.9853276,0.0874202,0.1775296,74.8714427,77.5161532
,7,0.1499987,0.7918188,1.6885397,1.7462925,0.9372470,0.9693034,0.0844120,0.2619417,68.8539687,74.6292454
,8,0.2000152,0.7427635,1.6156947,1.7136348,0.8968134,0.9511763,0.0808113,0.3427530,61.5694687,71.3634753
,9,0.2999975,0.6704044,1.5244268,1.6505761,0.8461538,0.9161747,0.1524157,0.4951686,52.4426758,65.0576059




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.18235852020014426
RMSE: 0.4270345655800526
LogLoss: 0.541772237998002
Mean Per-Class Error: 0.27064939535650834
AUC: 0.8058874117105745
Gini: 0.6117748234211491
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446206535575455: 


0,1,2,3,4
,NO,YES,Error,Rate
NO,9535.0,8052.0,0.4578,(8052.0/17587.0)
YES,3163.0,18777.0,0.1442,(3163.0/21940.0)
Total,12698.0,26829.0,0.2837,(11215.0/39527.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4462065,0.7700383,253.0
max f2,0.2563970,0.8677530,343.0
max f0point5,0.5975501,0.7695182,172.0
max accuracy,0.5336832,0.7290966,206.0
max precision,0.9875308,1.0,0.0
max recall,0.1129000,1.0,396.0
max specificity,0.9875308,1.0,0.0
max absolute_mcc,0.5386940,0.4559240,203.0
max min_per_class_accuracy,0.5386940,0.7271650,203.0


Gains/Lift Table: Avg response rate: 55.51 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100185,0.9632241,1.8015953,1.8015953,1.0,1.0,0.0180492,0.0180492,80.1595260,80.1595260
,2,0.0200116,0.9458224,1.8015953,1.8015953,1.0,1.0,0.0180036,0.0360529,80.1595260,80.1595260
,3,0.0300048,0.9296230,1.8015953,1.8015953,1.0,1.0,0.0180036,0.0540565,80.1595260,80.1595260
,4,0.0400233,0.9160172,1.7697489,1.7936236,0.9823232,0.9955752,0.0177302,0.0717867,76.9748879,79.3623599
,5,0.0500164,0.9003288,1.7514243,1.7851923,0.9721519,0.9908953,0.0175023,0.0892890,75.1424253,78.5192268
,6,0.1000076,0.8440970,1.6876279,1.7364224,0.9367409,0.9638249,0.0843665,0.1736554,68.7627948,73.6422449
,7,0.1499987,0.7902493,1.6128654,1.6952437,0.8952429,0.9409681,0.0806290,0.2542844,61.2865392,69.5243710
,8,0.2000152,0.7424064,1.5427925,1.6571213,0.8563480,0.9198077,0.0771650,0.3314494,54.2792501,65.7121266
,9,0.2999975,0.6687679,1.4113712,1.5752184,0.7834008,0.8743464,0.1411121,0.4725615,41.1371185,57.5218389



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.7133099,0.0065868,0.715406,0.7090817,0.6975332,0.7211891,0.7233397
auc,0.8060378,0.0038847,0.8073686,0.8060697,0.7970652,0.8142967,0.8053887
err,0.2866901,0.0065868,0.2845940,0.2909183,0.3024668,0.2788109,0.2766603
err_count,2266.4,52.07802,2250.0,2300.0,2391.0,2204.0,2187.0
f0point5,0.7218404,0.0051447,0.7234936,0.7199222,0.709607,0.7241845,0.7319950
f1,0.7715131,0.0025384,0.7758964,0.7733097,0.7653812,0.7729707,0.7700074
f2,0.8286785,0.0061656,0.8364831,0.8352495,0.8306710,0.8288050,0.8121839
lift_top_group,1.8018559,0.0153400,1.7911192,1.7782276,1.7852304,1.8345324,1.8201704
logloss,0.5417724,0.0020884,0.5393959,0.5398793,0.5463629,0.5390373,0.5441863


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2018-07-02 04:27:05,6.607 sec,0.0,0.4969587,0.6870709,0.5,1.0,0.4449364
,2018-07-02 04:27:05,6.632 sec,1.0,0.4898214,0.6727576,0.7134887,1.8015953,0.3881145
,2018-07-02 04:27:05,6.652 sec,2.0,0.4841579,0.6614461,0.7316107,1.8015953,0.3565917
,2018-07-02 04:27:05,6.672 sec,3.0,0.4786659,0.6504677,0.7403761,1.8015953,0.3571989
,2018-07-02 04:27:06,6.710 sec,4.0,0.4743149,0.6417239,0.7446867,1.8015953,0.3604372
---,---,---,---,---,---,---,---,---
,2018-07-02 04:27:07,7.996 sec,46.0,0.4120044,0.5140059,0.8463681,1.8015953,0.2507400
,2018-07-02 04:27:07,8.027 sec,47.0,0.4113368,0.5126009,0.8471042,1.8015953,0.2474511
,2018-07-02 04:27:07,8.065 sec,48.0,0.4107162,0.5113633,0.8479587,1.8015953,0.2492221



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Origin,3876.3937988,1.0,0.2336145
TaxiOut,2874.3134766,0.7414916,0.1732232
Dest,2840.7470703,0.7328324,0.1712003
DepTime,2190.9628906,0.5652065,0.1320404
AirTime,1183.2290039,0.3052396,0.0713084
CRSDepTime,752.0204468,0.1940000,0.0453212
Year,663.8605957,0.1712573,0.0400082
DayofMonth,571.0549316,0.1473160,0.0344152
CRSElapsedTime,493.3397217,0.1272677,0.0297316




In [55]:
m_SE.save_mojo("/tmp/models/")

'/private/tmp/models/SE_glm_gbm_rf.zip'

In [56]:
h2o.save_model(m_SE,'/tmp/models/')

'/private/tmp/models/SE_glm_gbm_rf'