In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()

# import the cars dataset:
# this dataset is used to classify whether or not a car is economical based on
# the car's displacement, power, weight, and acceleration, and the year it was made
cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")

# set the predictor names and the response column name
predictors = ["displacement","power","weight","acceleration","year"]
response = "cylinders"

# split into train and validation sets
train, valid = cars.split_frame(ratios = [.8], seed = 1234)

# train a GBM model
cars_gbm = H2OGradientBoostingEstimator(distribution = "poisson", seed = 1234)
cars_gbm.train(x = predictors,
               y = response,
               training_frame = train,
               validation_frame = valid)

# retrieve the model performance
perf = cars_gbm.model_performance(valid)
perf

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.8" 2020-07-14; OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.8+10); OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.8+10, mixed mode)
  Starting server from /home/pzhao28/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp_ke5w2ql
  JVM stdout: /tmp/tmp_ke5w2ql/h2o_pzhao28_started_from_python.out
  JVM stderr: /tmp/tmp_ke5w2ql/h2o_pzhao28_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Phoenix
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_pzhao28_1whpm1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.836 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%

ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 0.03769791966551617
RMSE: 0.19415952118172358
MAE: 0.07947861719967757
RMSLE: 0.03359130162278705
Mean Residual Deviance: -9.918300985300526




In [2]:
valid

name,economy,cylinders,displacement,power,weight,acceleration,year,economy_20mpg
AMC Concord DL 6,20.2,6,232,90,3265,18.2,79,1.0
AMC Gremlin,19.0,6,232,100,2634,13.0,71,0.0
AMC Hornet,18.0,6,232,100,2945,16.0,73,0.0
AMC Matador (Wagon),14.0,8,304,150,4257,15.5,74,0.0
AMC Matador (Wagon),15.0,8,304,150,3892,12.5,72,0.0
AMC Matador,14.0,8,304,150,3672,11.5,73,0.0
AMC Matador,15.0,6,258,110,3730,19.0,75,0.0
AMC Matador,15.5,8,304,120,3962,13.9,76,0.0
AMC Rebel SST (Wagon),,8,360,175,3850,11.0,70,
Audi 100 LS,20.0,4,114,91,2582,14.0,73,0.0




In [4]:
train.head(100)

name,economy,cylinders,displacement,power,weight,acceleration,year,economy_20mpg
AMC Ambassador Brougham,13.0,8,360,175.0,3821,11.0,73,0.0
AMC Ambassador DPL,15.0,8,390,190.0,3850,8.5,70,0.0
AMC Ambassador SST,17.0,8,304,150.0,3672,11.5,72,0.0
AMC Concord DL,18.1,6,258,120.0,3410,15.1,78,0.0
AMC Concord DL,23.0,4,151,,3035,20.5,82,1.0
AMC Concord,19.4,6,232,90.0,3210,17.2,78,0.0
AMC Concord,24.3,4,151,90.0,3003,20.1,80,1.0
AMC Gremlin,18.0,6,232,100.0,2789,15.0,73,0.0
AMC Gremlin,20.0,6,232,100.0,2914,16.0,75,0.0
AMC Gremlin,21.0,6,199,90.0,2648,15.0,70,1.0




In [8]:
pred

predict,p0,p1,cal_p0,cal_p1
1,0.542275,0.457725,0.760534,0.239466
0,0.566532,0.433468,0.788617,0.211383
0,0.653978,0.346022,0.869554,0.130446
0,0.965854,0.0341463,0.981422,0.0185779
0,0.99,0.01,0.98413,0.0158705
0,0.99,0.01,0.98413,0.0158705
0,0.891425,0.108575,0.96991,0.0300896
0,0.88,0.12,0.967617,0.0323833
0,1.0,0.0,0.985134,0.0148665
1,0.0948147,0.905185,0.140117,0.859883




In [9]:
cars

name,economy,cylinders,displacement,power,weight,acceleration,year,economy_20mpg
AMC Ambassador Brougham,13.0,8,360,175.0,3821,11.0,73,0
AMC Ambassador DPL,15.0,8,390,190.0,3850,8.5,70,0
AMC Ambassador SST,17.0,8,304,150.0,3672,11.5,72,0
AMC Concord DL 6,20.2,6,232,90.0,3265,18.2,79,1
AMC Concord DL,18.1,6,258,120.0,3410,15.1,78,0
AMC Concord DL,23.0,4,151,,3035,20.5,82,1
AMC Concord,19.4,6,232,90.0,3210,17.2,78,0
AMC Concord,24.3,4,151,90.0,3003,20.1,80,1
AMC Gremlin,18.0,6,232,100.0,2789,15.0,73,0
AMC Gremlin,19.0,6,232,100.0,2634,13.0,71,0




In [10]:
cars_drf

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1603473007745_1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,10.0,20.0,3198.0,3.0,5.0,4.1,6.0,9.0,8.15




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.07167425726941093
RMSE: 0.2677204834700007
LogLoss: 0.6277394507856711
Mean Per-Class Error: 0.0833475588938204
AUC: 0.9521807784226699
AUCPR: 0.9610198834906681
Gini: 0.9043615568453398

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6030516722550698: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,107.0,9.0,0.0776,(9.0/116.0)
1,1,18.0,184.0,0.0891,(18.0/202.0)
2,Total,125.0,193.0,0.0849,(27.0/318.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.603052,0.931646,63.0
1,max f2,0.234491,0.963391,99.0
2,max f0point5,0.603052,0.944559,63.0
3,max accuracy,0.603052,0.915094,63.0
4,max precision,0.982456,0.975806,3.0
5,max recall,0.0,1.0,132.0
6,max specificity,1.0,0.974138,0.0
7,max absolute_mcc,0.603052,0.821249,63.0
8,max min_per_class_accuracy,0.603052,0.910891,63.0
9,max mean_per_class_accuracy,0.603052,0.916652,63.0



Gains/Lift Table: Avg response rate: 62.77 %, avg score: 62.10 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.36,1.0,1.552288,1.552288,0.974359,1.0,0.974359,1.0,0.558824,0.558824,55.228758,55.228758,0.53403
1,2,0.406154,0.957125,1.486928,1.54486,0.933333,0.97694,0.969697,0.99738,0.068627,0.627451,48.69281,54.486037,0.594393
2,3,0.507692,0.854396,1.448307,1.52555,0.909091,0.901629,0.957576,0.97823,0.147059,0.77451,44.83066,52.554961,0.716659
3,4,0.609231,0.53267,1.303476,1.488537,0.818182,0.726742,0.934343,0.936315,0.132353,0.906863,30.347594,48.853733,0.799425
4,5,0.710769,0.223336,0.724153,1.37934,0.454545,0.373025,0.865801,0.855845,0.073529,0.980392,-27.58467,37.933961,0.724194
5,6,0.812308,0.041469,0.048277,1.212957,0.030303,0.116022,0.761364,0.763367,0.004902,0.985294,-95.172311,21.295677,0.464633
6,7,1.0,0.0,0.078351,1.0,0.04918,0.004765,0.627692,0.620983,0.014706,1.0,-92.164899,0.0,0.0




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.0782562709016502
RMSE: 0.2797432231558974
LogLoss: 0.2382614928127087
Mean Per-Class Error: 0.07880844645550522
AUC: 0.9751131221719458
AUCPR: 0.9730927260518765
Gini: 0.9502262443438916

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.45772475691013736: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,34.0,5.0,0.1282,(5.0/39.0)
1,1,1.0,33.0,0.0294,(1.0/34.0)
2,Total,35.0,38.0,0.0822,(6.0/73.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.457725,0.916667,23.0
1,max f2,0.457725,0.948276,23.0
2,max f0point5,0.940605,0.942029,11.0
3,max accuracy,0.744772,0.917808,21.0
4,max precision,1.0,1.0,0.0
5,max recall,0.122951,1.0,29.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.457725,0.841115,23.0
8,max min_per_class_accuracy,0.754049,0.897436,20.0
9,max mean_per_class_accuracy,0.457725,0.921192,23.0



Gains/Lift Table: Avg response rate: 46.58 %, avg score: 52.02 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.178082,1.0,2.147059,2.147059,1.0,1.0,1.0,1.0,0.382353,0.382353,114.705882,114.705882,0.382353
1,2,0.232877,0.984737,2.147059,2.147059,1.0,0.987368,1.0,0.997028,0.117647,0.5,114.705882,114.705882,0.5
2,3,0.315068,0.965494,2.147059,2.147059,1.0,0.975155,1.0,0.991322,0.176471,0.676471,114.705882,114.705882,0.676471
3,4,0.424658,0.892045,1.610294,2.008539,0.75,0.919187,0.935484,0.972706,0.176471,0.852941,61.029412,100.85389,0.801659
4,5,0.520548,0.445596,1.226891,1.864551,0.571429,0.741243,0.868421,0.930068,0.117647,0.970588,22.689076,86.455108,0.842383
5,6,0.630137,0.109083,0.268382,1.586957,0.125,0.283031,0.73913,0.81754,0.029412,1.0,-73.161765,58.695652,0.692308
6,7,0.753425,0.02,0.0,1.327273,0.0,0.038443,0.618182,0.690052,0.0,1.0,-100.0,32.727273,0.461538
7,8,1.0,0.0,0.0,1.0,0.0,0.001111,0.465753,0.520176,0.0,1.0,-100.0,0.0,0.0




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2020-10-23 10:13:51,0.025 sec,0.0,,,,,,,,,,,,
1,,2020-10-23 10:13:52,0.207 sec,1.0,0.288584,0.783658,0.941891,0.956334,1.542561,0.112069,0.294711,1.545161,0.940045,0.911556,2.003922,0.123288
2,,2020-10-23 10:13:52,0.253 sec,2.0,0.296112,0.926112,0.939702,0.955762,1.544368,0.111111,0.292196,1.127464,0.949849,0.931028,2.044818,0.09589
3,,2020-10-23 10:13:52,0.278 sec,3.0,0.304032,0.954671,0.933073,0.949096,1.535727,0.120172,0.287145,1.109562,0.950226,0.93119,2.039706,0.082192
4,,2020-10-23 10:13:52,0.307 sec,4.0,0.285987,0.831506,0.941532,0.954073,1.541746,0.106618,0.277595,0.678367,0.957014,0.958812,2.147059,0.082192
5,,2020-10-23 10:13:52,0.328 sec,5.0,0.283599,0.789191,0.943774,0.955049,1.543738,0.099315,0.28481,0.678507,0.957391,0.961853,2.147059,0.082192
6,,2020-10-23 10:13:52,0.351 sec,6.0,0.282526,0.775605,0.942424,0.955033,1.544118,0.1,0.277325,0.232574,0.975867,0.976328,2.147059,0.068493
7,,2020-10-23 10:13:52,0.373 sec,7.0,0.279308,0.749403,0.945751,0.955457,1.543738,0.096154,0.275761,0.231754,0.975867,0.97651,2.147059,0.068493
8,,2020-10-23 10:13:52,0.395 sec,8.0,0.275068,0.635871,0.950463,0.960344,1.553309,0.094637,0.278643,0.234734,0.971342,0.969663,2.147059,0.082192
9,,2020-10-23 10:13:52,0.413 sec,9.0,0.268175,0.626433,0.95265,0.961388,1.552634,0.084906,0.278654,0.2357,0.973605,0.972097,2.147059,0.082192



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,weight,440.754974,1.0,0.421418
1,displacement,383.773346,0.870718,0.366936
2,power,155.198761,0.35212,0.14839
3,acceleration,39.571045,0.08978,0.037835
4,year,26.587412,0.060322,0.025421




In [14]:
cars_drf.rmse(valid=True)

0.2797432231558974