## Bayesian optimization with Python, H2O

In [6]:
# install and import packages (make sure you have installed Java)
# !pip install bayesian-optimization
# !pip install h2o
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from bayes_opt import BayesianOptimization

In [8]:
# start h2o
h2o.init()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 13.0.1+9, mixed mode, sharing)
  Starting server from X:\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\quany\AppData\Local\Temp\tmpx6umusj0
  JVM stdout: C:\Users\quany\AppData\Local\Temp\tmpx6umusj0\h2o_quany_started_from_python.out
  JVM stderr: C:\Users\quany\AppData\Local\Temp\tmpx6umusj0\h2o_quany_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,25 days
H2O cluster name:,H2O_from_python_quany_qvstr7
H2O cluster total nodes:,1
H2O cluster free memory:,3.959 Gb
H2O cluster total cores:,16
H2O cluster allowed cores:,16


In [9]:
# load dataset
data = h2o.upload_file("winequality-red.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
# train test split
train, test = data.split_frame(ratios = [0.7], destination_frames = ['train', 'test'])

Suppose we want to tune the following parameters:  
max_depth, ntrees, min_rows, learn_rate, sample_rate, col_sample_rate  

BayesianOptimization packages need two parts for achieving that  
(1) The Target function  
(2) The boundary of parameters  

In [63]:
# The Target function
def GBDT_h2o(max_depth, ntrees, min_rows, learn_rate, sample_rate, \
             data = train, xcols = train.columns[:-1], ycol = 'quality'):
    params = {'max_depth': int(max_depth),
              'ntrees': int(ntrees),
              'min_rows': int(min_rows),
              'learn_rate': learn_rate,
              'sample_rate': sample_rate}
    # not specify nfolds = 5
    model = H2OGradientBoostingEstimator(**params)
    model.train(x = xcols, y = ycol, training_frame = data, validation_frame = test)
#     train_rmse = - model.rmse()
    test_rmse = - model.rmse(valid=True)
    return test_rmse

# Optimization boundaries
bounds = {'max_depth':(3, 8),
          'ntrees': (300, 800),
          'min_rows': (5, 10),
          'learn_rate': (0.01, 0.05),
          'sample_rate': (0.8, 1)}

In [64]:
# run Bayesian Optimization
optimizer = BayesianOptimization(
    f = GBDT_h2o,
    pbounds = bounds,
    random_state = 2020)
optimizer.maximize(init_points = 5, n_iter = 20)

|   iter    |  target   | learn_... | max_depth | min_rows  |  ntrees   | sample... |
-------------------------------------------------------------------------------------
gbm Model Build progress: |███████████████████████████████████████████████| 100%
|  1        | -0.5718   |  0.04945  |  7.367    |  7.549    |  435.9    |  0.8674   |
gbm Model Build progress: |███████████████████████████████████████████████| 100%
|  2        | -0.5827   |  0.01868  |  4.382    |  6.717    |  731.1    |  0.8313   |
gbm Model Build progress: |███████████████████████████████████████████████| 100%
|  3        | -0.5708   |  0.01564  |  6.785    |  8.682    |  477.8    |  0.8682   |
gbm Model Build progress: |███████████████████████████████████████████████| 100%
|  4        | -0.5796   |  0.03667  |  4.086    |  7.807    |  362.1    |  0.8639   |
gbm Model Build progress: |███████████████████████████████████████████████| 100%
|  5        | -0.5954   |  0.04813  |  3.687    |  7.847    |  787.8    |  0.90

In [65]:
optimizer.max

{'target': -0.5632478247168863,
 'params': {'learn_rate': 0.01,
  'max_depth': 8.0,
  'min_rows': 5.000000007839401,
  'ntrees': 800.0,
  'sample_rate': 0.8000000148665619}}

In [70]:
# train model with the best parameters
best_param = {'learn_rate': 0.01,
             'max_depth': 8,
             'min_rows': 5,
             'ntrees': 800,
             'sample_rate': 0.8}
# nfolds = 5
model = H2OGradientBoostingEstimator(**best_param)
model.train(x = train.columns[:-1], y = 'quality', \
            training_frame = train, validation_frame = test)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [72]:
print("Train RMSE = {0}".format(model.rmse()))
print("Test RMSE = {0}".format(model.rmse(valid=True)))

Train RMSE = 0.16036605054753
Test RMSE = 0.5652282702693677


In [73]:
# predict the test value
model.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict
4.97194
5.12012
4.90172
5.42199
5.02189
5.24796
5.24851
5.23156
5.72173
5.31643


