## Hyperparameter search

In [1]:
#Import H2O and other libraries that will be used in this tutorial 
import h2o
import matplotlib as plt

#Import the Estimators
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

#Import h2o grid search 
import h2o.grid 
from h2o.grid.grid_search import H2OGridSearch

In [5]:
import h2o
h2o.init(max_mem_size=16)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.6" 2020-01-14; OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
  Starting server from /home/ubuntu/Anaconda/envs/keras_env/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpokd8z0rj
  JVM stdout: /tmp/tmpokd8z0rj/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpokd8z0rj/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54327
Connecting to H2O server at http://127.0.0.1:54327 ... successful.


0,1
H2O cluster uptime:,10 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,1 month and 12 days
H2O cluster name:,H2O_from_python_unknownUser_xzk5mx
H2O cluster total nodes:,1
H2O cluster free memory:,16 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [18]:
loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/DAI-Tutorials/loan_level_500k.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:
train, valid, test = loan_level.split_frame([0.7, 0.15], seed=42)
print("train:%d valid:%d test:%d" % (train.nrows, valid.nrows, test.nrows))
y = "DELINQUENT"
ignore = ["DELINQUENT", "PREPAID", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "PRODUCT_TYPE"] 
x = list(set(train.names) - set(ignore))

train:350268 valid:74971 test:74898


## Grid Search/ Cartesian Search by default or not specified

In [None]:

glm_grid = h2o.grid.H2OGridSearch (
    H2OGeneralizedLinearEstimator( 
        family = "binomial",
        lambda_search = True),
    
    hyper_params = {
        "alpha": [x*0.01 for x in range(0, 4)],
        "lambda": [x*1e-6 for x in range(0, 4)],
        },
    
    grid_id = "glm_grid_2",
    
)
%time glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)

## Random Search

In [9]:

glm_grid = h2o.grid.H2OGridSearch (
    H2OGeneralizedLinearEstimator( 
        family = "binomial",
        lambda_search = True),
    
    hyper_params = {
        "alpha": [x*0.01 for x in range(0, 100)],
        "lambda": [x*1e-6 for x in range(0, 1000)],
        },
    
    grid_id = "glm_grid",
    
    search_criteria = {
        "strategy":"RandomDiscrete", 
        "max_models":100,
        "max_runtime_secs":300,
        "seed":42
        }
)
%time glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%
CPU times: user 5.02 s, sys: 543 ms, total: 5.56 s
Wall time: 5min 14s


In [10]:
h2o.ls()

Unnamed: 0,key
0,glm_grid
1,glm_grid_model_1
2,glm_grid_model_10
3,glm_grid_model_11
4,glm_grid_model_12
5,glm_grid_model_13
6,glm_grid_model_14
7,glm_grid_model_15
8,glm_grid_model_16
9,glm_grid_model_17


In [31]:
help(h2o.grid.H2OGridSearch)

Help on class H2OGridSearch in module h2o.grid.grid_search:

class H2OGridSearch(H2OGridSearch)
 |  Grid Search of a Hyper-Parameter Space for a Model
 |  
 |  :param model: The type of model to be explored initialized with optional parameters that will be
 |      unchanged across explored models.
 |  :param hyper_params: A dictionary of string parameters (keys) and a list of values to be explored by grid
 |      search (values).
 |  :param str grid_id: The unique id assigned to the resulting grid object. If none is given, an id will
 |      automatically be generated.
 |  :param search_criteria:  The optional dictionary of directives which control the search of the hyperparameter space.
 |      The dictionary can include values for: ``strategy``, ``max_models``, ``max_runtime_secs``, ``stopping_metric``, 
 |      ``stopping_tolerance``, ``stopping_rounds`` and ``seed``. The default strategy, "Cartesian", covers the entire space of 
 |      hyperparameter combinations. If you want to u

In [11]:
glm_grid.get_grid(sort_by='auc',decreasing=True)

                       alpha                   lambda          model_ids  \
0                     [0.87]  [4.9999999999999996E-6]  glm_grid_model_61   
1                      [0.4]                 [1.8E-5]  glm_grid_model_46   
2                     [0.07]                 [3.7E-5]  glm_grid_model_48   
3                     [0.07]                 [5.6E-5]  glm_grid_model_72   
4                     [0.88]                 [3.1E-5]  glm_grid_model_37   
5                     [0.18]   [8.099999999999999E-5]  glm_grid_model_86   
6                     [0.15]  [1.3099999999999999E-4]  glm_grid_model_30   
7                      [0.1]                [1.59E-4]  glm_grid_model_38   
8                     [0.06]                [2.12E-4]  glm_grid_model_78   
9                     [0.53]   [9.499999999999999E-5]  glm_grid_model_28   
10     [0.41000000000000003]                [1.37E-4]  glm_grid_model_67   
11                    [0.65]                [1.07E-4]  glm_grid_model_34   
12          



In [None]:
glm_grid.models[0:1]

In [None]:
glm_grid.summary()

In [None]:
sorted_glm_grid[0].actual_params

In [None]:
print(sorted_glm_grid[0].F1())
sorted_glm_grid[1].F1()

In [None]:
sorted_glm_grid[0].model_performance(test) # should give AUC of 0.8524 compared to the untuned version of 0.8523

## Random Forest

In [14]:
# Grid Search/ Cartesian Search by default or not specified
rf_grid = h2o.grid.H2OGridSearch (
    H2ORandomForestEstimator(nfolds=10),
    
    hyper_params = {
        "ntrees": [50,100],
        "max_depth": [10,20],
        },
    
     search_criteria = {
        "strategy":"RandomDiscrete", # Random Search 
        "max_models":100,
        "max_runtime_secs":300,
        "seed":42
        },
    
    grid_id = "rf_grid_2",
    
)
%time rf_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)

drf Grid Build progress: |████████████████████████████████████████████████| 100%
CPU times: user 1.97 s, sys: 378 ms, total: 2.34 s
Wall time: 6min 48s


In [15]:
rf_grid.get_grid(sort_by='auc', decreasing=True)

    max_depth ntrees          model_ids                 auc
0          20     14  rf_grid_2_model_1  0.7931050429424835




### Get the best model and train on top of that

In [16]:
best_model = rf_grid.get_grid(sort_by="auc", decreasing=True)[0]

rf = H2ORandomForestEstimator (seed=42, model_id='default_random_forest', checkpoint=best_model.model_id)
%time rf.train(x=x, y=y, training_frame=train, validation_frame=valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 511 ms, sys: 114 ms, total: 625 ms
Wall time: 1min 16s


In [17]:
rf.summary()


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,7639119.0,20.0,20.0,14.4,9881.0,11724.0,7801.92


