# Gridsearch For Improved Model Parameters

In [1]:
import json

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch

In [2]:
## start h2o instance
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 19 mins
H2O_cluster_timezone:,America/Denver
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,1 month and 23 days
H2O_cluster_name:,H2O_from_python_rdangol_ltdftd
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.953 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


## IMPORT Data

In [3]:
with open("grid_search_data_type.json", "r") as f_handle:
    data_types = json.load(f_handle)

In [4]:
data = h2o.import_file("grid_search.csv", col_types=data_types)

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Prepare Data

In [5]:
data = data[data.col_names[1:]]

In [6]:
# Identify predictors and response
x = data.columns
y = "target"
x.remove(y)

# For binary classification, response should be a factor
data[y] = data[y].asfactor()


# Split data into train & validation
train, valid = data.split_frame([0.8])

## GBM Grid Search

In [4]:
# GBM hyperparameters
gbm_params1 = { 'ntrees':[100, 500, 1000],
               'learn_rate': [0.01,0.4, 0.1],
                'max_depth': [3, 5, 9],
                'sample_rate': [0.1,0.5, 1.0],
                'col_sample_rate': [0.2, 0.5, 1.0]}

# Train and validate a cartesian grid of GBMs
gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator,
                          grid_id='gbm_grid1',
                          hyper_params=gbm_params1)
gbm_grid1.train(x=x, 
                y=y,
                training_frame=train,
                validation_frame=valid,
                ntrees=100,
                seed=1)

# Get the grid results, sorted by validation AUC
gbm_gridperf1 = gbm_grid1.get_grid(sort_by='auc', decreasing=True)
gbm_gridperf1

# Grab the top GBM model, chosen by validation AUC
best_gbm1 = gbm_gridperf1.models[0]

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance

In [28]:
best_gbm1.F1()

[[0.3075587616663262, 0.6679031530402527]]

In [5]:
best_gbm_perf1 = best_gbm1.model_performance(valid)

## Save params

In [6]:
# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
# save the hyper parameters as json file for later use

params = best_gbm1.actual_params
with open("hyperparams/gbm_grid_param.json", 'w') as json_file:
        json_file.write(json.dumps(params, indent=4))

## XGBoost Grid Search

In [7]:
xgb_params1 = { 'ntrees': [100, 120, 150, 200], 
                'max_depth' : [3, 5, 10], 
                'min_rows' : [1, 5, 10 ],                                    
                'sample_rate' : [0.5, 0.7, 0.9],                 
                'col_sample_rate_per_tree' : [0.5, 0.7, 0.9]} 

# Train and validate a cartesian grid of GBMs
xgb_grid1 = H2OGridSearch(model=H2OXGBoostEstimator,
                          grid_id='xgb_grid1',
                          hyper_params=xgb_params1)
xgb_grid1.train(x=x, 
                y=y,
                training_frame=train,
                validation_frame=valid,
                seed=123)

xgboost Grid Build progress: |████████████████████████████████████████████| 100%


In [8]:
# Get the grid results, sorted by validation AUC
xgb_gridperf1 = xgb_grid1.get_grid(sort_by='F1', decreasing=True)

# Grab the top GBM model, chosen by validation AUC
best_xgb1 = xgb_gridperf1.models[0]

## Save the XGBoost Hyper Param

In [9]:
# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance

# save the hyper parameters as json file for later use
params = best_xgb1.actual_params
with open("hyperparams/xgb_grid_param.json", 'w') as json_file:
        json_file.write(json.dumps(params, indent=4))

## How to Use it 

In [30]:
with open("hyperparams/xgb_grid_param.json", "r") as f_handle:
    hyper_params = json.load(f_handle)

remove_params = ['response_column','ignored_columns','model_id','training_frame',
                     'validation_frame','calibration_frame','seed']

params = {key: hyper_params[key] for key in hyper_params if key not in remove_params}