In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.cluster().show_status()

# import the covtype dataset:
# this dataset is used to classify the correct forest cover type
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/Covertype
covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")

# convert response column to a factor
covtype[54] = covtype[54].asfactor()

# set the predictor names and the response column name
predictors = covtype.columns[0:54]
response = 'C55'

# split into train and validation sets
train, valid = covtype.split_frame(ratios = [.8], seed = 1234)

# try using the balance_classes parameter (set to True):
cov_gbm = H2OGradientBoostingEstimator(balance_classes = True, seed = 1234)
cov_gbm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)

print('logloss', cov_gbm.logloss(valid = True))

# grid over `balance_classes` (boolean parameter)
# import Grid Search
from h2o.grid.grid_search import H2OGridSearch

# select the values for `balance_classes` to grid over
hyper_params = {'balance_classes': [True, False]}

# this example uses cartesian grid search because the search space is small
# and we want to see the performance of all models. For a larger search space use
# random grid search instead: {'strategy': "RandomDiscrete"}
# initialize the GBM estimator
cov_gbm_2 = H2OGradientBoostingEstimator(seed = 1234)

# build grid search with previously made GBM and hyperparameters
grid = H2OGridSearch(model = cov_gbm_2, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})

# train using the grid
grid.train(x = predictors, y = response, training_frame = train, validation_frame = valid)

# sort the grid models by logloss
sorted_grid = grid.get_grid(sort_by='logloss', decreasing=False)
print(sorted_grid)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,5 hours 2 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.990 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


0,1
H2O_cluster_uptime:,5 hours 2 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.990 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
logloss 0.4117192242079613
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
    balance_classes  \
0              true   
1             false   

                                                         model_ids  \
0  Grid_GBM_py_3_sid_bd51_model_python_1602894814824_10283_model_1   
1  Grid_GBM_py_3_sid_bd51_model_python_1602894814824_10283_model_2   

              logloss  
0  0.4117192242079613  
1  0.4138197096305129  



In [2]:
#class_sampling_factors
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.cluster().show_status()

# import the covtype dataset:
# This dataset is used to classify the correct forest cover type
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/Covertype
covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")

# convert response column to a factor
covtype[54] = covtype[54].asfactor()

# set the predictor names and the response column name
predictors = covtype.columns[0:54]
response = 'C55'

# split into train and validation sets
train, valid = covtype.split_frame(ratios = [.8], seed = 1234)

# look at the frequencies of each class
print(covtype[54].table())

# try using the class_sampling_factors parameter:
# since all but Class 2 have similar frequency counts, let's undersample Class 2
# and not change the sampling rate of the other classes
# note: class_sampling_factors must be a list of floats
sample_factors = [1., 0.5, 1., 1., 1., 1., 1.]
cov_gbm = H2OGradientBoostingEstimator(balance_classes = True, class_sampling_factors = sample_factors, seed = 1234)
cov_gbm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)

# print the logloss for your model
print('logloss', cov_gbm.logloss(valid = True))

# grid over `class_sampling_factors`
# import Grid Search
from h2o.grid.grid_search import H2OGridSearch

# select the values for `class_sampling_factors` to grid over
# the first class_sampling_factors is the same as above
# the second doubles the number of samples for all but Class 2
# the third demonstrates a random option
hyper_params = {'class_sampling_factors': [[1., 0.5, 1., 1., 1., 1., 1.], [2., 1., 2., 2., 2., 2., 2.],
               [4., 0.5, 1., 1., 2., 2., 1.]]}

# this example uses cartesian grid search because the search space is small
# and we want to see the performance of all models. For a larger search space use
# random grid search instead: {'strategy': "RandomDiscrete"}
# initialize the GBM estimator
cov_gbm_2 = H2OGradientBoostingEstimator(balance_classes = True, seed = 1234)

# build grid search with previously made GBM and hyperparameters
grid = H2OGridSearch(model = cov_gbm_2, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})

# train using the grid
grid.train(x = predictors, y = response, training_frame = train, validation_frame = valid)

# sort the grid models by logloss
sorted_grid = grid.get_grid(sort_by='logloss', decreasing=False)
print(sorted_grid)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,5 hours 4 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.986 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


0,1
H2O_cluster_uptime:,5 hours 4 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.986 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%


C55,Count
1,2942
2,6022
3,2160
4,2160
5,2396
6,2160
7,2160



gbm Model Build progress: |███████████████████████████████████████████████| 100%
logloss 0.42013444233774033
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
                  class_sampling_factors  \
0    [2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0]   
1    [4.0, 0.5, 1.0, 1.0, 2.0, 2.0, 1.0]   
2    [1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0]   

                                                         model_ids  \
0  Grid_GBM_py_7_sid_a770_model_python_1602894814824_10285_model_2   
1  Grid_GBM_py_7_sid_a770_model_python_1602894814824_10285_model_3   
2  Grid_GBM_py_7_sid_a770_model_python_1602894814824_10285_model_1   

               logloss  
0  0.40944214293931014  
1  0.41952588932366436  
2  0.42013444233774033  



In [3]:
#max_after_balance_size
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.cluster().show_status()

# import the covtype dataset:
# this dataset is used to classify the correct forest cover type
# original dataset can be found at https://archive.ics.uci.edu/ml/datasets/Covertype
covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data")

# convert response column to a factor
covtype[54] = covtype[54].asfactor()

# set the predictor names and the response column name
predictors = covtype.columns[0:54]
response = 'C55'

# split into train and validation sets
train, valid = covtype.split_frame(ratios = [.8], seed = 1234)

# look at the frequencies of each class
print(covtype[54].table())

# try using the max_after_balance_size parameter:
max = .85
cov_gbm = H2OGradientBoostingEstimator(balance_classes = True,
                                       max_after_balance_size = max,
                                       seed = 1234)

cov_gbm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)

# print the logloss for your model
print('logloss', cov_gbm.logloss(valid = True))

# grid over `max_after_balance_size`
# import Grid Search
from h2o.grid.grid_search import H2OGridSearch

# select the values for `max_after_balance_size` to grid over
# the first and last max_after_balance_sizes reduce the size of the
# original dataset, the second increases the dataset by 1.7
hyper_params = {'max_after_balance_size': [.85, 1.7,.5]}

# this example uses cartesian grid search because the search space is small
# and we want to see the performance of all models. For a larger search space use
# random grid search instead: {'strategy': "RandomDiscrete"}
# initialize the GBM estimator
cov_gbm_2 = H2OGradientBoostingEstimator(balance_classes = True, seed = 1234)

# build grid search with previously made GBM and hyperparameters
grid = H2OGridSearch(model = cov_gbm_2, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})

# train using the grid
grid.train(x = predictors, y = response, training_frame = train, validation_frame = valid)

# sort the grid models by logloss
sorted_grid = grid.get_grid(sort_by='logloss', decreasing=False)
print(sorted_grid)


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,5 hours 6 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.981 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


0,1
H2O_cluster_uptime:,5 hours 6 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.981 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%


C55,Count
1,2942
2,6022
3,2160
4,2160
5,2396
6,2160
7,2160



gbm Model Build progress: |███████████████████████████████████████████████| 100%
logloss 0.42244897088060607
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
    max_after_balance_size  \
0                      1.7   
1                     0.85   
2                      0.5   

                                                          model_ids  \
0  Grid_GBM_py_12_sid_b0a5_model_python_1602894814824_10287_model_2   
1  Grid_GBM_py_12_sid_b0a5_model_python_1602894814824_10287_model_1   
2  Grid_GBM_py_12_sid_b0a5_model_python_1602894814824_10287_model_3   

               logloss  
0  0.40729219226804936  
1  0.42244897088060607  
2   0.4481831538444112  



In [4]:
#weights column 
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.cluster().show_status()

# import the cars dataset:
# this dataset is used to classify whether or not a car is economical based on
# the car's displacement, power, weight, and acceleration, and the year it was made
cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")

# convert response column to a factor
cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()

# set the predictor names and the response column name
predictors = ["displacement","power","acceleration","year"]
response = "economy_20mpg"

# create a new column that specifies the weights
# or use a column that already exists in the dataframe
# Note: do not use the fold_column
# in this case we will use the "weight" column that already exists in the dataframe
# this column contains the integers 1 or 2 in each row

# split into train and validation sets
train, valid = cars.split_frame(ratios = [.8], seed = 1234)

# try using the `weights_column` parameter:
# first initialize your estimator
cars_gbm = H2OGradientBoostingEstimator(seed = 1234)

# then train your model, where you specify the weights_column
cars_gbm.train(x = predictors, y = response, training_frame = train,
               validation_frame = valid, weights_column = "weight")

# print the auc for the validation data
cars_gbm.auc(valid=True)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,5 hours 8 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.977 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


0,1
H2O_cluster_uptime:,5 hours 8 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,5 months and 3 days !!!
H2O_cluster_name:,H2O_from_python_WIN_djxt8w
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.977 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%


0.964552201340255