<a href="https://colab.research.google.com/github/pysr1/h2o/blob/master/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install h2o:

Note: Most of this code is copied and pasted from the h2o documentation.

In [1]:
%%bash
apt-get install default-jre
pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

Reading package lists...
Building dependency tree...
Reading state information...
default-jre is already the newest version (2:1.11-68ubuntu1~18.04.1).
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.
Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html


# Data Import and AutoML

In [2]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.4" 2019-07-16; OpenJDK Runtime Environment (build 11.0.4+11-post-Ubuntu-1ubuntu218.04.3); OpenJDK 64-Bit Server VM (build 11.0.4+11-post-Ubuntu-1ubuntu218.04.3, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpnvvuqtp6
  JVM stdout: /tmp/tmpnvvuqtp6/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpnvvuqtp6/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.6
H2O cluster version age:,7 days and 37 minutes
H2O cluster name:,H2O_from_python_unknownUser_rv4k5w
H2O cluster total nodes:,1
H2O cluster free memory:,3 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1, keep_cross_validation_models=True)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |████████████████████████████████████████████████████████| 100%


model_id,auc,logloss,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_AutoML_20191009_005033,0.789683,0.551519,0.318681,0.432219,0.186813
StackedEnsemble_BestOfFamily_AutoML_20191009_005033,0.787846,0.55313,0.315858,0.433038,0.187522
XGBoost_grid_1_AutoML_20191009_005033_model_3,0.78538,0.55838,0.333409,0.434884,0.189124
XGBoost_3_AutoML_20191009_005033,0.784162,0.558118,0.340129,0.435162,0.189366
XGBoost_grid_1_AutoML_20191009_005033_model_4,0.783842,0.557377,0.312392,0.435045,0.189264
XGBoost_1_AutoML_20191009_005033,0.782782,0.557031,0.331908,0.43506,0.189277
XGBoost_2_AutoML_20191009_005033,0.782426,0.557296,0.326804,0.435181,0.189383
XGBoost_grid_1_AutoML_20191009_005033_model_1,0.781559,0.561901,0.330242,0.436658,0.190671
GBM_5_AutoML_20191009_005033,0.780344,0.558723,0.334826,0.436138,0.190216
GBM_1_AutoML_20191009_005033,0.780286,0.560701,0.316822,0.43642,0.190462




## Random Grid / Tuning w/ GBM

In [4]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch


# Use same data as above

# GBM hyperparameters
gbm_params2 = {'learn_rate': [i * 0.01 for i in range(1, 11)],
                'max_depth': list(range(2, 11)),
                'sample_rate': [i * 0.1 for i in range(5, 11)],
                'col_sample_rate': [i * 0.1 for i in range(1, 11)],
                'ntrees' : [100, 300, 500, 700]
              }

# Search criteria
search_criteria = {'strategy': 'RandomDiscrete', 'max_models': 36, 'seed': 1}

# Train and validate a random grid of GBMs
gbm_grid2 = H2OGridSearch(model=H2OGradientBoostingEstimator,
                          grid_id='gbm_grid2',
                          
                          hyper_params=gbm_params2,
                          search_criteria=search_criteria)
gbm_grid2.train(x=x, y=y,
                training_frame=train,
                seed=1, nfolds = 5)

# Get the grid results, sorted by validation AUC
gbm_gridperf2 = gbm_grid2.get_grid(sort_by='auc', decreasing=True)
gbm_gridperf2

# Grab the top GBM model, chosen by validation AUC
best_gbm2 = gbm_gridperf2.models[0]

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf2 = best_gbm2.model_performance(test)

best_gbm_perf2.auc()
# 0.7810757307013204

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


0.7899973856840057

In [5]:
gbm_gridperf2

          col_sample_rate  ...           model_ids                 auc
0      0.6000000000000001  ...  gbm_grid2_model_26  0.7853128741147936
1                     1.0  ...  gbm_grid2_model_15  0.7834489859199875
2                     0.8  ...   gbm_grid2_model_6  0.7827057788160587
3     0.30000000000000004  ...   gbm_grid2_model_4  0.7820598101993038
4      0.6000000000000001  ...  gbm_grid2_model_27  0.7807801155823421
5      0.6000000000000001  ...  gbm_grid2_model_17  0.7802352990760838
6                     0.9  ...   gbm_grid2_model_8  0.7802126000608116
7                     0.4  ...  gbm_grid2_model_22   0.779543551101384
8                     0.5  ...  gbm_grid2_model_16  0.7793075495800883
9                     0.4  ...  gbm_grid2_model_35  0.7772351756464252
10     0.7000000000000001  ...   gbm_grid2_model_5  0.7769261599628307
11                    0.9  ...  gbm_grid2_model_10  0.7766655728591225
12     0.7000000000000001  ...  gbm_grid2_model_11  0.7766009679695018
13    



In [6]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch


rf_params = {
                'max_depth': list(range(2, 11)),
                'sample_rate': [i * 0.1 for i in range(5, 11)],
                'ntrees' : [100, 300, 500, 700]
             }

search_criteria = {'strategy': 'RandomDiscrete', 'max_models': 5, 'seed': 1}

rf_grid2 = H2OGridSearch(model=H2ORandomForestEstimator,
                          grid_id='rf_grid',
                          hyper_params=rf_params,
                          search_criteria=search_criteria)
rf_grid2.train(x=x, y=y,
                training_frame=train,
                seed=1, nfolds = 5)

rf_gridperf2 = rf_grid2.get_grid(sort_by='auc', decreasing=True)



best_rf = rf_gridperf2.models[0]


best_rf = best_rf.model_performance(test)

best_rf.auc()

drf Grid Build progress: |████████████████████████████████████████████████| 100%


NameError: ignored

In [0]:
best_rf.auc()