## All Imports for the script

In [1]:
import json

import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import pandas as pd

In [2]:
# CONSTANT DECLARATIONS
AIRLINE_DATA_PATH = "C://repository/h2o_project/h2o_data/allyears2k.csv"

## Initiating h2o instance

In [3]:
h2o.init(max_mem_size="1G" )

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.201-b09, mixed mode, sharing)
  Starting server from c:\programdata\miniconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\retim\AppData\Local\Temp\tmpt781x5xs
  JVM stdout: C:\Users\retim\AppData\Local\Temp\tmpt781x5xs\h2o_retim_started_from_python.out
  JVM stderr: C:\Users\retim\AppData\Local\Temp\tmpt781x5xs\h2o_retim_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Denver
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.1
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_retim_ubtal2
H2O cluster total nodes:,1
H2O cluster free memory:,989 Mb
H2O cluster total cores:,6
H2O cluster allowed cores:,6


## 1. Import the Airlines dataset

In [4]:
airline_df = h2o.import_file(AIRLINE_DATA_PATH)

Parse progress: |█████████████████████████████████████████████████████████| 100%


## 2. Split the data set into a train (80%) and test set (20%)

In [5]:
train, valid = airline_df.split_frame(ratios = [.80,], seed = 1234)

## Variable description

In [6]:
remove =['IsArrDelayed','IsDepDelayed', 'WeatherDelay','DepTime', 'ArrTime','ArrDelay','DepDelay',
          'IsArrDelayed','IsDepDelayed','ActualElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay',
         'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
         'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

predictors = [x for x in airline_df.columns if x not in remove]

target = 'IsArrDelayed'

airline_df[target] = airline_df[target].asfactor()

# GLM Search Space

In [6]:
hyper_parameters = {'alpha': [0.01,1], 
                    'lambda': [0, 1e-7, 1e-5, 1e-3, 1e-1]}
search_criteria = {"strategy": "RandomDiscrete",
                   "max_models": 5,
                   "seed": 1234}

glm_grid1 = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), 
                          grid_id = 'glm_grid1',
                          hyper_params = hyper_parameters,
                          search_criteria = search_criteria
                         )
glm_grid1.train(x = predictors, 
                y = target,
                training_frame = train,
                validation_frame = valid,
                seed = 1234,
                nfolds = 5,
                fold_assignment = "modulo",
                keep_cross_validation_predictions = True)

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [7]:
glm_grid1.get_grid(sort_by='auc',
                   decreasing=True)
glm_gridperf1 = glm_grid1.get_grid(sort_by='auc',decreasing=True)


# Grab the top GBM model, chosen by validation AUC
best_glm1 = glm_gridperf1.models[0]
print(best_glm1.auc(valid=True))

# save the hyper parameters as json file for later use

params = best_glm1.actual_params
with open("c://repository/h2o_project/grid_hyperparams_glm.json", 'w') as json_file:
    json_file.write(json.dumps(params, indent=4))

0.6578248648028588


# GBM Search Space

In [8]:
gbm_params1 = {'learn_rate': [0.01, 0.03],
                'max_depth': [3,4, 5,6, 9],
                'sample_rate': [0.7, 0.8,0.9, 1.0],
                'col_sample_rate': [0.2, 0.3,0.4, 0.5,0.7, 0.8]}

gbm_search_criteria = {"strategy": "RandomDiscrete",
                       "max_models": 5,
                       "seed": 1234}

gbm_grid1 = H2OGridSearch(model = H2OGradientBoostingEstimator,
                              grid_id = 'gbm_grid1',
                              hyper_params = gbm_params1,
                              search_criteria = gbm_search_criteria)

gbm_grid1.train(x = predictors, 
                y = target,
                training_frame = train,
                validation_frame = valid,
                ntrees = 100,
                seed = 1234,
                nfolds = 5,
                fold_assignment = "modulo",
                keep_cross_validation_predictions = True)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [9]:
# Get the grid results, sorted by validation AUC
gbm_gridperf1 = gbm_grid1.get_grid(sort_by='auc',
                                       decreasing=True)
# Grab the top GBM model, chosen by validation AUC
best_gbm1 = gbm_gridperf1.models[0]
# Now let's evaluate the model performance on a test set
best_gbm1.auc(valid = True)

0.7270300017365988

In [10]:
# save the hyper parameters as json file for later use
params = best_gbm1.actual_params
with open("c://repository/h2o_project/grid_hyperparams_GBM.json", 'w') as json_file:
    json_file.write(json.dumps(params, indent=4))

# Deep Learning Search Space

In [11]:
# creating function as input for hidden layer for one input at time 
# to iterate through hidden layer list [10,10]),[20, 15],[50,50,50]
def deep_learning_grid(hidden_lst):
    dl_params = {"activation" : ["rectifier","rectifier_with_dropout"],
                 "hidden" : hidden_lst,
                "l1" : [0, 1e-3, 1e-5],
                "l2": [0, 1e-3, 1e-5]}
    search_criteria ={ "strategy": "RandomDiscrete",
                        "max_models" :5, 
                        "seed" : 123}
    dl_grid1 = H2OGridSearch(model = H2ODeepLearningEstimator,
                                  grid_id = 'dl_grid1',
                                  hyper_params = dl_params,
                                  search_criteria = search_criteria)
    dl_grid1.train(x = predictors,
                 y= target,
                 training_frame = train,
                 validation_frame = valid,
                 seed = 123,
                 nfolds = 5,
                 fold_assignment = "modulo",
                 keep_cross_validation_predictions = True)
    return dl_grid1

In [12]:
# creating grid will all the given hidden paramters list
dl_grid_perf_10_10 = deep_learning_grid([10,10])
dl_grid_perf_20_15 = deep_learning_grid([20, 15])
dl_grid_perf_50_50_50 = deep_learning_grid([50,50,50])

deeplearning Grid Build progress: |███████████████████████████████████████| 100%
deeplearning Grid Build progress: |███████████████████████████████████████| 100%
deeplearning Grid Build progress: |███████████████████████████████████████| 100%


In [13]:
# Get the grid results, sorted by validation AUC
dl_grid_10 = dl_grid_perf_10_10.get_grid(sort_by = 'auc', decreasing = True)
dl_grid_20 = dl_grid_perf_20_15.get_grid(sort_by = 'auc', decreasing = True)
dl_grid_50 = dl_grid_perf_50_50_50.get_grid(sort_by = 'auc', decreasing = True)


# Grab the top GBM model, chosen by validation AUC for each hidden layer top most

best_dl_grid_10 = dl_grid_10.models[0]
best_dl_grid_20 = dl_grid_20.models[0]
best_dl_grid_50 = dl_grid_50.models[0]

In [14]:
# Retreive test set AUC
# Now let's evaluate the model performance on a test set

print (best_dl_grid_10.auc(valid = True))
print (best_dl_grid_20.auc(valid = True))
print (best_dl_grid_50.auc(valid = True))

0.6791219926170651
0.6791219926170651
0.6791219926170651


Since the auc in different hidden layers have same auc and all other params being same, let save with less hidden layer.

In [15]:
# save the hyper parameters as json file for later use
params = best_dl_grid_10.actual_params
with open("c://repository/h2o_project/grid_hyperparams_dl.json", 'w') as json_file:
    json_file.write(json.dumps(params, indent=4))

In [16]:
# save the model
model_path_glm = h2o.save_model(model=best_glm1, path="c://repository/h2o_project/best_glm", force=True)
model_path_gbm = h2o.save_model(model=best_gbm1, path="c://repository/h2o_project/best_gbm", force=True)
model_path_dl = h2o.save_model(model=best_dl_grid_10, path="c://repository/h2o_project/best_dl", force=True)

In [17]:
# load the model
#saved_model = h2o.load_model(model_path)

## 4. Ensemble all the models

In [18]:
ensemble = H2OStackedEnsembleEstimator(model_id="ensemble_1",
                                      base_models=[best_gbm1, 
                                                   best_glm1, 
                                                   best_dl_grid_10])
ensemble.train(x = predictors, 
               y=target,
               training_frame = train,
               validation_frame = valid)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


## 5. Evaluate the ensemble performance on the test set

In [19]:
ensemble.auc(valid = True)

0.7270862154943917

## 6. Find and compare the best base learner test AUC and the ensemble test AUC.

In [20]:
print("glm best base model auc: ", best_glm1.auc(valid=True),"\n"
      "gbm best base model auc: ", best_gbm1.auc(valid=True),"\n"
      "dl best base model auc: ", best_dl_grid_10.auc(valid=True),"\n"
      "ensemble test auc: ", ensemble.auc(valid=True))

glm best base model auc:  0.6578248648028588 
gbm best base model auc:  0.7270300017365988 
dl best base model auc:  0.6791219926170651 
ensemble test auc:  0.7270862154943917
