In [1]:
import pandas as pd
import h2o
from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
import optuna

seed = 1

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.21" 2023-10-17; OpenJDK Runtime Environment (build 11.0.21+9-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.21+9-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp_n7h4cyd
  JVM stdout: /tmp/tmp_n7h4cyd/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp_n7h4cyd/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 month and 8 days
H2O_cluster_name:,H2O_from_python_unknownUser_7htgnk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.250 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [4]:
#get data
train_clean = pd.read_csv('../prediction-task/train-clean.csv')
test_clean = pd.read_csv('../prediction-task/test-clean.csv')

In [6]:
#convert pandas df to h2o df
train_clean_h2o = h2o.H2OFrame(train_clean)



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [8]:
#ensure categorical columns are understood as categorical by h2o models
train_clean_h2o['MODE'] = train_clean_h2o['MODE'].asfactor()
train_clean_h2o['POWER'] = train_clean_h2o['POWER'].asfactor()

In [10]:
#split data into train and val sets
splits = train_clean_h2o.split_frame(ratios=[0.9], seed=1) 
train = splits[0]
val = splits[1]

In [12]:
#get features and target for modelling
y = 'OUTPUT'
x = list(train.columns)
x.remove(y)

## tune multiple deep learning models with different hyperparameters

In [16]:
dl_models = []

def objective(trial):
    #params to tune
    num_layers = trial.suggest_int('num_layers', 1, 5)
    hidden_layer_size = trial.suggest_int('hidden_layer_size', 100, 300, step=50)

    params = {
        'hidden': [hidden_layer_size]*num_layers,
        'activation': trial.suggest_categorical('activation', ['rectifier', 'rectifierwithdropout', 'tanh', 'tanh_with_dropout', 'maxout', 'maxout_with_dropout']),
        'epochs': trial.suggest_int('epochs', 5, 50, step=5),
    }

    #train model
    model = H2ODeepLearningEstimator(**params,
                                    standardize=True,
                                    categorical_encoding='auto',
                                    nfolds=5,
                                    keep_cross_validation_predictions=True,  #need for stacked ensembling later
                                    seed=seed)
    model.train(x=x, y=y, training_frame=train)
    
    #store model
    dl_models.append(model)

    #get cv rmse
    cv_metrics_df = model.cross_validation_metrics_summary().as_data_frame()
    cv_rmse_index = cv_metrics_df[cv_metrics_df[''] == 'rmse'].index
    cv_rmse = cv_metrics_df['mean'].iloc[cv_rmse_index]
    return cv_rmse


In [17]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2023-12-17 05:00:24,791] A new study created in memory with name: no-name-ecca3309-e496-456a-b47e-3b4b87ce8d56


deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:00:30,906] Trial 0 finished with value: 1.6633766 and parameters: {'num_layers': 1, 'hidden_layer_size': 300, 'activation': 'tanh_with_dropout', 'epochs': 30}. Best is trial 0 with value: 1.6633766.


█| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████

[I 2023-12-17 05:05:20,643] Trial 1 finished with value: 0.4778851 and parameters: {'num_layers': 3, 'hidden_layer_size': 300, 'activation': 'tanh', 'epochs': 40}. Best is trial 1 with value: 0.4778851.


| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:07:40,349] Trial 2 finished with value: 0.57637537 and parameters: {'num_layers': 5, 'hidden_layer_size': 300, 'activation': 'rectifier', 'epochs': 30}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:11:42,235] Trial 3 finished with value: 0.56930196 and parameters: {'num_layers': 2, 'hidden_layer_size': 300, 'activation': 'maxout', 'epochs': 50}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%


[I 2023-12-17 05:11:47,397] Trial 4 finished with value: 2.3188674 and parameters: {'num_layers': 3, 'hidden_layer_size': 150, 'activation': 'rectifierwithdropout', 'epochs': 5}. Best is trial 1 with value: 0.4778851.


deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:13:04,785] Trial 5 finished with value: 0.5139188 and parameters: {'num_layers': 3, 'hidden_layer_size': 250, 'activation': 'rectifier', 'epochs': 50}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |███████████████████████████████████████████

[I 2023-12-17 05:13:08,607] Trial 6 finished with value: 0.7636756 and parameters: {'num_layers': 1, 'hidden_layer_size': 200, 'activation': 'maxout', 'epochs': 20}. Best is trial 1 with value: 0.4778851.


██| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████

[I 2023-12-17 05:17:41,124] Trial 7 finished with value: 0.5001623 and parameters: {'num_layers': 4, 'hidden_layer_size': 250, 'activation': 'tanh', 'epochs': 40}. Best is trial 1 with value: 0.4778851.


| (done) 100%
deeplearning Model Build progress: |██████████████████████████████████████████

[I 2023-12-17 05:17:43,747] Trial 8 finished with value: 0.98886687 and parameters: {'num_layers': 1, 'hidden_layer_size': 250, 'activation': 'rectifierwithdropout', 'epochs': 35}. Best is trial 1 with value: 0.4778851.


███| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:18:34,219] Trial 9 finished with value: 1.3026475 and parameters: {'num_layers': 5, 'hidden_layer_size': 150, 'activation': 'maxout', 'epochs': 10}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:18:58,811] Trial 10 finished with value: 0.48489624 and parameters: {'num_layers': 4, 'hidden_layer_size': 100, 'activation': 'tanh', 'epochs': 20}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:19:22,816] Trial 11 finished with value: 0.48227125 and parameters: {'num_layers': 4, 'hidden_layer_size': 100, 'activation': 'tanh', 'epochs': 20}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:19:46,997] Trial 12 finished with value: 1.1669194 and parameters: {'num_layers': 4, 'hidden_layer_size': 100, 'activation': 'maxout_with_dropout', 'epochs': 20}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:20:23,815] Trial 13 finished with value: 0.50259006 and parameters: {'num_layers': 2, 'hidden_layer_size': 150, 'activation': 'tanh', 'epochs': 40}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:21:09,493] Trial 14 finished with value: 0.5095935 and parameters: {'num_layers': 3, 'hidden_layer_size': 200, 'activation': 'tanh', 'epochs': 15}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:21:57,800] Trial 15 finished with value: 0.50500834 and parameters: {'num_layers': 4, 'hidden_layer_size': 100, 'activation': 'tanh', 'epochs': 40}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:22:37,358] Trial 16 finished with value: 0.95323193 and parameters: {'num_layers': 2, 'hidden_layer_size': 200, 'activation': 'maxout_with_dropout', 'epochs': 25}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:24:02,067] Trial 17 finished with value: 1.7414631 and parameters: {'num_layers': 5, 'hidden_layer_size': 150, 'activation': 'tanh_with_dropout', 'epochs': 45}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:26:40,232] Trial 18 finished with value: 0.49151322 and parameters: {'num_layers': 3, 'hidden_layer_size': 250, 'activation': 'tanh', 'epochs': 35}. Best is trial 1 with value: 0.4778851.


█| (done) 100%
deeplearning Model Build progress: |████████████████████████████████████████████

[I 2023-12-17 05:28:30,272] Trial 19 finished with value: 0.5302075 and parameters: {'num_layers': 4, 'hidden_layer_size': 200, 'activation': 'tanh', 'epochs': 25}. Best is trial 1 with value: 0.4778851.


█| (done) 100%


In [18]:
len(dl_models)

20

## tune multiple xgboost models with different hyperparameters

In [13]:
xgboost_models = []

def objective(trial):
    #common params between xgboost and lightgbm
    params = {
        'ntrees': trial.suggest_int('ntrees', 50, 5000),
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'min_rows': trial.suggest_int('min_rows', 1, 5),
        'sample_rate': trial.suggest_float('sample_rate', 0.8, 1.0),
        'col_sample_rate': trial.suggest_float('col_sample_rate', 0.2, 1.0),
        'col_sample_rate_per_tree': trial.suggest_float('col_sample_rate_per_tree', 0.5, 1.0)
    }
    
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    
    #add lightgbm-only params
    if grow_policy == 'lossguide':  #emulates lightgbm
        tree_method = 'hist'  #must be hist when using lossguide
        params['max_bins'] = trial.suggest_int('max_bins', 20, 256)
        params['max_leaves'] = trial.suggest_int('max_leaves', 31, 1024)
        
    #add xgboost-only params
    else:
        tree_method = 'auto'
        params['booster'] = trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart'])
        params['reg_alpha'] = trial.suggest_float('reg_alpha', 0.001, 1)
        params['reg_lambda'] = trial.suggest_float('reg_lambda', 0.001, 1)
        params['min_split_improvement'] = trial.suggest_float('min_split_improvement', 1e-10, 1e-3, log=True)
    
    #add grow_policy and tree_method into params
    params['grow_policy'] = grow_policy
    params['tree_method'] = tree_method

    #train model
    model = H2OXGBoostEstimator(**params,
                                learn_rate=0.1,
                                categorical_encoding='auto',
                                nfolds=5,
                                keep_cross_validation_predictions=True,  #need for stacked ensembling later
                                seed=seed) 
    model.train(x=x, y=y, training_frame=train)

    #store model
    xgboost_models.append(model)

    #get cv rmse
    cv_metrics_df = model.cross_validation_metrics_summary().as_data_frame()
    cv_rmse_index = cv_metrics_df[cv_metrics_df[''] == 'rmse'].index
    cv_rmse = cv_metrics_df['mean'].iloc[cv_rmse_index]
    return cv_rmse


In [14]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2023-12-17 04:33:35,373] A new study created in memory with name: no-name-c695d077-f1f0-4b38-92c3-94fea2beaf6e


xgboost Model Build progress: |█████████████████████████████████████████████████

  float(v)
  if math.isnan(v):
  values = [float(value) for value in values]
[I 2023-12-17 04:34:11,053] Trial 0 finished with value: 0.91742015 and parameters: {'ntrees': 2450, 'max_depth': 4, 'min_rows': 1, 'sample_rate': 0.8188631129384818, 'col_sample_rate': 0.7707111398310464, 'col_sample_rate_per_tree': 0.9761972625178172, 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_alpha': 0.4816789236927959, 'reg_lambda': 0.7663842939437949, 'min_split_improvement': 1.0431098985684226e-07}. Best is trial 0 with value: 0.91742015.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:34:33,960] Trial 1 finished with value: 0.8052408 and parameters: {'ntrees': 1513, 'max_depth': 9, 'min_rows': 1, 'sample_rate': 0.8987283320043595, 'col_sample_rate': 0.9393631519305861, 'col_sample_rate_per_tree': 0.9502997317497792, 'grow_policy': 'lossguide', 'max_bins': 158, 'max_leaves': 1012}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:35:02,873] Trial 2 finished with value: 0.9394562 and parameters: {'ntrees': 2432, 'max_depth': 5, 'min_rows': 3, 'sample_rate': 0.922478385432603, 'col_sample_rate': 0.271656185538547, 'col_sample_rate_per_tree': 0.7651088323971433, 'grow_policy': 'lossguide', 'max_bins': 199, 'max_leaves': 67}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:35:22,606] Trial 3 finished with value: 0.9098812 and parameters: {'ntrees': 499, 'max_depth': 6, 'min_rows': 2, 'sample_rate': 0.8400527873962631, 'col_sample_rate': 0.6164600853737945, 'col_sample_rate_per_tree': 0.7835211699367628, 'grow_policy': 'lossguide', 'max_bins': 126, 'max_leaves': 845}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:35:55,634] Trial 4 finished with value: 0.9749818 and parameters: {'ntrees': 4533, 'max_depth': 7, 'min_rows': 5, 'sample_rate': 0.9166674850645258, 'col_sample_rate': 0.46949209769806877, 'col_sample_rate_per_tree': 0.6771993790035081, 'grow_policy': 'lossguide', 'max_bins': 72, 'max_leaves': 924}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:36:26,238] Trial 5 finished with value: 0.9829188 and parameters: {'ntrees': 2057, 'max_depth': 8, 'min_rows': 4, 'sample_rate': 0.8979605536915783, 'col_sample_rate': 0.8361753770444416, 'col_sample_rate_per_tree': 0.5452232089860783, 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_alpha': 0.318663349471974, 'reg_lambda': 0.5545637400000574, 'min_split_improvement': 8.240468628280513e-05}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:36:56,884] Trial 6 finished with value: 1.0254233 and parameters: {'ntrees': 4820, 'max_depth': 5, 'min_rows': 3, 'sample_rate': 0.9024725069250021, 'col_sample_rate': 0.4937768788352731, 'col_sample_rate_per_tree': 0.5882556011511106, 'grow_policy': 'lossguide', 'max_bins': 228, 'max_leaves': 272}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:37:13,159] Trial 7 finished with value: 0.92321724 and parameters: {'ntrees': 301, 'max_depth': 9, 'min_rows': 5, 'sample_rate': 0.8843945024679433, 'col_sample_rate': 0.4136925233917033, 'col_sample_rate_per_tree': 0.9137580944713453, 'grow_policy': 'lossguide', 'max_bins': 225, 'max_leaves': 141}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:37:41,966] Trial 8 finished with value: 2.4167137 and parameters: {'ntrees': 3469, 'max_depth': 1, 'min_rows': 4, 'sample_rate': 0.8689844184975527, 'col_sample_rate': 0.6784623691653913, 'col_sample_rate_per_tree': 0.7050197480032321, 'grow_policy': 'depthwise', 'booster': 'gblinear', 'reg_alpha': 0.5010819795906567, 'reg_lambda': 0.024698297527956912, 'min_split_improvement': 1.7162570950304488e-10}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:38:17,896] Trial 9 finished with value: 0.96729386 and parameters: {'ntrees': 2618, 'max_depth': 7, 'min_rows': 4, 'sample_rate': 0.8109371548456188, 'col_sample_rate': 0.3272342875224815, 'col_sample_rate_per_tree': 0.7210691012898812, 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_alpha': 0.5507582884431805, 'reg_lambda': 0.6950933807779839, 'min_split_improvement': 3.8934433473410915e-10}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:38:33,927] Trial 10 finished with value: 1.4675977 and parameters: {'ntrees': 1308, 'max_depth': 2, 'min_rows': 1, 'sample_rate': 0.9660872327451535, 'col_sample_rate': 0.9647798067185598, 'col_sample_rate_per_tree': 0.891902546222447, 'grow_policy': 'lossguide', 'max_bins': 143, 'max_leaves': 662}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:38:46,618] Trial 11 finished with value: 0.85111165 and parameters: {'ntrees': 94, 'max_depth': 9, 'min_rows': 2, 'sample_rate': 0.8475953195883608, 'col_sample_rate': 0.9849291828516529, 'col_sample_rate_per_tree': 0.8190286339838478, 'grow_policy': 'lossguide', 'max_bins': 116, 'max_leaves': 1017}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:39:07,143] Trial 12 finished with value: 0.8319576 and parameters: {'ntrees': 1204, 'max_depth': 9, 'min_rows': 2, 'sample_rate': 0.8546806193472577, 'col_sample_rate': 0.9915542859317801, 'col_sample_rate_per_tree': 0.8478690030103999, 'grow_policy': 'lossguide', 'max_bins': 134, 'max_leaves': 995}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:39:27,333] Trial 13 finished with value: 0.82285994 and parameters: {'ntrees': 1245, 'max_depth': 9, 'min_rows': 2, 'sample_rate': 0.953933803824648, 'col_sample_rate': 0.8641313281644876, 'col_sample_rate_per_tree': 0.9857337303628266, 'grow_policy': 'lossguide', 'max_bins': 164, 'max_leaves': 675}. Best is trial 1 with value: 0.8052408.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:39:47,660] Trial 14 finished with value: 0.8017554 and parameters: {'ntrees': 1270, 'max_depth': 7, 'min_rows': 1, 'sample_rate': 0.9562865227261494, 'col_sample_rate': 0.8620540269772812, 'col_sample_rate_per_tree': 0.9985787518709145, 'grow_policy': 'lossguide', 'max_bins': 177, 'max_leaves': 545}. Best is trial 14 with value: 0.8017554.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:40:08,050] Trial 15 finished with value: 0.79517704 and parameters: {'ntrees': 1754, 'max_depth': 7, 'min_rows': 1, 'sample_rate': 0.998366804370096, 'col_sample_rate': 0.7369458627040277, 'col_sample_rate_per_tree': 0.926345791753407, 'grow_policy': 'lossguide', 'max_bins': 66, 'max_leaves': 414}. Best is trial 15 with value: 0.79517704.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:40:30,913] Trial 16 finished with value: 0.8034699 and parameters: {'ntrees': 3499, 'max_depth': 7, 'min_rows': 1, 'sample_rate': 0.9961339581958474, 'col_sample_rate': 0.7343365130864157, 'col_sample_rate_per_tree': 0.890735402858023, 'grow_policy': 'lossguide', 'max_bins': 20, 'max_leaves': 402}. Best is trial 15 with value: 0.79517704.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:40:49,765] Trial 17 finished with value: 1.074685 and parameters: {'ntrees': 1924, 'max_depth': 3, 'min_rows': 1, 'sample_rate': 0.9946859306656045, 'col_sample_rate': 0.5613428969235961, 'col_sample_rate_per_tree': 0.9953948334527745, 'grow_policy': 'lossguide', 'max_bins': 77, 'max_leaves': 489}. Best is trial 15 with value: 0.79517704.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 04:51:14,266] Trial 18 finished with value: 0.7871454 and parameters: {'ntrees': 741, 'max_depth': 6, 'min_rows': 2, 'sample_rate': 0.9618181597901027, 'col_sample_rate': 0.8380692178167941, 'col_sample_rate_per_tree': 0.8483749309839395, 'grow_policy': 'depthwise', 'booster': 'dart', 'reg_alpha': 0.964316366336372, 'reg_lambda': 0.09919986407233439, 'min_split_improvement': 0.00012696661324644446}. Best is trial 18 with value: 0.7871454.


█| (done) 100%
xgboost Model Build progress: |█████████████████████████████████████████████████

[I 2023-12-17 05:00:24,751] Trial 19 finished with value: 0.863278 and parameters: {'ntrees': 704, 'max_depth': 6, 'min_rows': 2, 'sample_rate': 0.976209037093354, 'col_sample_rate': 0.6871850363941621, 'col_sample_rate_per_tree': 0.8339593470448108, 'grow_policy': 'depthwise', 'booster': 'dart', 'reg_alpha': 0.9651393268304185, 'reg_lambda': 0.15124209937660504, 'min_split_improvement': 0.0005083777188725635}. Best is trial 18 with value: 0.7871454.


█| (done) 100%


In [15]:
len(xgboost_models)

20

## build stacked ensembler using deep learning and xgboost models
with GLM chosen as meta-model

In [19]:
base_models = dl_models + xgboost_models

In [20]:
len(base_models)

40

In [21]:
def objective(trial):   
    meta_model_params = {
        'alpha': trial.suggest_float('alpha', 0, 1),  #regularization distribution between L1 and L2
        'family': trial.suggest_categorical('family', ['gaussian', 'tweedie']),  
        'lambda': trial.suggest_float('lambda', 1e-6, 10.0, log=True),
        'standardize': trial.suggest_categorical('standardize', [True, False]),
        'non_negative': True  #predictions of each base model cannot be subtracted from one another
    }

    ensemble = H2OStackedEnsembleEstimator(metalearner_algorithm='glm',
                                             metalearner_params=meta_model_params,
                                             metalearner_nfolds=5,
                                             base_models=base_models,  
                                             seed=seed)

    ensemble.train(x=x, y=y, training_frame=train)
    
    #get cv rmse
    cv_metrics_df = ensemble.cross_validation_metrics_summary().as_data_frame()
    cv_rmse_index = cv_metrics_df[cv_metrics_df[''] == 'rmse'].index
    cv_rmse = cv_metrics_df['mean'].iloc[cv_rmse_index]
    return cv_rmse
    

In [22]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2023-12-17 05:28:30,330] A new study created in memory with name: no-name-0de7a2c8-3133-42ce-a22a-a55c81bb429c


stackedensemble Model Build progress: |

[I 2023-12-17 05:28:31,683] Trial 0 finished with value: 0.4435823 and parameters: {'alpha': 0.2883819606738718, 'family': 'tweedie', 'lambda': 1.0078116253602135e-05, 'standardize': True}. Best is trial 0 with value: 0.4435823.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:32,784] Trial 1 finished with value: 0.44359657 and parameters: {'alpha': 0.07344939994794075, 'family': 'tweedie', 'lambda': 2.7033032441268117e-05, 'standardize': False}. Best is trial 0 with value: 0.4435823.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:33,888] Trial 2 finished with value: 0.4450764 and parameters: {'alpha': 0.945032152563395, 'family': 'gaussian', 'lambda': 5.007848462647795e-06, 'standardize': False}. Best is trial 0 with value: 0.4435823.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:34,770] Trial 3 finished with value: 0.44358265 and parameters: {'alpha': 0.38213870802343275, 'family': 'tweedie', 'lambda': 0.00010926564314327826, 'standardize': False}. Best is trial 0 with value: 0.4435823.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:35,643] Trial 4 finished with value: 0.442963 and parameters: {'alpha': 0.32442383931510577, 'family': 'tweedie', 'lambda': 0.0008543778469206235, 'standardize': True}. Best is trial 4 with value: 0.442963.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:36,518] Trial 5 finished with value: 0.43876544 and parameters: {'alpha': 0.16165874818526615, 'family': 'gaussian', 'lambda': 0.0835053698118708, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:37,389] Trial 6 finished with value: 0.44423836 and parameters: {'alpha': 0.68966459979134, 'family': 'gaussian', 'lambda': 0.003911073472606895, 'standardize': True}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:38,045] Trial 7 finished with value: 0.44307494 and parameters: {'alpha': 0.6697557225167311, 'family': 'gaussian', 'lambda': 0.07193421416870015, 'standardize': True}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:38,903] Trial 8 finished with value: 1.2271538 and parameters: {'alpha': 0.5979754959628343, 'family': 'gaussian', 'lambda': 3.630991033827338, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:39,761] Trial 9 finished with value: 0.4423639 and parameters: {'alpha': 0.0901844900934371, 'family': 'tweedie', 'lambda': 0.00833992022037345, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:41,303] Trial 10 finished with value: 0.5096543 and parameters: {'alpha': 0.023202460104279538, 'family': 'gaussian', 'lambda': 3.452701654131003, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:41,964] Trial 11 finished with value: 0.43945518 and parameters: {'alpha': 0.19852617689552204, 'family': 'tweedie', 'lambda': 0.07209661138716489, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:42,631] Trial 12 finished with value: 0.4391283 and parameters: {'alpha': 0.2123263429629933, 'family': 'gaussian', 'lambda': 0.1572145487742103, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:43,511] Trial 13 finished with value: 0.44367382 and parameters: {'alpha': 0.4789861518549842, 'family': 'gaussian', 'lambda': 0.24311630834764603, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:44,584] Trial 14 finished with value: 0.44510612 and parameters: {'alpha': 0.1968600678318372, 'family': 'gaussian', 'lambda': 0.49902341487411933, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:45,451] Trial 15 finished with value: 0.4431844 and parameters: {'alpha': 0.18879997383730318, 'family': 'gaussian', 'lambda': 0.021640364722283287, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:46,119] Trial 16 finished with value: 0.44496745 and parameters: {'alpha': 0.45963378293539986, 'family': 'gaussian', 'lambda': 0.0009589363284711321, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:46,985] Trial 17 finished with value: 0.50249505 and parameters: {'alpha': 0.8500581611896025, 'family': 'gaussian', 'lambda': 0.5305167081533387, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:47,852] Trial 18 finished with value: 0.43943 and parameters: {'alpha': 0.2701394113241932, 'family': 'gaussian', 'lambda': 0.03029229444431384, 'standardize': True}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |

[I 2023-12-17 05:28:48,928] Trial 19 finished with value: 0.45703736 and parameters: {'alpha': 0.12911295576519033, 'family': 'gaussian', 'lambda': 1.020830529008062, 'standardize': False}. Best is trial 5 with value: 0.43876544.


██████████████████████████████████████████| (done) 100%


In [None]:
#build best ensembler

In [23]:
best_meta_model_params = study.best_params
best_meta_model_params

{'alpha': 0.16165874818526615,
 'family': 'gaussian',
 'lambda': 0.0835053698118708,
 'standardize': False}

In [24]:
best_ensemble = H2OStackedEnsembleEstimator(metalearner_algorithm='glm',
                                            metalearner_params=best_meta_model_params,
                                            metalearner_nfolds=5,
                                            base_models=base_models,
                                            seed=seed)

best_ensemble.train(x=x, y=y, training_frame=train)

stackedensemble Model Build progress: |██████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),16/40
# XGBoost base models (used / total),3/20
# DeepLearning base models (used / total),13/20
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,
Custom metalearner hyperparameters,"{""alpha"": [0.16165874818526615], ""family"": [""gaussian""], ""lambda"": [0.0835053698118708], ""standardize"": [false]}"

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.2880177,0.0300084,0.2765387,0.2946345,0.2951343,0.3279593,0.2458218
mean_residual_deviance,0.1967238,0.0838715,0.1254751,0.1620242,0.1737838,0.3418678,0.1804682
mse,0.1967238,0.0838715,0.1254751,0.1620242,0.1737838,0.3418678,0.1804682
null_deviance,989.26465,54.242702,1020.849,914.5442,1058.8077,977.3748,974.74744
r2,0.9680559,0.0145715,0.9796059,0.971458,0.9748279,0.9426593,0.9717284
residual_deviance,30.82815,12.854041,20.70339,26.085888,26.588924,53.331375,27.431168
rmse,0.4366262,0.087188,0.3542246,0.4025222,0.4168739,0.5846946,0.4248155
rmsle,,0.0,,,,,


In [55]:
#see best ensembler performance on val set

In [25]:
ensemble_val_rmse = best_ensemble.model_performance(val).rmse()
ensemble_val_rmse

0.31475634111745304

In [None]:
#get predictions for test set

In [28]:
test_clean_h2o = h2o.H2OFrame(test_clean)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [30]:
test_clean_h2o['MODE'] = test_clean_h2o['MODE'].asfactor()
test_clean_h2o['POWER'] = test_clean_h2o['POWER'].asfactor()

In [31]:
predictions_h2o = best_ensemble.predict(test_clean_h2o)
predictions = predictions_h2o.as_data_frame()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [32]:
predictions.to_csv('test_preds.csv', header=None, index=False)

In [33]:
predictions

Unnamed: 0,predict
0,2.511439
1,2.704789
2,2.432484
3,2.336090
4,2.177502
...,...
2495,2.640867
2496,2.618526
2497,2.167043
2498,2.208762
