In [1]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

SEED = 2024

In [2]:
h2o.init(max_mem_size="20G")
h2o.display.toggle_user_tips('off')

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.22" 2024-01-16; OpenJDK Runtime Environment (build 11.0.22+7-post-Ubuntu-0ubuntu220.04.1); OpenJDK 64-Bit Server VM (build 11.0.22+7-post-Ubuntu-0ubuntu220.04.1, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpxya6ntzv
  JVM stdout: /tmp/tmpxya6ntzv/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpxya6ntzv/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 11 days
H2O_cluster_name:,H2O_from_python_unknownUser_03mjfc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,20 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e4'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

In [4]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [5]:
TARGET = 'Rings'
features = [f for f in test.columns]

In [6]:
train[TARGET] = np.log1p(train[TARGET])

In [7]:
time_limit = 60 * 60 * 11
aml = H2OAutoML(
    nfolds=5,
    max_runtime_secs=time_limit,
    stopping_metric='RMSE',
    stopping_rounds=100,
    stopping_tolerance=2e-4,
    sort_metric='RMSE',
    exploitation_ratio=0.1,
    seed=SEED)

aml.train(x=features, y=TARGET, training_frame=h2o.H2OFrame(train))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
02:01:55.165: Stopping tolerance set by the user is < 70% of the recommended default of 0.0033220024821015216, so models may take a long time to converge or may not converge at all.

███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),78/78
# GBM base models (used / total),59/59
# XGBoost base models (used / total),7/7
# DRF base models (used / total),2/2
# DeepLearning base models (used / total),9/9
# GLM base models (used / total),1/1
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,0.107328,0.0008119,0.1082297,0.1078453,0.1076201,0.1065317,0.1064133
mean_residual_deviance,0.0217681,0.0003343,0.0220043,0.0216952,0.0221902,0.0213408,0.0216098
mse,0.0217681,0.0003343,0.0220043,0.0216952,0.0221902,0.0213408,0.0216098
r2,0.7329546,0.0044164,0.732181,0.7378417,0.7260346,0.7334328,0.7352828
residual_deviance,0.0217681,0.0003343,0.0220043,0.0216952,0.0221902,0.0213408,0.0216098
rmse,0.1475366,0.0011326,0.1483385,0.147293,0.1489637,0.1460848,0.1470029
rmsle,0.0444696,0.00065,0.0448978,0.0439412,0.0452701,0.0437061,0.0445327


In [8]:
aml.leaderboard

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_5_AutoML_1_20240425_20155,0.147544,0.0217693,0.10733,0.0444752,0.0217693
StackedEnsemble_AllModels_6_AutoML_1_20240425_20155,0.147842,0.0218574,0.107618,0.0445533,0.0218574
StackedEnsemble_BestOfFamily_7_AutoML_1_20240425_20155,0.148571,0.0220735,0.108211,0.0447371,0.0220735
StackedEnsemble_BestOfFamily_6_AutoML_1_20240425_20155,0.14864,0.0220939,0.108088,0.044764,0.0220939
StackedEnsemble_AllModels_2_AutoML_1_20240425_20155,0.148802,0.0221422,0.108359,0.0448128,0.0221422
StackedEnsemble_BestOfFamily_3_AutoML_1_20240425_20155,0.148802,0.0221422,0.108361,0.0448125,0.0221422
StackedEnsemble_AllModels_4_AutoML_1_20240425_20155,0.14881,0.0221444,0.108381,0.0448134,0.0221444
StackedEnsemble_AllModels_3_AutoML_1_20240425_20155,0.148811,0.0221446,0.108383,0.0448137,0.0221446
StackedEnsemble_Best1000_1_AutoML_1_20240425_20155,0.148815,0.0221458,0.108384,0.0448148,0.0221458
StackedEnsemble_BestOfFamily_4_AutoML_1_20240425_20155,0.148856,0.0221581,0.108404,0.0448268,0.0221581


In [9]:
test_preds = aml.leader.predict(h2o.H2OFrame(test[features]))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [10]:
sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
sub[TARGET] = test_preds.as_data_frame()
sub[TARGET] = np.expm1(sub[TARGET]).clip(1, 29)


with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()



In [11]:
sub.to_csv('submission.csv', index=False)

In [12]:
!head submission.csv

id,Rings
90615,9.822501202721387
90616,9.68517676070098
90617,9.793030850895946
90618,10.513371273182862
90619,7.563179955782431
90620,9.522906936581714
90621,11.179117191614795
90622,6.298647416541355
90623,7.99118927119504
