In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
from typing import Tuple

In [None]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

In [None]:
# Starting H2O

import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='16G')

In [None]:
in_kaggle = True


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-feb-2021/train.csv'
        test_path = '../input/tabular-playground-series-feb-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-feb-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path

In [None]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

df_train = pd.read_csv(train_set_path)
df_test = pd.read_csv(test_set_path)

subm = pd.read_csv(sample_subm_path)

# list of basic raw features
feature_list = [col for col in df_train.columns if col.startswith('cont')]

In [None]:
# drop sig_id from train and test sets
df_train = df_train.drop(['id'], axis=1, errors='ignore')
df_test = df_test.drop(['id'], axis=1, errors='ignore')

In [None]:
%%time
train = h2o.H2OFrame(df_train)
test = h2o.H2OFrame(df_test)

In [None]:
%%time
x = test.columns
y = 'target'

In [None]:
# Run AutoML for up to 2000 models to generate, up to 12 h in terms of duration (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=200, seed=47, max_runtime_secs=14400)
aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print the entire leaderboard instead of default (top 10 rows)

In [None]:
# The leader model is stored here
aml.leader

In [None]:
# If you need to generate predictions on a test set, you can make
# predictions directly on the `"H2OAutoML"` object, or on the leader
# model object directly

preds = aml.predict(test)

In [None]:
preds.as_data_frame().values.flatten()

In [None]:
subm['target'] = preds.as_data_frame().values.flatten()
subm.to_csv('h2o_automl_baseline_submission.csv', index=False)

In [None]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)