In [None]:
import numpy as np  
import pandas as pd  
import datetime as dt
from typing import Tuple

import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data

In [None]:

df_train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
print(df_train.shape)
df_train.head()


In [None]:
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
print(df_test.shape)
df_test.head()

In [None]:
df_sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
print(df_sub.shape)
df_sub.head()

In [None]:
df_train = df_train.drop(['id'], axis=1, errors='ignore')
df_test = df_test.drop(['id'], axis=1, errors='ignore')

print(df_train.shape)
print(df_test.shape)

In [None]:
# Target should be boolean, otherwise autoML will perform a regression, not classification

df_train.target = df_train.target.astype(bool)


# h2o autoML

In [None]:
%%time

#Prep the h2o frames

train = h2o.H2OFrame(df_train)
test = h2o.H2OFrame(df_test)

x = test.columns
y = 'target'

In [None]:
# Run AutoML - set num of models and allocated time

aml = H2OAutoML(max_models=50, seed=47, max_runtime_secs= 3*3600)
aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the AutoML Leaderboard

lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print the entire leaderboard instead of default (top 10 rows)

In [None]:
# The leader model 
aml.leader

# Submit

In [None]:
preds = aml.predict(test)

In [None]:

df_sub['target'] = preds['True'].as_data_frame().values.flatten()
df_sub.to_csv('h2o_automl_baseline_submission.csv', index=False)

df_sub.head()