In [None]:
# inspiered by https://www.kaggle.com/tunguz/jan-21-tps-h2o-automl

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
from typing import Tuple



In [None]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

In [None]:
# Starting H2O

import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='16G')

In [None]:
in_kaggle = True


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-jan-2021/train.csv'
        test_path = '../input/tabular-playground-series-jan-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-jan-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path

In [None]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

df_train = pd.read_csv(train_set_path)
df_test = pd.read_csv(test_set_path)

subm = pd.read_csv(sample_subm_path)

# list of basic raw features
feature_list = [col for col in df_train.columns if col.startswith('cont')]

In [None]:
%%time
# add additional features pre-selected in the previous featurewiz feature importance experiments
def add_extra_features(
    train: pd.DataFrame, 
    test: pd.DataFrame) -> [pd.DataFrame, pd.DataFrame]:
    
    small_val = 0.00001
    for df in [train, test]:
        df['cont2_squared'] = df['cont2'] ** 2
        df['cont3_squared'] = df['cont3'] ** 2
        df['cont9_squared'] = df['cont9'] ** 2
        df['cont12_squared'] = df['cont12'] ** 2
        df['cont14_squared'] = df['cont14'] ** 2
        df['cont7_squared'] = df['cont7'] ** 2
        df['cont13_squared'] = df['cont13'] ** 2
        df['cont8_squared'] = df['cont8'] ** 2
        df['cont11_plus_cont3'] = df['cont11'] + df['cont3']
        df['cont13_plus_cont3'] = df['cont13'] + df['cont3']
        df['cont1_plus_cont4'] = df['cont1'] + df['cont4']
        df['cont5_plus_cont11'] = df['cont5'] + df['cont11']
        df['cont3_plus_cont8'] = df['cont3'] + df['cont8']
        df['cont3_plus_cont14'] = df['cont3'] + df['cont14']
        df['cont4_plus_cont3'] = df['cont4'] + df['cont3']
        df['cont4_plus_cont2'] = df['cont4'] + df['cont2']
        df['cont14_plus_cont11'] = df['cont14'] + df['cont11']
        df['cont14_plus_cont4'] = df['cont14'] + df['cont4']
        df['cont10_minus_cont12'] = df['cont10'] - df['cont12']
        df['cont13_minus_cont2'] = df['cont13'] - df['cont2']
        df['cont11_minus_cont10'] = df['cont11'] - df['cont10']
        df['cont4_minus_cont11'] = df['cont4'] - df['cont11']
        df['cont1_minus_cont6'] = df['cont1'] - df['cont6']
        df['cont5_minus_cont4'] = df['cont5'] - df['cont4']
        df['cont13_minus_cont10'] = df['cont13'] - df['cont10']
        df['cont13_minus_cont6'] = df['cont13'] - df['cont6']
        df['cont10_minus_cont6'] = df['cont10'] - df['cont6']
        df['cont13_minus_cont8'] = df['cont13'] - df['cont8']
        df['cont14_minus_cont13'] = df['cont14'] - df['cont13']
        df['cont9_minus_cont13'] = df['cont9'] - df['cont13']
        df['cont7_minus_cont13'] = df['cont7'] - df['cont13']
        df['cont12_minus_cont8'] = df['cont12'] - df['cont8']
        df['cont2_minus_cont3'] = df['cont2'] - df['cont3']
        df['cont8_minus_cont4'] = df['cont8'] - df['cont4']
        df['cont12_minus_cont6'] = df['cont12'] - df['cont6']
        df['cont5_minus_cont2'] = df['cont5'] - df['cont12']
        df['cont14_minus_cont8'] = df['cont14'] - df['cont8']
        df['cont10_minus_cont9'] = df['cont10'] - df['cont9']
        df['cont1_minus_cont9'] = df['cont1'] - df['cont9']
        df['cont9_minus_cont12'] = df['cont9'] - df['cont12']
        df['cont6_minus_cont11'] = df['cont6'] - df['cont11']
        df['cont6_minus_cont9'] = df['cont6'] - df['cont9']
        df['cont14_minus_cont4'] = df['cont14'] - df['cont4']
        df['cont7_minus_cont11'] = df['cont7'] - df['cont11']
        df['cont1_minus_cont10'] = df['cont1'] - df['cont10']
        df['cont12_minus_cont11'] = df['cont12'] - df['cont11']
        df['cont4_prod_cont3'] = df['cont4'] * df['cont3']
        df['cont4_prod_cont2'] = df['cont4'] * df['cont2']
        df['cont11_prod_cont4'] = df['cont11'] * df['cont4']
        df['cont11_prod_cont3'] = df['cont11'] * df['cont3']
        df['cont13_prod_cont3'] = df['cont13'] * df['cont3']
        df['cont12_prod_cont5'] = df['cont12'] * df['cont5']
        df['cont14_prod_cont11'] = df['cont14'] * df['cont11']
        df['cont8_prod_cont3'] = df['cont8'] * df['cont3']
        df['cont14_prod_cont3'] = df['cont14'] * df['cont3']
        df['cont12_qq_cont10'] = (df['cont12'] + df['cont10']) * (df['cont12'] - df['cont10'])
        df['cont10_qq_cont6'] = (df['cont10'] + df['cont6']) * (df['cont10'] - df['cont6'])
        df['cont1_qq_cont6'] = (df['cont1'] + df['cont6']) * (df['cont1'] - df['cont6'])
        df['cont13_qq_cont10'] = (df['cont13'] + df['cont10']) * (df['cont13'] - df['cont10'])
        df['cont11_qq_cont13'] = (df['cont11'] + df['cont13']) * (df['cont11'] - df['cont13'])
        df['cont4_qq_cont11'] = (df['cont4'] + df['cont11']) * (df['cont4'] - df['cont11'])
        df['cont8_qq_cont10'] = (df['cont8'] + df['cont10']) * (df['cont8'] - df['cont10'])
        df['cont9_qq_cont13'] = (df['cont9'] + df['cont13']) * (df['cont9'] - df['cont13'])
        df['cont13_qq_cont2'] = (df['cont13'] + df['cont2']) * (df['cont13'] - df['cont2'])
        df['cont5_qq_cont9'] = (df['cont5'] + df['cont9']) * (df['cont5'] - df['cont9'])
        df['cont7_qq_cont2'] = (df['cont7'] + df['cont2']) * (df['cont7'] - df['cont2'])
        df['cont1_qq_cont12'] = (df['cont1'] + df['cont12']) * (df['cont1'] - df['cont12'])
        df['cont3_qq_cont11'] = (df['cont3'] + df['cont11']) * (df['cont3'] - df['cont11'])
        df['cont5_qq_cont4'] = (df['cont5'] + df['cont4']) * (df['cont5'] - df['cont4'])
        df['cont10_qq_cont9'] = (df['cont10'] + df['cont9']) * (df['cont10'] - df['cont9'])
        df['cont7_qq_cont9'] = (df['cont7'] + df['cont9']) * (df['cont7'] - df['cont9'])
        df['cont3_qq_cont2'] = (df['cont3'] + df['cont2']) * (df['cont3'] - df['cont2'])
        df['cont1_qq_cont9'] = (df['cont1'] + df['cont9']) * (df['cont1'] - df['cont9'])
        df['cont14_qq_cont8'] = (df['cont14'] + df['cont8']) * (df['cont14'] - df['cont8'])
        df['cont8_qq_cont2'] = (df['cont8'] + df['cont2']) * (df['cont8'] - df['cont2'])
        df['cont5_qq_cont14'] = (df['cont5'] + df['cont14']) * (df['cont5'] - df['cont14'])
        df['cont6_qq_cont12'] = (df['cont6'] + df['cont12']) * (df['cont6'] - df['cont12'])
        df['cont11_qq_cont14'] = (df['cont11'] + df['cont14']) * (df['cont11'] - df['cont14'])
        df['cont12_qq_cont11'] = (df['cont12'] + df['cont11']) * (df['cont12'] - df['cont11'])
        df['cont1_qq_cont10'] = (df['cont1'] + df['cont10']) * (df['cont1'] - df['cont10'])
        df['cont4_qq_cont8'] = (df['cont4'] + df['cont8']) * (df['cont4'] - df['cont8'])
        df['cont10_div2_cont12'] = (df['cont10'] - df['cont12']) / (df['cont10'] + df['cont12'] + small_val)
        df['cont7_div2_cont1'] = (df['cont7'] - df['cont1']) / (df['cont7'] + df['cont1'] + small_val)
        df['cont10_div2_cont11'] = (df['cont10'] - df['cont11']) / (df['cont10'] + df['cont11'] + small_val)
        df['cont10_div2_cont6'] = (df['cont10'] - df['cont6']) / (df['cont10'] + df['cont6'] + small_val)
        df['cont12_div2_cont13'] = (df['cont12'] - df['cont13']) / (df['cont12'] + df['cont13'] + small_val)
        df['cont13_div2_cont8'] = (df['cont13'] - df['cont8']) / (df['cont13'] + df['cont8'] + small_val)
        df['cont9_div2_cont2'] = (df['cont9'] - df['cont2']) / (df['cont9'] + df['cont2'] + small_val)
        df['cont7_div2_cont4'] = (df['cont7'] - df['cont4']) / (df['cont7'] + df['cont4'] + small_val)
        df['cont9_div2_cont3'] = (df['cont9'] - df['cont3']) / (df['cont9'] + df['cont3'] + small_val)
        df['cont13_div2_cont1'] = (df['cont13'] - df['cont1']) / (df['cont13'] + df['cont1'] + small_val)
        df['cont10_div2_cont13'] = (df['cont10'] - df['cont13']) / (df['cont10'] + df['cont13'] + small_val)
        df['cont11_div2_cont3'] = (df['cont11'] - df['cont3']) / (df['cont11'] + df['cont3'] + small_val)
        df['cont7_div2_cont10'] = (df['cont7'] - df['cont10']) / (df['cont7'] + df['cont10'] + small_val)
        df['cont10_div2_cont1'] = (df['cont10'] - df['cont1']) / (df['cont10'] + df['cont1'] + small_val)
        df['cont8_div2_cont11'] = (df['cont8'] - df['cont11']) / (df['cont8'] + df['cont11'] + small_val)
        df['cont11_div2_cont9'] = (df['cont11'] - df['cont9']) / (df['cont11'] + df['cont9'] + small_val)
        df['cont9_div2_cont6'] = (df['cont9'] - df['cont6']) / (df['cont9'] + df['cont6'] + small_val)
        df['cont4_div2_cont14'] = (df['cont4'] - df['cont14']) / (df['cont4'] + df['cont14'] + small_val)
        df['cont4_div2_cont5'] = (df['cont4'] - df['cont5']) / (df['cont4'] + df['cont5'] + small_val)
        df['cont9_div2_cont1'] = (df['cont9'] - df['cont1']) / (df['cont9'] + df['cont1'] + small_val)
        df['cont8_div2_cont4'] = (df['cont8'] - df['cont4']) / (df['cont8'] + df['cont4'] + small_val)
        df['cont14_div2_cont5'] = (df['cont14'] - df['cont5']) / (df['cont14'] + df['cont5'] + small_val)
        df['cont9_div2_cont7'] = (df['cont9'] - df['cont7']) / (df['cont9'] + df['cont7'] + small_val)
        df['cont8_div2_cont14'] = (df['cont8'] - df['cont14']) / (df['cont8'] + df['cont14'] + small_val)
        df['cont6_div2_cont12'] = (df['cont6'] - df['cont12']) / (df['cont6'] + df['cont12'] + small_val)
        df['cont9_div2_cont8'] = (df['cont9'] - df['cont8']) / (df['cont9'] + df['cont8'] + small_val)
        df['cont6_div2_cont7'] = (df['cont6'] - df['cont7']) / (df['cont6'] + df['cont7'] + small_val)
        df['cont10_div2_cont9'] = (df['cont10'] - df['cont9']) / (df['cont10'] + df['cont9'] + small_val)
        df['cont7_div2_cont12'] = (df['cont7'] - df['cont12']) / (df['cont7'] + df['cont12'] + small_val)
        df['cont11_div2_cont12'] = (df['cont11'] - df['cont12']) / (df['cont11'] + df['cont12'] + small_val)
        df['cont11_div2_cont7'] = (df['cont11'] - df['cont7']) / (df['cont11'] + df['cont7'] + small_val)
        df['cont12_div2_cont10'] = (df['cont12'] - df['cont10']) / (df['cont12'] + df['cont10'] + small_val)
        df['cont11_div2_cont10'] = (df['cont11'] - df['cont10']) / (df['cont11'] + df['cont10'] + small_val)
        df['cont6_div2_cont10'] = (df['cont6'] - df['cont10']) / (df['cont6'] + df['cont10'] + small_val)
        df['cont8_div2_cont13'] = (df['cont8'] - df['cont13']) / (df['cont8'] + df['cont13'] + small_val)
        df['cont4_div2_cont8'] = (df['cont4'] - df['cont8']) / (df['cont4'] + df['cont8'] + small_val)
        df['cont13_div2_cont10'] = (df['cont13'] - df['cont10']) / (df['cont13'] + df['cont10'] + small_val)
        df['cont5_div2_cont4'] = (df['cont5'] - df['cont4']) / (df['cont5'] + df['cont4'] + small_val)
        df['cont10_div2_cont7'] = (df['cont10'] - df['cont7']) / (df['cont10'] + df['cont7'] + small_val)
        df['cont3_div2_cont9'] = (df['cont3'] - df['cont9']) / (df['cont3'] + df['cont9'] + small_val)
        df['cont9_div2_cont11'] = (df['cont9'] - df['cont11']) / (df['cont9'] + df['cont11'] + small_val)
        df['cont1_div2_cont9'] = (df['cont1'] - df['cont9']) / (df['cont1'] + df['cont9'] + small_val)
        df['cont3_div2_cont11'] = (df['cont3'] - df['cont11']) / (df['cont3'] + df['cont11'] + small_val)
        df['cont3_div2_cont5'] = (df['cont3'] - df['cont5']) / (df['cont3'] + df['cont5'] + small_val)
        df['cont14_div2_cont8'] = (df['cont14'] - df['cont8']) / (df['cont14'] + df['cont8'] + small_val)
        df['cont9_div2_cont10'] = (df['cont9'] - df['cont10']) / (df['cont9'] + df['cont10'] + small_val)
        df['cont9_div2_cont10'] = (df['cont9'] - df['cont10']) / (df['cont9'] + df['cont10'] + small_val)
        df['cont12_div2_cont11'] = (df['cont12'] - df['cont11']) / (df['cont12'] + df['cont11'] + small_val)
    
    return train, test

# add extra features
df_train, df_test = add_extra_features(df_train, df_test)

# subset the best features in train and test sets
relevant_features = [
    'cont12_qq_cont10', 'cont2_squared', 'cont3_squared', 'cont14_squared', 
    'cont7_squared','cont4_plus_cont3', 'cont4_qq_cont11', 'cont10', 'cont1_minus_cont6', 
    'cont1_plus_cont4','cont3_qq_cont11', 'cont13_minus_cont8', 'cont10_minus_cont6', 'cont7_div2_cont1',
    'cont4_prod_cont2', 'cont10_div2_cont7', 'cont12_squared',                      
    'cont11_div2_cont9', 'cont13_minus_cont10', 'cont14_minus_cont13', 'cont9_qq_cont13',
    'cont9_div2_cont3', 'cont5_div2_cont4', 'cont4_qq_cont8', 'cont12_minus_cont8',
    'cont14_plus_cont4',  'cont1_qq_cont9', 'cont10_qq_cont9',   'cont3_qq_cont2',
    'cont1', 'cont2',  'cont4', 'cont5', 'cont6',  'cont7', 'cont8', 'cont9', 'cont11',
    'cont12', 'cont13', 'cont14', 'cont11_minus_cont10', 'cont1_qq_cont10', 'cont10_div2_cont9',
    'cont9_div2_cont1', 'cont14_div2_cont5',                    
]

# drop irrelevant features from train and test sets
target_values = df_train['target']
df_train = df_train[relevant_features]
df_train['target'] = target_values
df_test = df_test[relevant_features]

In [None]:
%%time
train = h2o.H2OFrame(df_train)
test = h2o.H2OFrame(df_test)

In [None]:
x = test.columns
y = 'target'

In [None]:
# Run AutoML for 200 base models, up to 4 h in terms of duration (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=200, seed=47, max_runtime_secs=14400)
aml.train(x=x, y=y, training_frame=train)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# The leader model is stored here
aml.leader

In [None]:
# If you need to generate predictions on a test set, you can make
# predictions directly on the `"H2OAutoML"` object, or on the leader
# model object directly

preds = aml.predict(test)

In [None]:
preds.as_data_frame().values.flatten()

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')
sample_submission['target'] = preds.as_data_frame().values.flatten()
sample_submission.to_csv('h2o_automl_submission.csv', index=False)

In [None]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)