In [None]:
!pip install h2o

# Data Preparation

In [None]:
import pandas as pd 
import numpy as np 
import h2o
from h2o.automl import H2OAutoML
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
df_train.drop(columns='id', axis=1, inplace=True)
df_train.head()

# H2O

In [None]:
h2o.init(
    nthreads=-1,     # number of threads when launching a new H2O server
    max_mem_size=12  # in gigabytes
)

Change to dataframe format h2o can read

In [None]:
train = h2o.H2OFrame(df_train)

In [None]:
x = train.columns
y = "target"
x.remove(y)

In [None]:
aml = H2OAutoML(nfolds=5, balance_classes=False,
                max_runtime_secs=(3600*3), max_models=None, seed=42)
aml.train(x=x, y=y, training_frame=train)

# Model Evaluation

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

Check feature importance of leader model

In [None]:
aml.leader.varimp_plot(num_of_features=10)

Check coefficient of each model in the stacked ensemble model

In [None]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
se = h2o.get_model([mid for mid in model_ids if "StackedEnsemble_AllModels" in mid][0])
metalearner = h2o.get_model(se.metalearner()['name'])

metalearner.std_coef_plot(num_of_features=20)

# Submission

Make Prediction and Submission

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
df_test_x = df_test.drop(columns='id',axis=1)
test = h2o.H2OFrame(df_test_x)
preds = aml.leader.predict(test)

In [None]:
test_with_pred = test.cbind(preds)
res = test_with_pred[:,'predict']
res = res.set_names(['target'])
h2o.export_file(res, path = 'submission.csv', force = True)
submission = pd.read_csv('./submission.csv')
submission['id'] = df_test.id
submission = submission[['id','target']]
submission.to_csv('submission.csv',index=False)

In [None]:
submission.head()