In [None]:
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch

In [None]:
df_train = h2o.import_file('train.csv')
df_test = h2o.import_file('test.csv')

y = "score"
x = df.columns
x.remove(y)
df_train[y] = df_train[y].asfactor()

In [None]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=15, seed=1)
aml.train(x=x, y=y, training_frame=df)
h2o_glm_preds = aml.leader.predict(df_test)

In [None]:
# Split data into train & validation
ss = df.split_frame(seed = 1)
train = ss[0]
valid = ss[1]

# GBM hyperparameters
gbm_params1 = {'learn_rate': [0.01, 0.1],
                'max_depth': [3, 5, 9],
                'sample_rate': [0.8, 1.0],
                'col_sample_rate': [0.2, 0.5, 1.0]}

# Train and validate a cartesian grid of GBMs
gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator,
                          grid_id='gbm_grid1',
                          hyper_params=gbm_params1)
gbm_grid1.train(x=x, y=y,
                training_frame=train,
                validation_frame=valid,
                ntrees=100,
                seed=1)

# Get the grid results, sorted by validation AUC
gbm_gridperf1 = gbm_grid1.get_grid(sort_by='rmse', decreasing=True)

# Grab the top GBM model, chosen by validation AUC
best_gbm1 = gbm_gridperf1.models[0] 

In [None]:
best_gbm_perf1 = best_gbm1.model_performance(df_test)
h2o_grid_search_preds = best_gbm1.predict(df_test)

In [None]:
glm_preds_dataframe = h2o_glm_preds.as_data_frame(use_pandas=True)
glm_preds_dataframe.to_csv('glm_preds_submission.csv', header=False, index=False)

In [None]:
h2o_grid_search_preds_dataframe = h2o_grid_search_preds.as_data_frame(use_pandas=True)
h2o_grid_search_preds_dataframe.to_csv('h2o_grid_search_preds_submission.csv', header=False, index=False)