In [1]:
import pandas as pd
import numpy as np
import h2o
from h2o.automl import H2OAutoML

In [2]:
from h2o.estimators import H2OXGBoostEstimator

In [3]:
h2o.init(max_mem_size = "20g")

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.9.1" 2020-11-04; OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpok0eedo5
  JVM stdout: /tmp/tmpok0eedo5/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpok0eedo5/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,"14 days, 9 hours and 48 minutes"
H2O_cluster_name:,H2O_from_python_unknownUser_juoyp9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,20 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [4]:
# XGBoostを用いる。
h2o.estimators.xgboost.H2OXGBoostEstimator.available()

True

In [5]:
# data load
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

In [6]:
# h2oで使えるように変換
htrain = h2o.H2OFrame(train)
htest  = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
# 予測対象変数 (y) と予測に用いる変数 (x)
x = htrain.columns
y = 'y'
x.remove(y)

In [8]:
# For binary classification, response should be a factor
htrain[y] = htrain[y].asfactor()

In [9]:
# AutoML
aml = H2OAutoML(max_models=2000,
                seed=2000, 
                max_runtime_secs=3600
               )
aml.train(x=x, y=y, training_frame=htrain)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [10]:
# 結果表示
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # 全ての結果

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_grid__1_AutoML_20201202_012646_model_10,0.850092,0.201409,0.441421,0.304152,0.23632,0.0558473
GBM_grid__1_AutoML_20201202_012646_model_3,0.848963,0.201573,0.443256,0.320249,0.236146,0.0557651
StackedEnsemble_BestOfFamily_AutoML_20201202_012646,0.847386,0.211175,0.444605,0.291822,0.238335,0.0568035
GBM_grid__1_AutoML_20201202_012646_model_36,0.846424,0.202864,0.437528,0.320152,0.236758,0.0560545
StackedEnsemble_AllModels_AutoML_20201202_012646,0.846188,0.210294,0.44536,0.305735,0.238091,0.0566874
XGBoost_3_AutoML_20201202_012646,0.845182,0.203035,0.4358,0.313304,0.237105,0.0562188
GBM_grid__1_AutoML_20201202_012646_model_47,0.845018,0.203885,0.436456,0.324014,0.237178,0.0562536
GBM_grid__1_AutoML_20201202_012646_model_16,0.844431,0.204824,0.425299,0.316122,0.238132,0.056707
GBM_grid__1_AutoML_20201202_012646_model_43,0.844177,0.204097,0.43703,0.313494,0.236879,0.0561118
GBM_grid__1_AutoML_20201202_012646_model_38,0.843738,0.204328,0.43244,0.308666,0.237495,0.0564037




In [11]:
preds = aml.leader.predict(htest)
preds = preds.as_data_frame()
print(preds)

gbm prediction progress: |████████████████████████████████████████████████| 100%




       predict        p0        p1
0            1  0.258587  0.741413
1            0  0.873946  0.126054
2            0  0.969856  0.030144
3            0  0.996018  0.003982
4            0  0.962293  0.037707
...        ...       ...       ...
18045        0  0.982271  0.017729
18046        0  0.991325  0.008675
18047        0  0.909579  0.090421
18048        0  0.995167  0.004833
18049        0  0.904585  0.095415

[18050 rows x 3 columns]


In [12]:
# 提出処理
preds.to_csv("h2o.csv", index = True, header=False)