In [14]:
import h2o
import pandas as pd
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from sklearn.cross_validation import train_test_split

In [9]:
df_train = pd.read_csv("./new/train_data.csv")
df_test = pd.read_csv("./new/test_data.csv")

In [10]:
df_train.drop("indi_nonsegment_ratio_city", axis = 1, inplace = True)
df_train.drop("indi_segment_ratio_city", axis = 1, inplace = True)
df_train.drop("sum_wt_ratio_city", axis = 1, inplace = True)

df_test.drop("indi_nonsegment_ratio_city", axis = 1, inplace = True)
df_test.drop("indi_segment_ratio_city", axis = 1, inplace = True)
df_test.drop("sum_wt_ratio_city", axis = 1, inplace = True)

In [5]:
def run_gbm(x_train, label_train, x_valid = None, label_valid = None):

    h2o.remove_all()                          
    h2o.init(max_mem_size = "2G")             
    
    gbm = H2OGradientBoostingEstimator(
            ntrees=1000,
            learn_rate=0.3,
            max_depth=10,
            sample_rate=0.7,
            col_sample_rate=0.7,
            stopping_rounds=2,
            stopping_tolerance=0.001, #10-fold increase in threshold as defined in rf_v1
            score_each_iteration=True,
            model_id="gbm_starter_1",
            seed=2000000
        )

    hf_train = h2o.H2OFrame(pd.concat([x_train, label_train], axis = 1))
    
    if x_valid is not None:
        hf_valid = h2o.H2OFrame(pd.concat([x_valid, label_valid], axis = 1))
        gbm.train(hf_train.col_names[:-1], hf_train.col_names[-1], training_frame = hf_train, validation_frame = hf_valid)
    else:
        gbm.train(hf_train.col_names[:-1], hf_train.col_names[-1], training_frame = hf_train)
        
    return gbm

In [11]:
#Clean useless columns
df_label = df_train.pop("segment")
df_train.drop("ID", axis = 1, inplace=True)

#Validation split
x_train, x_valid, label_train, label_valid = train_test_split(df_train, df_label, test_size=0.2, random_state=4242, stratify = df_label)

test_ids = df_test.pop("ID")

In [17]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (build 1.8.0_121-8u121-b13-0ubuntu1.16.04.2-b13); OpenJDK 64-Bit Server VM (build 25.121-b13, mixed mode)
  Starting server from /usr/local/lib/python2.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpiWt080
  JVM stdout: /tmp/tmpiWt080/h2o_belongtech_started_from_python.out
  JVM stderr: /tmp/tmpiWt080/h2o_belongtech_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,06 secs
H2O cluster version:,3.10.4.8
H2O cluster version age:,22 days
H2O cluster name:,H2O_from_python_belongtech_vnfu2a
H2O cluster total nodes:,1
H2O cluster free memory:,1.714 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [21]:
gbm_model = run_gbm(x_train, label_train, x_valid, label_valid)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 mins 18 secs
H2O cluster version:,3.10.4.8
H2O cluster version age:,22 days
H2O cluster name:,H2O_from_python_belongtech_vnfu2a
H2O cluster total nodes:,1
H2O cluster free memory:,1.567 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [29]:
gbm_preds = gbm_model.predict(h2o.H2OFrame(df_test))

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


In [34]:
type(gbm_preds)

h2o.frame.H2OFrame

In [35]:
sub = pd.DataFrame()
sub['ID'] = test_ids
sub['segment'] = gbm_preds.as_data_frame()
sub.to_csv("./subs/gbm_1.csv", index=False)   

TypeError: DataFrame constructor called with incompatible data and dtype: cannot copy sequence with size 100000 to array axis with dimension 1