In [1]:
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import h2o
from h2o.automl import H2OAutoML

In [2]:
#data cleaning and feature engineering 
def get_name_prefix(data):
    prefix = pd.Series(np.ones(data.shape[0]), index=data.index)
    data['Prefix'] = prefix
    data.loc[data.Name.str.contains('Miss.', regex=False), 'Prefix'] = 2
    data.loc[data.Name.str.contains('Mrs.', regex=False), 'Prefix'] = 3
    data.loc[data.Name.str.contains('Mr.', regex=False), 'Prefix'] = 4
    
# https://stackoverflow.com/a/42523230
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
    return df

def normalize(df, mean, std):
    """
    @param df pandas DataFrame
    @param mean pandas Series of column values mean
    @param std pandas Series of column values standard deviation
    """
    for i in range(mean.size):
        df[mean.index[i]] = (df[mean.index[i]] - mean[0]) / std[0] 

def process_data(data):
    # get prefix data
    get_name_prefix(data)
    # remove name and ticket
    data.drop(['Ticket', 'Name'], inplace=True, axis=1)
    # sex
    data.loc[data.Sex != 'male', 'Sex'] = 0;
    data.loc[data.Sex == 'male', 'Sex'] = 1;
    # cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # embarked
    data.Embarked.fillna(0, inplace=True)
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.fillna(-1, inplace=True)
    
    data = one_hot(data, ('Pclass', 'Sex', 'Cabin', 'Embarked', 'Prefix'))
    return data.astype(float)

#load data
train_raw = pd.read_csv('../data/titanic/train.csv')
test_raw = pd.read_csv('../data/titanic/test.csv')

train = process_data(train_raw)
test = process_data(test_raw)

data_mean = train[['Age','Fare','SibSp','Parch']].mean(axis=0)
data_std = train[['Age','Fare','SibSp','Parch']].std(axis=0)

normalize(train, data_mean, data_std)
normalize(test, data_mean, data_std)

test, train = test.align(train, axis=1, fill_value=0)

In [3]:
#start H2O 

# h2o.init(ip="localhost", port="8080")

h2o.init()

#load data as h2o frames
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

#drop passengerId from data set
passId = test['PassengerId']
train = train.drop('PassengerId',axis =1)
test = test.drop('PassengerId',axis =1)

#identify predictors and labels
x = train.columns
y = 'Survived'
x.remove(y)

#for binary classification, lables should be a factor
train[y] = train[y].asfactor()

# Run AutoML
aml_ti = H2OAutoML(max_runtime_secs= 120,max_models= 10, seed= 7,nfolds= 10)
aml_ti.train(x = x, y = y,
          training_frame = train)
          
#prediction
pred = aml_ti.leader.predict(test)

#save predict results to submission form
pred_df = pred.as_data_frame()
pred_res = pred_df.predict
passId_df = passId.as_data_frame()
res_ti = pd.concat([passId_df,pred_res],axis=1,ignore_index = True)
res_ti.columns = ['PassengerId','Survived']
res_ti.to_csv('mypred.csv',index=False)

#http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.7.0_201"; OpenJDK Runtime Environment (IcedTea 2.6.16) (Alpine 7.201.2.6.16-r0); OpenJDK 64-Bit Server VM (build 24.201-b00, mixed mode)
  Starting server from /usr/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpn5gemxj7
  JVM stdout: /tmp/tmpn5gemxj7/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpn5gemxj7/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Etc/GMT
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.6
H2O cluster version age:,"14 days, 13 hours and 9 minutes"
H2O cluster name:,H2O_from_python_unknownUser_kgbogi
H2O cluster total nodes:,1
H2O cluster free memory:,592 Mb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


In [4]:
#check the leaderboard
lb_ti = aml_ti.leaderboard
lb_ti

model_id,auc,logloss,mean_per_class_error,rmse,mse
GBM_1_AutoML_20190328_093750,0.872032,0.413346,0.179968,0.356766,0.127282
GBM_4_AutoML_20190328_093750,0.867606,0.418072,0.178027,0.356861,0.12735
GBM_2_AutoML_20190328_093750,0.867108,0.41623,0.180376,0.35647,0.127071
StackedEnsemble_AllModels_AutoML_20190328_093750,0.866927,0.415035,0.18282,0.358294,0.128375
StackedEnsemble_BestOfFamily_AutoML_20190328_093750,0.866498,0.414782,0.181167,0.35833,0.1284
GBM_3_AutoML_20190328_093750,0.865223,0.420427,0.177644,0.357952,0.128129
GLM_grid_1_AutoML_20190328_093750_model_1,0.856344,0.442253,0.202449,0.373309,0.13936
GBM_5_AutoML_20190328_093750,0.852709,0.445151,0.213306,0.374165,0.139999
DRF_1_AutoML_20190328_093750,0.851625,1.13749,0.193869,0.378304,0.143114
DeepLearning_1_AutoML_20190328_093750,0.845312,0.460344,0.202689,0.379533,0.144045


