### Environment

In [1]:
import time
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

In [2]:
time_suffix = time.strftime("%Y%m%d_%H%M%S", time.localtime())

In [3]:
seed = 123

In [4]:
data_path='/home/sfang/windows/gitlab/stanleysfang/code_reference/h2o/data/'
model_path='/home/sfang/windows/gitlab/stanleysfang/code_reference/h2o/models/'

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_171"; OpenJDK Runtime Environment (build 1.8.0_171-8u171-b11-0ubuntu0.16.04.1-b11); OpenJDK 64-Bit Server VM (build 25.171-b11, mixed mode)
  Starting server from /home/sfang/anaconda3/envs/h2o/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp7h5324ln
  JVM stdout: /tmp/tmp7h5324ln/h2o_sfang_started_from_python.out
  JVM stderr: /tmp/tmp7h5324ln/h2o_sfang_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,25 days
H2O_cluster_name:,H2O_from_python_sfang_i3k5te
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.535 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


### Dataprep

#### Train

In [6]:
train = pd.read_csv(
    data_path + "train.csv",
    header=0, names=['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'ParCh', 'Ticket', 'Fare', 'Cabin', 'Embarked']
)

In [7]:
title_re = re.compile('^.*, (.*?)\\..*$')
train['Title'] = train['Name'].apply(lambda x: title_re.search(x)[1])

In [8]:
adult_title = ['Mr', 'Mrs', 'Dr', 'Rev', 'Col', 'Major', 'Sir', 'Don', 'Dona', 'Mme', 'Jonkheer', 'Lady', 'Capt', 'the Countess']

train.loc[train['Title'].isin(adult_title) & (train['Sex'] == 'male'), 'Title_cleaned'] = 'Mr'
train.loc[train['Title'].isin(adult_title) & train['Sex'].isin(['female']), 'Title_cleaned'] = 'Mrs'
train.loc[train['Title'].isin(['Miss', 'Ms', 'Mlle']), 'Title_cleaned'] = 'Miss'
train.loc[train['Title'].isin(['Master']), 'Title_cleaned'] = 'Master'

In [9]:
mr_mean_age = train.loc[train['Title_cleaned'].isin(['Mr']), 'Age'].mean()
mrs_mean_age = train.loc[train['Title_cleaned'].isin(['Mrs']), 'Age'].mean()
miss_mean_age = train.loc[train['Title_cleaned'].isin(['Miss']), 'Age'].mean()
master_mean_age = train.loc[train['Title_cleaned'].isin(['Master']), 'Age'].mean()

In [10]:
train.loc[train['Title_cleaned'].isin(['Mr']) & train['Age'].isna(), 'Age'] = mr_mean_age
train.loc[train['Title_cleaned'].isin(['Mrs']) & train['Age'].isna(), 'Age'] = mrs_mean_age
train.loc[train['Title_cleaned'].isin(['Miss']) & train['Age'].isna(), 'Age'] = miss_mean_age
train.loc[train['Title_cleaned'].isin(['Master']) & train['Age'].isna(), 'Age'] = master_mean_age

In [11]:
train.loc[train['Cabin'].isna(), 'Cabin'] = 'No Cabin'

In [12]:
train.loc[train['Embarked'].isna(), 'Embarked'] = 'S'

#### Test

In [13]:
test = pd.read_csv(
    data_path + "test.csv",
    header=0, names=['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'ParCh', 'Ticket', 'Fare', 'Cabin', 'Embarked']
)

In [14]:
title_re = re.compile('^.*, (.*?)\\..*$')
test['Title'] = test['Name'].apply(lambda x: title_re.search(x)[1])

In [15]:
adult_title = ['Mr', 'Mrs', 'Dr', 'Rev', 'Col', 'Major', 'Sir', 'Don', 'Dona', 'Mme', 'Jonkheer', 'Lady', 'Capt', 'the Countess']

test.loc[test['Title'].isin(adult_title) & (test['Sex'] == 'male'), 'Title_cleaned'] = 'Mr'
test.loc[test['Title'].isin(adult_title) & test['Sex'].isin(['female']), 'Title_cleaned'] = 'Mrs'
test.loc[test['Title'].isin(['Miss', 'Ms', 'Mlle']), 'Title_cleaned'] = 'Miss'
test.loc[test['Title'] == 'Master', 'Title_cleaned'] = 'Master'

In [16]:
test.loc[test['Title_cleaned'].isin(['Mr']) & test['Age'].isna(), 'Age'] = mr_mean_age
test.loc[test['Title_cleaned'].isin(['Mrs']) & test['Age'].isna(), 'Age'] = mrs_mean_age
test.loc[test['Title_cleaned'].isin(['Miss']) & test['Age'].isna(), 'Age'] = miss_mean_age
test.loc[test['Title_cleaned'].isin(['Master']) & test['Age'].isna(), 'Age'] = master_mean_age

In [17]:
test.loc[test['Cabin'].isna(), 'Cabin'] = 'No Cabin'

In [18]:
test.loc[test['Fare'].isna(), 'Fare'] = train.loc[train['Pclass'] == 3, 'Fare'].mean()

#### Answers

In [19]:
answers = pd.read_csv(
    data_path + "answers.csv",
    header=0, names=['PassengerId', 'Survived'],
)

#### Target/Features

In [20]:
target = 'Survived'
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'ParCh', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title_cleaned']

### pandas dataframe to H2OFrame

In [21]:
train_h2o = h2o.H2OFrame(train, destination_frame='train', column_types={'Survived': 'enum', 'Pclass': 'enum', 'Name': 'enum', 'Sex': 'enum', 'Ticket': 'enum', 'Cabin': 'enum', 'Embarked': 'enum', 'Title': 'enum', 'Title_cleaned': 'enum'})
test_h2o = h2o.H2OFrame(test, destination_frame='test', column_types={'Pclass': 'enum', 'Name': 'enum', 'Sex': 'enum', 'Ticket': 'enum', 'Cabin': 'enum', 'Embarked': 'enum', 'Title': 'enum', 'Title_cleaned': 'enum'})

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


### Random Forest

In [22]:
rf_model_id = 'my_model_0'

In [23]:
rf_model = H2ORandomForestEstimator(
    model_id=rf_model_id,
    nfolds=5,
    seed=seed
)

In [24]:
rf_model.train(x=feature_cols, y=target, training_frame=train_h2o)

drf Model Build progress: |███████████████████████████████████████████████| 100%


### Save/Load H2O Model

In [25]:
h2o.save_model(model=rf_model, path=model_path, force=True)

'/mnt/c/Users/Stanley Fang/Documents/sfang/gitlab/stanleysfang/code_reference/h2o/models/my_model_0'

In [26]:
rf_model = model_path + 'my_model_0'

### Random Forest Grid Search

In [27]:
rf_grid_id = 'rf_grid_' + time_suffix

In [28]:
rf_grid = H2OGridSearch(
    H2ORandomForestEstimator(
        ntrees=10000,
        nfolds=5,
        stopping_metric='AUC',
        stopping_tolerance=1e-4,
        stopping_rounds=3,
        score_tree_interval=5,
        seed=seed
    ),
    grid_id=rf_grid_id,
    hyper_params={
        'max_depth': [15, 20, 25],
        'col_sample_rate_per_tree': [0.5, 0.7],
        'mtries': [3, 5]
    }
)

In [29]:
rf_grid.train(x=feature_cols, y=target, training_frame=train_h2o)

drf Grid Build progress: |████████████████████████████████████████████████| 100%


In [30]:
rf_grid_auc = rf_grid.get_grid(sort_by='auc', decreasing=True)
print(rf_grid_auc)

     col_sample_rate_per_tree max_depth mtries  \
0                         0.7        15      3   
1                         0.7        25      3   
2                         0.7        20      3   
3                         0.5        25      3   
4                         0.5        20      3   
5                         0.5        15      3   
6                         0.5        15      5   
7                         0.5        20      5   
8                         0.5        25      5   
9                         0.7        25      5   
10                        0.7        20      5   
11                        0.7        15      5   

                           model_ids                 auc  
0    rf_grid_20201023_214548_model_2  0.8795657175726201  
1    rf_grid_20201023_214548_model_6  0.8788280659146348  
2    rf_grid_20201023_214548_model_4  0.8782049233587916  
3    rf_grid_20201023_214548_model_5  0.8752649687363521  
4    rf_grid_20201023_214548_model_3  0.87498002748218

### Saving/Loading Grid Search

In [31]:
h2o.save_grid(model_path + rf_grid_auc.grid_id, rf_grid_auc.grid_id)

'/home/sfang/windows/gitlab/stanleysfang/code_reference/h2o/models/rf_grid_20201023_214548/rf_grid_20201023_214548'

In [32]:
rf_grid = h2o.load_grid(model_path + 'rf_grid_20201023_172627/rf_grid_20201023_172627')

### Predictions

In [33]:
model = rf_grid_auc.models[0]
predict = model.predict(test_h2o)

drf prediction progress: |████████████████████████████████████████████████| 100%




### H2OFrame to pandas dataframe

In [34]:
(answers['Survived'] == predict.as_data_frame()['predict']).mean()

0.8038277511961722

### Shutting Down H2O

In [35]:
h2o.cluster().shutdown()

H2O session _sid_9bbb closed.
