In [1]:
import h2o
import numpy as np
import pandas as pd
import random
random.seed(123)

In [2]:
h2o.init()
#h2o.cluster().shutdown()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 day 23 hours 11 mins
H2O cluster version:,3.13.0.3978
H2O cluster version age:,4 years and 9 days !!!
H2O cluster name:,H2O_started_from_R_Sandipan.Dey_kpl973
H2O cluster total nodes:,1
H2O cluster free memory:,2.510 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [3]:
from scipy.stats import truncnorm

def get_truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

In [4]:
n = 1000
data = {'student_id': list(range(1, n+1)), 
        'score': get_truncated_normal(mean=70, sd=30, low=0, upp=100).rvs(1000),
        'age': get_truncated_normal(mean=19, sd=3, low=0, upp=100).rvs(1000),
        'gender': np.random.choice(a=['male', 'female'], size=1000, p=[0.5, 0.5])
       }
df = pd.DataFrame(data)
df.age = df.age.astype('int')
df.head(20)

Unnamed: 0,student_id,score,age,gender
0,1,54.170856,26,female
1,2,16.418582,23,female
2,3,71.927874,16,male
3,4,23.128042,16,male
4,5,44.068255,19,female
5,6,27.309425,18,male
6,7,72.158954,24,male
7,8,64.752088,17,female
8,9,66.077595,19,male
9,10,48.822375,19,male


In [5]:
df_h2o = h2o.H2OFrame(df, destination_frame='student')
df_h2o.as_data_frame()

AttributeError: 'DataFrame' object has no attribute 'as_matrix'

In [6]:
df_h2o.summary()

Unnamed: 0,student_id,score,age,gender
type,int,real,int,enum
mins,1.0,0.02420221858625382,7.0,
mean,500.5,61.021719570298124,18.49300000000001,
maxs,1000.0,99.70148283983325,29.0,
sigma,288.8194360957494,22.57027771128949,2.9991575393676704,
zeros,0,0,0,
missing,0,0,0,0
0,1.0,72.66493045787158,12.0,male
1,2.0,53.70835416225138,19.0,male
2,3.0,73.97561440143802,18.0,male


In [7]:
train, test = df_h2o.split_frame(
    ratios = [0.897],
    destination_frames = ['student_train', 'student_test'],
    seed=123
    )

In [8]:
print('%d/%d' % (train.nrows, test.nrows))

900/100


In [9]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [10]:
y = 'score'
ignoreFields = [y, 'student_id']
x = [i for i in train.names if i not in ignoreFields]

In [11]:
m = H2OGradientBoostingEstimator(model_id='m10folds', nfolds=10)
m.train(x,y,train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [12]:
print('mae_train:', m.mae(train=True))
print('mae_xval:', m.mae(xval=True))
perf = m.model_performance(test)
print('mae_test:',perf.mae())

mae_train: 18.557723622994704
mae_xval: 19.191105537299237
mae_test: 18.882965658079186


In [13]:
m2 = H2OGradientBoostingEstimator(model_id='m2_10folds', nfolds=10, max_depth=7, ntrees=100)
m2.train(x,y,train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [14]:
print('mae_train:', m2.mae(train=True))
print('mae_xval:', m2.mae(xval=True))
perf = m2.model_performance(test)
print('mae_test:',perf.mae())

mae_train: 18.560498950889126
mae_xval: 19.079226260562464
mae_test: 18.90909424846111


In [6]:
# Assignment 1

import numpy as np
import pandas as pd
import random
import h2o

# Step One: create data set of 1000 voters
# include gender, college, favorite food, and how they voted
N = 1000
# create sample of possible outcomes
combos = [['male', 0, 'dem'], ['male', 1, 'dem'],
          ['male', 0, 'rep'], ['male', 0, 'rep'],
          ['female', 0, 'dem'], ['female', 1, 'dem'],
          ['female', 0, 'dem'], ['female', 1, 'dem'],
          ['female', 0, 'rep'], ['female', 1, 'rep']]    
df = pd.DataFrame(np.repeat(combos,N/len(combos),axis=0), #generate 1000 rows
                  columns =['gender', 'college', 'vote'])
df['favorite_food'] = random.choices(
    ['italian','chinese','vegan','french','american'], k=N)

# Step Two: Start H2O and import data
h2o.init()
vote_data = h2o.H2OFrame(df)
vote_data['vote'] = vote_data['vote'].asfactor() #convert categ to factor
vote_data['gender'] = vote_data['gender'].asfactor()
vote_data['college'] = vote_data['college'].asfactor()

# Step Three: Split the data using a cross-validation approach
train, valid, test = vote_data.split_frame(
    ratios=[0.80,0.10],    #80% train, 10% validation, 10% test
    destination_frames=['vote_train','vote_valid','vote_test'],
    seed=906
)
y = 'vote'
x = [var for var in train.names if var not in [y]]

# Step Four: Create classification model using RF
from h2o.estimators.random_forest import H2ORandomForestEstimator
rf1 = H2ORandomForestEstimator(model_id='initial')
rf1.train(x, y, train, validation_frame=valid)

rf1
perf1 = rf1.model_performance(test)
perf1

# Step Five: Build alternative model by altering parameters

rf2 = H2ORandomForestEstimator(model_id='overfit', ntrees=1000, max_depth=10)
rf2.train(x, y, train, validation_frame=valid)

rf2
perf2 = rf1.model_performance(test)
perf2

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 day 23 hours 26 mins
H2O cluster version:,3.13.0.3978
H2O cluster version age:,4 years and 9 days !!!
H2O cluster name:,H2O_started_from_R_Sandipan.Dey_kpl973
H2O cluster total nodes:,1
H2O cluster free memory:,2.491 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


AttributeError: 'DataFrame' object has no attribute 'as_matrix'