In [1]:
import h2o
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

h2o.init()


# # create dataset - age, bloodtype, healthy eating, lifestyle  & import dataset as "people"

# In[2]:


#seed for random number gen
np.random.seed(1337)

#number of rows, N
N = 1000

#bloodtypes
bloodTypes = np.array(['A', 'A', 'A', 'B', 'AB', 'O', 'O', 'O'])

#create the dataframe
d = pd.DataFrame({'id':range(N)})

#assign bloodtypes
d = d.assign(bloodType = bloodTypes[d.id.values %len(bloodTypes)])

#assign age between 18 and 85
d = d.assign(age = np.random.uniform(18,85,N).round())

#randomly assign healthy eating on a scale of 0-9, with a mean of 4 and STD of 2
v = np.random.normal(4,2,N).round()
v = np.where(v>9,9,v)
v = np.where(v<0,0,v)
d = d.assign(healthyEating=v)

#radomly assign active lifestyle on a scale of 0-9, with a mean of 6 and std of 2
v = np.random.normal(6,2,N).round()
v = np.where(v>9,9,v)
v = np.where(v<0,0,v)
d = d.assign(activeLifestyle=v)
#people under 35 get a +1 modifier
d = d.assign(activeLifestyle = np.where(d.age < 35, d.activeLifestyle +1,d.activeLifestyle))

#salary, with a base of $20,000 and modifiers based on healthy eating and active lifestyle
v = 20000 + (d.age.values * 3)**2
v += d.healthyEating*500
v -= d.activeLifestyle*300
v += np.random.uniform(0,5000,N)

#add noise by rounding
d = d.assign(income = v.round(2))

#load into H20 dataframe of people
people = h2o.H2OFrame(d,destination_frame='people')


# # split the data into test, train, and valid 

# In[3]:


#import the h2o data frame as people (i think this is redundant... but i'm leaving it in)
people = h2o.get_frame("people")


# In[5]:


#run to verify dataset "people" if you want
people


# In[6]:


#split into 3 sections, train = 0.7, valid = 0.2, test = (1 - 0.7- 0.1 = )0.1
train, valid, test = people.split_frame(
    ratios = [0.7,  0.2],
    #optional if you aren't viewing in flow... but still good practice to name things
    destination_frames = ["people_train", "people_valid", "people_test"],
    #optional, but adds more randomization 
    seed = 1338
)


# In[7]:


#test the split if you want
print("%d/%d/%d" % (train.nrows, valid.nrows, test.nrows))


# # choose gbm and make a model
# 

# In[8]:


#import the h20 gbm
from h2o.estimators.gbm import H2OGradientBoostingEstimator


# In[9]:


#set the field we want to find out as y
y = "income"
#ignore the id field and the field we want to find out
ignoreFields = [y, "id"]
x = [i for i in train.names if i not in ignoreFields]


# In[10]:


#name gbm model "defaults" and train on the data, "x", solving for y, using train data, 
# then validating on valid data
m1 = H2OGradientBoostingEstimator(model_id = "defaults")
m1.train(x, y, train, validation_frame = valid)


# In[11]:


#show mae on training data
m1.mae(train=True)


# In[12]:


#show mae on validation data
m1.mae(valid=True)


# In[13]:


#show mae on test data
perf = m1.model_performance(test)
perf.mae()


# # try some alternative params and build new model

# In[14]:


#fitting ntrees to 500, (100x what we did before)
#fitting max_depth to 20, (4x what we did before)
m2 = H2OGradientBoostingEstimator(model_id = "overfit", ntrees=5000, max_depth = 20)
m2.train(x, y, train, validation_frame = valid)


# In[15]:


#print findins of model 1 (m1) to model 2 (m2)
print("Train Data: %d |  %d" % (m1.mae(train=True), m2.mae(train=True)))
print("Valid Data: %d |  %d" % (m1.mae(valid=True), m2.mae(valid=True)))
print("Test Data: %d |  %d" % (perf.mae(), m2.model_performance(test).mae()))


# # As you can see we overfitted by increasing the ntrees of the GBM by 100x and max_depth of trees by 4x. Looking at the graphs of the "default" vs "overfit" models in h2o flow will validate these findings.

# In[ ]:






Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,9 hours 47 mins
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_Aditya_Jain_lwamvf
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,76.7 Mb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
712/186/102
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
Train Data: 1019 |  140
Valid Data: 1367 |  1481
Test Data: 1257 |  1423
