In [1]:
import pandas as pd
import numpy as np
import h2o

# Step 1 - Create an artifical data set

- N number of people
- Id -> sequential numbered value from 1 to N
- Age -> Random value from 18 to 62
- EatingHabit -> Random value from 1 to 10
- ActiveLifeStyle -> Randome value from 1 to 10
- Salary -> (Dependent variable) -> 18000 + Age*2 + EatingHabit * 5000 - ActiveLifeStyle * 4000 + Random (2000,5000)

In [2]:
import random

N = 1000
id = list(range(1,N))
age = [random.randint(18,62) for x in range(N)]
eatingHabit = [random.randint(1,10) for x in range(N)]
lifestyle = [random.randint(1,10) for x in range(N)]
salary = list()

for i in range(N):
    salary.append(age[i]*2 + eatingHabit[i]*5000 + random.randint(2000,5000) - lifestyle[i]*400 + 18000)

In [3]:
person_df = pd.DataFrame(data=list(zip(id, age, eatingHabit, lifestyle, salary)), columns=['id','age','eatinghabit','lifestyle','salary'])

# Step 2 - Initialize H2o and convert pandas dataframe to H2o frame

In [4]:
#initialize h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,37 mins 41 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_Aditya_Jain_qlgbo4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,70.6 Mb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [5]:
#convert pandas dataframe to h2o frame
person_h2o = h2o.H2OFrame(person_df, destination_frame='person')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
# load the frame from h2o & print summary
loaded_person_frame = h2o.get_frame('person')
loaded_person_frame.summary()

Unnamed: 0,id,age,eatinghabit,lifestyle,salary
type,int,int,int,int,int
mins,1.0,18.0,1.0,1.0,21237.0
mean,500.0,40.218218218218205,5.450450450450448,5.395395395395391,46674.68268268267
maxs,999.0,62.0,10.0,10.0,72308.0
sigma,288.5307609250702,12.75043432905671,2.853696631779517,2.8805231549340227,14282.151655553676
zeros,0,0,0,0,0
missing,0,0,0,0,0
0,1.0,55.0,9.0,8.0,64396.0
1,2.0,35.0,9.0,7.0,63996.0
2,3.0,34.0,3.0,7.0,32854.0


# Step 3 - Split the data (Will use CV)

In [7]:
train, test = loaded_person_frame.split_frame([0.8], destination_frames=['train_person','test_person'])

# Step 4 - Use GBM for Predicting Salary

In [8]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

estimator = H2OGradientBoostingEstimator(nfolds=10,
                                         model_id='gbm_person_default')
estimator.train(['id', 'age', 'eatinghabit', 'lifestyle'],'salary',train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [9]:
perf = estimator.model_performance(test)
print (perf)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 1006857.4155771006
RMSE: 1003.4228498380434
MAE: 849.4201739215891
RMSLE: 0.024655035941179293
Mean Residual Deviance: 1006857.4155771006



# Step 5 - Use Overfit GBM model for Predicting Salary

In [10]:
estimator = H2OGradientBoostingEstimator(nfolds=10,
                                        max_depth=50,                                        
                                        ntrees=500,
                                        model_id='gbm_person_overfit')
estimator.train(['id', 'age', 'eatinghabit', 'lifestyle'],'salary',train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
perf = estimator.model_performance(test)
print (perf)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 1255011.212006979
RMSE: 1120.2728292728423
MAE: 921.3793470591897
RMSLE: 0.027817013333067644
Mean Residual Deviance: 1255011.212006979



# Step 6 - Use EarlyStopping & Overfit GBM model for Predicting Salary

In [15]:
estimator = H2OGradientBoostingEstimator(nfolds=10,
                                        max_depth=50, 
                                        stopping_rounds=4,
                                        stopping_tolerance=0.01, #This ranges from 0 (0%) to 1 (100%)
                                        ntrees=500,
                                        model_id='gbm_person_overfit')
estimator.train(['id', 'age', 'eatinghabit', 'lifestyle'],'salary',train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [14]:
perf = estimator.model_performance(test)
print (perf)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 1064843.4890047882
RMSE: 1031.9125394163927
MAE: 864.1257358686198
RMSLE: 0.02511375833826869
Mean Residual Deviance: 1064843.4890047882



# Save Model

In [17]:
h2o.save_model(estimator, 'estimator_model')

'C:\\MachineLearning\\repos\\personel\\h2o\\estimator_model\\gbm_person_overfit'

In [19]:
m = h2o.load_model('estimator_model/gbm_person_overfit')