In [6]:
import h2o
import os

h2o.init(max_mem_size = "2G")             #specify max number of bytes. uses all cores by default.
h2o.remove_all()                          #clean slate, in case cluster was already running


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,18 mins 19 secs
H2O cluster version:,3.10.5.4
H2O cluster version age:,18 days
H2O cluster name:,H2O_from_python_sneha_rp971x
H2O cluster total nodes:,1
H2O cluster free memory:,1.734 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [7]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator


covtype_df = h2o.import_file(os.path.realpath("covtype.full.csv"))


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:
#split the data as described above
train, valid, test = covtype_df.split_frame([0.6, 0.2], seed=1234)

#Prepare predictors and response columns
covtype_X = covtype_df.col_names[:-1]     #last column is Cover_Type, our desired response variable 
covtype_y = covtype_df.col_names[-1]    


In [9]:
rf_v1 = H2ORandomForestEstimator(
    model_id="rf_covType_v1",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=1000000)

In [10]:
rf_v1.train(covtype_X, covtype_y, training_frame=train, validation_frame=valid)



rf_v1


drf Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf_covType_v1


ModelMetricsMultinomial: drf
** Reported on train data. **

MSE: 0.056076839859494784
RMSE: 0.23680548950456107
LogLoss: 0.23843299358906858
Mean Per-Class Error: 0.11102189346483596
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8
class_1,class_2,class_3,class_4,class_5,class_6,class_7,Error,Rate
117176.0,9534.0,5.0,0.0,53.0,11.0,338.0,0.0782035,"9,941 / 127,117"
5414.0,164066.0,321.0,3.0,240.0,244.0,50.0,0.0368209,"6,272 / 170,338"
32.0,413.0,20370.0,93.0,22.0,512.0,0.0,0.0499953,"1,072 / 21,442"
0.0,32.0,178.0,1390.0,0.0,58.0,0.0,0.1616405,"268 / 1,658"
93.0,1386.0,63.0,0.0,4161.0,17.0,0.0,0.2725524,"1,559 / 5,720"
38.0,368.0,739.0,42.0,7.0,9239.0,0.0,0.1144446,"1,194 / 10,433"
709.0,70.0,0.0,0.0,2.0,0.0,11519.0,0.0634959,"781 / 12,300"
123462.0,175869.0,21676.0,1528.0,4485.0,10081.0,11907.0,0.0604198,"21,087 / 349,008"


Top-7 Hit Ratios: 


0,1
k,hit_ratio
1,0.9395802
2,0.9962408
3,0.9982007
4,0.9982522
5,0.9982522
6,0.9982522
7,1.0



ModelMetricsMultinomial: drf
** Reported on validation data. **

MSE: 0.0531414089432006
RMSE: 0.2305242046796835
LogLoss: 0.20030408050020737
Mean Per-Class Error: 0.10251306792333845
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8
class_1,class_2,class_3,class_4,class_5,class_6,class_7,Error,Rate
39403.0,2998.0,0.0,0.0,15.0,2.0,82.0,0.0728706,"3,097 / 42,500"
1589.0,54529.0,104.0,0.0,83.0,60.0,15.0,0.0328308,"1,851 / 56,380"
0.0,131.0,6844.0,30.0,3.0,135.0,0.0,0.0418592,"299 / 7,143"
1.0,1.0,61.0,479.0,0.0,20.0,0.0,0.1476868,83 / 562
29.0,432.0,24.0,0.0,1377.0,8.0,0.0,0.2636364,"493 / 1,870"
0.0,129.0,212.0,19.0,3.0,3101.0,0.0,0.1047921,"363 / 3,464"
204.0,16.0,0.0,0.0,1.0,0.0,3878.0,0.0539156,"221 / 4,099"
41226.0,58236.0,7245.0,528.0,1482.0,3326.0,3975.0,0.0552242,"6,407 / 116,018"


Top-7 Hit Ratios: 


0,1
k,hit_ratio
1,0.9447758
2,0.9978452
3,0.9996811
4,0.9997845
5,0.9997932
6,0.9997932
7,1.0


Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
,2017-08-04 12:41:12,0.011 sec,0.0,,,,,,
,2017-08-04 12:41:15,3.206 sec,1.0,0.3356442,2.4967403,0.1239456,0.3342398,2.4447915,0.1272906
,2017-08-04 12:41:18,6.557 sec,2.0,0.3192478,2.1652719,0.1135361,0.2663761,0.7915252,0.0850127
,2017-08-04 12:41:21,9.513 sec,3.0,0.3061849,1.8191096,0.1055723,0.2506499,0.4602991,0.0723336
,2017-08-04 12:41:25,12.995 sec,4.0,0.2953465,1.5013414,0.0992534,0.2449232,0.3418711,0.0674033
---,---,---,---,---,---,---,---,---,---
,2017-08-04 12:43:08,1 min 56.328 sec,20.0,0.2385726,0.2613526,0.0620141,0.2307102,0.2016582,0.0560861
,2017-08-04 12:43:15,2 min 3.034 sec,21.0,0.2380320,0.2537838,0.0615603,0.2306266,0.2008930,0.0556638
,2017-08-04 12:43:22,2 min 9.984 sec,22.0,0.2373729,0.2481413,0.0611638,0.2303363,0.2003093,0.0554483



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Soil_Type,784023.9375000,1.0,0.2524228
Elevation,738730.5625000,0.9422296,0.2378402
Horizontal_Distance_To_Roadways,327317.25,0.4174837,0.1053824
Horizontal_Distance_To_Fire_Points,317907.9062500,0.4054824,0.1023530
Wilderness_Area,178298.4218750,0.2274145,0.0574046
Horizontal_Distance_To_Hydrology,159165.5312500,0.2030111,0.0512446
Vertical_Distance_To_Hydrology,134396.1718750,0.1714185,0.0432699
Aspect,106050.0156250,0.1352637,0.0341437
Hillshade_Noon,99759.4062500,0.1272403,0.0321183




In [11]:
rf_v1.score_history()

Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
0,,2017-08-04 12:41:12,0.011 sec,0.0,,,,,,
1,,2017-08-04 12:41:15,3.206 sec,1.0,0.335644,2.49674,0.123946,0.33424,2.444792,0.127291
2,,2017-08-04 12:41:18,6.557 sec,2.0,0.319248,2.165272,0.113536,0.266376,0.791525,0.085013
3,,2017-08-04 12:41:21,9.513 sec,3.0,0.306185,1.81911,0.105572,0.25065,0.460299,0.072334
4,,2017-08-04 12:41:25,12.995 sec,4.0,0.295347,1.501341,0.099253,0.244923,0.341871,0.067403
5,,2017-08-04 12:41:28,16.276 sec,5.0,0.285458,1.259114,0.093016,0.240228,0.284286,0.063619
6,,2017-08-04 12:41:32,19.898 sec,6.0,0.276749,1.059652,0.087339,0.237109,0.253836,0.061215
7,,2017-08-04 12:41:35,23.432 sec,7.0,0.271068,0.889017,0.084058,0.237045,0.242014,0.060637
8,,2017-08-04 12:41:39,27.570 sec,8.0,0.264125,0.748301,0.079793,0.234717,0.228079,0.059456
9,,2017-08-04 12:41:43,31.876 sec,9.0,0.259892,0.640781,0.07688,0.234854,0.223989,0.058715


In [12]:
rf_v1.hit_ratio_table(valid=True)

Top-7 Hit Ratios: 


0,1
k,hit_ratio
1,0.9447758
2,0.9978452
3,0.9996811
4,0.9997845
5,0.9997932
6,0.9997932
7,1.0




In [13]:
final_rf_predictions = rf_v1.predict(test[:-1])

drf prediction progress: |████████████████████████████████████████████████| 100%


In [14]:
final_rf_predictions

predict,class_1,class_2,class_3,class_4,class_5,class_6,class_7
class_2,0.414264,0.585736,0,0,0.0,0,0.0
class_1,1.0,0.0,0,0,0.0,0,0.0
class_1,0.94961,0.0503899,0,0,0.0,0,0.0
class_1,0.756907,0.243093,0,0,0.0,0,0.0
class_2,0.127547,0.872453,0,0,0.0,0,0.0
class_2,0.0439125,0.483751,0,0,0.469526,0,0.0028104
class_1,0.955216,0.0447843,0,0,0.0,0,0.0
class_2,0.0785891,0.921411,0,0,0.0,0,0.0
class_2,0.0570821,0.942918,0,0,0.0,0,0.0
class_2,0.0833298,0.91667,0,0,0.0,0,0.0




In [15]:
(final_rf_predictions['predict']==test['Cover_Type']).as_data_frame(use_pandas=True).mean()

predict    0.944835
dtype: float64

In [16]:
h2o.shutdown(prompt=False)

    >>> h2o.shutdown(prompt=False)
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
H2O session _sid_95f6 closed.
