### H2O Initialization

In [48]:
# Load the H2O library and start up the H2O cluter locally on your machine
import h2o

# Number of threads, nthreads = -1, means use all cores on your machine
# max_mem_size is the maximum memory (in GB) to allocate to H2O
h2o.init(nthreads = -1, max_mem_size = 8)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.2+9, mixed mode)
  Starting server from D:\Anaconda\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ERIC~1.YUA\AppData\Local\Temp\tmphhdo1ppq
  JVM stdout: C:\Users\ERIC~1.YUA\AppData\Local\Temp\tmphhdo1ppq\h2o_eric_yuan_started_from_python.out
  JVM stderr: C:\Users\ERIC~1.YUA\AppData\Local\Temp\tmphhdo1ppq\h2o_eric_yuan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,30 days
H2O cluster name:,H2O_from_python_eric_yuan_88xrhi
H2O cluster total nodes:,1
H2O cluster free memory:,8 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


### Data Preparation

In [49]:
data = h2o.import_file('loan.csv')
# data = data.as_data_frame()

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [50]:
data['bad_loan'] = data['bad_loan'].asfactor()
data['bad_loan'].levels()

[['0', '1']]

In [51]:
# Split a frame into distinct subsets of size determined by the given ratios. 
# The number of subsets is always 1 more than the number of ratios given
splits = data.split_frame(ratios=[0.7, 0.15], seed=1)  
train = splits[0]
valid = splits[1]
test = splits[2]

In [52]:
y = 'bad_loan'
x = list(data.columns)
# remove the response
x.remove(y)
# remove the interest rate column because it's correlated with the outcome
x.remove('int_rate')
# List of predictor columns
x

['loan_amnt',
 'term',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'purpose',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'revol_util',
 'total_acc',
 'longest_credit_length',
 'verification_status']

### Training Machine Learning Model
* Generalized Linear Model (GLM)
* Random Forest (RF)
* Gradient Boosting Machine (RF)
* Deep Learning (DL)
* Naive Bayes (NB)

In [53]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

* GLM default

In [54]:
glm_fit1 = H2OGeneralizedLinearEstimator(family = 'binomial', model_id = 'glm_fit1')
glm_fit1.train(x = x, y = y, training_frame = train)

glm Model Build progress: |███████████████████████████████████████████████| 100%


* GLM training with lambda search   
Tunning parameter: lambda, in this case we should add validation_frame

In [55]:
glm_fit2 = H2OGeneralizedLinearEstimator(family='binomial', model_id='glm_fit2', lambda_search=True)
glm_fit2.train(x = x, y = y, training_frame = train, validation_frame = valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


* random forest

In [56]:
rf_fit = H2ORandomForestEstimator(model_id='rf_fit1', ntrees=500, seed=1)
rf_fit.train(x=x, y=y, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


* gbm model

In [57]:
# Now let's use early stopping to find optimal ntrees
gbm_fit = H2OGradientBoostingEstimator(model_id = 'gbm_fit1', 
                                        ntrees = 500, 
                                        score_tree_interval = 5,     #used for early stopping
                                        stopping_rounds = 3,         #used for early stopping
                                        stopping_metric = 'AUC',     #used for early stopping
                                        stopping_tolerance = 0.0005, #used for early stopping
                                        seed = 1)

# The use of a validation_frame is recommended with using early stopping
gbm_fit.train(x=x, y=y, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


* Deep Learning model

In [41]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [43]:
dl_fit = H2ODeepLearningEstimator(model_id = 'dl_fit',
                                  epochs = 20,
                                  hidden = [10, 10],
                                  score_interval = 1,
                                  stopping_rounds = 3,
                                  stopping_metric = 'AUC',
                                  stopping_tolerance = 0.0005,
                                  seed = 1)
dl_fit.train(x = x, y = y, training_frame = train, validation_frame = valid)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [44]:
dl_perf = dl_fit.model_performance(test)
print(dl_perf.auc())

0.6805062986404884


In [45]:
dl_fit.scoring_history()

Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,...,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_r2,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2020-01-16 12:37:59,0.000 sec,,0.0,0,0.0,,,,...,,,,,,,,,,
1,,2020-01-16 12:38:00,0.290 sec,510163 obs/sec,0.870192,1,99992.0,0.383383,0.474283,0.027499,...,0.303446,2.667063,0.360538,0.383359,0.473423,0.026911,0.661235,0.304054,2.619777,0.405911
2,,2020-01-16 12:38:01,1.291 sec,696743 obs/sec,6.960882,8,799861.0,0.381625,0.469441,0.0364,...,0.320904,2.830353,0.366303,0.381564,0.469992,0.036004,0.675611,0.317306,2.751867,0.357539
3,,2020-01-16 12:38:02,2.386 sec,771749 obs/sec,14.79588,17,1700165.0,0.378077,0.453021,0.054236,...,0.317068,3.102502,0.357807,0.378198,0.453579,0.052937,0.674628,0.315287,2.707837,0.36652
4,,2020-01-16 12:38:02,3.094 sec,803176 obs/sec,20.018589,23,2300296.0,0.377193,0.451511,0.058653,...,0.323646,2.830353,0.359324,0.378195,0.453847,0.052949,0.673783,0.315288,2.773882,0.357131
5,,2020-01-16 12:38:03,3.144 sec,803176 obs/sec,20.018589,23,2300296.0,0.381625,0.469441,0.0364,...,0.320904,2.830353,0.366303,0.381564,0.469992,0.036004,0.675611,0.317306,2.751867,0.357539


### Model Evaluation

In [58]:
glm_perf1 = glm_fit1.model_performance(test)
glm_perf2 = glm_fit2.model_performance(test)
rf_perf = rf_fit.model_performance(test)
gbm_perf = gbm_fit.model_performance(test)

In [59]:
rf_perf


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.14239900470731787
RMSE: 0.3773579265197935
LogLoss: 0.451479886673741
Mean Per-Class Error: 0.3726280311979171
AUC: 0.6753271924435713
AUCPR: 0.32217263587559447
Gini: 0.3506543848871426

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1843048375126013: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,12366.0,7625.0,0.3814,(7625.0/19991.0)
1,1,1670.0,2920.0,0.3638,(1670.0/4590.0)
2,Total,14036.0,10545.0,0.3781,(9295.0/24581.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.184305,0.385861,233.0
1,max f2,0.122561,0.555791,301.0
2,max f0point5,0.272161,0.354589,158.0
3,max accuracy,0.48502,0.813962,38.0
4,max precision,0.634311,0.727273,6.0
5,max recall,0.022444,1.0,393.0
6,max specificity,0.710832,0.99995,0.0
7,max absolute_mcc,0.272161,0.206967,158.0
8,max min_per_class_accuracy,0.186525,0.625732,231.0
9,max mean_per_class_accuracy,0.184305,0.627372,233.0



Gains/Lift Table: Avg response rate: 18.67 %, avg score: 18.66 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010008,0.493585,2.764748,2.764748,0.51626,0.544518,0.51626,0.544518,0.027669,0.027669,176.474751,176.474751
1,,2,0.020015,0.452502,2.481742,2.623245,0.463415,0.470619,0.489837,0.507569,0.024837,0.052505,148.174186,162.324468
2,,3,0.030023,0.423903,2.242276,2.496255,0.418699,0.437348,0.466125,0.484162,0.02244,0.074946,124.227554,149.625497
3,,4,0.040031,0.402387,2.176967,2.416433,0.406504,0.413225,0.45122,0.466428,0.021786,0.096732,117.696654,141.643286
4,,5,0.050039,0.386807,2.307585,2.394663,0.430894,0.394245,0.447154,0.451991,0.023094,0.119826,130.758453,139.466319
5,,6,0.100037,0.329429,1.817067,2.105983,0.3393,0.355626,0.393249,0.403828,0.09085,0.210675,81.706739,110.598274
6,,7,0.150035,0.294343,1.660198,1.957428,0.310008,0.31114,0.36551,0.372941,0.083007,0.293682,66.019826,95.74282
7,,8,0.200033,0.266073,1.564334,1.859175,0.292107,0.279633,0.347163,0.349618,0.078214,0.371895,56.433379,85.917459
8,,9,0.300028,0.223996,1.252774,1.657069,0.23393,0.243691,0.309424,0.314314,0.125272,0.497168,25.277428,65.706856
9,,10,0.400024,0.191954,1.080654,1.51298,0.20179,0.207342,0.282518,0.287574,0.108061,0.605229,8.065398,51.297957







In [60]:
# Retreive test set AUC
print(glm_perf1.auc())
print(glm_perf2.auc())
print(rf_perf.auc())
print(gbm_perf.auc())

0.6774630119501488
0.6769720666238805
0.6753271924435713
0.6844308097685352


* GBM history

In [40]:
gbm_fit.scoring_history().tail()

Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
15,,2020-01-16 12:33:02,3.141 sec,75.0,0.365027,0.425718,0.723959,0.389552,3.851542,0.269033,0.375363,0.446726,0.684226,0.327466,2.883957,0.338722
16,,2020-01-16 12:33:02,3.310 sec,80.0,0.3646,0.424773,0.725947,0.392813,3.952017,0.284758,0.375336,0.446639,0.684535,0.327367,2.861942,0.342763
17,,2020-01-16 12:33:02,3.474 sec,85.0,0.364264,0.424071,0.727128,0.395495,3.995078,0.269503,0.375368,0.446706,0.684319,0.327257,2.839927,0.342559
18,,2020-01-16 12:33:03,3.614 sec,90.0,0.363937,0.42338,0.728522,0.39819,4.028569,0.270364,0.375382,0.446713,0.684341,0.327179,2.950001,0.342885
19,,2020-01-16 12:33:03,3.759 sec,95.0,0.363618,0.422716,0.729838,0.40069,4.023785,0.265726,0.375401,0.446726,0.684362,0.32678,2.927986,0.350967


### Grid Search

In [62]:
# Import H2O Grid Search:
from h2o.grid.grid_search import H2OGridSearch

In [64]:
# GBM hyperparameters
gbm_params = {'learn_rate': [0.01, 0.1], 
              'max_depth': [3, 5, 9],
              'sample_rate': [0.8, 1.0],
              'col_sample_rate': [0.2, 0.5, 1.0]}

In [66]:
gbm_grid = H2OGridSearch(model = H2OGradientBoostingEstimator,
                         grid_id = 'gbm_grid',
                         hyper_params = gbm_params)
gbm_grid.train(x = x, y = y, 
               training_frame = train, 
               validation_frame = valid, 
               ntrees = 100,
               seed = 1)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [67]:
gbm_gridperf = gbm_grid.get_grid(sort_by = 'auc', decreasing = True)

In [68]:
print(gbm_gridperf)

     col_sample_rate learn_rate max_depth sample_rate          model_ids  \
0                0.2        0.1         5         0.8  gbm_grid_model_10   
1                0.2        0.1         5         1.0  gbm_grid_model_28   
2                1.0        0.1         5         0.8  gbm_grid_model_12   
3                0.5        0.1         5         0.8  gbm_grid_model_11   
4                1.0        0.1         3         0.8   gbm_grid_model_6   
5                0.5        0.1         5         1.0  gbm_grid_model_29   
6                1.0        0.1         3         1.0  gbm_grid_model_24   
7                0.5        0.1         3         0.8   gbm_grid_model_5   
8                1.0        0.1         5         1.0  gbm_grid_model_30   
9                0.5        0.1         3         1.0  gbm_grid_model_23   
10               0.2        0.1         3         1.0  gbm_grid_model_22   
11               0.2        0.1         9         1.0  gbm_grid_model_34   
12          

### Random Grid Search

In [69]:
# GBM hyperparameters
gbm_params2 = {'learn_rate': [i * 0.01 for i in range(1, 11)], 
                'max_depth': list(range(2, 11)),
                'sample_rate': [i * 0.1 for i in range(5, 11)],
                'col_sample_rate': [i * 0.1 for i in range(1, 11)]}

# Search criteria
search_criteria2 = {'strategy': 'RandomDiscrete', 'max_models': 36}

In [70]:
gbm_grid2 = H2OGridSearch(model = H2OGradientBoostingEstimator,
                          grid_id = 'gbm_grid2',
                          hyper_params = gbm_params2,
                          search_criteria = search_criteria2)
gbm_grid2.train(x = x, y = y, 
                training_frame = train, 
                validation_frame = valid, 
                ntrees = 100,
                seed = 1)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [71]:
gbm_gridperf2 = gbm_grid2.get_grid(sort_by = 'auc', decreasing = True)

In [72]:
print(gbm_gridperf2)

          col_sample_rate learn_rate max_depth         sample_rate  \
0                     0.5       0.08         5  0.6000000000000001   
1      0.6000000000000001       0.08         5  0.6000000000000001   
2                     0.4       0.06         5                 0.8   
3      0.7000000000000001       0.07         4  0.6000000000000001   
4                     1.0       0.07         4                 0.5   
5                     0.9       0.04         5  0.6000000000000001   
6                     0.2       0.02         9                 0.9   
7                     0.9       0.04         7                 0.9   
8                     0.8       0.04         4                 0.8   
9                     0.5       0.04         4                 1.0   
10                    0.2       0.02         8                 0.9   
11    0.30000000000000004       0.05         4                 0.8   
12                    0.9       0.09         2                 1.0   
13     0.60000000000

* Add models to existing grid  

It looks like learn_rate=0.1 does well here, which was the biggest learn_rate in our previous search, so maybe we want to add some models to our grid search with a higher learn_rate. We will create a new hyper_params and search_criteria objects.

We can add models to the same grid, by re-using the same model_id. Let's add as many new models as we can train in 60 seconds by setting max_runtime_secs=60 in search_criteria.

In [73]:
# GBM hyperparameters
gbm_params = {'learn_rate': [i * 0.01 for i in range(1, 31)],  #updated
                'max_depth': list(range(2, 11)),
                'sample_rate': [0.9, 0.95, 1.0],  #updated
                'col_sample_rate': [i * 0.1 for i in range(1, 11)]}

# Search criteria, 
search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 60}  #updated

In [74]:
gbm_grid = H2OGridSearch(model = H2OGradientBoostingEstimator,
                         grid_id = 'gbm_grid2',
                         hyper_params = gbm_params,
                         search_criteria = search_criteria)
gbm_grid.train(x = x, y = y, 
               training_frame = train, 
               validation_frame = valid, 
               ntrees = 100,
               seed = 1)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [75]:
gbm_gridperf = gbm_grid.get_grid(sort_by='auc', decreasing=True)

In [76]:
print(gbm_gridperf)

          col_sample_rate learn_rate max_depth         sample_rate  \
0                     0.5       0.08         5  0.6000000000000001   
1      0.6000000000000001       0.08         5  0.6000000000000001   
2                     0.4       0.06         5                 0.8   
3                     0.5       0.26         2                 1.0   
4                     0.4        0.1         4                 0.9   
5                     0.2       0.19         3                0.95   
6                     0.5       0.12         3                 0.9   
7      0.7000000000000001       0.07         4  0.6000000000000001   
8                     1.0       0.07         4                 0.5   
9                     0.9       0.04         5  0.6000000000000001   
10                    0.2       0.02         9                 0.9   
11                    0.9       0.04         7                 0.9   
12                    0.8       0.04         4                 0.8   
13                  

* extract the top model, as determined by validation AUC, from the grid.

In [77]:
# Grab the model_id for the top GBM model, chosen by validation AUC
best_gbm_model = gbm_gridperf.models[0]

In [78]:
# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
gbm_perf = best_gbm_model.model_performance(test)
print(gbm_perf.auc())

0.6864278140849657


### Cross Validation
When specifying nfolds, the algorithm will build nfolds +1 models.  
For example, if you specify nfolds=5, then 6 models are built.  
The first 5 models (cross-validation models) are built on 80% of the training data, and a different 20% is held out for each of the 5 models.  
Then the main model is built on 100% of the training data.

In [61]:
rf_fit2 = H2ORandomForestEstimator(model_id = 'rf_fit2', seed = 1, nfolds = 5)
rf_fit2.train(x = x, y = y, training_frame = data)
# rf_fit2 contains the evaluation metrics for both main and validation models
# rf_fit2

drf Model Build progress: |███████████████████████████████████████████████| 100%


### Close H2O

In [46]:
h2o.shutdown(prompt=False)

  """Entry point for launching an IPython kernel.


H2O session _sid_bd97 closed.


### Summary

H2O package trains and evaluates models quicker and simpler than others, it simplified the whole process and built high-level APIs for machine learning problem although reduced some flexibilities. We should definitely consider to incorporate H2O into the current machine learning business lines.