In [1]:
import pandas as pd
import h2o

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [3]:
import multiprocessing
print("CPU: ",multiprocessing.cpu_count())

import psutil
print("Memory: ",psutil.virtual_memory())

CPU:  72
Memory:  svmem(total=1622764929024, available=1422414950400, percent=12.3, used=194895474688, free=556070727680, active=910295629824, inactive=111481761792, buffers=0, cached=871798726656, shared=2571857920, slab=15486603264)


In [4]:
#h2o.init()
h2o.init(ip="127.0.0.1", max_mem_size_GB = 100, nthreads = 5)

Checking whether there is an H2O instance running at http://127.0.0.1:54321 . connected.


0,1
H2O cluster uptime:,12 mins 05 secs
H2O cluster timezone:,Europe/Istanbul
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.3
H2O cluster version age:,25 days
H2O cluster name:,H2O_from_python_96273_ugsf64
H2O cluster total nodes:,1
H2O cluster free memory:,88.3 Gb
H2O cluster total cores:,72
H2O cluster allowed cores:,5


In [5]:
hf_positive = h2o.import_file('dataset/train_true_positive_features.csv')
hf_negative = h2o.import_file('dataset/train_true_negative_features.csv')
hf = hf_positive.rbind(hf_negative)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
print("positive instances: ",hf_positive.shape)
print("negative instances: ",hf_negative.shape)
print("total instances: ",hf.shape)

positive instances:  (165179, 34)
negative instances:  (283367, 34)
total instances:  (448546, 34)


In [7]:
hf = hf[['vgg_cosine', 'vgg_euclidean_l2'
         , 'facenet_cosine', 'facenet_euclidean_l2'
         , 'openface_cosine', 'openface_euclidean_l2'
         , 'is_related']]

In [8]:
hf.head()

vgg_cosine,vgg_euclidean_l2,facenet_cosine,facenet_euclidean_l2,openface_cosine,openface_euclidean_l2,is_related
0.618396,1.11211,1.25131,1.58197,1.12544,1.50029,1
0.601191,1.09653,1.14205,1.51133,1.08315,1.47183,1
0.543063,1.04217,1.10449,1.48627,1.14981,1.51645,1
0.618544,1.11224,1.24833,1.58008,1.09367,1.47897,1
0.60665,1.1015,1.15115,1.51733,1.11618,1.49411,1
0.630702,1.12312,1.22153,1.56303,1.20384,1.55167,1
0.742856,1.2189,1.09549,1.48019,1.07032,1.4631,1
0.558462,1.05685,1.11984,1.49655,1.24634,1.57882,1
0.648851,1.13917,1.10773,1.48844,0.612283,1.1066,1
0.677681,1.1642,1.03272,1.43716,0.571711,1.06931,1




In [9]:
#convert target label to factor because this is a binary classification
#otherwise, there would be a regression problem
hf['is_related'] = hf['is_related'].asfactor()

In [10]:
#70% train, 15% test, 15% validation
train, test, validation = hf.split_frame(ratios=[0.70, 0.15], seed=17)

In [11]:
print("train set size: ",train.shape)
print("test set size: ", test.shape)
print("validation set size: ", validation.shape)

train set size:  (314122, 7)
test set size:  (67160, 7)
validation set size:  (67264, 7)


# Training

In [12]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [13]:
model = H2OGradientBoostingEstimator(
    ntrees = 1000
    , learn_rate = 0.01
    , stopping_rounds = 50
    , stopping_metric = "AUC"
)

In [14]:
model.train(x = hf.names[0:-1], y = hf.names[-1]
    , training_frame = train
    , validation_frame = test
    #, verbose = True
    , model_id = "GBM_Kinship"
)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [15]:
model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_Kinship


Model Summary: 

Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,1000.0,1000.0,268761.0,0.0,5.0,2.695,1.0,32.0,16.503




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.19740943929831983
RMSE: 0.44430782043344663
LogLoss: 0.5789539838842188
Mean Per-Class Error: 0.3333046491681597
AUC: 0.7276208150873205
pr_auc: 0.6123781904410922
Gini: 0.4552416301746409

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3082383667488498: 

Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,110415.0,87952.0,0.4434,(87952.0/198367.0)
1,1,27576.0,88179.0,0.2382,(27576.0/115755.0)
2,Total,137991.0,176131.0,0.3678,(115528.0/314122.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.308238,0.604202,254.0
1,max f2,0.152925,0.753347,352.0
2,max f0point5,0.459942,0.578745,170.0
3,max accuracy,0.501562,0.697356,148.0
4,max precision,0.928695,1.0,0.0
5,max recall,0.055597,1.0,398.0
6,max specificity,0.928695,1.0,0.0
7,max absolute_mcc,0.391396,0.325785,208.0
8,max min_per_class_accuracy,0.365508,0.663764,222.0
9,max mean_per_class_accuracy,0.369733,0.666695,220.0



Gains/Lift Table: Avg response rate: 36.85 %, avg score: 36.85 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010002,0.828712,2.402755,2.402755,0.885423,0.864628,0.885423,0.864628,0.024034,0.024034,140.275528,140.275528
1,,2,0.020002,0.786386,2.216042,2.309414,0.816619,0.805583,0.851027,0.83511,0.022159,0.046192,121.604221,130.94136
2,,3,0.030007,0.763294,2.10757,2.242111,0.776647,0.773547,0.826225,0.814583,0.021088,0.06728,110.756993,124.211096
3,,4,0.04,0.742632,2.086914,2.20334,0.769035,0.753464,0.811938,0.799314,0.020854,0.088134,108.691396,120.33395
4,,5,0.050003,0.719768,2.00719,2.164102,0.739656,0.731279,0.797479,0.785704,0.020077,0.108211,100.719025,116.410216
5,,6,0.100012,0.640659,1.847527,2.005805,0.68082,0.675509,0.739146,0.730603,0.092393,0.200605,84.752721,100.580461
6,,7,0.150002,0.578208,1.665916,1.892532,0.613895,0.608137,0.697404,0.68979,0.083279,0.283884,66.591561,89.253232
7,,8,0.200002,0.527343,1.504394,1.795499,0.554374,0.554116,0.661647,0.655872,0.075219,0.359103,50.439384,79.549924
8,,9,0.300024,0.455729,1.332786,1.64124,0.491136,0.489654,0.604802,0.600458,0.133307,0.492411,33.278565,64.12401
9,,10,0.400001,0.39681,1.156587,1.520105,0.426206,0.426881,0.560164,0.557074,0.115632,0.608043,15.658664,52.01047




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.19726966691087394
RMSE: 0.444150500293396
LogLoss: 0.5786944223627535
Mean Per-Class Error: 0.33191619190721755
AUC: 0.727593424555288
pr_auc: 0.610124995970803
Gini: 0.45518684911057594

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3059424318662228: 

Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,23540.0,18949.0,0.446,(18949.0/42489.0)
1,1,5745.0,18926.0,0.2329,(5745.0/24671.0)
2,Total,29285.0,37875.0,0.3677,(24694.0/67160.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.305942,0.605187,253.0
1,max f2,0.152247,0.75321,353.0
2,max f0point5,0.460136,0.578742,166.0
3,max accuracy,0.50508,0.698392,142.0
4,max precision,0.908542,0.984127,3.0
5,max recall,0.06096,1.0,397.0
6,max specificity,0.928602,0.999976,0.0
7,max absolute_mcc,0.375693,0.326135,215.0
8,max min_per_class_accuracy,0.362964,0.666996,222.0
9,max mean_per_class_accuracy,0.36487,0.668084,221.0



Gains/Lift Table: Avg response rate: 36.73 %, avg score: 36.78 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010006,0.828643,2.369794,2.369794,0.870536,0.863653,0.870536,0.863653,0.023712,0.023712,136.979363,136.979363
1,,2,0.020116,0.785004,2.241124,2.305126,0.82327,0.804792,0.84678,0.83407,0.022658,0.04637,124.112442,130.512568
2,,3,0.030003,0.76251,2.086765,2.23317,0.766566,0.772705,0.820347,0.813848,0.020632,0.067002,108.676545,123.316975
3,,4,0.040009,0.741507,2.057872,2.189329,0.755952,0.752465,0.804243,0.798497,0.020591,0.087593,105.787207,118.932902
4,,5,0.05,0.717152,1.983857,2.148271,0.728763,0.729365,0.78916,0.784683,0.019821,0.107414,98.385658,114.827125
5,,6,0.100045,0.637059,1.882312,2.015232,0.691461,0.673227,0.740289,0.72893,0.0942,0.201613,88.231172,101.523211
6,,7,0.15003,0.576325,1.650202,1.893616,0.606196,0.605305,0.695613,0.687742,0.082486,0.284099,65.020161,89.361565
7,,8,0.2,0.526255,1.50712,1.797049,0.553635,0.552535,0.66014,0.65396,0.075311,0.35941,50.711951,79.704917
8,,9,0.300045,0.454845,1.333359,1.64244,0.489805,0.488514,0.603345,0.598795,0.133395,0.492805,33.335924,64.243984
9,,10,0.4,0.395776,1.150853,1.519598,0.422762,0.426238,0.558219,0.555675,0.115034,0.607839,15.085253,51.959791




Scoring History: 

Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2019-09-18 12:19:12,0.012 sec,0.0,0.482399,0.658154,0.5,0.0,1.0,0.631497,0.482083,0.657531,0.5,0.0,1.0,0.632653
1,,2019-09-18 12:19:13,1.333 sec,1.0,0.481705,0.656718,0.721281,0.569402,2.289071,0.359561,0.481386,0.656089,0.723266,0.57094,2.291923,0.35737
2,,2019-09-18 12:19:13,1.639 sec,2.0,0.481025,0.655313,0.721487,0.569547,2.289071,0.359561,0.480702,0.654678,0.723424,0.571011,2.291923,0.35737
3,,2019-09-18 12:19:14,1.841 sec,3.0,0.480356,0.653936,0.721588,0.589096,2.289071,0.359561,0.48003,0.653294,0.723559,0.589766,2.291923,0.35737
4,,2019-09-18 12:19:14,2.027 sec,4.0,0.479701,0.65259,0.722411,0.590812,2.288996,0.376733,0.479373,0.651944,0.724206,0.591456,2.293831,0.356358
5,,2019-09-18 12:19:14,2.209 sec,5.0,0.479057,0.65127,0.722372,0.590803,2.288996,0.371531,0.478725,0.650617,0.724238,0.591521,2.293831,0.356626
6,,2019-09-18 12:19:14,2.410 sec,6.0,0.478426,0.649978,0.722452,0.590815,2.288996,0.371149,0.478091,0.649321,0.724261,0.59145,2.293831,0.356358
7,,2019-09-18 12:19:14,2.603 sec,7.0,0.477805,0.648711,0.722552,0.590854,2.292176,0.376284,0.477467,0.648047,0.724298,0.591508,2.304677,0.356313
8,,2019-09-18 12:19:15,2.801 sec,8.0,0.477197,0.647471,0.722574,0.590964,2.29266,0.375908,0.476856,0.646803,0.724324,0.591713,2.304677,0.356313
9,,2019-09-18 12:19:15,2.995 sec,9.0,0.476599,0.646254,0.72271,0.591004,2.29266,0.37637,0.476255,0.645579,0.724557,0.591808,2.304677,0.356313



See the whole table with table.as_data_frame()

Variable Importances: 

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,facenet_euclidean_l2,261705.15625,1.0,0.468295
1,facenet_cosine,162436.65625,0.620686,0.290664
2,vgg_euclidean_l2,73451.710938,0.280666,0.131434
3,vgg_cosine,34810.042969,0.133012,0.062289
4,openface_cosine,15869.265625,0.060638,0.028396
5,openface_euclidean_l2,10574.509766,0.040406,0.018922




# Model Performance

In [16]:
val_perf = model.model_performance(validation)

In [17]:
val_perf.auc()

0.7225285769297011

In [18]:
val_perf.accuracy()[0][1]

0.6940562559467174

In [19]:
test_perf = model.model_performance(test)

In [20]:
test_perf.auc()

0.727593424555288

In [21]:
test_perf.accuracy()[0][1]

0.6983918999404407

# Predictions

In [22]:
predictions = model.predict(test_data = validation)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [23]:
predictions.tail()

predict,p0,p1
1,0.462883,0.537117
0,0.815415,0.184585
1,0.48551,0.51449
1,0.322137,0.677863
1,0.4286,0.5714
0,0.85153,0.14847
0,0.909476,0.0905237
1,0.336492,0.663508
1,0.631113,0.368887
0,0.857651,0.142349




In [24]:
predictions_pd = predictions.as_data_frame()

In [25]:
predictions_pd.iloc[0]

predict    0.000000
p0         0.886282
p1         0.113718
Name: 0, dtype: float64

# Store model

In [26]:
saved_model = h2o.save_model(model, path = "", force=True)

In [27]:
saved_model

'/outputs/sefik/kinship/GBM_Kinship'

# Load the best model

In [28]:
#you can directly load the best model by running the load_model command
restored_model = h2o.load_model(saved_model)