# H2O Reduced Dset (Python)

In [28]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,1 month and 1 day
H2O_cluster_name:,H2O_from_python_lukeswaby_petts_4tc1tc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [29]:
train = h2o.import_file("../Data/Reduced/train_reduced_dset.csv")
test = h2o.import_file("../Data/Reduced/test_reduced_dset.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [30]:
# Rename cols
cols = train.columns[:-1] + ['Dive']
train.set_names(cols) 
test.set_names(cols)

# Set factors
train['Dive'] = train['Dive'].asfactor()
test['Dive'] = test['Dive'].asfactor()

In [32]:
# Build and train the model:
dl = H2ODeepLearningEstimator(distribution = "bernoulli",
                              hidden = [200, 200],
                              #nfolds = 9,
                              #keep_cross_validation_predictions = True,
                              epochs = 50,
                              train_samples_per_iteration = -1,
                              activation = "RectifierWithDropout",
                              input_dropout_ratio = 0.2,
                              hidden_dropout_ratios = [0.2, 0.2],
                              single_node_mode = False,
                              balance_classes = False,
                              force_load_balance = False,
                              seed = 23123,
                              score_training_samples = 0,
                              score_validation_samples = 0,
                              training_frame = train,
                              stopping_rounds = 0)

dl.train(y="Dive",
         training_frame=train)


# Eval performance:
perf = dl.model_performance(test)

# Generate predictions on a test set (if necessary):
pred = dl.predict(test)

deeplearning Model Build progress: |██████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%


In [46]:
print('Accuracy:')
print(perf.accuracy())
print(perf.confusion_matrix())

Accuracy:
[[0.9082240503898414, 0.9857304643261608]]

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.8931861878663852: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,2327.0,44.0,0.0186,(44.0/2371.0)
1,1,19.0,2025.0,0.0093,(19.0/2044.0)
2,Total,2346.0,2069.0,0.0143,(63.0/4415.0)





# Cross-validation

## 9-fold

**Imports**

In [None]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

**Load data**

In [64]:
data = h2o.import_file('../Data/Reduced/reduced_dset_wnames.csv', header=1)

Parse progress: |█████████████████████████████████████████████████████████| 100%


**Set factors:**

In [66]:
data['Dive'] = data['Dive'].asfactor()
data['BirdID'] = data['BirdID'].asfactor()

**Train model**

In [70]:
# Build and train the model:
dl_cross = H2ODeepLearningEstimator(distribution = "bernoulli",
                                  hidden = [200, 200],
                                  nfolds = 9,
                                    keep_cross_validation_models = True,
                                    keep_cross_validation_fold_assignment = True,
                                  keep_cross_validation_predictions = True,
                                  score_each_iteration = True,
                                  epochs = 50,
                                  train_samples_per_iteration = -1,
                                  activation = "RectifierWithDropout",
                                  input_dropout_ratio = 0.2,
                                  hidden_dropout_ratios = [0.2, 0.2],
                                  single_node_mode = False,
                                  balance_classes = False,
                                  force_load_balance = False,
                                  seed = 23123,
                                  score_training_samples = 0,
                                  score_validation_samples = 0,
                                  stopping_rounds = 0)

dl_cross.train(x = data.columns[:-2],
               y="Dive",
               training_frame=data)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


**Performance Summary**

In [90]:
dl_cross.cross_validation_metrics_summary()


Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid
0,accuracy,0.98221755,0.0036953602,0.9772821,0.9855257,0.9843816,0.98315334,0.97858673,0.98202056,0.9774756,0.98383087,0.9877014
1,auc,0.9962182,0.0013032363,0.9937787,0.99716914,0.9975272,0.99621654,0.99481547,0.995843,0.99624276,0.99646497,0.99790597
2,aucpr,0.99447197,0.0021278437,0.9900367,0.99525356,0.9970305,0.9954048,0.9923913,0.9948403,0.99376196,0.99543667,0.9960917
3,err,0.017782452,0.0036953602,0.022717886,0.014474245,0.015618404,0.016846653,0.021413276,0.017979452,0.022524437,0.016169155,0.012298558
4,err_count,42.0,8.87412,55.0,34.0,37.0,39.0,50.0,42.0,53.0,39.0,29.0
5,f0point5,0.9755011,0.00606308,0.9671472,0.9808871,0.97805905,0.9801012,0.9695088,0.975161,0.9669052,0.980961,0.9807796
6,f1,0.9812667,0.0039076456,0.9767245,0.9846154,0.98428875,0.98293215,0.9770009,0.98057353,0.97607225,0.9822969,0.9868956
7,f2,0.98711544,0.0030408674,0.9864934,0.9883721,0.99059826,0.9857795,0.98460966,0.9860465,0.98541474,0.9836364,0.9930884
8,lift_top_group,2.1285,0.06192698,2.0834768,2.1393442,2.0334764,2.0360599,2.1761417,2.1811392,2.1587155,2.1947224,2.1534247
9,logloss,0.08311191,0.014411614,0.105651684,0.07073687,0.08370941,0.07751018,0.09645968,0.08591821,0.084689535,0.0877606,0.055571027



See the whole table with table.as_data_frame()




**Save Model**

In [93]:
model_path = h2o.save_model(model=dl_cross, path="../Data/Reduced/h2o/", force=True)

**Load model**

In [87]:
saved_model = h2o.load_model(model_path)

## Leave-one-out

In [91]:
# Build and train the model:
dl_cross = H2ODeepLearningEstimator(distribution = "bernoulli",
                                  hidden = [200, 200],
                                  fold_column = 'BirdID',
                                    keep_cross_validation_models = True,
                                    keep_cross_validation_fold_assignment = True,
                                  keep_cross_validation_predictions = True,
                                  score_each_iteration = True,
                                  epochs = 50,
                                  train_samples_per_iteration = -1,
                                  activation = "RectifierWithDropout",
                                  input_dropout_ratio = 0.2,
                                  hidden_dropout_ratios = [0.2, 0.2],
                                  single_node_mode = False,
                                  balance_classes = False,
                                  force_load_balance = False,
                                  seed = 23123,
                                  score_training_samples = 0,
                                  score_validation_samples = 0,
                                  stopping_rounds = 0)

dl_cross.train(x = data.columns[:-2],
               y="Dive",
               training_frame=data)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [92]:
dl_cross.cross_validation_metrics_summary()


Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid
0,accuracy,0.98003316,0.0075694234,0.977167,0.98534954,0.9799427,0.96488297,0.987931,0.9798573,0.989048,0.9735382,0.9825815
1,auc,0.9954797,0.0025993083,0.99428177,0.9980365,0.99615353,0.9914211,0.9970044,0.99660623,0.9979576,0.9912259,0.9966304
2,aucpr,0.99405015,0.0031189604,0.9939919,0.9977276,0.99383724,0.9867858,0.9948635,0.99601406,0.99579054,0.9923573,0.9950833
3,err,0.01996686,0.0075694234,0.02283298,0.014650482,0.020057306,0.035117056,0.0120689655,0.020142678,0.01095198,0.0264618,0.01741849
4,err_count,47.22222,18.226202,54.0,35.0,49.0,84.0,28.0,48.0,26.0,62.0,39.0
5,f0point5,0.975084,0.0063400855,0.970696,0.9786689,0.9727867,0.96146786,0.9820949,0.9784425,0.98090786,0.9768126,0.9738785
6,f1,0.97891504,0.008130406,0.9763158,0.9849721,0.9782319,0.96146786,0.98739874,0.97991633,0.9872549,0.97327584,0.981402
7,f2,0.9828012,0.010945012,0.98200107,0.99135697,0.98373836,0.96146786,0.9927602,0.9813946,0.99368465,0.96976465,0.9890427
8,lift_top_group,2.1315827,0.10932507,2.0947742,2.0737848,2.1910315,2.1944954,2.1071754,1.9991611,2.3528247,2.0077121,2.163285
9,logloss,0.09231399,0.03225877,0.112317584,0.052325927,0.10662608,0.13830924,0.07061862,0.07900394,0.06314087,0.1375733,0.07091036



See the whole table with table.as_data_frame()




**Save Model**

In [None]:
model_path = h2o.save_model(model=dl_cross, path="../Data/Reduced/h2o/", force=True)

**Load Model**

In [4]:
import h2o
h2o.init()
saved_model = h2o.load_model('../Data/Reduced/h2o_bird_factor_cross_val_model')

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,3 hours 10 mins
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,1 month and 11 days
H2O_cluster_name:,H2O_from_python_lukeswaby_petts_shf7lv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [5]:
saved_model.confusion_matrix()


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.8811137866967023: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,11258.0,0.0,0.0,(0.0/11258.0)
1,1,0.0,9990.0,0.0,(0.0/9990.0)
2,Total,11258.0,9990.0,0.0,(0.0/21248.0)




**Shutdown model**

In [94]:
h2o.shutdown()

  """Entry point for launching an IPython kernel.


H2O session _sid_ae60 closed.


# Larger Dset

In [1]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "15.0.2" 2021-01-19; Java(TM) SE Runtime Environment (build 15.0.2+7-27); Java HotSpot(TM) 64-Bit Server VM (build 15.0.2+7-27, mixed mode, sharing)
  Starting server from /Users/lukeswaby-petts/Desktop/CMEE/PROJECT/Code/dl_venv/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/td/7hq_f4m975gb353c15zk2f800000gn/T/tmpsg6g5csv
  JVM stdout: /var/folders/td/7hq_f4m975gb353c15zk2f800000gn/T/tmpsg6g5csv/h2o_lukeswaby_petts_started_from_python.out
  JVM stderr: /var/folders/td/7hq_f4m975gb353c15zk2f800000gn/T/tmpsg6g5csv/h2o_lukeswaby_petts_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,1 month and 23 days
H2O_cluster_name:,H2O_from_python_lukeswaby_petts_wxa0r5
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [3]:
data = h2o.import_file('../Data/Reduced/reduced_all_dives.csv', header=1)

Parse progress: |███████████████████████████████████████ (failed)


OSError: Job with key $03017f00000132d4ffffffff$_a396ce9e9d7f9d55c0299c7284074b51 failed with an exception: DistributedException from /127.0.0.1:54321: 'Java heap space', caused by java.lang.OutOfMemoryError: Java heap space
stacktrace: 
DistributedException from /127.0.0.1:54321: 'Java heap space', caused by java.lang.OutOfMemoryError: Java heap space
	at water.MRTask.getResult(MRTask.java:654)
	at water.MRTask.getResult(MRTask.java:664)
	at water.MRTask.doAll(MRTask.java:549)
	at water.parser.ParseDataset.parseAllKeys(ParseDataset.java:254)
	at water.parser.ParseDataset.access$000(ParseDataset.java:26)
	at water.parser.ParseDataset$ParserFJTask.compute2(ParseDataset.java:203)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1610)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:974)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1477)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)
Caused by: java.lang.OutOfMemoryError: Java heap space


In [None]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

data = h2o.import_file('../Data/Reduced/reduced_all_dives.csv', header=1)

data['Dive'] = data['Dive'].asfactor()
data['BirdID'] = data['BirdID'].asfactor()

# Build and train the model:
dl_cross = H2ODeepLearningEstimator(distribution = "bernoulli",
                                    fold_column = 'BirdID',
                                    keep_cross_validation_models = True,
                                    keep_cross_validation_fold_assignment = True,
                                    keep_cross_validation_predictions = True,
                                    score_each_iteration = True,
                                    epochs = 50,
                                    train_samples_per_iteration = -1,
                                    activation = "RectifierWithDropout",
                                    input_dropout_ratio = 0.2,
                                    hidden_dropout_ratios = [0.2, 0.2],
                                    single_node_mode = False,
                                    balance_classes = False,
                                    force_load_balance = False,
                                    seed = 23123,
                                    score_training_samples = 0,
                                    score_validation_samples = 0,
                                    stopping_rounds = 0)

dl_cross.train(x = data.columns[:-2],
               y="Dive",
               training_frame=data)

In [None]:
dl_cross.cross_validation_metrics_summary()

In [None]:
model_path = h2o.save_model(model=dl_cross, path="../Data/Reduced/h2o/", force=True)