In [1]:
%load_ext autoreload

In [2]:
import h2o
import os
import pandas as pd
from h2o.estimators import *
from h2o.grid import *
from h2o.automl import H2OAutoML

In [3]:
## Read data ##
A_train_target = pd.read_parquet('data/A/parquet/train_targets.parquet')
A_test = pd.read_parquet('data/A/parquet/X_test_estimated.parquet')
A_train_estimated = pd.read_parquet('data/A/parquet/X_train_estimated.parquet')
A_train_observed = pd.read_parquet('data/A/parquet/X_train_observed.parquet')

B_train_target = pd.read_parquet('data/B/parquet/train_targets.parquet')
B_test = pd.read_parquet('data/B/parquet/X_test_estimated.parquet')
B_train_estimated = pd.read_parquet('data/B/parquet/X_train_estimated.parquet')
B_train_observed = pd.read_parquet('data/B/parquet/X_train_observed.parquet')

C_train_target = pd.read_parquet('data/C/parquet/train_targets.parquet')
C_test = pd.read_parquet('data/C/parquet/X_test_estimated.parquet')
C_train_estimated = pd.read_parquet('data/C/parquet/X_train_estimated.parquet')
C_train_observed = pd.read_parquet('data/C/parquet/X_train_observed.parquet')
###############

## Concatinate train data ##
A_train_concat = pd.concat([A_train_observed, A_train_estimated], axis=0)
B_train_concat = pd.concat([B_train_observed, B_train_estimated], axis=0)
C_train_concat = pd.concat([C_train_observed, C_train_estimated], axis=0)

# Set index to date_forecast
A_train_concat.set_index('date_forecast', inplace=True)
B_train_concat.set_index('date_forecast', inplace=True)
C_train_concat.set_index('date_forecast', inplace=True)

# Drop date_calc column
A_train_concat.drop(columns=['date_calc'], inplace=True)
B_train_concat.drop(columns=['date_calc'], inplace=True)
C_train_concat.drop(columns=['date_calc'], inplace=True)
############################

## Format test data to same as train data ##
A_test.set_index('date_forecast', inplace=True)
B_test.set_index('date_forecast', inplace=True)
C_test.set_index('date_forecast', inplace=True)

# Drop date_calc column
A_test.drop(columns=['date_calc'], inplace=True)
B_test.drop(columns=['date_calc'], inplace=True)
C_test.drop(columns=['date_calc'], inplace=True)
############################################

## Cast train_target's RangeIndex to DatetimeIndex ##
A_train_target.set_index('time', inplace=True)
B_train_target.set_index('time', inplace=True)
C_train_target.set_index('time', inplace=True)
#####################################################

## Add 'pv_measurement' column from train_target to train data ##
A_train_concat['pv_measurement'] = A_train_target['pv_measurement']
B_train_concat['pv_measurement'] = B_train_target['pv_measurement']
C_train_concat['pv_measurement'] = C_train_target['pv_measurement']

# Remove all rows with NaN values in 'pv_measurement' column in train_concat
A_train_concat.dropna(subset=['pv_measurement'], inplace=True)
B_train_concat.dropna(subset=['pv_measurement'], inplace=True)
C_train_concat.dropna(subset=['pv_measurement'], inplace=True)
###################################################################

## Check that every index in train_concat is in train_target ##
if len(A_train_concat.index) == len(A_train_target.index) or len(B_train_concat.index) == len(B_train_target.index) or len(C_train_concat.index) == len(C_train_target.index):
    print('Same length')
else:
    # Remove indicies in A_train_target that are not in A_train_concat
    A_train_target = A_train_target[A_train_target.index.isin(A_train_concat.index)]
    B_train_target = B_train_target[B_train_target.index.isin(B_train_concat.index)]
    C_train_target = C_train_target[C_train_target.index.isin(C_train_concat.index)]
    
    if len(A_train_concat.index) != len(A_train_target.index) or len(B_train_concat.index) != len(B_train_target.index) or len(C_train_concat.index) != len(C_train_target.index):
        print('Not same length')
################################################################

In [4]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 hours 4 mins
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.4
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_stinky_ixpoqx
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.806 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [5]:
X_train = A_train_concat
y_train = A_train_target

X_test = A_test

# Convert to h2o frame
X_train = h2o.H2OFrame(X_train)
y_train = h2o.H2OFrame(y_train)
X_test = h2o.H2OFrame(X_test)



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
automl = H2OAutoML(max_models=10, seed=1, stopping_metric='MAE')