In [7]:
import os
import numpy as np
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
import xgboost

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

h2o.init(max_mem_size='12G')

# Data loading
data_files = {
    'a': {
        'train': 'A/train_targets.parquet',
        'estimated': 'A/X_train_estimated.parquet',
        'observed': 'A/X_train_observed.parquet'
    },
    'b': {
        'train': 'B/train_targets.parquet',
        'estimated': 'B/X_train_estimated.parquet',
        'observed': 'B/X_train_observed.parquet'
    },
    'c': {
        'train': 'C/train_targets.parquet',
        'estimated': 'C/X_train_estimated.parquet',
        'observed': 'C/X_train_observed.parquet'
    }
}


def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """

    # Ensure the datetime columns are in datetime format
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    # Ensure data is sorted by date_forecast
    targets = targets.sort_values(by='time')
    observed = observed.sort_values(by='date_forecast')
    estimated = estimated.sort_values(by='date_forecast')
    test = test.sort_values(by='date_forecast')

    # Identify boolean columns
    boolean_features = [col for col in observed.columns if observed[col].dropna().isin([0.0, 1.0]).all()]

    # Forward fill NaNs for boolean columns
    for df in [observed, estimated, test]:
        df[boolean_features] = df[boolean_features].fillna(method='ffill')

    # Forward fill for time-series data (for non-boolean columns)
    for df in [observed, estimated, test]:
        df[df.columns.difference(boolean_features)] = df[df.columns.difference(boolean_features)].fillna(method='ffill')

    """  
    # Forward fill for time-series data
    observed.fillna(method='ffill', inplace=True)
    estimated.fillna(method='ffill', inplace=True)
    test.fillna(method='ffill', inplace=True)

    # Fill NaNs in boolean features with 0
    boolean_features = [col for col in observed.columns if observed[col].dropna().isin([0.0, 1.0]).all()]
    observed[boolean_features] = observed[boolean_features].fillna(method='ffill')
    estimated[boolean_features] = estimated[boolean_features].fillna(method='ffill')
    test[boolean_features] = test[boolean_features].fillna(method='ffill') 
    """

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()

    # Round boolean columns after resampling
    for df in [observed_resampled, estimated_resampled, test_resampled]:
        df[boolean_features] = df[boolean_features].round(0)

    observed_resampled['estimated'] = 0
    estimated_resampled['estimated'] = 1
    test_resampled['estimated'] = 1
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Time-Based Features (training data)
    merged_data['hour'] = merged_data['date_forecast'].dt.hour
    merged_data['sin_hour'] = np.sin(2 * np.pi * merged_data['hour'] / 24)
    merged_data['cos_hour'] = np.cos(2 * np.pi * merged_data['hour'] / 24)
    # merged_data['day_of_week'] = merged_data['date_forecast'].dt.dayofweek
    merged_data['month'] = merged_data['date_forecast'].dt.month
    merged_data['sin_month'] = np.sin(2 * np.pi * merged_data['month'] / 12)
    merged_data['cos_month'] = np.cos(2 * np.pi * merged_data['month'] / 12)

    # Time-Based Features (test data)
    test_resampled['hour'] = test_resampled['date_forecast'].dt.hour
    test_resampled['sin_hour'] = np.sin(2 * np.pi * test_resampled['hour'] / 24)
    test_resampled['cos_hour'] = np.cos(2 * np.pi * test_resampled['hour'] / 24)
    # test_resampled['day_of_week'] = test_resampled['date_forecast'].dt.dayofweek
    test_resampled['month'] = test_resampled['date_forecast'].dt.month
    test_resampled['sin_month'] = np.sin(2 * np.pi * test_resampled['month'] / 12)
    test_resampled['cos_month'] = np.cos(2 * np.pi * test_resampled['month'] / 12)


    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['time', 'date_forecast', 'pv_measurement', 'snow_density:kgm3'])
    test_resampled = test_resampled.drop(columns=['date_forecast', 'snow_density:kgm3'])

    # fixing ceiling_height NaN value
    merged_data['ceiling_height_agl:m'].fillna(0, inplace=True)
    test_resampled['ceiling_height_agl:m'].fillna(0, inplace=True)
  
    
    return merged_data, test_resampled


results = {}

for dataset_name, file_paths in data_files.items():
    train_data = pd.read_parquet(file_paths['train'])
    estimated_data = pd.read_parquet(file_paths['estimated'])
    observed_data = pd.read_parquet(file_paths['observed'])

    X_train, _ = preprocess_data(train_data, observed_data, estimated_data)

    # Convert pandas dataframe to H2OFrame
    X_train_h2o = h2o.H2OFrame(X_train)

    # Specify target and predictor columns
    y = "pv_measurement"
    X = [col for col in X_train_h2o.columns if col != y]

    # Split data into training and validation sets (80% train, 20% validation)
    train, val = X_train_h2o.split_frame(ratios=[.8], seed=42)

    # Run H2O AutoML
    aml = H2OAutoML(max_runtime_secs=300, max_models=20, seed=42, stopping_metric='MAE', sort_metric='MAE')
    aml.train(x=X, y=y, training_frame=train)

    # Validate the model
    preds = aml.leader.predict(val)
    performance = aml.leader.model_performance(val)
    mae = performance.mae()

    print(f"Dataset {dataset_name} - Mean Absolute Error on validation set: {mae:.2f}")

# Shutdown H2O cluster when finished
h2o.cluster().shutdown()


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,6 mins 00 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,1 month and 10 days
H2O_cluster_name:,H2O_from_python_sigurdskatvedt_bck8fk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,11.03 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
13:55:49.341: AutoML: XGBoost is not available; skipping it.
13:55:49.343: _train param, Dropping bad and constant columns: [snow_drift:idx, elevation:m]

█████████████
13:56:45.318: XRT_1_AutoML_3_20231003_135549 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

██
13:56:50.294: _train param, Dropping bad and constant columns: [snow_drift:idx, elevation:m]

████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Dataset a - Mean Absolute Error on validation set: 168.61


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
14:00:53.76: AutoML: XGBoost is not available; skipping it.
14:00:53.78: _train param, Dropping bad and constant columns: [elevation:m]

████████████
14:01:46.544: XRT_1_AutoML_4_20231003_140053 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

████
14:01:55.954: _train param, Dropping bad and constant columns: [elevation:m]

███████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Dataset b - Mean Absolute Error on validation set: 25.66


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().reset_index()


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
14:05:56.393: AutoML: XGBoost is not available; skipping it.
14:05:56.394: _train param, Dropping bad and constant columns: [elevation:m]

████████████
14:06:37.973: XRT_1_AutoML_5_20231003_140556 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

██
14:06:42.232: _train param, Dropping bad and constant columns: [elevation:m]

█████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Dataset c - Mean Absolute Error on validation set: 15.92
H2O session _sid_9e13 closed.
