## Environment Setup and Imports
**Purpose**: Initialize the runtime by importing all required libraries and configuring basic settings (e.g., warnings, device selection).

In [4]:
import gc
import json
import os
import time
import pickle
import warnings
import itertools
import shap
import cudf
import numpy as np
from cuml.ensemble import RandomForestRegressor as cuRF
from cuml.linear_model import Ridge, ElasticNet
import cupy as cp
from lightgbm import LGBMRegressor
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb
import itertools
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    median_absolute_error,
    explained_variance_score
)

warnings.filterwarnings("ignore", category=DeprecationWarning)

## Data Loading and Preprocessing

**Purpose**: Read training and test datasets from Parquet files, apply scaling, and add a constant column for OLS modeling.

In [None]:
train_path = '/data/workspace_files/train_nov/fourth_week_df.parquet'
test_path1 = '/data/workspace_files/test_dec/first_week_df.parquet'
test_path2 = '/data/workspace_files/test_dec/second_week_df.parquet'

scaler = StandardScaler()

X_train = pd.read_parquet(train_path, columns=predictor_cols)
y_train = pd.read_parquet(train_path, columns=['px_imp_5'])

X_test1 = pd.read_parquet(test_path1, columns=predictor_cols)
y_test1 = pd.read_parquet(test_path1, columns=['px_imp_5'])

X_test2 = pd.read_parquet(test_path2, columns=predictor_cols)
y_test2 = pd.read_parquet(test_path2, columns=['px_imp_5'])

X_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_scaled, columns=X_train.columns, index=X_train.index)
X_train = sm.add_constant(X_train)

X_test1_scaled = scaler.transform(X_test1)
X_test1 = pd.DataFrame(X_test1_scaled, columns=X_test1.columns, index=X_test1.index)
X_test1 = sm.add_constant(X_test1)

X_test2_scaled = scaler.transform(X_test2)
X_test2 = pd.DataFrame(X_test2_scaled, columns=X_test2.columns, index=X_test2.index)
X_test2 = sm.add_constant(X_test2)

## OLS Model Training and Evaluation
**Purpose**: Fit an Ordinary Least Squares (OLS) regression model using the preprocessed data, evaluate its performance, and save the computed metrics.

In [None]:
ols_final = sm.OLS(y_train, X_train).fit(cov_type='HC3')
ols_final.summary()

In [None]:
metrics_save_dir = '/data/workspace_files/Metrics/OLS'

# Evaluate the model (test data is loaded on demand within evaluate_gbt_model).
y_pred1 = ols_final.predict(X_test1)
metrics1 = compute_regression_metrics(y_test1.values, y_pred1)

# Build a filename for saving metrics that includes all hyperparameters.
metrics_filename = os.path.join(
    metrics_save_dir, 
    f"metrics_final_model_ols_dec1.json"
)
with open(metrics_filename, 'w') as f:
    json.dump(metrics1, f, indent=4)

# Evaluate the model (test data is loaded on demand within evaluate_gbt_model).
y_pred2 = ols_final.predict(X_test2)
metrics2 = compute_regression_metrics(y_test2.values, y_pred2)

# Build a filename for saving metrics that includes all hyperparameters.
metrics_filename = os.path.join(
    metrics_save_dir, 
    f"metrics_final_model_ols_dec2.json"
)
with open(metrics_filename, 'w') as f:
    json.dump(metrics2, f, indent=4)

## Model Function Definitions

**Purpose**: Purpose: Define reusable functions for training and evaluating various models (Ridge, ElasticNet, RandomForest, Gradient Boosted Trees, and XGBoost) that load data on demand, perform computations on GPU (if available), and free memory after use.

In [25]:

def compute_regression_metrics(y_true, y_pred):
    """
    Computeregression metrics and return as a dictionary.
    """
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    evs = explained_variance_score(y_true, y_pred)
    
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Median Absolute Error': medae,
        'Explained Variance': evs
    }
    
    return metrics

In [16]:
# Ridge

def train_ridge_model(train_path, predictor_cols, target_col, alpha):
    """
    Load training data from disk into GPU memory on demand, train a GPU-accelerated
    Ridge model, and free the GPU training data.
    """
    # Load training data directly into GPU memory.
    X_train_gpu = cudf.read_parquet(train_path, columns=predictor_cols)
    y_train_gpu = cudf.read_parquet(train_path, columns=[target_col])
    
    # Training phase on GPU.
    model = Ridge(alpha=alpha)
    model.fit(X_train_gpu, y_train_gpu)
    
    # Free GPU training data.
    del X_train_gpu, y_train_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    return model


def evaluate_ridge_model(model, test_path, predictor_cols, target_col):
    """
    Load test data from disk into GPU memory for features, load target to CPU,
    run prediction on GPU, transfer predictions to CPU, compute regression metrics,
    and free GPU memory.
    """
    # Load test features directly into GPU and target into CPU.
    X_test_gpu = cudf.read_parquet(test_path, columns=predictor_cols)
    y_test_cpu = pd.read_parquet(test_path, columns=[target_col])
    
    # Prediction phase on GPU.
    y_pred_gpu = model.predict(X_test_gpu)
    
    # Free test features from GPU.
    del X_test_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # Transfer predictions from GPU to CPU.
    y_pred_cpu = cp.asnumpy(y_pred_gpu)
    del y_pred_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # Compute regression metrics on CPU.
    metrics = compute_regression_metrics(y_test_cpu.values, y_pred_cpu)
    
    # Clean up CPU objects.
    del y_test_cpu, y_pred_cpu
    gc.collect()
    
    return metrics

In [17]:
# Elastic Net

def train_elastic_net_model(train_path, predictor_cols, target_col, alpha, l1_ratio):
    """
    Load training data from disk into GPU memory on demand, train a GPU-accelerated
    ElasticNet model, and free the GPU training data.
    """
    # Load training data directly into GPU memory.
    X_train_gpu = cudf.read_parquet(train_path, columns=predictor_cols)
    y_train_gpu = cudf.read_parquet(train_path, columns=[target_col])
    
    # Training phase on GPU.
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    model.fit(X_train_gpu, y_train_gpu)
    
    # Free GPU training data.
    del X_train_gpu, y_train_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    return model


def evaluate_elastic_net_model(model, test_path, predictor_cols, target_col):
    """
    Load test data from disk into GPU memory for features, load target into CPU,
    run prediction on GPU, transfer predictions to CPU, compute regression metrics,
    and free GPU memory.
    """
    # Load test features into GPU and target into CPU.
    X_test_gpu = cudf.read_parquet(test_path, columns=predictor_cols)
    y_test_cpu = pd.read_parquet(test_path, columns=[target_col])
    
    # Prediction phase on GPU.
    y_pred_gpu = model.predict(X_test_gpu)
    
    # Free test features from GPU.
    del X_test_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # Transfer predictions from GPU to CPU.
    y_pred_cpu = cp.asnumpy(y_pred_gpu)
    del y_pred_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # Compute regression metrics on CPU.
    metrics = compute_regression_metrics(y_test_cpu.values, y_pred_cpu)
    
    # Clean up CPU objects.
    del y_test_cpu, y_pred_cpu
    gc.collect()
    
    return metrics

In [18]:
# RandomForest 

def train_rf_model(train_path, predictor_cols, target_col, n_estimators, max_depth, max_features, max_leaves):
    """
    Load training data from disk into GPU memory on demand, train a GPU-accelerated 
    RandomForest model, and then free the GPU training data.
    """
    # --- Load training data directly into GPU memory ---
    # with open(train_path, 'rb') as f:
    #     X_train_gpu = cudf.read_parquet(f, columns=predictor_cols, engine='pyarrow').astype('float32')
    #     y_train_gpu = cudf.read_parquet(f, columns=[target_col], engine='pyarrow').astype('float32')
    
    X_train_gpu = cudf.read_parquet(train_path, columns=predictor_cols).astype('float32')
    y_train_gpu = cudf.read_parquet(train_path, columns=[target_col]).astype('float32')
    
    # --- Training Phase on GPU ---
    model = cuRF(n_estimators=n_estimators, max_depth=max_depth, random_state=42, n_streams=1)
    model.fit(X_train_gpu, y_train_gpu)
    
    # Free training data from GPU immediately.
    del X_train_gpu, y_train_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    return model


def evaluate_rf_model(model, test_path, predictor_cols, target_col):
    """
    Load test data from disk directly into GPU memory, run prediction with the model,
    then transfer predictions (and ground truth) to the CPU to compute regression metrics.
    """
    # --- Load test data directly into GPU and CPU memory ---
    X_test_gpu = cudf.read_parquet(test_path, columns=predictor_cols).astype('float32')
    y_test_cpu = pd.read_parquet(test_path, columns=[target_col])
    
    # --- Prediction Phase on GPU ---
    y_pred_gpu = model.predict(X_test_gpu)
    
    # Free test features from GPU immediately.
    del X_test_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # --- Transfer predictions and ground truth to CPU ---
    y_pred_cpu = y_pred_gpu.to_pandas()
    del y_pred_gpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # --- Compute regression metrics on CPU ---
    metrics = compute_regression_metrics(y_test_cpu.values, y_pred_cpu.values)
    
    # Clean up CPU objects.
    del y_test_cpu, y_pred_cpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    return metrics

In [19]:
# Gradient Boosted Trees

def train_gbt_model(train_path, predictor_cols, target_col, 
                    boosting_type, n_estimators, max_depth, learning_rate, num_leaves):
    """
    Load training data from disk into GPU memory on demand, train a LightGBM gradient boosted trees model
    with GPU support, and free the GPU training data.
    """
    # Load training data into GPU.
    # X_train_gpu = cudf.read_parquet(train_path, columns=predictor_cols)
    # y_train_gpu = cudf.read_parquet(train_path, columns=[target_col])
    X_train_cpu = pd.read_parquet(train_path, columns=predictor_cols)
    y_train_cpu = pd.read_parquet(train_path, columns=[target_col])
    
    # Initialize and train the model.
    model = LGBMRegressor(boosting_type=boosting_type, n_estimators=n_estimators, max_depth=max_depth,
                          learning_rate=learning_rate, num_leaves=num_leaves, device='gpu', random_state=42)
    model.fit(X_train_cpu, y_train_cpu)
    
    # Free training data.
    del X_train_cpu, y_train_cpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    return model


def evaluate_gbt_model(model, test_path, predictor_cols, target_col):
    """
    Load test data from disk into GPU memory for features (and target into CPU), 
    convert the features to a pandas DataFrame, run prediction,
    compute regression metrics, and free GPU memory.
    """
    # Load test features into GPU and target into CPU.
    # X_test_gpu = cudf.read_parquet(test_path, columns=predictor_cols)
    X_test_cpu = pd.read_parquet(test_path, columns=predictor_cols)
    y_test_cpu = pd.read_parquet(test_path, columns=[target_col])
    
    # Run prediction on CPU.
    y_pred = model.predict(X_test_cpu)
    
    # Free GPU memory and the intermediate pandas object.
    del X_test_cpu
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()
    gc.collect()
    
    # Compute regression metrics on CPU.
    metrics = compute_regression_metrics(y_test_cpu.values, y_pred)
    
    # Clean up CPU objects.
    del y_test_cpu, y_pred
    gc.collect()
    
    return metrics

In [19]:
# XG Boost

def train_xgb_model(train_path, predictor_cols, target_col, params, num_boost_round=150):
    """
    Load training data from disk into GPU memory on demand, train an XGBoost model using GPU,
    and free the GPU training data.
    """

    # Load training data directly using pandas.
    X_train = pd.read_parquet(train_path, columns=predictor_cols).values
    y_train = pd.read_parquet(train_path, columns=[target_col]).values
    
    # Create DMatrix from training data.
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    # Update parameters for CPU training.
    params.update({
        'tree_method': 'hist',
        'n_jobs': -1,
        'booster': 'gbtree',
        'device': 'cpu',
        'objective': 'reg:squarederror',
        'seed': 42
    })
    
    # Train the model.
    booster = xgb.train(params, dtrain, num_boost_round=num_boost_round)
    
    # Clean up.
    del X_train, y_train, dtrain
    gc.collect()
    
    return booster

def evaluate_xgb_model(booster, test_path, predictor_cols, target_col):
    """
    Load test data from disk into GPU memory for features (and target into CPU),
    predict using the XGBoost model, compute regression metrics on CPU, and free GPU memory.
    """

    # Load test data directly using pandas.
    X_test = pd.read_parquet(test_path, columns=predictor_cols).values
    y_test = pd.read_parquet(test_path, columns=[target_col])
    
    # Create DMatrix for testing.
    dtest = xgb.DMatrix(X_test, label=y_test.values)
    
    # Predict using the booster; predictions are on CPU.
    y_pred = booster.predict(dtest)
    
    # Clean up.
    del X_test, dtest
    gc.collect()
    
    # Compute regression metrics on CPU.
    metrics = compute_regression_metrics(y_test.values, y_pred)
    
    # Clean up.
    del y_test, y_pred
    gc.collect()
    
    return metrics

## Model Training and Evaluation

**Purpose**: This section is dedicated to training various regression models using the prepared datasets and evaluating their performance. It encompasses model fitting, prediction generation, and metric computation, enabling a systematic comparison across different algorithms and hyperparameter configurations.

In [6]:
#####################################
# 1. SETUP SECTION
#####################################

# Define predictor columns and target columns.
predictor_cols = [
    'bid', 'spread_minus', 'bbo_moving_trade', 'non_bbo_moving_trade', 'bbo_improving_limit',
    'bbo_worsening_cancel', 'bbo_depth_add_limit', 'bbo_depth_remove_cancel',
    'non_bbo_depth_add_limit', 'non_bbo_depth_remove_cancel', 'non_bbo_deep_depth_add_limit',
    'non_bbo_deep_depth_remove_cancel', 'bbo_depth_imbalance_minus', 
    'non_bbo_depth_imbalance_minus', 'non_bbo_deep_depth_imbalance_minus', 
    'bbo_queue_length_immbalance_minus', 'block_2', 'block_3', 'block_4', 'block_5', 'block_6'
]

# Additional observable features and their replicates.
ob_features = [
    'bbo_moving_trade', 'non_bbo_moving_trade', 'bbo_improving_limit',
    'bbo_worsening_cancel', 'bbo_depth_add_limit', 'bbo_depth_remove_cancel',
    'non_bbo_depth_add_limit', 'non_bbo_depth_remove_cancel', 
    'non_bbo_deep_depth_add_limit', 'non_bbo_deep_depth_remove_cancel'
]
for feat in ob_features:
    for r in range(1, 4):
        predictor_cols.append(f'{feat}_{r}')

# Define target columns
# target_cols = ['px_imp_0', 'px_imp_1', 'px_imp_5', 'px_imp_10', 'px_imp_20']
target_cols = ['px_imp_5']

# Define file paths.
eval_pairs = [('/data/workspace_files/train_oct/oct_train1_df.parquet', '/data/workspace_files/val_oct/oct_val1_df.parquet'),
              ('/data/workspace_files/train_oct/oct_train2_df.parquet', '/data/workspace_files/val_oct/oct_val2_df.parquet'),
              ('/data/workspace_files/train_oct/oct_train3_df.parquet', '/data/workspace_files/val_oct/oct_val3_df.parquet'),
              ('/data/workspace_files/train_oct/oct_train4_df.parquet', '/data/workspace_files/val_oct/oct_val4_df.parquet'),
              ('/data/workspace_files/train_nov/nov_train5_df.parquet', '/data/workspace_files/val_nov/nov_val5_df.parquet'),
              ('/data/workspace_files/train_nov/nov_train6_df.parquet', '/data/workspace_files/val_nov/nov_val6_df.parquet'),
              ('/data/workspace_files/train_nov/nov_train7_df.parquet', '/data/workspace_files/val_nov/nov_val7_df.parquet')]

# Global trackers for overall best result.
overall_best_mse = float('inf')
overall_best_info = None

# Dictionary to track best result per model group (each will be keyed by target)
best_models = {
    'Ridge': {},
    'ElasticNet': {},
    'RandomForest': {},
    'GBM': {},
    'XGBoost': {}
}

# Results list to store all grid search run information.
results = []
model_files_info = []

### Random Forest

In [12]:

model_save_dir = '/data/workspace_files/Models/RF' # '/data/workspace_files/Models/RF'
metrics_save_dir = '/data/workspace_files/Metrics/RF' # '/data/mfe_aaron_afp_eu/Metrics'
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(metrics_save_dir, exist_ok=True)

#####################################
# 2. TRAINING & EVALUATION LOOP FOR RANDOMFOREST
#####################################

# Hyperparameter grids for RandomForest.
rf_n_estimators = [200] #[100, 200]
rf_max_depths = [20] #[10, 20]
rf_max_features = [1.0] #['sqrt', 1.0]
rf_max_leaves = [-1, 50] #[-1, 50]

print("\nStarting grid search for RandomForest...")

# Create the full grid using itertools.product.
hyperparameter_grid = list(itertools.product(
    rf_n_estimators, rf_max_depths, rf_max_features, rf_max_leaves
))

# Loop over evaluation pairs with an index.
for i, (train_path, test_path) in enumerate(eval_pairs, start=2):
    for target in target_cols:
        print(f"\nProcessing target '{target}' for Eval Pair {i}")

        # Iterate over each combination in the grid.
        for n_estimators, max_depth, max_features, max_leaves in hyperparameter_grid:
            # Print the current set of hyperparameters.
            print(f"  Training RF with n_estimators={n_estimators}, "
                  f"max_depth={max_depth}, max_features={max_features}, "
                  f"max_leaves={max_leaves}...")

            # Train the model.
            model = train_rf_model(
                train_path, predictor_cols, target, 
                n_estimators=n_estimators, max_depth=max_depth,
                max_features=max_features, max_leaves=max_leaves
            )

            # Build filenames that include the target, evaluation pair index, and hyperparameters.
            model_filename = os.path.join(
                model_save_dir,
                f"model_rf_{target}_pair_{i}_nest_{n_estimators}_maxd_{max_depth}_feat_{max_features}_leaves_{max_leaves}.pkl"
            )

            # Save the trained model.
            with open(model_filename, 'wb') as f:
                pickle.dump(model, f)
                print(f"    Model dumped to {model_filename}")

            # Record info for later evaluation.
            info = {
                'eval_pair': i,
                'target': target,
                'train_path': train_path,
                'test_path': test_path,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'max_features': max_features,
                'max_leaves': max_leaves,
                'model_file': model_filename
            }
            model_files_info.append(info)

            # Clean up GPU memory after each hyperparameter set.
            cp.get_default_memory_pool().free_all_blocks()
            cp.get_default_pinned_memory_pool().free_all_blocks()
            cp.cuda.Stream.null.synchronize()
            gc.collect()

        # Clean up after each target.
        cp.get_default_memory_pool().free_all_blocks()
        cp.get_default_pinned_memory_pool().free_all_blocks()
        cp.cuda.Stream.null.synchronize()
        gc.collect()

print("\nGrid search for RandomForest completed.")


Starting grid search for RandomForest...

Processing target 'px_imp_5' for Eval Pair 2
  Training RF with n_estimators=200, max_depth=20, max_features=1.0, max_leaves=-1...
    Model dumped to /data/workspace_files/Models/RF/model_rf_px_imp_5_pair_2_nest_200_maxd_20_feat_1.0_leaves_-1.pkl
  Training RF with n_estimators=200, max_depth=20, max_features=1.0, max_leaves=50...
    Model dumped to /data/workspace_files/Models/RF/model_rf_px_imp_5_pair_2_nest_200_maxd_20_feat_1.0_leaves_50.pkl

Grid search for RandomForest completed.


Rerun from pair 2 because it crashed

  Training RF with n_estimators=200, max_depth=20, max_features=sqrt, max_leaves=50...

Rerun from pair 3 because it crashed

  Training RF with n_estimators=200, max_depth=20, max_features=1.0, max_leaves=50...

In [25]:

def populate_model_files_info_from_eval_pairs(model_save_dir, eval_pairs):
    """
    Scans model_save_dir for model files following the naming convention:
    model_rf_{target}_pair_{eval_pair}_nest_{n_estimators}_maxd_{max_depth}_feat_{max_features}_leaves_{max_leaves}.pkl
    """
    model_files_info = []
    # Regex pattern to capture the expected fields.
    pattern = re.compile(
        r"model_rf_(?P<target>.+?)_pair_(?P<eval_pair>\d+)_nest_(?P<n_estimators>\d+)_maxd_(?P<max_depth>\d+)_feat_(?P<max_features>.+?)_leaves_(?P<max_leaves>-?\d+)\.pkl$"
    )
    
    for filename in os.listdir(model_save_dir):
        if filename.endswith('.pkl'):
            match = pattern.match(filename)
            if match:
                info = match.groupdict()
                try:
                    eval_pair = int(info['eval_pair'])
                    # Check if the eval_pair is valid for eval_pairs list.
                    if eval_pair < 1 or eval_pair > len(eval_pairs):
                        print(f"Warning: eval_pair {eval_pair} for file {filename} is out of range. Skipping.")
                        continue
                    
                    n_estimators = int(info['n_estimators'])
                    max_depth = int(info['max_depth'])
                    max_leaves = int(info['max_leaves'])
                    target = info['target']
                    try:
                        max_features = float(info['max_features'])
                    except ValueError:
                        max_features = info['max_features']
                    
                    model_file = os.path.join(model_save_dir, filename)
                    
                    # Use eval_pair to get train and test paths.
                    train_path, test_path = eval_pairs[eval_pair - 1]
                    
                    model_files_info.append({
                        'eval_pair': eval_pair,
                        'target': target,
                        'train_path': train_path,
                        'test_path': test_path,
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'max_features': max_features,
                        'max_leaves': max_leaves,
                        'model_file': model_file
                    })
                except Exception as e:
                    print(f"Error parsing file {filename}: {e}")
            else:
                print(f"Filename does not match pattern, skipping: {filename}")
    return model_files_info

In [26]:
eval_pairs = [
    ('/data/workspace_files/train_oct/oct_train1_df.parquet', '/data/workspace_files/val_oct/oct_val1_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train2_df.parquet', '/data/workspace_files/val_oct/oct_val2_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train3_df.parquet', '/data/workspace_files/val_oct/oct_val3_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train4_df.parquet', '/data/workspace_files/val_oct/oct_val4_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train5_df.parquet', '/data/workspace_files/val_nov/nov_val5_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train6_df.parquet', '/data/workspace_files/val_nov/nov_val6_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train7_df.parquet', '/data/workspace_files/val_nov/nov_val7_df.parquet')
]

model_save_dir = '/data/mfe_aaron_afp_eu/Models/RF'
model_files_info = populate_model_files_info_from_eval_pairs(model_save_dir, eval_pairs)

len(model_files_info)

112

In [27]:
evaluation_results = []
overall_best_mse = float('inf')
overall_best_info = None


print("\nStarting evaluation loop for RandomForest models...")

total_models = len(model_files_info)
for idx, info in enumerate(model_files_info, start=1):
    print(f"\nProcessing model {idx} of {total_models}...")
    eval_pair    = info['eval_pair']
    target       = info['target']
    test_path    = info['test_path']
    n_estimators = info['n_estimators']
    max_depth    = info['max_depth']
    max_features = info['max_features']
    max_leaves   = info['max_leaves']
    model_filename = info['model_file']
    
    # --- Time the model load ---
    load_start = time.time()
    with open(model_filename, 'rb') as f:
        model = pickle.load(f)
    load_time = time.time() - load_start
    print(f"Model load time: {load_time:.4f} seconds")
    
    # --- Time the prediction step (inside evaluate_rf_model) ---
    pred_start = time.time()
    metrics = evaluate_rf_model(model, test_path, predictor_cols, target)
    pred_time = time.time() - pred_start
    print(f"Prediction time: {pred_time:.4f} seconds")
    
    # Build a filename for saving metrics that includes all hyperparameters.
    metrics_filename = os.path.join(
        metrics_save_dir, 
        f"metrics_rf_{target}_pair_{eval_pair}_nest_{n_estimators}_maxd_{max_depth}_feat_{max_features}_leaves_{max_leaves}.json"
    )
    # Ensure the subdirectory exists.
    os.makedirs(os.path.dirname(metrics_filename), exist_ok=True)
    
    with open(metrics_filename, 'w') as f:
        json.dump(metrics, f, indent=4)
    
    run_info = {
        'eval_pair': eval_pair,
        'target': target,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
        'max_leaves': max_leaves,
        'metrics': metrics,
        'model_file': model_filename,
        'metrics_file': metrics_filename,
        'load_time': load_time,
        'prediction_time': pred_time
    }
    evaluation_results.append(run_info)
    
    # Update overall best if current MSE is lower.
    if metrics['MSE'] < overall_best_mse:
        overall_best_mse = metrics['MSE']
        overall_best_info = run_info
    
    # Clean up.
    gc.collect()
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()

print("\nEvaluation loop for RandomForest models completed.")


Starting evaluation loop for RandomForest models...

Processing model 1 of 112...
Model load time: 0.2565 seconds
Prediction time: 7.5543 seconds

Processing model 2 of 112...
Model load time: 0.2401 seconds
Prediction time: 3.4987 seconds

Processing model 3 of 112...
Model load time: 0.3704 seconds
Prediction time: 4.1325 seconds

Processing model 4 of 112...
Model load time: 0.2694 seconds
Prediction time: 2.4657 seconds

Processing model 5 of 112...
Model load time: 5.1241 seconds
Prediction time: 5.3784 seconds

Processing model 6 of 112...
Model load time: 5.0868 seconds
Prediction time: 4.9759 seconds

Processing model 7 of 112...
Model load time: 5.6129 seconds
Prediction time: 3.7442 seconds

Processing model 8 of 112...
Model load time: 5.7671 seconds
Prediction time: 5.6871 seconds

Processing model 9 of 112...
Model load time: 0.4907 seconds
Prediction time: 2.4546 seconds

Processing model 10 of 112...
Model load time: 0.6078 seconds
Prediction time: 3.8046 seconds

Proce

In [None]:
print("\nEvaluation loop completed.")

print("\nOverall Best RandomForest Model:")
print(f"  Eval Pair: {overall_best_info['eval_pair']}")
print(f"  Target: {overall_best_info['target']}")
print(f"  n_estimators: {overall_best_info['n_estimators']}")
print(f"  max_depth: {overall_best_info['max_depth']}")
print(f"  max_features: {overall_best_info['max_features']}")
print(f"  max_leaves: {overall_best_info['max_leaves']}")
print("  Metrics:")
for key, value in overall_best_info['metrics'].items():
    print(f"    {key}: {value}")
print(f"  Model file: {overall_best_info['model_file']}")
print(f"  Metrics file: {overall_best_info['metrics_file']}")

# Optionally, save all evaluation results for future reference.
all_results_filename = os.path.join(metrics_save_dir, "all_evaluation_results.json")
with open(all_results_filename, 'w') as f:
    json.dump(evaluation_results, f, indent=4)

gc.collect()
cp.get_default_memory_pool().free_all_blocks()
cp.get_default_pinned_memory_pool().free_all_blocks()
cp.cuda.Stream.null.synchronize()

In [None]:
evaluation_results

In [None]:
cp.get_default_memory_pool().free_all_blocks()
cp.get_default_pinned_memory_pool().free_all_blocks()
cp.cuda.Stream.null.synchronize()
gc.collect()

## Gradient Boosted Trees

In [15]:
# Directories for saving models and metrics.
model_save_dir = '/data/workspace_files/Models/GBT'
metrics_save_dir = '/data/workspace_files/Metrics/GBT'

os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(metrics_save_dir, exist_ok=True)

#####################################
# GRADIENT BOOSTED TREES TRAINING & EVALUATION LOOP
#####################################

# Hyperparameter grids for LightGBM gradient boosted trees.
gbt_boosting_types  = ['gbdt', 'dart']
gbt_n_estimators    = [100, 200, 1000]
gbt_max_depths      = [10, 20]
gbt_learning_rates  = [0.05, 0.1]
gbt_num_leaves      = [31] # [50]

print("\nStarting grid search for Gradient Boosted Trees...")

# Create the full grid using itertools.product.
hyperparameter_grid = list(itertools.product(
    gbt_boosting_types, gbt_n_estimators, gbt_max_depths, gbt_learning_rates, gbt_num_leaves
))
print(f"Total grid points: {len(hyperparameter_grid)}")

# Loop over evaluation pairs with an index.
for i, (train_path, test_path) in enumerate(eval_pairs, start=1):
    for target in target_cols:
        print(f"\nProcessing target '{target}' for Eval Pair {i}")
        for boosting, n_estimators, max_depth, learning_rate, num_leaves in hyperparameter_grid:
            # Print the current set of hyperparameters.
            print(f"  Training GBT with boosting_type={boosting}, n_estimators={n_estimators}, "
                  f"max_depth={max_depth}, learning_rate={learning_rate}, num_leaves={num_leaves}...")
            
            # Train the model.
            model = train_gbt_model(
                train_path, predictor_cols, target,
                boosting_type=boosting,
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                num_leaves=num_leaves
            )
            
            # Build filename that includes target, evaluation pair, and hyperparameters.
            model_filename = os.path.join(
                model_save_dir,
                f"model_gbt_{target}_pair_{i}_boost_{boosting}_nest_{n_estimators}_maxd_{max_depth}_lr_{learning_rate}_leaves_{num_leaves}.pkl"
            )
            
            # Save the trained model.
            with open(model_filename, 'wb') as f:
                pickle.dump(model, f)
                print(f"    Model dumped to {model_filename}")
            
            # Record info for later evaluation.
            info = {
                'eval_pair': i,
                'target': target,
                'train_path': train_path,
                'test_path': test_path,
                'boosting_type': boosting,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'learning_rate': learning_rate,
                'num_leaves': num_leaves,
                'model_file': model_filename
            }
            model_files_info.append(info)
            
            # Clean up GPU memory after each hyperparameter set.
            cp.get_default_memory_pool().free_all_blocks()
            cp.get_default_pinned_memory_pool().free_all_blocks()
            cp.cuda.Stream.null.synchronize()
            gc.collect()
        
        # Clean up after each target.
        cp.get_default_memory_pool().free_all_blocks()
        cp.get_default_pinned_memory_pool().free_all_blocks()
        cp.cuda.Stream.null.synchronize()
        gc.collect()

print("\nGrid search for Gradient Boosted Trees completed.")


Starting grid search for Gradient Boosted Trees...
Total grid points: 1

Processing target 'px_imp_5' for Eval Pair 5
  Training GBT with boosting_type=dart, n_estimators=1000, max_depth=20, learning_rate=0.1, num_leaves=31...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1217
[LightGBM] [Info] Number of data points in the train set: 13043266, number of used features: 51
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (149.27 MB) transferred to GPU in 0.178255 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -0.000000
    Model dumped to /data/workspace_files/Models/GBT/model_gbt_px_imp_5_pair_5_boost_dart_nest_1000_maxd_20_lr_0.1_leaves_31.pkl

Grid search for Gradient Boosted Trees completed.


In [17]:
import os
import re
import pprint

def populate_model_files_info_from_eval_pairs_gbt(model_save_dir, eval_pairs):
    """
    Scans model_save_dir for GBT model files following the naming convention:
    model_gbt_{target}_pair_{eval_pair}_boost_{boosting_type}_nest_{n_estimators}_maxd_{max_depth}_lr_{learning_rate}_leaves_{num_leaves}.pkl
    """
    model_files_info = []
    # Regex pattern for GBT model files.
    pattern = re.compile(
        r"model_gbt_(?P<target>.+?)(?=_pair_)_pair_(?P<eval_pair>\d+)_boost_(?P<boosting_type>.+?)(?=_nest_)_nest_(?P<n_estimators>\d+)_maxd_(?P<max_depth>\d+)_lr_(?P<learning_rate>.+?)(?=_leaves_)_leaves_(?P<num_leaves>\d+)\.pkl$"
    )
    
    for filename in os.listdir(model_save_dir):
        if filename.endswith('.pkl'):
            match = pattern.match(filename)
            if match:
                info = match.groupdict()
                try:
                    eval_pair = int(info['eval_pair'])
                    # Check if the eval_pair is valid.
                    if eval_pair < 1 or eval_pair > len(eval_pairs):
                        print(f"Warning: eval_pair {eval_pair} for file {filename} is out of range. Skipping.")
                        continue
                    
                    n_estimators = int(info['n_estimators'])
                    max_depth = int(info['max_depth'])
                    num_leaves = int(info['num_leaves'])
                    target = info['target']
                    boosting_type = info['boosting_type']
                    try:
                        learning_rate = float(info['learning_rate'])
                    except ValueError:
                        learning_rate = info['learning_rate']
                    
                    model_file = os.path.join(model_save_dir, filename)
                    
                    # Use eval_pair to get train and test paths.
                    train_path, test_path = eval_pairs[eval_pair - 1]
                    
                    model_files_info.append({
                        'eval_pair': eval_pair,
                        'target': target,
                        'boosting_type': boosting_type,
                        'train_path': train_path,
                        'test_path': test_path,
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'learning_rate': learning_rate,
                        'num_leaves': num_leaves,
                        'model_file': model_file
                    })
                except Exception as e:
                    print(f"Error parsing file {filename}: {e}")
            else:
                print(f"Filename does not match pattern, skipping: {filename}")
    return model_files_info

# Example usage:
eval_pairs = [
    ('/data/workspace_files/train_oct/oct_train1_df.parquet', '/data/workspace_files/val_oct/oct_val1_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train2_df.parquet', '/data/workspace_files/val_oct/oct_val2_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train3_df.parquet', '/data/workspace_files/val_oct/oct_val3_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train4_df.parquet', '/data/workspace_files/val_oct/oct_val4_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train5_df.parquet', '/data/workspace_files/val_nov/nov_val5_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train6_df.parquet', '/data/workspace_files/val_nov/nov_val6_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train7_df.parquet', '/data/workspace_files/val_nov/nov_val7_df.parquet')
]

model_save_dir = '/data/workspace_files/Models/GBT'
model_files_info = populate_model_files_info_from_eval_pairs_gbt(model_save_dir, eval_pairs)
pprint.pprint(model_files_info)

[{'boosting_type': 'dart',
  'eval_pair': 5,
  'learning_rate': 0.1,
  'max_depth': 20,
  'model_file': '/data/workspace_files/Models/GBT/model_gbt_px_imp_5_pair_5_boost_dart_nest_1000_maxd_20_lr_0.1_leaves_31.pkl',
  'n_estimators': 1000,
  'num_leaves': 31,
  'target': 'px_imp_5',
  'test_path': '/data/workspace_files/val_nov/nov_val5_df.parquet',
  'train_path': '/data/workspace_files/train_nov/nov_train5_df.parquet'},
 {'boosting_type': 'dart',
  'eval_pair': 7,
  'learning_rate': 0.1,
  'max_depth': 20,
  'model_file': '/data/workspace_files/Models/GBT/model_gbt_px_imp_5_pair_7_boost_dart_nest_1000_maxd_20_lr_0.1_leaves_31.pkl',
  'n_estimators': 1000,
  'num_leaves': 31,
  'target': 'px_imp_5',
  'test_path': '/data/workspace_files/val_nov/nov_val7_df.parquet',
  'train_path': '/data/workspace_files/train_nov/nov_train7_df.parquet'},
 {'boosting_type': 'dart',
  'eval_pair': 7,
  'learning_rate': 0.05,
  'max_depth': 20,
  'model_file': '/data/workspace_files/Models/GBT/model_gbt

In [19]:
evaluation_results = []
overall_best_mse = float('inf')
overall_best_info = None

print("\nStarting evaluation loop for Gradient Boosted Trees models...")

total_models = len(model_files_info)
for idx, info in enumerate(model_files_info, start=1):
    print(f"\nProcessing model {idx} of {total_models}...")
    eval_pair     = info['eval_pair']
    target        = info['target']
    test_path     = info['test_path']
    boosting_type = info['boosting_type']
    n_estimators  = info['n_estimators']
    max_depth     = info['max_depth']
    learning_rate = info['learning_rate']
    num_leaves    = info['num_leaves']
    model_filename = info['model_file']

    # Build metrics filename.
    metrics_filename = os.path.join(
        metrics_save_dir, 
        f"metrics_gbt_{target}_pair_{eval_pair}_boost_{boosting_type}_nest_{n_estimators}_maxd_{max_depth}_lr_{learning_rate}_leaves_{num_leaves}.json"
    )
    # If the metrics file exists, skip this model.
    if os.path.exists(metrics_filename):
        print(f"Metrics file {metrics_filename} already exists. Skipping model {idx}.")
        continue

    # --- Time the model load ---
    load_start = time.time()
    with open(model_filename, 'rb') as f:
        model = pickle.load(f)
    load_time = time.time() - load_start
    print(f"Model load time: {load_time:.4f} seconds")
    
    # --- Time the prediction step (inside evaluate_gbt_model) ---
    pred_start = time.time()
    metrics = evaluate_gbt_model(model, test_path, predictor_cols, target)
    pred_time = time.time() - pred_start
    print(f"Prediction time: {pred_time:.4f} seconds")
    
    # Ensure the directory for metrics file exists.
    os.makedirs(os.path.dirname(metrics_filename), exist_ok=True)
    
    with open(metrics_filename, 'w') as f:
        json.dump(metrics, f, indent=4)
    
    run_info = {
        'eval_pair': eval_pair,
        'target': target,
        'boosting_type': boosting_type,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'num_leaves': num_leaves,
        'metrics': metrics,
        'model_file': model_filename,
        'metrics_file': metrics_filename
    }
    evaluation_results.append(run_info)
    
    # Update overall best if current MSE is lower.
    if metrics['MSE'] < overall_best_mse:
        overall_best_mse = metrics['MSE']
        overall_best_info = run_info

    # Clean up.
    gc.collect()
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    cp.cuda.Stream.null.synchronize()

print("\nEvaluation loop for Gradient Boosted Trees models completed.")


Starting evaluation loop for Gradient Boosted Trees models...

Processing model 1 of 168...

Processing model 2 of 168...

Processing model 3 of 168...

Processing model 4 of 168...

Processing model 5 of 168...

Processing model 6 of 168...

Processing model 7 of 168...

Processing model 8 of 168...

Processing model 9 of 168...

Processing model 10 of 168...

Processing model 11 of 168...

Processing model 12 of 168...

Processing model 13 of 168...

Processing model 14 of 168...

Processing model 15 of 168...

Processing model 16 of 168...

Processing model 17 of 168...

Processing model 18 of 168...
Metrics file /data/workspace_files/Metrics/GBT/metrics_gbt_px_imp_5_pair_5_boost_gbdt_nest_200_maxd_20_lr_0.05_leaves_31.json already exists. Skipping model 18.

Processing model 19 of 168...

Processing model 20 of 168...
Metrics file /data/workspace_files/Metrics/GBT/metrics_gbt_px_imp_5_pair_5_boost_dart_nest_100_maxd_10_lr_0.05_leaves_31.json already exists. Skipping model 20.

Pro

In [20]:
print("\nEvaluation loop completed.")

print("\nOverall Best Gradient Boosted Trees Model:")
print(f"  Eval Pair: {overall_best_info['eval_pair']}")
print(f"  Target: {overall_best_info['target']}")
print(f"  Boosting Type: {overall_best_info['boosting_type']}")
print(f"  n_estimators: {overall_best_info['n_estimators']}")
print(f"  max_depth: {overall_best_info['max_depth']}")
print(f"  learning_rate: {overall_best_info['learning_rate']}")
print(f"  num_leaves: {overall_best_info['num_leaves']}")
print("  Metrics:")
for key, value in overall_best_info['metrics'].items():
    print(f"    {key}: {value}")
print(f"  Model file: {overall_best_info['model_file']}")
print(f"  Metrics file: {overall_best_info['metrics_file']}")

# Optionally, save all evaluation results for future reference.
all_results_filename = os.path.join(metrics_save_dir, "all_evaluation_results.json")
with open(all_results_filename, 'w') as f:
    json.dump(evaluation_results, f, indent=4)

gc.collect()
cp.get_default_memory_pool().free_all_blocks()
cp.get_default_pinned_memory_pool().free_all_blocks()
cp.cuda.Stream.null.synchronize()


Evaluation loop completed.

Overall Best Gradient Boosted Trees Model:
  Eval Pair: 3
  Target: px_imp_5
  Boosting Type: dart
  n_estimators: 1000
  max_depth: 10
  learning_rate: 0.05
  num_leaves: 31
  Metrics:
    MSE: 9.663882695403778e-09
    RMSE: 9.830504918570449e-05
    MAE: 2.7687421159721706e-05
    R2: 0.07723891184694753
    Median Absolute Error: 5.753558629481704e-06
    Explained Variance: 0.07725600471389205
  Model file: /data/workspace_files/Models/GBT/model_gbt_px_imp_5_pair_3_boost_dart_nest_1000_maxd_10_lr_0.05_leaves_31.pkl
  Metrics file: /data/workspace_files/Metrics/GBT/metrics_gbt_px_imp_5_pair_3_boost_dart_nest_1000_maxd_10_lr_0.05_leaves_31.json


In [17]:
evaluation_results

[{'eval_pair': 1,
  'target': 'px_imp_5',
  'boosting_type': 'gbdt',
  'n_estimators': 100,
  'max_depth': 10,
  'learning_rate': 0.05,
  'num_leaves': 31,
  'metrics': {'MSE': 1.7800554737406772e-08,
   'RMSE': np.float64(0.00013341871959139307),
   'MAE': 4.0724083058962924e-05,
   'R2': 0.10800277321394769,
   'Median Absolute Error': np.float64(8.627593963205301e-06),
   'Explained Variance': 0.10802573466565424},
  'model_file': '/data/workspace_files/Models/GBT/model_gbt_px_imp_5_pair_1_boost_gbdt_nest_100_maxd_10_lr_0.05_leaves_31.pkl',
  'metrics_file': '/data/workspace_files/Metrics/GBT/metrics_gbt_px_imp_5_pair_1_boost_gbdt_nest_100_maxd_10_lr_0.05_leaves_31.json'},
 {'eval_pair': 1,
  'target': 'px_imp_5',
  'boosting_type': 'gbdt',
  'n_estimators': 100,
  'max_depth': 10,
  'learning_rate': 0.1,
  'num_leaves': 31,
  'metrics': {'MSE': 1.7833444765171383e-08,
   'RMSE': np.float64(0.00013354192137741386),
   'MAE': 4.080611939190233e-05,
   'R2': 0.10635463280553126,
   'M

In [None]:
gbt_boosting_types  = ['gbdt', 'dart']
gbt_n_estimators    = [100, 200, 1000]
gbt_max_depths      = [10, 20]
gbt_learning_rates  = [0.05, 0.1]
gbt_num_leaves      = [31]
result = {}

for method in gbt_boosting_types:
    for nest in gbt_n_estimators:
        for depth in gbt_max_depths:
            for rate in gbt_learning_rates:
                for leaf in gbt_num_leaves:
                    result[f'{method}-{nest}-{depth}-{rate}-{leaf}'] = 0
                    for eval_pair in range(1, 8):
                        filename = f"/data/workspace_files/Metrics/GBT/metrics_gbt_px_imp_5_pair_{eval_pair}_boost_{method}_nest_{nest}_maxd_{depth}_lr_{rate}_leaves_{leaf}.json"
                        with open(filename, 'r') as file:
                            data = json.load(file)
                        result[f'{method}-{nest}-{depth}-{rate}-{leaf}'] += data['R2'] / 7

### XGBoost Model

In [None]:
os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())

In [17]:
# Directories for saving models and metrics.
model_save_dir = '/data/workspace_files/Models/XGB'
metrics_save_dir = '/data/workspace_files/Metrics/XGB'
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(metrics_save_dir, exist_ok=True)

# Hyperparameter grids for XGBoost.
xgb_max_depths = [5, 10, 20]               # 3 values
xgb_learning_rates = [0.05, 0.1]            # 2 values
xgb_min_child_weights = [1]               # 1 value (fixed)
xgb_subsamples = [0.8, 1.0]                # 2 values
xgb_num_boost_rounds = [100, 200]           # 2 values

# Create the full grid using itertools.product.
hyperparameter_grid = list(itertools.product(
    xgb_max_depths, xgb_learning_rates, xgb_min_child_weights, xgb_subsamples, xgb_num_boost_rounds
))
print(f"Total grid points: {len(hyperparameter_grid)}")  # Expected: 24 grid points.

model_files_info = []

# Loop over evaluation pairs.
for i, (train_path, test_path) in enumerate(eval_pairs, start=1):
    for target in target_cols:
        print(f"\nProcessing target '{target}' for Eval Pair {i}")
        for max_depth, learning_rate, min_child_weight, subsample, num_boost_round in hyperparameter_grid:
            # Print the current hyperparameter set.
            print(f"  Training XGB with max_depth={max_depth}, learning_rate={learning_rate}, "
                  f"min_child_weight={min_child_weight}, subsample={subsample}, num_boost_round={num_boost_round}...")
            
            # --- Time the training step ---
            train_start = time.time()
            booster = train_xgb_model(
                train_path, predictor_cols, target,
                params={
                    'max_depth': max_depth,
                    'learning_rate': learning_rate,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample
                },
                num_boost_round=num_boost_round
            )
            train_time = time.time() - train_start
            print(f"    Training time: {train_time:.4f} seconds")
            
            # Build filename that includes target, evaluation pair, and hyperparameters.
            model_filename = os.path.join(
                model_save_dir,
                f"model_xgb_{target}_pair_{i}_maxd_{max_depth}_lr_{learning_rate}_sub_{subsample}_nbr_{num_boost_round}.pkl"
            )
            
            # Save the trained model.
            with open(model_filename, 'wb') as f:
                pickle.dump(booster, f)
            print(f"    Model dumped to {model_filename}")
            
            # Record info for later evaluation.
            info = {
                'eval_pair': i,
                'target': target,
                'train_path': train_path,
                'test_path': test_path,
                'max_depth': max_depth,
                'learning_rate': learning_rate,
                'min_child_weight': min_child_weight,
                'subsample': subsample,
                'num_boost_round': num_boost_round,
                'model_file': model_filename
            }
            model_files_info.append(info)
            
            # Clean up GPU memory.
            gc.collect()
            cp.get_default_memory_pool().free_all_blocks()
            cp.get_default_pinned_memory_pool().free_all_blocks()
            cp.cuda.Stream.null.synchronize()

print("\nGrid search for XGBoost models completed.")

Total grid points: 24

Processing target 'px_imp_5' for Eval Pair 1
  Training XGB with max_depth=5, learning_rate=0.05, min_child_weight=1, subsample=0.8, num_boost_round=100...
    Training time: 54.1709 seconds
    Model dumped to /data/workspace_files/Models/XGB/model_xgb_px_imp_5_pair_1_maxd_5_lr_0.05_sub_0.8_nbr_100.pkl
  Training XGB with max_depth=5, learning_rate=0.05, min_child_weight=1, subsample=0.8, num_boost_round=200...
    Training time: 85.5217 seconds
    Model dumped to /data/workspace_files/Models/XGB/model_xgb_px_imp_5_pair_1_maxd_5_lr_0.05_sub_0.8_nbr_200.pkl
  Training XGB with max_depth=5, learning_rate=0.05, min_child_weight=1, subsample=1.0, num_boost_round=100...
    Training time: 52.8316 seconds
    Model dumped to /data/workspace_files/Models/XGB/model_xgb_px_imp_5_pair_1_maxd_5_lr_0.05_sub_1.0_nbr_100.pkl
  Training XGB with max_depth=5, learning_rate=0.05, min_child_weight=1, subsample=1.0, num_boost_round=200...
    Training time: 68.3405 seconds
    Mo

In [23]:
gc.collect()
cp.get_default_memory_pool().free_all_blocks()
cp.get_default_pinned_memory_pool().free_all_blocks()
cp.cuda.Stream.null.synchronize()

In [12]:
import itertools
def populate_model_files_info_from_eval_pairs_xgb(model_save_dir, eval_pairs):
    # Hyperparameter grids for XGBoost.
    xgb_max_depths = [5, 10, 20]               # 3 values
    xgb_learning_rates = [0.05, 0.1]            # 2 values
    xgb_min_child_weights = [1]               # 1 value (fixed)
    xgb_subsamples = [0.8, 1.0]                # 2 values
    xgb_num_boost_rounds = [100, 200]           # 2 values

    model_files_info = []
    target_cols = ['px_imp_5']

    # Create the full grid using itertools.product.
    hyperparameter_grid = list(itertools.product(
        xgb_max_depths, xgb_learning_rates, xgb_min_child_weights, xgb_subsamples, xgb_num_boost_rounds
    ))
    print(f"Total grid points: {len(hyperparameter_grid)}")  # Expected: 24 grid points.

    # Loop over evaluation pairs.
    for i, (train_path, test_path) in enumerate(eval_pairs, start=1):
        for target in target_cols:
            print(f"\nProcessing target '{target}' for Eval Pair {i}")
            for max_depth, learning_rate, min_child_weight, subsample, num_boost_round in hyperparameter_grid:

                # Build filename that includes target, evaluation pair, and hyperparameters.
                model_filename = os.path.join(
                    model_save_dir,
                    f"model_xgb_{target}_pair_{i}_maxd_{max_depth}_lr_{learning_rate}_sub_{subsample}_nbr_{num_boost_round}.pkl"
                )
            
                # Record info for later evaluation.
                info = {
                    'eval_pair': i,
                    'target': target,
                    'train_path': train_path,
                    'test_path': test_path,
                    'max_depth': max_depth,
                    'learning_rate': learning_rate,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'num_boost_round': num_boost_round,
                    'model_file': model_filename
                }
                model_files_info.append(info)

    return model_files_info

eval_pairs = [
    ('/data/workspace_files/train_oct/oct_train1_df.parquet', '/data/workspace_files/val_oct/oct_val1_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train2_df.parquet', '/data/workspace_files/val_oct/oct_val2_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train3_df.parquet', '/data/workspace_files/val_oct/oct_val3_df.parquet'),
    ('/data/workspace_files/train_oct/oct_train4_df.parquet', '/data/workspace_files/val_oct/oct_val4_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train5_df.parquet', '/data/workspace_files/val_nov/nov_val5_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train6_df.parquet', '/data/workspace_files/val_nov/nov_val6_df.parquet'),
    ('/data/workspace_files/train_nov/nov_train7_df.parquet', '/data/workspace_files/val_nov/nov_val7_df.parquet')
]

model_save_dir = '/data/workspace_files/Models/XGB'
metrics_save_dir = '/data/workspace_files/Metrics/XGB'
model_files_info = populate_model_files_info_from_eval_pairs_xgb(model_save_dir, eval_pairs)

len(model_files_info)


evaluation_results = []
overall_best_mse = float('inf')
overall_best_info = None

print("\nStarting evaluation loop for XGBoost models...")

total_models = len(model_files_info)
for idx, info in enumerate(model_files_info, start=1):
    print(f"\nProcessing model {idx} of {total_models}...")
    eval_pair      = info['eval_pair']
    target         = info['target']
    test_path      = info['test_path']
    max_depth      = info['max_depth']
    learning_rate  = info['learning_rate']
    min_child_weight = info.get('min_child_weight', None)
    subsample      = info.get('subsample', None)
    num_boost_round = info.get('num_boost_round', None)
    model_filename = info['model_file']
    
    # Build metrics filename.
    metrics_filename = os.path.join(
        metrics_save_dir, 
        f"metrics_xgb_{target}_pair_{eval_pair}_maxd_{max_depth}_lr_{learning_rate}_sub_{subsample}_nbr_{num_boost_round}.json"
    )
    # If the metrics file exists, skip this model.
    # if os.path.exists(metrics_filename):
    #     print(f"Metrics file {metrics_filename} already exists. Skipping model {idx}.")
    #     continue

    # --- Time the model load ---
    load_start = time.time()
    with open(model_filename, 'rb') as f:
        booster = pickle.load(f)
    load_time = time.time() - load_start
    print(f"Model load time: {load_time:.4f} seconds")
    
    # --- Time the prediction step (inside evaluate_xgb_model) ---
    pred_start = time.time()
    metrics = evaluate_xgb_model(booster, test_path, predictor_cols, target)
    pred_time = time.time() - pred_start
    print(f"Prediction time: {pred_time:.4f} seconds")
    
    # Ensure the directory for the metrics file exists.
    os.makedirs(os.path.dirname(metrics_filename), exist_ok=True)
    with open(metrics_filename, 'w') as f:
        json.dump(metrics, f, indent=4)
    
    run_info = {
        'eval_pair': eval_pair,
        'target': target,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'min_child_weight': min_child_weight,
        'subsample': subsample,
        'num_boost_round': num_boost_round,
        'metrics': metrics,
        'model_file': model_filename,
        'metrics_file': metrics_filename,
        'load_time': load_time,
        'prediction_time': pred_time
    }
    evaluation_results.append(run_info)
    
    # Update overall best if current MSE is lower.
    if metrics['MSE'] < overall_best_mse:
        overall_best_mse = metrics['MSE']
        overall_best_info = run_info
    
    # Clean up.
    gc.collect()

print("\nEvaluation loop for XGBoost models completed.")

Total grid points: 24

Processing target 'px_imp_5' for Eval Pair 1

Processing target 'px_imp_5' for Eval Pair 2

Processing target 'px_imp_5' for Eval Pair 3

Processing target 'px_imp_5' for Eval Pair 4

Processing target 'px_imp_5' for Eval Pair 5

Processing target 'px_imp_5' for Eval Pair 6

Processing target 'px_imp_5' for Eval Pair 7


168

In [27]:
overall_best_info

{'eval_pair': 3,
 'target': 'px_imp_5',
 'max_depth': 5,
 'learning_rate': 0.05,
 'min_child_weight': 1,
 'subsample': 1.0,
 'num_boost_round': 200,
 'metrics': {'MSE': 9.77808097461826e-09,
  'RMSE': np.float64(9.888417959723517e-05),
  'MAE': 2.797008637131759e-05,
  'R2': 0.06633462712883109,
  'Median Absolute Error': np.float64(6.003017460898263e-06),
  'Explained Variance': 0.06634413949745366},
 'model_file': '/data/workspace_files/Models/XGB/model_xgb_px_imp_5_pair_3_maxd_5_lr_0.05_sub_1.0_nbr_200.pkl',
 'metrics_file': '/data/workspace_files/Metrics/XGB/metrics_xgb_px_imp_5_pair_3_maxd_5_lr_0.05_sub_1.0_nbr_200.json',
 'load_time': 0.03071451187133789,
 'prediction_time': 10.899120330810547}

### LSTM

In [6]:
# Ensure the checkpoint directory exists.
checkpoint_dir = '/data/workspace_files/Models/LSTM'
os.makedirs(checkpoint_dir, exist_ok=True)
model_save_path = os.path.join(checkpoint_dir, 'lstm_checkpoint.pt')

In [18]:
predictor_cols = [
    'bid', 'spread_minus', 'bbo_moving_trade', 'non_bbo_moving_trade', 'bbo_improving_limit',
    'bbo_worsening_cancel', 'bbo_depth_add_limit', 'bbo_depth_remove_cancel',
    'non_bbo_depth_add_limit', 'non_bbo_depth_remove_cancel', 'non_bbo_deep_depth_add_limit',
    'non_bbo_deep_depth_remove_cancel', 'bbo_depth_imbalance_minus', 
    'non_bbo_depth_imbalance_minus', 'non_bbo_deep_depth_imbalance_minus', 
    'bbo_queue_length_immbalance_minus', 'block_2', 'block_3', 'block_4', 'block_5', 'block_6'
]

# Additional observable features and their replicates.
ob_features = [
    'bbo_moving_trade', 'non_bbo_moving_trade', 'bbo_improving_limit',
    'bbo_worsening_cancel', 'bbo_depth_add_limit', 'bbo_depth_remove_cancel',
    'non_bbo_depth_add_limit', 'non_bbo_depth_remove_cancel', 
    'non_bbo_deep_depth_add_limit', 'non_bbo_deep_depth_remove_cancel'
]
for feat in ob_features:
    for r in range(1, 4):
        predictor_cols.append(f'{feat}_{r}')

target_col = 'px_imp_5'

# Define file paths.
eval_pairs = [('/data/workspace_files/train_oct/oct_train1_df.parquet', '/data/workspace_files/val_oct/oct_val1_df.parquet'),
              ('/data/workspace_files/train_oct/oct_train2_df.parquet', '/data/workspace_files/val_oct/oct_val2_df.parquet'),
              ('/data/workspace_files/train_oct/oct_train3_df.parquet', '/data/workspace_files/val_oct/oct_val3_df.parquet'),
              ('/data/workspace_files/train_oct/oct_train4_df.parquet', '/data/workspace_files/val_oct/oct_val4_df.parquet'),
              ('/data/workspace_files/train_nov/nov_train5_df.parquet', '/data/workspace_files/val_nov/nov_val5_df.parquet'),
              ('/data/workspace_files/train_nov/nov_train6_df.parquet', '/data/workspace_files/val_nov/nov_val6_df.parquet'),
              ('/data/workspace_files/train_nov/nov_train7_df.parquet', '/data/workspace_files/val_nov/nov_val7_df.parquet')]

In [9]:


# Define the LSTM model for regression.
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, output_dim=1, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        last_out = lstm_out[:, -1, :]
        out = self.fc(last_out)
        return out

def train_lstm_model(eval_pairs, predictor_cols, target_col, 
                     num_epochs=10, batch_size=64, learning_rate=0.001, 
                     num_eval_epochs=1,  # Set to 0 to skip eval fine-tuning.
                     model_save_path=model_save_path):
    """
    Train an LSTM model on each eval pair with incremental checkpointing.
    
    For each eval pair:
      - Train on the training set for 'num_epochs' (or resume if partially completed).
      - Evaluate on the validation set.
      - Optionally perform additional training on the validation set for 'num_eval_epochs'.
    
    The checkpoint saved contains:
      - current_pair_index: which eval pair we're on.
      - current_epoch: if < num_epochs, this is in the training phase;
          if >= num_epochs, then (current_epoch - num_epochs) eval epochs have been completed.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dim = len(predictor_cols)
    
    # Initialize the model.
    model = LSTMModel(input_dim=input_dim).to(device)
    # Freeze LSTM layer parameters.
    for param in model.lstm.parameters():
        param.requires_grad = False
    # Only parameters in the fully connected layer are trainable.
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
    criterion = nn.MSELoss()

    # Check if a checkpoint exists.
    start_pair = 0
    current_epoch = 0  # Represents the last completed epoch (training + eval fine-tuning) for current eval pair.
    if os.path.exists(model_save_path):
        checkpoint = torch.load(model_save_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_pair = checkpoint.get('current_pair_index', 0)
        current_epoch = checkpoint.get('current_epoch', 0)
        print(f"Checkpoint found. Resuming from eval pair index {start_pair}, current_epoch {current_epoch}")
    else:
        print("No checkpoint found. Starting fresh training.")

    # Process each eval pair.
    for idx, (train_path, val_path) in enumerate(eval_pairs[start_pair:], start=start_pair):
        print(f"\n--- Starting eval pair index {idx} ---")
        # ----------------- Training Phase on Training Set -----------------
        # Load training data.
        train_df = pd.read_parquet(train_path, columns=predictor_cols + [target_col])
        X_train = torch.tensor(train_df[predictor_cols].values, dtype=torch.float32)
        y_train = torch.tensor(train_df[target_col].values, dtype=torch.float32)
        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        
        if current_epoch < num_epochs:
            for epoch in range(current_epoch, num_epochs):
                model.train()
                epoch_losses = []
                for X_batch, y_batch in train_loader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    optimizer.zero_grad()
                    predictions = model(X_batch)
                    loss = criterion(predictions.squeeze(), y_batch.squeeze())
                    loss.backward()
                    optimizer.step()
                    epoch_losses.append(loss.item())
                avg_loss = sum(epoch_losses) / len(epoch_losses)
                print(f"Eval Pair {idx} - Training Epoch {epoch+1}/{num_epochs}: Loss = {avg_loss:.4f}")
                
                # Update checkpoint after each training epoch.
                current_epoch = epoch + 1
                checkpoint = {
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'current_pair_index': idx,
                    'current_epoch': current_epoch
                }
                torch.save(checkpoint, model_save_path)
                print(f"Checkpoint saved after training epoch {current_epoch} for eval pair {idx}.")
        else:
            print(f"Training phase already complete for eval pair {idx}.")


        # ----------------- Cleanup Before Evaluation -----------------
        del train_df, X_train, y_train, train_dataset, train_loader
        gc.collect()
        torch.cuda.empty_cache()
        time.sleep(1)  # Short delay to ensure GPU memory is freed
        
        # ----------------- Evaluation on Validation Set -----------------
        # Load validation data into CPU memory first.
        val_df = pd.read_parquet(val_path, columns=predictor_cols + [target_col])
        X_val_cpu = torch.tensor(val_df[predictor_cols].values, dtype=torch.float32)
        y_val_cpu = torch.tensor(val_df[target_col].values, dtype=torch.float32)
        val_dataset = TensorDataset(X_val_cpu, y_val_cpu)
        # Use a DataLoader so we only load one batch at a time onto the GPU.
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        model.eval()
        total_loss = 0.0
        count = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                predictions = model(X_batch)
                loss = criterion(predictions.squeeze(), y_batch.squeeze())
                total_loss += loss.item() * X_batch.size(0)
                count += X_batch.size(0)
        avg_val_loss = total_loss / count
        print(f"Eval Pair {idx}: Validation Loss = {avg_val_loss:.4f}")
        
        # ----------------- Additional Training on Validation Set (Fine-Tuning) -----------------
        if num_eval_epochs > 0:
            # Reuse the same val_loader for fine-tuning.
            # Determine eval epochs already done:
            eval_epochs_done = max(0, current_epoch - num_epochs)
            for eval_epoch in range(eval_epochs_done, num_eval_epochs):
                model.train()
                eval_epoch_losses = []
                for X_batch, y_batch in val_loader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    optimizer.zero_grad()
                    predictions = model(X_batch)
                    loss = criterion(predictions.squeeze(), y_batch.squeeze())
                    loss.backward()
                    optimizer.step()
                    eval_epoch_losses.append(loss.item())
                avg_eval_loss = sum(eval_epoch_losses) / len(eval_epoch_losses)
                print(f"Eval Pair {idx} - Fine-Tuning Epoch {eval_epoch+1}/{num_eval_epochs}: Loss = {avg_eval_loss:.4f}")
                
                current_epoch = num_epochs + eval_epoch + 1
                checkpoint = {
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'current_pair_index': idx,
                    'current_epoch': current_epoch
                }
                torch.save(checkpoint, model_save_path)
                print(f"Checkpoint saved after fine-tuning epoch {eval_epoch+1} for eval pair {idx}.")
            del val_loader
        else:
            print("Skipping fine-tuning on eval set (num_eval_epochs is 0).")
        
        # Cleanup validation data.
        del val_df, X_val_cpu, y_val_cpu, val_dataset
        gc.collect()
        torch.cuda.empty_cache()
        time.sleep(1)
        
        # Mark completion of this eval pair and reset current_epoch.
        current_epoch = 0
        checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'current_pair_index': idx + 1,
            'current_epoch': current_epoch
        }
        torch.save(checkpoint, model_save_path)
        print(f"Completed eval pair {idx}. Checkpoint updated for next pair.")
        
        torch.cuda.empty_cache()
        time.sleep(1)

    print(f"\nFinal model saved to {model_save_path}")
    return model

In [10]:
model = train_lstm_model(
    eval_pairs,
    predictor_cols,
    target_col,
    num_epochs=10,
    batch_size=256,
    learning_rate=0.001,
    num_eval_epochs=10,
    model_save_path=model_save_path
)

Checkpoint found. Resuming from eval pair index 3, current_epoch 8

--- Starting eval pair index 3 ---
Eval Pair 3 - Training Epoch 9/10: Loss = 0.0000
Checkpoint saved after training epoch 9 for eval pair 3.
Eval Pair 3 - Training Epoch 10/10: Loss = 0.0000
Checkpoint saved after training epoch 10 for eval pair 3.
Eval Pair 3: Validation Loss = 0.0000
Eval Pair 3 - Fine-Tuning Epoch 1/10: Loss = 0.0000
Checkpoint saved after fine-tuning epoch 1 for eval pair 3.
Eval Pair 3 - Fine-Tuning Epoch 2/10: Loss = 0.0000
Checkpoint saved after fine-tuning epoch 2 for eval pair 3.
Eval Pair 3 - Fine-Tuning Epoch 3/10: Loss = 0.0000
Checkpoint saved after fine-tuning epoch 3 for eval pair 3.
Eval Pair 3 - Fine-Tuning Epoch 4/10: Loss = 0.0000
Checkpoint saved after fine-tuning epoch 4 for eval pair 3.
Eval Pair 3 - Fine-Tuning Epoch 5/10: Loss = 0.0000
Checkpoint saved after fine-tuning epoch 5 for eval pair 3.
Eval Pair 3 - Fine-Tuning Epoch 6/10: Loss = 0.0000
Checkpoint saved after fine-tunin

In [11]:
# Set up device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model.
input_dim = len(predictor_cols)
model = LSTMModel(input_dim=input_dim).to(device)

# Load the checkpoint.
checkpoint = torch.load(model_save_path)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Function to predict and compute metrics for a given file.
def predict_and_compute_metrics(file_path, batch_size=256):
    # Load validation data from CPU.
    df = pd.read_parquet(file_path, columns=predictor_cols + [target_col])
    X_cpu = torch.tensor(df[predictor_cols].values, dtype=torch.float32)
    y_true = df[target_col].values  # Ground truth as NumPy array.
    
    dataset = TensorDataset(X_cpu)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    predictions_list = []
    with torch.no_grad():
        for batch in loader:
            batch_x = batch[0].to(device)
            preds = model(batch_x)
            predictions_list.append(preds.cpu())
    y_pred = torch.cat(predictions_list, dim=0).squeeze().numpy()
    
    metrics = compute_regression_metrics(y_true, y_pred)
    return y_true, y_pred, metrics

# Test files.
test_files = [
    '/data/workspace_files/test_dec/first_week_df.parquet',
    '/data/workspace_files/test_dec/second_week_df.parquet'
]

# To store metrics for each week.
week_metrics = {}
week_predictions = {}
all_y_true = []
all_y_pred = []

for file_path in test_files:
    base_name = os.path.basename(file_path)
    print(f"Processing file: {file_path}")
    y_true, y_pred, metrics = predict_and_compute_metrics(file_path, batch_size=256)
    week_metrics[base_name] = metrics
    week_predictions[base_name] = y_pred.tolist()  # Convert NumPy array to list.
    all_y_true.append(y_true)
    all_y_pred.append(y_pred)
    print(f"Metrics for {base_name}: {metrics}")
    torch.cuda.empty_cache()
    gc.collect()
    time.sleep(1)

# Aggregate results across all test files.
agg_y_true = np.concatenate(all_y_true)
agg_y_pred = np.concatenate(all_y_pred)
aggregated_metrics = compute_regression_metrics(agg_y_true, agg_y_pred)

print("\nAggregated Metrics for Entire Test Set:")
print(aggregated_metrics)

# Prepare results to save.
results = {
    "week_metrics": week_metrics,
    "aggregated_metrics": aggregated_metrics,
    "week_predictions": week_predictions
}

# Save results to a JSON file in the LSTM folder.
results_json_path = os.path.join(checkpoint_dir, "test_results.json")
with open(results_json_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Metrics and predictions saved to {results_json_path}")

Processing file: /data/workspace_files/test_dec/first_week_df.parquet
Metrics for first_week_df.parquet: {'MSE': 1.5185416776022949e-06, 'RMSE': np.float64(0.0012322912308388367), 'MAE': 0.0009004806530252714, 'R2': -11.754886405082777, 'Median Absolute Error': np.float64(0.000488348538056016), 'Explained Variance': -6.570038105125205}
Processing file: /data/workspace_files/test_dec/second_week_df.parquet
Metrics for second_week_df.parquet: {'MSE': 1.5932895917378045e-06, 'RMSE': np.float64(0.0012622557552801272), 'MAE': 0.0009440433832736227, 'R2': -25.83704652117, 'Median Absolute Error': np.float64(0.0005762574170449224), 'Explained Variance': -14.036397888269919}

Aggregated Metrics for Entire Test Set:
{'MSE': 1.5565697801603443e-06, 'RMSE': np.float64(0.0012476256570623835), 'MAE': 0.0009226432510422458, 'R2': -16.550610950501998, 'Median Absolute Error': np.float64(0.0005157595562707701), 'Explained Variance': -9.120155005330588}
Metrics and predictions saved to /data/workspace_