##### Import

In [14]:
import warnings
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from tqdm import tqdm
import shap
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.pipeline import Pipeline
import os
import gc
import sys

# Filter out warning messages
warnings.filterwarnings('ignore')

# Set pandas display options
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

# Set seaborn style
sns.set_style('whitegrid')

# Add the parent directory to sys.path
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# Index and deciles for data slicing
idx = pd.IndexSlice



from pathlib import Path

# Paths to the downloaded datasets, model, and hyperparameters
data_dir = Path('data/')
model_dir = Path('models/')
best_hyperparams_dir = Path('best_hyperparams/')
study_dir = Path('study/')

# Create directories if they do not exist
data_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)
best_hyperparams_dir.mkdir(parents=True, exist_ok=True)
study_dir.mkdir(parents=True, exist_ok=True)

In [15]:
# from pathlib import Path
# import pandas as pd
# from utils import rank_stocks_and_quantile
# # UNSEEN_KEY = '/data/YEAR_20220803_20230803'
# top = 250  # parameters -> papermill
# DATA_STORE = Path(f'data/{top}_dataset.h5')
# with pd.HDFStore(DATA_STORE) as store:
#     # unseen = store[UNSEEN_KEY]
#     print(store.keys())

In [16]:
"""
Process Large Financial Datasets from HDF5 Format.

This script reads, processes, and normalizes financial datasets stored in an HDF5 format.
The primary processing steps involve converting data types, handling infinite values, and
scaling the dataset. The MinMaxScaler, computed from the entire dataset, is employed for normalization.
Once data processing is complete, stocks are ranked, and quantiles are determined in post-processing.

Attributes:
    - top (int): Number of top stocks to consider.
    - DATA_STORE (Path): Path to the HDF5 file containing the datasets.
    - dataset_keys (list of str): Keys identifying which datasets to process in the HDF5 store.
    - target_string (str): Target column identifier for post-processing.
    - CHUNK_SIZE (int): Size of chunks in which data is read and processed.

Functions:
    - convert_dtype(chunk, feature_columns, dtype='float32'): Converts dtype of specified columns in a chunk.
    - handle_infinite_values(chunk, feature_columns): Handles infinite values in a chunk.
    - process_chunk(chunk, feature_columns, scaler=None): Process a single chunk with optional normalization.

Workflow:
    1. Set parameters and paths.
    2. Define utility functions.
    3. Identify features and target columns from the first chunk.
    4. Determine the MinMaxScaler using all chunks in the dataset.
    5. Process and concatenate chunks to form the dataset.
    6. Rank stocks and compute quantiles in post-processing.
"""

import gc
import numpy as np
import pandas as pd
from pathlib import Path
from utils import rank_stocks_and_quantile
from sklearn.preprocessing import MinMaxScaler

# Parameters and data paths
TOP = top = 500
DATA_STORE = Path(f'data/{top}_dataset.h5')
dataset_keys = [
    '/data/YEAR_20200930_20220802',
    '/data/YEAR_20181024_20200929',
    '/data/YEAR_20161116_20181023',
    '/data/YEAR_20141210_20161115'
]
target_string = 'TARGET_ret_fwd'
CHUNK_SIZE = 50000

def convert_dtype(chunk, feature_columns, dtype='float32'):
    """Converts the datatype of the specified columns."""
    chunk[feature_columns] = chunk[feature_columns].astype(dtype)
    return chunk

def handle_infinite_values(chunk, feature_columns):
    """Handle infinite values by replacing them with the maximum finite value."""
    max_val = np.finfo('float32').max
    chunk[feature_columns] = chunk[feature_columns].replace([np.inf, -np.inf], max_val)
    return chunk

def process_chunk(chunk, feature_columns, scaler=None):
    """Process a single chunk of data."""
    chunk = convert_dtype(chunk, feature_columns)
    chunk = handle_infinite_values(chunk, feature_columns)
    
    # Normalize with scaler if provided
    if scaler:
        chunk[feature_columns] = scaler.transform(chunk[feature_columns])
    
    return chunk

# Identify features and targets based on the first chunk
with pd.HDFStore(DATA_STORE) as store:
    first_chunk = store.select(dataset_keys[0], stop=CHUNK_SIZE)
    features = [col for col in first_chunk.columns if col.startswith('FEATURE_')]
    target = [col for col in first_chunk.columns if col.startswith('TARGET_')]

# Determine the scaler using the entire dataset for the identified features
scaler = MinMaxScaler()

for key in dataset_keys:
    with pd.HDFStore(DATA_STORE) as store:
        for chunk in store.select(key, chunksize=CHUNK_SIZE):
            # Convert dtype and handle infinite values
            chunk = convert_dtype(chunk, features)
            chunk = handle_infinite_values(chunk, features)
            scaler.partial_fit(chunk[features])

# Process and concatenate chunks
dataset = pd.DataFrame()
for key in dataset_keys:
    with pd.HDFStore(DATA_STORE) as store:
        for chunk in store.select(key, chunksize=CHUNK_SIZE):
            processed_chunk = process_chunk(chunk, features, scaler)
            dataset = pd.concat([dataset, processed_chunk], ignore_index=False)
            del processed_chunk
            gc.collect()

# Post-processing steps
dataset = rank_stocks_and_quantile(dataset, target_substring=target_string)
dataset.index.set_levels(dataset.index.levels[0].tz_localize(None), \
    level=0, inplace=True)

In [17]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
from joblib import Parallel, delayed

PADDING_VALUE = -1
MAX_LEN = None  # If you have a predefined value, set it here; otherwise, it gets calculated automatically.

def pad_sequence(inputs, padding_value=-1, max_len=None):
    if max_len is None:
        max_len = max([input.shape[0] for input in inputs])
    padded_inputs = []
    masks = []
    for input in inputs:
        pad_len = max_len - input.shape[0]
        padded_input = F.pad(input, (0, 0, 0, pad_len), value=padding_value)
        mask = torch.ones((input.shape[0], 1), dtype=torch.float)
        masks.append(
            torch.cat((mask, torch.zeros((pad_len, 1), dtype=torch.float)), dim=0)
        )
        padded_inputs.append(padded_input)
    return torch.stack(padded_inputs), torch.stack(masks)

def convert_to_torch(timestamp, data):
    feature_names = [col for col in data.columns if col.startswith('FEATURE_')]
    target_names = [col for col in data.columns if col.startswith('TARGET_')]
    
    inputs = torch.from_numpy(
                data[feature_names].values.astype(np.float32))
    labels = torch.from_numpy(
                data[target_names].values.astype(np.float32))

    padded_inputs, masks_inputs = pad_sequence(
            [inputs], padding_value=PADDING_VALUE, max_len=MAX_LEN)
    padded_labels, masks_labels = pad_sequence(
            [labels], padding_value=PADDING_VALUE, max_len=MAX_LEN)

    return {
        timestamp: (
            padded_inputs,
            padded_labels,
            masks_inputs,
            target_names
        )
    }

def get_era2data(df):
    # Group by the Timestamp index (level=0)
    res = Parallel(n_jobs=-1, prefer="threads")(
        delayed(convert_to_torch)(timestamp, data)
        for timestamp, data in tqdm(df.groupby(level=0)))
    
    era2data = {}
    for r in tqdm(res):
        era2data.update(r)
    return era2data

# Assuming DataFrame is named "dataset": testing the function
timestamp2data_dataset = get_era2data(dataset)

  0%|          | 0/1924 [00:00<?, ?it/s]

100%|██████████| 1924/1924 [00:04<00:00, 401.12it/s]
100%|██████████| 1924/1924 [00:00<00:00, 2049223.18it/s]


In [18]:
import torch
import torch.nn as nn

def pearsonr(x, y):
    xm, ym = x - x.mean(), y - y.mean()
    r_num = torch.sum(xm * ym)
    r_den = torch.sqrt(torch.sum(xm ** 2) + 1e-10) * torch.sqrt(torch.sum(ym ** 2) + 1e-10)
    correlation = r_num / r_den
    return correlation.requires_grad_()  # Ensure that the returned tensor requires gradients

def spearmanr(x, y):
    rank_x = x.argsort().argsort().float()
    rank_y = y.argsort().argsort().float()
    return pearsonr(rank_x, rank_y)

def pairwise_ranking_loss(outputs, target_labels, masks_inputs):
    sorted_indices = torch.argsort(target_labels, dim=-1, descending=True)
    sorted_outputs = torch.gather(outputs, -1, sorted_indices)
    
    diff_vector = sorted_outputs[:, 1:] - sorted_outputs[:, :-1]
    sigmoid_diff = 1.0 / (1.0 + torch.exp(-diff_vector))
    loss = -torch.log(torch.clamp(sigmoid_diff, min=1e-10, max=1-1e-10))
    
    min_dim = min(masks_inputs.shape[1], loss.shape[1])
    masked_loss = loss[:, :min_dim] * masks_inputs[:, :min_dim]
    
    return torch.sum(masked_loss)

def calculate_loss(outputs, criterion, target_labels, masks_inputs, alpha_mse=0.5, alpha_corr=1.0, alpha_rank=1.0):
    
    # 1. Print basic info
    # print(f"Outputs Range: {outputs.min().item()}, {outputs.max().item()}")
    # print(f"Target Labels Range: {target_labels.min().item()}, {target_labels.max().item()}")
    # print(f"Masks Range: {masks_inputs.min().item()}, {masks_inputs.max().item()}")
    
    # Calculating the MSE loss
    mse_main = criterion(outputs * masks_inputs, target_labels * masks_inputs)
    # print(f"MSE Loss: {mse_main.item()}")
    
    # Calculating the Spearman Correlation
    non_zero_mask = masks_inputs.view(-1).nonzero().squeeze()
    spearman_corr = spearmanr(outputs[0][:, 0][non_zero_mask], target_labels[0][:, 0][non_zero_mask])
    # print(f"Spearman Correlation: {spearman_corr.item()}")

    # Calculating the Ranking Loss
    ranking_loss = pairwise_ranking_loss(outputs, target_labels, masks_inputs)
    # print(f"Ranking Loss: {ranking_loss.item()}")

    # Using alpha values to weight the losses and normalize them
    losses = [mse_main, -spearman_corr, ranking_loss]
    alphas = [alpha_mse, alpha_corr, alpha_rank]
    weights = [alpha / (loss + 1e-10) for alpha, loss in zip(alphas, losses)]
    normalized_weights = [weight / sum(weights) for weight in weights]
    
    combined_loss = sum(w * l for w, l in zip(normalized_weights, losses))
    
    return combined_loss.requires_grad_(), mse_main, spearman_corr


In [19]:
# Training loop
def train_on_batch(model, criterion, optimizer, batch, lookahead):
    inputs, labels, masks_inputs, target_names = batch

    # Get index for specific label dynamically
    specific_label_name = f'TARGET_ret_fwd_{lookahead:02d}d_rank_quantiled'
    specific_label_index = target_names.index(specific_label_name)

    # Use that index to fetch the specific column
    labels = labels[:, :, specific_label_index].unsqueeze(2)

    # print('labels shape: ', labels.shape)

    # Zero the parameter gradients
    optimizer.zero_grad()

    outputs = model(inputs / 4.0, masks_inputs)

    # print("Output shape: ", outputs.shape)

    assert labels.shape == outputs.shape, \
        f"Shape mismatch: labels {labels.shape}, outputs {outputs.shape}"

    loss, _mse, _corr = calculate_loss(outputs, criterion, labels, masks_inputs)
    
    loss.backward()
    optimizer.step()
    
    return loss.item(), _mse.item(), _corr.item()

def evaluate_on_batch(model, criterion, batch, lookahead):
    inputs, labels, masks_inputs, target_names = batch

    # Get index for specific label dynamically
    specific_label_name = f'TARGET_ret_fwd_{lookahead:02d}d_rank_quantiled'
    specific_label_index = target_names.index(specific_label_name)

    # Use that index to fetch the specific column
    labels = labels[:, :, specific_label_index].unsqueeze(2)

    model.eval()
    with torch.no_grad():
        outputs = model(inputs / 4, masks_inputs)

        # print('output form eval: ', outputs.shape)

        assert labels.shape == outputs.shape, \
            f"Shape mismatch: labels {labels.shape}, outputs {outputs.shape}"

        loss, mse, corr = calculate_loss(outputs, criterion, labels, masks_inputs)

        # Assuming masks_inputs is of shape (1, 253, 1)
        non_zero_indices = masks_inputs.squeeze().nonzero().squeeze()

        # Gather values from outputs tensor using the non-zero indices
        preds = torch.gather(outputs.squeeze(), 0, non_zero_indices).cpu().numpy()

    return loss.item(), mse.item(), corr.item(), preds


def compute_fold_metrics(era_scores, weights=None):
    era_scores = pd.Series(era_scores)
    
    # Calculate metrics
    mean_correlation = np.mean(era_scores)
    std_deviation = np.std(era_scores)
    sharpe_ratio = mean_correlation / std_deviation
    max_dd = (era_scores.cummax() - era_scores).max()

    # Smart Sharpe
    smart_sharpe = mean_correlation \
        / (std_deviation + np.std(era_scores.diff()))
    
    # Autocorrelation
    autocorrelation = era_scores.autocorr()

    metrics = pd.Series({
        'mean_correlation': mean_correlation,
        'std_deviation': std_deviation,
        'sharpe_ratio': sharpe_ratio,
        'smart_sharpe': smart_sharpe,
        'autocorrelation': autocorrelation,
        'max_dd': max_dd,
        'min_correlation': era_scores.min(),
        'max_correlation': era_scores.max(),
    })

    if weights:
        normalized_metrics = (metrics - metrics.min()) / (metrics.max() - metrics.min())
        weighted_values = normalized_metrics.multiply(pd.Series(weights))
        metrics["weighted_score"] = weighted_values.sum()

    _ = gc.collect()

    return metrics

In [20]:
from tqdm import tqdm

def train_model(model, criterion, optimizer, scheduler, \
                num_epochs, patience, train_loader, lookahead, \
                device, val_loader=None, is_lr_scheduler=True):
    best_score = float('-inf')  # Initialize with negative infinity since we want to maximize Sharpe ratio
    best_corr = None
    best_model_wts = None  # Changing from 'best_model' to avoid confusion with the model object
    all_val_scores = []
    all_val_outputs = {}
    no_improve_epoch = 0

    model = model.to(device)

    epoch_progress = tqdm(range(num_epochs), desc="Epochs", leave=False)

    for epoch in epoch_progress:
        total_loss = []
        total_corr = []

        # Training
        for era_num in tqdm(train_loader, desc="Training", leave=False):
            data = train_loader[era_num]
            batch = (data[0].to(device), data[1].to(device), data[2].to(device), data[3])
            
            loss, _mse, _corr = train_on_batch(model, criterion, optimizer, batch, lookahead)
            # print(loss)
            total_loss.append(loss)
            total_corr.append(_corr)

        # Adjust learning rate if is_lr_scheduler is True
        if is_lr_scheduler:
            scheduler.step()

        # Validation - Only if val_loader is provided
        if val_loader:
            val_total_loss = []
            val_total_corr = []
            val_total_outputs = {}

            with torch.no_grad():
                for era_num in tqdm(val_loader, desc="Validation", leave=False):
                    data = val_loader[era_num]
                    batch = (data[0].to(device), data[1].to(device), data[2].to(device), data[3])
                    
                    loss, _mse, _corr, outputs = evaluate_on_batch(model, criterion, batch, lookahead)
                    val_total_loss.append(loss)
                    val_total_corr.append(_corr)
                    val_total_outputs[era_num] = outputs

            all_val_scores.append(val_total_corr) 
            all_val_outputs.update(val_total_outputs)

            # Early stopping check based on Sharpe score
            current_score = np.mean(val_total_corr) / np.std(val_total_corr)  # Assuming Sharpe ratio here
            if current_score > best_score:
                best_score = current_score
                best_corr = val_total_corr.copy()
                best_model_wts = model.state_dict().copy()
                no_improve_epoch = 0
            else:
                no_improve_epoch += 1
                if no_improve_epoch >= patience:
                    epoch_progress.set_description(f'Early stopping at epoch {epoch+1}')
                    epoch_progress.refresh()
                    break

        torch.cuda.empty_cache()
        _ = gc.collect()

    if val_loader:  # If validation data was provided
        return best_model_wts, best_corr, all_val_scores
    else:  # If only training data was used without validation
        return model.state_dict(), None, None

In [21]:
import optuna
import mlflow
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from utils import CustomBackwardMultipleTimeSeriesCV
from model import Transformer
from model import RankPredictorNN
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


# Constants and hyperparameters
NUM_EPOCHS = 15
PATIENCE = 5
FEATURE_DIM = len(features)  # Assuming 'features' is defined elsewhere in your code
OUTPUT_DIM = 1
NUM_TRAIL = 25
device = "cuda" if torch.cuda.is_available() else "cpu"

# # Choose model
# model = Transformer(
#     input_dim=FEATURE_DIM,
#     d_model=hidden_dim,
#     output_dim=OUTPUT_DIM,
#     num_heads=num_heads,
#     num_layers=num_layers,
# ).to(device)

weights = {
    'mean_correlation': 0.0,
    'std_deviation': -0.025, # Mild penalty for higher volatility
    'sharpe_ratio': 0.95,    # Primary objective, so highest weight
    'smart_sharpe': 0.075,   # Supplementary to Sharpe Ratio but considering autocorrelation
    'autocorrelation': -0.1, # Penalize strategies showing signs of overfitting
    'max_dd': -0.1,          # Major risk metric, negative to penalize higher drawdowns
    'min_correlation': 0.0,
    'max_correlation': 0.0,
}

def objective(trial, dataset, device):  # Placeholder for dataset
    print(f"\n--- Starting Trial: {trial.number + 1} ---")

    # Suggest hyperparameters
    train_length_multiplier = trial.suggest_int('train_length_multiplier', 10, 15)
    val_period_length = trial.suggest_categorical('val_period_length', [21, 42, 63])
    lookahead = trial.suggest_categorical('lookahead', [1, 5, 21])
    num_heads = trial.suggest_int("num_heads", 1, 5)
    hidden_dim = trial.suggest_int("hidden_dim", 64, 256, step=2)
    num_layers = trial.suggest_int("num_layers", 1, 5)
    lr = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)

    print(f"Hyperparameters for this trial: {trial.params}")


    # Initialize CV and other variables
    cv = CustomBackwardMultipleTimeSeriesCV(dataset,
                                    train_period_length=int(21 * train_length_multiplier),
                                    test_period_length=val_period_length,
                                    lookahead=lookahead, date_idx='date')

    cv.update_lookahead(lookahead)
    fold_weighted_scores = []
    for train_idx, test_idx in cv:
        # Choose model
        model = Transformer(
            input_dim=FEATURE_DIM,
            d_model=hidden_dim,
            output_dim=OUTPUT_DIM,
            num_heads=num_heads,
            num_layers=num_layers).to(device)

        # # Initialize model, loss, optimizer
        # model = RankPredictorNN(input_dim=FEATURE_DIM, output_dim=OUTPUT_DIM).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=100, gamma=0.1)

        # Prepare data batches
        train_data = dataset.iloc[train_idx]
        test_data = dataset.iloc[test_idx]
        train_batches = get_era2data(train_data)
        validation_batches = get_era2data(test_data)

        # Train and validate model
        _, val_corr_on_fold, _ = train_model(
            model, criterion, optimizer, scheduler, NUM_EPOCHS, PATIENCE,
            train_batches, lookahead, device, validation_batches, is_lr_scheduler=True
        )

        # print(val_corr_on_fold)

        # Compute metrics
        scores_on_fold = compute_fold_metrics(val_corr_on_fold)

        # Normalize and weight scores
        normalized_scores = (scores_on_fold - scores_on_fold.min()) \
            / (scores_on_fold.max() - scores_on_fold.min())
        weighted_scores_on_fold = normalized_scores.multiply(pd.Series(weights))

        # Append to list
        fold_weighted_scores.append(weighted_scores_on_fold.sum())

    # Calculate overall score
    overall_score = np.mean(fold_weighted_scores)
    # print('==================')
    # print('Overall score: ', overall_score)
    # print('==================')

    # Log metrics
    with mlflow.start_run():
        mlflow.log_params(trial.params)
        mlflow.log_metric("avg_score_across_folds", overall_score)

    return -overall_score if not np.isnan(overall_score) else 1e-9

def callback(study, trial):
    print(f"\n--- Trial {trial.number + 1} finished ---")
    print(f"Value: {trial.value} and parameters: {trial.params}")
    
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    
    if completed_trials:
        best_trial_number = study.best_trial.number + 1  # Adding 1 to align with your display
        print(f"Best is trial {best_trial_number} with value: {study.best_trial.value}\n")
    else:
        print("No successful trials yet.\n")

study_dir = "/home/sayem/Desktop/Project/study"
# study = optuna.create_study(study_name='Maximizing the Sharpe', direction='minimize',
#                             storage=f'sqlite:///{study_dir}/study.db', load_if_exists=True)
study = optuna.create_study(study_name='Maximizing the Sharpe', \
    direction='minimize', load_if_exists=True)
# study.optimize(objective, n_trials=NUM_TRAIL, callbacks=[callback])
study.optimize(lambda trial: objective(trial, dataset, device), n_trials=NUM_TRAIL, callbacks=[callback])

[I 2023-10-11 00:11:58,104] A new study created in memory with name: Maximizing the Sharpe



--- Starting Trial: 1 ---
Hyperparameters for this trial: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 158, 'num_layers': 4, 'learning_rate': 0.0003473467864674847}


100%|██████████| 210/210 [00:00<00:00, 495.10it/s]
100%|██████████| 210/210 [00:00<00:00, 1846548.93it/s]
100%|██████████| 21/21 [00:00<00:00, 3973.13it/s]
100%|██████████| 21/21 [00:00<00:00, 579476.21it/s]
Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 210/210 [00:00<00:00, 408.40it/s]     
100%|██████████| 210/210 [00:00<00:00, 1744166.02it/s]
100%|██████████| 21/21 [00:00<00:00, 4079.12it/s]
100%|██████████| 21/21 [00:00<00:00, 587202.56it/s]
100%|██████████| 210/210 [00:00<00:00, 411.04it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1700393.51it/s]
100%|██████████| 21/21 [00:00<00:00, 4025.43it/s]
100%|██████████| 21/21 [00:00<00:00, 611669.33it/s]
100%|██████████| 210/210 [00:00<00:00, 441.13it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1558944.85it/s]
100%|██████████| 21/21 [00:00<00:00, 4007.84it/s]
100%|██████████| 21/21 [00:00<00:00, 624683.57it/s]
100%|██████████| 210/210 [00:00<00:00, 396.71it/s]     
100%|██████████| 210/210 [00:00<00:00, 1747626.67it/s]
100%|██████████| 21/21 [00:00<00:00, 4069.88it/s]
100%|██████████| 21/21 [00:00<00:00, 225269.52it/s]
100%|██████████| 210/210 [00:00<00:00, 352.64it/s]                        
100%|██████████| 210/210 [00:00<00:00


--- Trial 1 finished ---
Value: -0.9848623647133862 and parameters: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 158, 'num_layers': 4, 'learning_rate': 0.0003473467864674847}
Best is trial 1 with value: -0.9848623647133862


--- Starting Trial: 2 ---
Hyperparameters for this trial: {'train_length_multiplier': 13, 'val_period_length': 63, 'lookahead': 1, 'num_heads': 5, 'hidden_dim': 218, 'num_layers': 3, 'learning_rate': 0.0017896948297652798}


100%|██████████| 273/273 [00:00<00:00, 415.96it/s]
100%|██████████| 273/273 [00:00<00:00, 1714139.21it/s]
100%|██████████| 63/63 [00:00<00:00, 172.35it/s]
100%|██████████| 63/63 [00:00<00:00, 1179648.00it/s]
100%|██████████| 273/273 [00:00<00:00, 386.30it/s]                       
100%|██████████| 273/273 [00:00<00:00, 2081899.99it/s]
100%|██████████| 63/63 [00:00<00:00, 285.68it/s]
100%|██████████| 63/63 [00:00<00:00, 1223338.67it/s]
100%|██████████| 273/273 [00:00<00:00, 416.36it/s]                        
100%|██████████| 273/273 [00:00<00:00, 2081899.99it/s]
100%|██████████| 63/63 [00:00<00:00, 169.89it/s]
100%|██████████| 63/63 [00:00<00:00, 1153891.49it/s]
100%|██████████| 273/273 [00:00<00:00, 322.74it/s]                         
100%|██████████| 273/273 [00:00<00:00, 1849830.36it/s]
100%|██████████| 63/63 [00:00<00:00, 303.31it/s]
100%|██████████| 63/63 [00:00<00:00, 1052753.59it/s]
100%|██████████| 273/273 [00:00<00:00, 433.11it/s]                       
100%|██████████| 273/2


--- Trial 2 finished ---
Value: -0.8428163189409913 and parameters: {'train_length_multiplier': 13, 'val_period_length': 63, 'lookahead': 1, 'num_heads': 5, 'hidden_dim': 218, 'num_layers': 3, 'learning_rate': 0.0017896948297652798}
Best is trial 1 with value: -0.9848623647133862


--- Starting Trial: 3 ---
Hyperparameters for this trial: {'train_length_multiplier': 12, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 3, 'hidden_dim': 150, 'num_layers': 4, 'learning_rate': 0.026266664491742715}


100%|██████████| 252/252 [00:00<00:00, 403.09it/s]
100%|██████████| 252/252 [00:00<00:00, 1785413.19it/s]
100%|██████████| 21/21 [00:00<00:00, 3301.36it/s]
100%|██████████| 21/21 [00:00<00:00, 478697.74it/s]
100%|██████████| 252/252 [00:00<00:00, 589.31it/s]                       
100%|██████████| 252/252 [00:00<00:00, 2060359.86it/s]
100%|██████████| 21/21 [00:00<00:00, 4052.09it/s]
100%|██████████| 21/21 [00:00<00:00, 494833.62it/s]
100%|██████████| 252/252 [00:00<00:00, 355.26it/s]                       
100%|██████████| 252/252 [00:00<00:00, 2005625.44it/s]
100%|██████████| 21/21 [00:00<00:00, 4173.83it/s]
100%|██████████| 21/21 [00:00<00:00, 587202.56it/s]
100%|██████████| 252/252 [00:00<00:00, 489.78it/s]                         
100%|██████████| 252/252 [00:00<00:00, 1914790.96it/s]
100%|██████████| 21/21 [00:00<00:00, 3938.49it/s]
100%|██████████| 21/21 [00:00<00:00, 421437.24it/s]
100%|██████████| 252/252 [00:00<00:00, 359.31it/s]                        
100%|██████████| 252/2


--- Trial 3 finished ---
Value: -0.8648876148490614 and parameters: {'train_length_multiplier': 12, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 3, 'hidden_dim': 150, 'num_layers': 4, 'learning_rate': 0.026266664491742715}
Best is trial 1 with value: -0.9848623647133862


--- Starting Trial: 4 ---
Hyperparameters for this trial: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 5, 'num_heads': 5, 'hidden_dim': 238, 'num_layers': 1, 'learning_rate': 0.05527322921866347}


100%|██████████| 210/210 [00:00<00:00, 434.97it/s]
100%|██████████| 210/210 [00:00<00:00, 1902384.10it/s]
100%|██████████| 21/21 [00:00<00:00, 3382.37it/s]
100%|██████████| 21/21 [00:00<00:00, 603290.30it/s]
100%|██████████| 210/210 [00:00<00:00, 507.28it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1819842.64it/s]
100%|██████████| 21/21 [00:00<00:00, 4018.45it/s]
100%|██████████| 21/21 [00:00<00:00, 716100.68it/s]
100%|██████████| 210/210 [00:00<00:00, 319.88it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1754589.32it/s]
100%|██████████| 21/21 [00:00<00:00, 3356.08it/s]
100%|██████████| 21/21 [00:00<00:00, 521185.70it/s]
100%|██████████| 210/210 [00:00<00:00, 407.60it/s]                       
100%|██████████| 210/210 [00:00<00:00, 2029501.94it/s]
100%|██████████| 21/21 [00:00<00:00, 3802.30it/s]
100%|██████████| 21/21 [00:00<00:00, 506209.10it/s]
100%|██████████| 210/210 [00:00<00:00, 344.91it/s]                       
100%|██████████| 210/210 


--- Trial 4 finished ---
Value: -0.9692084802908885 and parameters: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 5, 'num_heads': 5, 'hidden_dim': 238, 'num_layers': 1, 'learning_rate': 0.05527322921866347}
Best is trial 1 with value: -0.9848623647133862


--- Starting Trial: 5 ---
Hyperparameters for this trial: {'train_length_multiplier': 12, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 5, 'hidden_dim': 108, 'num_layers': 3, 'learning_rate': 0.0585119467810109}


100%|██████████| 252/252 [00:00<00:00, 413.87it/s]
100%|██████████| 252/252 [00:00<00:00, 1918266.08it/s]
100%|██████████| 63/63 [00:00<00:00, 181.39it/s]
100%|██████████| 63/63 [00:00<00:00, 1264311.73it/s]
100%|██████████| 252/252 [00:00<00:00, 414.78it/s]                         
100%|██████████| 252/252 [00:00<00:00, 2020964.83it/s]
100%|██████████| 63/63 [00:00<00:00, 191.06it/s]
100%|██████████| 63/63 [00:00<00:00, 1169208.64it/s]
100%|██████████| 252/252 [00:00<00:00, 371.41it/s]                       
100%|██████████| 252/252 [00:00<00:00, 1998042.74it/s]
100%|██████████| 63/63 [00:00<00:00, 271.54it/s]
100%|██████████| 63/63 [00:00<00:00, 1390742.91it/s]
100%|██████████| 252/252 [00:00<00:00, 445.09it/s]                       
100%|██████████| 252/252 [00:00<00:00, 1797558.86it/s]
100%|██████████| 63/63 [00:00<00:00, 330.07it/s]
100%|██████████| 63/63 [00:00<00:00, 670662.82it/s]
100%|██████████| 252/252 [00:00<00:00, 346.00it/s]                       
100%|██████████| 252/252


--- Trial 5 finished ---
Value: -0.9220208310596503 and parameters: {'train_length_multiplier': 12, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 5, 'hidden_dim': 108, 'num_layers': 3, 'learning_rate': 0.0585119467810109}
Best is trial 1 with value: -0.9848623647133862


--- Starting Trial: 6 ---
Hyperparameters for this trial: {'train_length_multiplier': 14, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 82, 'num_layers': 3, 'learning_rate': 0.00033780444725404}


100%|██████████| 294/294 [00:00<00:00, 412.05it/s]
100%|██████████| 294/294 [00:00<00:00, 1759094.69it/s]
100%|██████████| 21/21 [00:00<00:00, 4141.84it/s]
100%|██████████| 21/21 [00:00<00:00, 325019.87it/s]
100%|██████████| 294/294 [00:00<00:00, 501.18it/s]     
100%|██████████| 294/294 [00:00<00:00, 1932798.39it/s]
100%|██████████| 21/21 [00:00<00:00, 3468.69it/s]
100%|██████████| 21/21 [00:00<00:00, 547083.13it/s]
100%|██████████| 294/294 [00:00<00:00, 384.39it/s]     
100%|██████████| 294/294 [00:00<00:00, 1556976.48it/s]
100%|██████████| 21/21 [00:00<00:00, 3563.41it/s]
100%|██████████| 21/21 [00:00<00:00, 425509.10it/s]
100%|██████████| 294/294 [00:00<00:00, 417.58it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1944992.71it/s]
100%|██████████| 21/21 [00:00<00:00, 4165.34it/s]
100%|██████████| 21/21 [00:00<00:00, 652447.29it/s]
100%|██████████| 294/294 [00:00<00:00, 454.69it/s]     
100%|██████████| 294/294 [00:00<00:00, 1818768.99it/s]
100%|██████████| 21/21 [


--- Trial 6 finished ---
Value: -0.9845414729575974 and parameters: {'train_length_multiplier': 14, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 82, 'num_layers': 3, 'learning_rate': 0.00033780444725404}
Best is trial 1 with value: -0.9848623647133862


--- Starting Trial: 7 ---
Hyperparameters for this trial: {'train_length_multiplier': 13, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 1, 'hidden_dim': 84, 'num_layers': 3, 'learning_rate': 0.0006220901331040518}


100%|██████████| 273/273 [00:00<00:00, 344.71it/s]
100%|██████████| 273/273 [00:00<00:00, 1930935.91it/s]
100%|██████████| 63/63 [00:00<00:00, 184.35it/s]
100%|██████████| 63/63 [00:00<00:00, 1276527.30it/s]
100%|██████████| 273/273 [00:00<00:00, 489.18it/s]     
100%|██████████| 273/273 [00:00<00:00, 2023047.69it/s]
100%|██████████| 63/63 [00:00<00:00, 278.85it/s]
100%|██████████| 63/63 [00:00<00:00, 1201096.15it/s]
100%|██████████| 273/273 [00:00<00:00, 405.10it/s]     
100%|██████████| 273/273 [00:00<00:00, 1789132.80it/s]
100%|██████████| 63/63 [00:00<00:00, 170.53it/s]
100%|██████████| 63/63 [00:00<00:00, 1056964.61it/s]
100%|██████████| 273/273 [00:00<00:00, 454.05it/s]                         
100%|██████████| 273/273 [00:00<00:00, 2030221.62it/s]
100%|██████████| 63/63 [00:00<00:00, 173.98it/s]
100%|██████████| 63/63 [00:00<00:00, 1040319.50it/s]
100%|██████████| 273/273 [00:00<00:00, 444.50it/s]                         
100%|██████████| 273/273 [00:00<00:00, 2093318.08it/s]
10


--- Trial 7 finished ---
Value: -0.9871639090080142 and parameters: {'train_length_multiplier': 13, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 1, 'hidden_dim': 84, 'num_layers': 3, 'learning_rate': 0.0006220901331040518}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 8 ---
Hyperparameters for this trial: {'train_length_multiplier': 15, 'val_period_length': 63, 'lookahead': 5, 'num_heads': 4, 'hidden_dim': 158, 'num_layers': 5, 'learning_rate': 0.00029069120879957825}


100%|██████████| 315/315 [00:00<00:00, 414.41it/s]
100%|██████████| 315/315 [00:00<00:00, 2117316.92it/s]
100%|██████████| 63/63 [00:00<00:00, 273.55it/s]
100%|██████████| 63/63 [00:00<00:00, 1398101.33it/s]
100%|██████████| 315/315 [00:00<00:00, 480.37it/s]                       
100%|██████████| 315/315 [00:00<00:00, 1780600.75it/s]
100%|██████████| 63/63 [00:00<00:00, 205.09it/s]
100%|██████████| 63/63 [00:00<00:00, 818084.06it/s]
100%|██████████| 315/315 [00:00<00:00, 444.82it/s]                       
100%|██████████| 315/315 [00:00<00:00, 1942949.65it/s]
100%|██████████| 63/63 [00:00<00:00, 294.14it/s]
100%|██████████| 63/63 [00:00<00:00, 943718.40it/s]
100%|██████████| 315/315 [00:00<00:00, 432.98it/s]                         
100%|██████████| 315/315 [00:00<00:00, 1995779.09it/s]
100%|██████████| 63/63 [00:00<00:00, 244.57it/s]
100%|██████████| 63/63 [00:00<00:00, 1101004.80it/s]
[I 2023-10-11 01:05:50,920] Trial 7 finished with value: -0.9691435098194071 and parameters: {'trai


--- Trial 8 finished ---
Value: -0.9691435098194071 and parameters: {'train_length_multiplier': 15, 'val_period_length': 63, 'lookahead': 5, 'num_heads': 4, 'hidden_dim': 158, 'num_layers': 5, 'learning_rate': 0.00029069120879957825}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 9 ---
Hyperparameters for this trial: {'train_length_multiplier': 14, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 3, 'hidden_dim': 138, 'num_layers': 3, 'learning_rate': 0.001363938705095717}


100%|██████████| 294/294 [00:00<00:00, 528.23it/s]
100%|██████████| 294/294 [00:00<00:00, 1779401.70it/s]
100%|██████████| 63/63 [00:00<00:00, 310.77it/s]
100%|██████████| 63/63 [00:00<00:00, 1327844.98it/s]
100%|██████████| 294/294 [00:00<00:00, 437.53it/s]                         
100%|██████████| 294/294 [00:00<00:00, 2075968.65it/s]
100%|██████████| 63/63 [00:00<00:00, 291.23it/s]
100%|██████████| 63/63 [00:00<00:00, 1301680.55it/s]
100%|██████████| 294/294 [00:00<00:00, 428.87it/s]                        
100%|██████████| 294/294 [00:00<00:00, 1894201.81it/s]
100%|██████████| 63/63 [00:00<00:00, 272.82it/s]
100%|██████████| 63/63 [00:00<00:00, 1148874.57it/s]
100%|██████████| 294/294 [00:00<00:00, 373.29it/s]                         
100%|██████████| 294/294 [00:00<00:00, 1668640.56it/s]
100%|██████████| 63/63 [00:00<00:00, 292.89it/s]
100%|██████████| 63/63 [00:00<00:00, 818084.06it/s]
[I 2023-10-11 01:11:49,797] Trial 8 finished with value: -0.954890393299648 and parameters: {'t


--- Trial 9 finished ---
Value: -0.954890393299648 and parameters: {'train_length_multiplier': 14, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 3, 'hidden_dim': 138, 'num_layers': 3, 'learning_rate': 0.001363938705095717}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 10 ---
Hyperparameters for this trial: {'train_length_multiplier': 12, 'val_period_length': 42, 'lookahead': 5, 'num_heads': 2, 'hidden_dim': 240, 'num_layers': 3, 'learning_rate': 0.07099983988470826}


100%|██████████| 252/252 [00:00<00:00, 340.82it/s]
100%|██████████| 252/252 [00:00<00:00, 1501370.18it/s]
100%|██████████| 42/42 [00:00<00:00, 214.28it/s]
100%|██████████| 42/42 [00:00<00:00, 1024190.51it/s]
100%|██████████| 252/252 [00:00<00:00, 401.15it/s]                       
100%|██████████| 252/252 [00:00<00:00, 2092999.22it/s]
100%|██████████| 42/42 [00:00<00:00, 202.98it/s]
100%|██████████| 42/42 [00:00<00:00, 1061209.45it/s]
100%|██████████| 252/252 [00:00<00:00, 408.44it/s]                       
100%|██████████| 252/252 [00:00<00:00, 1563557.11it/s]
100%|██████████| 42/42 [00:00<00:00, 204.64it/s]
100%|██████████| 42/42 [00:00<00:00, 863533.18it/s]
100%|██████████| 252/252 [00:00<00:00, 421.02it/s]                       
100%|██████████| 252/252 [00:00<00:00, 2072479.62it/s]
100%|██████████| 42/42 [00:00<00:00, 229.76it/s]
100%|██████████| 42/42 [00:00<00:00, 894217.10it/s]
100%|██████████| 252/252 [00:00<00:00, 439.16it/s]                       
100%|██████████| 252/252 [0


--- Trial 10 finished ---
Value: -0.9760775347148258 and parameters: {'train_length_multiplier': 12, 'val_period_length': 42, 'lookahead': 5, 'num_heads': 2, 'hidden_dim': 240, 'num_layers': 3, 'learning_rate': 0.07099983988470826}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 11 ---
Hyperparameters for this trial: {'train_length_multiplier': 11, 'val_period_length': 42, 'lookahead': 1, 'num_heads': 1, 'hidden_dim': 72, 'num_layers': 1, 'learning_rate': 1.0093512175007751e-05}


100%|██████████| 231/231 [00:00<00:00, 319.68it/s]
100%|██████████| 231/231 [00:00<00:00, 1435384.04it/s]
100%|██████████| 42/42 [00:00<00:00, 232.96it/s]
100%|██████████| 42/42 [00:00<00:00, 898779.43it/s]
100%|██████████| 231/231 [00:00<00:00, 354.87it/s]                       
100%|██████████| 231/231 [00:00<00:00, 1742597.53it/s]
100%|██████████| 42/42 [00:00<00:00, 232.40it/s]
100%|██████████| 42/42 [00:00<00:00, 984138.37it/s]
100%|██████████| 231/231 [00:00<00:00, 552.23it/s]                        
100%|██████████| 231/231 [00:00<00:00, 1957341.87it/s]
100%|██████████| 42/42 [00:00<00:00, 213.91it/s]
100%|██████████| 42/42 [00:00<00:00, 898779.43it/s]
100%|██████████| 231/231 [00:00<00:00, 600.79it/s]                       
100%|██████████| 231/231 [00:00<00:00, 1989495.33it/s]
100%|██████████| 42/42 [00:00<00:00, 214.78it/s]
100%|██████████| 42/42 [00:00<00:00, 1012418.21it/s]
100%|██████████| 231/231 [00:00<00:00, 363.70it/s]                       
100%|██████████| 231/231 [0


--- Trial 11 finished ---
Value: -0.8486487591183685 and parameters: {'train_length_multiplier': 11, 'val_period_length': 42, 'lookahead': 1, 'num_heads': 1, 'hidden_dim': 72, 'num_layers': 1, 'learning_rate': 1.0093512175007751e-05}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 12 ---
Hyperparameters for this trial: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 1, 'hidden_dim': 192, 'num_layers': 5, 'learning_rate': 0.00012695339635823068}


100%|██████████| 210/210 [00:00<00:00, 400.74it/s]
100%|██████████| 210/210 [00:00<00:00, 1495422.48it/s]
100%|██████████| 21/21 [00:00<00:00, 4015.52it/s]
100%|██████████| 21/21 [00:00<00:00, 587202.56it/s]
100%|██████████| 210/210 [00:00<00:00, 386.64it/s]     
100%|██████████| 210/210 [00:00<00:00, 1858235.95it/s]
100%|██████████| 21/21 [00:00<00:00, 4235.45it/s]
100%|██████████| 21/21 [00:00<00:00, 533820.51it/s]
100%|██████████| 210/210 [00:00<00:00, 412.77it/s]                         
100%|██████████| 210/210 [00:00<00:00, 1953001.86it/s]
100%|██████████| 21/21 [00:00<00:00, 4444.91it/s]
100%|██████████| 21/21 [00:00<00:00, 620284.39it/s]
100%|██████████| 210/210 [00:00<00:00, 397.94it/s]     
100%|██████████| 210/210 [00:00<00:00, 1458284.50it/s]
100%|██████████| 21/21 [00:00<00:00, 3643.60it/s]
100%|██████████| 21/21 [00:00<00:00, 314572.80it/s]
100%|██████████| 210/210 [00:00<00:00, 386.08it/s]     
100%|██████████| 210/210 [00:00<00:00, 1948681.06it/s]
100%|██████████| 21/21


--- Trial 12 finished ---
Value: -0.981493693109277 and parameters: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 1, 'hidden_dim': 192, 'num_layers': 5, 'learning_rate': 0.00012695339635823068}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 13 ---
Hyperparameters for this trial: {'train_length_multiplier': 13, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 2, 'hidden_dim': 114, 'num_layers': 4, 'learning_rate': 0.004710422168153909}


100%|██████████| 273/273 [00:00<00:00, 383.02it/s]
100%|██████████| 273/273 [00:00<00:00, 1506638.15it/s]
100%|██████████| 63/63 [00:00<00:00, 312.94it/s]
100%|██████████| 63/63 [00:00<00:00, 1334551.27it/s]
100%|██████████| 273/273 [00:00<00:00, 401.83it/s]                       
100%|██████████| 273/273 [00:00<00:00, 1895769.85it/s]
100%|██████████| 63/63 [00:00<00:00, 293.86it/s]
100%|██████████| 63/63 [00:00<00:00, 1158952.42it/s]
100%|██████████| 273/273 [00:00<00:00, 389.91it/s]                        
100%|██████████| 273/273 [00:00<00:00, 1861861.78it/s]
100%|██████████| 63/63 [00:00<00:00, 297.23it/s]
100%|██████████| 63/63 [00:00<00:00, 1264311.73it/s]
100%|██████████| 273/273 [00:00<00:00, 395.57it/s]                       
100%|██████████| 273/273 [00:00<00:00, 1886400.32it/s]
100%|██████████| 63/63 [00:00<00:00, 262.84it/s]
100%|██████████| 63/63 [00:00<00:00, 1206580.60it/s]
100%|██████████| 273/273 [00:00<00:00, 455.32it/s]                       
100%|██████████| 273/273


--- Trial 13 finished ---
Value: -0.8083503468462727 and parameters: {'train_length_multiplier': 13, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 2, 'hidden_dim': 114, 'num_layers': 4, 'learning_rate': 0.004710422168153909}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 14 ---
Hyperparameters for this trial: {'train_length_multiplier': 11, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 186, 'num_layers': 2, 'learning_rate': 7.879766861905456e-05}


100%|██████████| 231/231 [00:00<00:00, 392.02it/s]
100%|██████████| 231/231 [00:00<00:00, 1509165.46it/s]
100%|██████████| 21/21 [00:00<00:00, 4306.90it/s]
100%|██████████| 21/21 [00:00<00:00, 537075.51it/s]
100%|██████████| 231/231 [00:00<00:00, 404.46it/s]                       
100%|██████████| 231/231 [00:00<00:00, 1673375.17it/s]
100%|██████████| 21/21 [00:00<00:00, 3759.46it/s]
100%|██████████| 21/21 [00:00<00:00, 672369.34it/s]
100%|██████████| 231/231 [00:00<00:00, 439.98it/s]                         
100%|██████████| 231/231 [00:00<00:00, 1985418.49it/s]
100%|██████████| 21/21 [00:00<00:00, 4419.71it/s]
100%|██████████| 21/21 [00:00<00:00, 568260.54it/s]
100%|██████████| 231/231 [00:00<00:00, 439.01it/s]     
100%|██████████| 231/231 [00:00<00:00, 1866828.95it/s]
100%|██████████| 21/21 [00:00<00:00, 4065.37it/s]
100%|██████████| 21/21 [00:00<00:00, 247416.81it/s]
100%|██████████| 231/231 [00:00<00:00, 388.28it/s]                       
100%|██████████| 231/231 [00:00<00:00, 18


--- Trial 14 finished ---
Value: -0.9827448160829343 and parameters: {'train_length_multiplier': 11, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 186, 'num_layers': 2, 'learning_rate': 7.879766861905456e-05}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 15 ---
Hyperparameters for this trial: {'train_length_multiplier': 11, 'val_period_length': 42, 'lookahead': 21, 'num_heads': 2, 'hidden_dim': 116, 'num_layers': 4, 'learning_rate': 0.006669750671065963}


100%|██████████| 231/231 [00:00<00:00, 488.05it/s]
100%|██████████| 231/231 [00:00<00:00, 1800900.04it/s]
100%|██████████| 42/42 [00:00<00:00, 225.09it/s]
100%|██████████| 42/42 [00:00<00:00, 730957.54it/s]
100%|██████████| 231/231 [00:00<00:00, 440.74it/s]                       
100%|██████████| 231/231 [00:00<00:00, 1896055.23it/s]
100%|██████████| 42/42 [00:00<00:00, 227.09it/s]
100%|██████████| 42/42 [00:00<00:00, 978670.93it/s]
100%|██████████| 231/231 [00:00<00:00, 378.57it/s]                       
100%|██████████| 231/231 [00:00<00:00, 1644964.73it/s]
100%|██████████| 42/42 [00:00<00:00, 228.18it/s]
100%|██████████| 42/42 [00:00<00:00, 1024190.51it/s]
100%|██████████| 231/231 [00:00<00:00, 288.29it/s]                         
100%|██████████| 231/231 [00:00<00:00, 1497502.66it/s]
100%|██████████| 42/42 [00:00<00:00, 233.93it/s]
100%|██████████| 42/42 [00:00<00:00, 903388.55it/s]
100%|██████████| 231/231 [00:00<00:00, 369.29it/s]                         
100%|██████████| 231/231


--- Trial 15 finished ---
Value: -0.878169183024307 and parameters: {'train_length_multiplier': 11, 'val_period_length': 42, 'lookahead': 21, 'num_heads': 2, 'hidden_dim': 116, 'num_layers': 4, 'learning_rate': 0.006669750671065963}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 16 ---
Hyperparameters for this trial: {'train_length_multiplier': 14, 'val_period_length': 63, 'lookahead': 1, 'num_heads': 3, 'hidden_dim': 184, 'num_layers': 2, 'learning_rate': 0.0006536171393274721}


100%|██████████| 294/294 [00:00<00:00, 426.64it/s]
100%|██████████| 294/294 [00:00<00:00, 1865545.20it/s]
100%|██████████| 63/63 [00:00<00:00, 278.15it/s]
100%|██████████| 63/63 [00:00<00:00, 1217701.16it/s]
100%|██████████| 294/294 [00:00<00:00, 356.24it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1976162.46it/s]
100%|██████████| 63/63 [00:00<00:00, 266.84it/s]
100%|██████████| 63/63 [00:00<00:00, 1405538.04it/s]
100%|██████████| 294/294 [00:00<00:00, 394.79it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1597312.66it/s]
100%|██████████| 63/63 [00:00<00:00, 357.36it/s]
100%|██████████| 63/63 [00:00<00:00, 1459895.87it/s]
100%|██████████| 294/294 [00:00<00:00, 439.15it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1963575.44it/s]
100%|██████████| 63/63 [00:00<00:00, 234.12it/s]
100%|██████████| 63/63 [00:00<00:00, 1056964.61it/s]
100%|██████████| 294/294 [00:00<00:00, 471.57it/s]                       
100%|██████████| 294/294 


--- Trial 16 finished ---
Value: -0.4503906773701508 and parameters: {'train_length_multiplier': 14, 'val_period_length': 63, 'lookahead': 1, 'num_heads': 3, 'hidden_dim': 184, 'num_layers': 2, 'learning_rate': 0.0006536171393274721}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 17 ---
Hyperparameters for this trial: {'train_length_multiplier': 13, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 1, 'hidden_dim': 90, 'num_layers': 4, 'learning_rate': 6.469672473694674e-05}


100%|██████████| 273/273 [00:00<00:00, 451.43it/s]
100%|██████████| 273/273 [00:00<00:00, 1849830.36it/s]
100%|██████████| 21/21 [00:00<00:00, 3064.41it/s]
100%|██████████| 21/21 [00:00<00:00, 564617.85it/s]
100%|██████████| 273/273 [00:00<00:00, 419.74it/s]     
100%|██████████| 273/273 [00:00<00:00, 1608209.26it/s]
100%|██████████| 21/21 [00:00<00:00, 3587.65it/s]
100%|██████████| 21/21 [00:00<00:00, 259824.14it/s]
100%|██████████| 273/273 [00:00<00:00, 409.84it/s]                        
100%|██████████| 273/273 [00:00<00:00, 1626484.36it/s]
100%|██████████| 21/21 [00:00<00:00, 4129.03it/s]
100%|██████████| 21/21 [00:00<00:00, 746443.93it/s]
100%|██████████| 273/273 [00:00<00:00, 466.99it/s]     
100%|██████████| 273/273 [00:00<00:00, 1654689.29it/s]
100%|██████████| 21/21 [00:00<00:00, 4083.47it/s]
100%|██████████| 21/21 [00:00<00:00, 503316.48it/s]
100%|██████████| 273/273 [00:00<00:00, 458.42it/s]                        
100%|██████████| 273/273 [00:00<00:00, 1902068.09it/s]
100%


--- Trial 17 finished ---
Value: -0.9809269288003684 and parameters: {'train_length_multiplier': 13, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 1, 'hidden_dim': 90, 'num_layers': 4, 'learning_rate': 6.469672473694674e-05}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 18 ---
Hyperparameters for this trial: {'train_length_multiplier': 15, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 66, 'num_layers': 2, 'learning_rate': 0.0026028784566409905}


100%|██████████| 315/315 [00:00<00:00, 531.48it/s]
100%|██████████| 315/315 [00:00<00:00, 2297749.15it/s]
100%|██████████| 21/21 [00:00<00:00, 4354.81it/s]
100%|██████████| 21/21 [00:00<00:00, 682793.67it/s]
100%|██████████| 315/315 [00:00<00:00, 465.85it/s]     
100%|██████████| 315/315 [00:00<00:00, 1623102.90it/s]
100%|██████████| 21/21 [00:00<00:00, 3262.96it/s]
100%|██████████| 21/21 [00:00<00:00, 591143.52it/s]
100%|██████████| 315/315 [00:00<00:00, 449.99it/s]                         
100%|██████████| 315/315 [00:00<00:00, 1738428.63it/s]
100%|██████████| 21/21 [00:00<00:00, 4185.73it/s]
100%|██████████| 21/21 [00:00<00:00, 587202.56it/s]
100%|██████████| 315/315 [00:00<00:00, 364.97it/s]     
100%|██████████| 315/315 [00:00<00:00, 1914790.96it/s]
100%|██████████| 21/21 [00:00<00:00, 4176.21it/s]
100%|██████████| 21/21 [00:00<00:00, 503316.48it/s]
100%|██████████| 315/315 [00:00<00:00, 447.56it/s]     
100%|██████████| 315/315 [00:00<00:00, 1709192.45it/s]
100%|██████████| 21/21


--- Trial 18 finished ---
Value: -0.9851872703517757 and parameters: {'train_length_multiplier': 15, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 66, 'num_layers': 2, 'learning_rate': 0.0026028784566409905}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 19 ---
Hyperparameters for this trial: {'train_length_multiplier': 15, 'val_period_length': 63, 'lookahead': 5, 'num_heads': 3, 'hidden_dim': 64, 'num_layers': 2, 'learning_rate': 0.00459981089755304}


100%|██████████| 315/315 [00:00<00:00, 429.05it/s]
100%|██████████| 315/315 [00:00<00:00, 1548893.04it/s]
100%|██████████| 63/63 [00:00<00:00, 196.33it/s]
100%|██████████| 63/63 [00:00<00:00, 1069802.23it/s]
100%|██████████| 315/315 [00:00<00:00, 426.16it/s]                        
100%|██████████| 315/315 [00:00<00:00, 1809870.90it/s]
100%|██████████| 63/63 [00:00<00:00, 276.73it/s]
100%|██████████| 63/63 [00:00<00:00, 1078535.31it/s]
100%|██████████| 315/315 [00:00<00:00, 430.45it/s]                       
100%|██████████| 315/315 [00:00<00:00, 1766317.86it/s]
100%|██████████| 63/63 [00:00<00:00, 357.25it/s]
100%|██████████| 63/63 [00:00<00:00, 1398101.33it/s]
100%|██████████| 315/315 [00:00<00:00, 402.13it/s]                       
100%|██████████| 315/315 [00:00<00:00, 2074106.37it/s]
100%|██████████| 63/63 [00:00<00:00, 217.76it/s]
100%|██████████| 63/63 [00:00<00:00, 1119665.90it/s]
[I 2023-10-11 01:56:06,434] Trial 18 finished with value: -0.5142185522137714 and parameters: {'tr


--- Trial 19 finished ---
Value: -0.5142185522137714 and parameters: {'train_length_multiplier': 15, 'val_period_length': 63, 'lookahead': 5, 'num_heads': 3, 'hidden_dim': 64, 'num_layers': 2, 'learning_rate': 0.00459981089755304}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 20 ---
Hyperparameters for this trial: {'train_length_multiplier': 15, 'val_period_length': 42, 'lookahead': 1, 'num_heads': 2, 'hidden_dim': 96, 'num_layers': 2, 'learning_rate': 0.012199953136611777}


100%|██████████| 315/315 [00:00<00:00, 435.55it/s]
100%|██████████| 315/315 [00:00<00:00, 1807395.02it/s]
100%|██████████| 42/42 [00:00<00:00, 240.61it/s]
100%|██████████| 42/42 [00:00<00:00, 1061209.45it/s]
100%|██████████| 315/315 [00:00<00:00, 404.80it/s]                         
100%|██████████| 315/315 [00:00<00:00, 2067614.65it/s]
100%|██████████| 42/42 [00:00<00:00, 231.77it/s]
100%|██████████| 42/42 [00:00<00:00, 851018.20it/s]
100%|██████████| 315/315 [00:00<00:00, 354.44it/s]                         
100%|██████████| 315/315 [00:00<00:00, 1966080.00it/s]
100%|██████████| 42/42 [00:00<00:00, 228.00it/s]
100%|██████████| 42/42 [00:00<00:00, 1048576.00it/s]
100%|██████████| 315/315 [00:00<00:00, 437.90it/s]                         
100%|██████████| 315/315 [00:00<00:00, 1963158.63it/s]
100%|██████████| 42/42 [00:00<00:00, 222.07it/s]
100%|██████████| 42/42 [00:00<00:00, 772634.95it/s]
100%|██████████| 315/315 [00:00<00:00, 434.53it/s]                       
100%|██████████| 315/


--- Trial 20 finished ---
Value: -0.8550305872815118 and parameters: {'train_length_multiplier': 15, 'val_period_length': 42, 'lookahead': 1, 'num_heads': 2, 'hidden_dim': 96, 'num_layers': 2, 'learning_rate': 0.012199953136611777}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 21 ---
Hyperparameters for this trial: {'train_length_multiplier': 14, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 132, 'num_layers': 1, 'learning_rate': 0.0026170959187590587}


100%|██████████| 294/294 [00:00<00:00, 419.09it/s]
100%|██████████| 294/294 [00:00<00:00, 1957341.87it/s]
100%|██████████| 63/63 [00:00<00:00, 314.16it/s]
100%|██████████| 63/63 [00:00<00:00, 1169208.64it/s]
100%|██████████| 294/294 [00:00<00:00, 450.55it/s]     
100%|██████████| 294/294 [00:00<00:00, 1471510.00it/s]
100%|██████████| 63/63 [00:00<00:00, 207.00it/s]
100%|██████████| 63/63 [00:00<00:00, 1229028.61it/s]
100%|██████████| 294/294 [00:00<00:00, 347.67it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1680007.32it/s]
100%|██████████| 63/63 [00:00<00:00, 320.17it/s]
100%|██████████| 63/63 [00:00<00:00, 898779.43it/s]
100%|██████████| 294/294 [00:00<00:00, 359.65it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1854323.87it/s]
100%|██████████| 63/63 [00:00<00:00, 127.86it/s]
100%|██████████| 63/63 [00:00<00:00, 1282724.04it/s]
[I 2023-10-11 02:02:35,671] Trial 20 finished with value: -0.9652622636442683 and parameters: {'train_length_multiplie


--- Trial 21 finished ---
Value: -0.9652622636442683 and parameters: {'train_length_multiplier': 14, 'val_period_length': 63, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 132, 'num_layers': 1, 'learning_rate': 0.0026170959187590587}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 22 ---
Hyperparameters for this trial: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 82, 'num_layers': 4, 'learning_rate': 0.0007368173612612001}


100%|██████████| 210/210 [00:00<00:00, 371.78it/s]
100%|██████████| 210/210 [00:00<00:00, 1827393.86it/s]
100%|██████████| 21/21 [00:00<00:00, 4029.29it/s]
100%|██████████| 21/21 [00:00<00:00, 463580.97it/s]
100%|██████████| 210/210 [00:00<00:00, 336.87it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1783003.72it/s]
100%|██████████| 21/21 [00:00<00:00, 4123.42it/s]
100%|██████████| 21/21 [00:00<00:00, 527427.45it/s]
100%|██████████| 210/210 [00:00<00:00, 397.57it/s]     
100%|██████████| 210/210 [00:00<00:00, 1862164.57it/s]
100%|██████████| 21/21 [00:00<00:00, 4119.18it/s]
100%|██████████| 21/21 [00:00<00:00, 497629.29it/s]
100%|██████████| 210/210 [00:00<00:00, 308.17it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1779401.70it/s]
100%|██████████| 21/21 [00:00<00:00, 4066.12it/s]
100%|██████████| 21/21 [00:00<00:00, 595137.73it/s]
100%|██████████| 210/210 [00:00<00:00, 469.82it/s]                       
100%|██████████| 210/210 [00:00<00:00, 1625


--- Trial 22 finished ---
Value: -0.9768638037175729 and parameters: {'train_length_multiplier': 10, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 82, 'num_layers': 4, 'learning_rate': 0.0007368173612612001}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 23 ---
Hyperparameters for this trial: {'train_length_multiplier': 13, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 66, 'num_layers': 2, 'learning_rate': 0.0010230449330602869}


100%|██████████| 273/273 [00:00<00:00, 372.92it/s]
100%|██████████| 273/273 [00:00<00:00, 1820421.29it/s]
100%|██████████| 21/21 [00:00<00:00, 4048.00it/s]
100%|██████████| 21/21 [00:00<00:00, 647649.88it/s]
100%|██████████| 273/273 [00:00<00:00, 378.23it/s]     
100%|██████████| 273/273 [00:00<00:00, 1737549.31it/s]
100%|██████████| 21/21 [00:00<00:00, 3667.72it/s]
100%|██████████| 21/21 [00:00<00:00, 530604.72it/s]
100%|██████████| 273/273 [00:00<00:00, 395.15it/s]                         
100%|██████████| 273/273 [00:00<00:00, 1797558.86it/s]
100%|██████████| 21/21 [00:00<00:00, 3956.18it/s]
100%|██████████| 21/21 [00:00<00:00, 688128.00it/s]
100%|██████████| 273/273 [00:00<00:00, 401.47it/s]                         
100%|██████████| 273/273 [00:00<00:00, 1927685.17it/s]
100%|██████████| 21/21 [00:00<00:00, 3849.33it/s]
100%|██████████| 21/21 [00:00<00:00, 436041.50it/s]
100%|██████████| 273/273 [00:00<00:00, 358.81it/s]     
100%|██████████| 273/273 [00:00<00:00, 1745495.41it/s]
10


--- Trial 23 finished ---
Value: -0.9852045439296993 and parameters: {'train_length_multiplier': 13, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 4, 'hidden_dim': 66, 'num_layers': 2, 'learning_rate': 0.0010230449330602869}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 24 ---
Hyperparameters for this trial: {'train_length_multiplier': 13, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 5, 'hidden_dim': 66, 'num_layers': 2, 'learning_rate': 0.0014531514529634428}


100%|██████████| 273/273 [00:00<00:00, 424.91it/s]
100%|██████████| 273/273 [00:00<00:00, 1944049.22it/s]
100%|██████████| 21/21 [00:00<00:00, 3140.01it/s]
100%|██████████| 21/21 [00:00<00:00, 662258.53it/s]
100%|██████████| 273/273 [00:00<00:00, 420.58it/s]     
100%|██████████| 273/273 [00:00<00:00, 2085692.15it/s]
100%|██████████| 21/21 [00:00<00:00, 4276.99it/s]
100%|██████████| 21/21 [00:00<00:00, 642922.51it/s]
100%|██████████| 273/273 [00:00<00:00, 368.39it/s]                       
100%|██████████| 273/273 [00:00<00:00, 1575027.50it/s]
100%|██████████| 21/21 [00:00<00:00, 4040.20it/s]
100%|██████████| 21/21 [00:00<00:00, 489335.47it/s]
100%|██████████| 273/273 [00:00<00:00, 375.29it/s]                       
100%|██████████| 273/273 [00:00<00:00, 1753514.54it/s]
100%|██████████| 21/21 [00:00<00:00, 3064.73it/s]
100%|██████████| 21/21 [00:00<00:00, 521185.70it/s]
100%|██████████| 273/273 [00:00<00:00, 424.77it/s]                       
100%|██████████| 273/273 [00:00<00:00, 1987


--- Trial 24 finished ---
Value: -0.9808941600899868 and parameters: {'train_length_multiplier': 13, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 5, 'hidden_dim': 66, 'num_layers': 2, 'learning_rate': 0.0014531514529634428}
Best is trial 7 with value: -0.9871639090080142


--- Starting Trial: 25 ---
Hyperparameters for this trial: {'train_length_multiplier': 14, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 3, 'hidden_dim': 100, 'num_layers': 2, 'learning_rate': 0.0024457388375685573}


100%|██████████| 294/294 [00:00<00:00, 491.61it/s]
100%|██████████| 294/294 [00:00<00:00, 2011623.78it/s]
100%|██████████| 21/21 [00:00<00:00, 4053.77it/s]
100%|██████████| 21/21 [00:00<00:00, 587202.56it/s]
100%|██████████| 294/294 [00:00<00:00, 405.66it/s]     
100%|██████████| 294/294 [00:00<00:00, 2115137.87it/s]
100%|██████████| 21/21 [00:00<00:00, 3969.02it/s]
100%|██████████| 21/21 [00:00<00:00, 638263.65it/s]
100%|██████████| 294/294 [00:00<00:00, 485.47it/s]                       
100%|██████████| 294/294 [00:00<00:00, 2100724.66it/s]
100%|██████████| 21/21 [00:00<00:00, 3555.64it/s]
100%|██████████| 21/21 [00:00<00:00, 699050.67it/s]
100%|██████████| 294/294 [00:00<00:00, 457.77it/s]                       
100%|██████████| 294/294 [00:00<00:00, 1957341.87it/s]
100%|██████████| 21/21 [00:00<00:00, 4086.12it/s]
100%|██████████| 21/21 [00:00<00:00, 530604.72it/s]
100%|██████████| 294/294 [00:00<00:00, 542.17it/s]                         
100%|██████████| 294/294 [00:00<00:00, 19


--- Trial 25 finished ---
Value: -0.9783406131475749 and parameters: {'train_length_multiplier': 14, 'val_period_length': 21, 'lookahead': 21, 'num_heads': 3, 'hidden_dim': 100, 'num_layers': 2, 'learning_rate': 0.0024457388375685573}
Best is trial 7 with value: -0.9871639090080142



In [22]:
STOP

NameError: name 'STOP' is not defined

In [None]:
# After all trials have finished, retrieve the best trial's parameters
best_params = study.best_trial.params

# # Create the best model using the Transformer
# best_model = Transformer(
#     input_dim=FEATURE_DIM,
#     d_model=best_params["hidden_dim"],
#     output_dim=OUTPUT_DIM,
#     num_heads=best_params["num_heads"],
#     num_layers=best_params["num_layers"]
# ).to(device)

# Below is the SimpleNN code, commented out:
best_model = SimpleNN(input_dim=FEATURE_DIM, output_dim=OUTPUT_DIM).to(device)

# Train the best model on the entire dataset
criterion = nn.MSELoss()
lr = best_params['learning_rate']
optimizer = optim.Adam(best_model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=100, gamma=0.1)

# Assuming get_era2data() can handle the entire dataset
all_batches = get_era2data(dataset)  

# You might need to adjust/train_model to handle no validation set or adjust accordingly.
_, _, _ = train_model(
    best_model, criterion, optimizer, scheduler, NUM_EPOCHS, PATIENCE, 
    all_batches, None, is_lr_scheduler=True  # Assuming train_model can handle None for validation_batches
)

# Saving the model
model_name = best_model.__class__.__name__
lookahead = best_params.get("lookahead", "NA")
filename = f"{top}_{model_name}_{target_string}_{lookahead:02d}d_rank_quantiled.pkl"
file_path = os.path.join(model_dir, filename)

save_data = {
    'model_type': 'Transformer',
    'model_state_dict': best_model.state_dict(),
    'trial_params': best_params
}
torch.save(save_data, file_path)

In [None]:
# Loading the saved data
loaded_data = torch.load(file_path)

# Create the correct model based on the saved type
if loaded_data['model_type'] == 'Transformer':
    model = Transformer(
        input_dim=FEATURE_DIM,
        d_model=loaded_data['trial_params']["hidden_dim"],
        output_dim=OUTPUT_DIM,
        num_heads=loaded_data['trial_params']["num_heads"],
        num_layers=loaded_data['trial_params']["num_layers"]
    ).to(device)
else:
    model = SimpleNN(input_dim=FEATURE_DIM, output_dim=OUTPUT_DIM).to(device)

# Load the saved parameters into the model
model.load_state_dict(loaded_data['model_state_dict'])

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.optim.lr_scheduler import StepLR
# from model import RankPredictorNN  # Assuming this is where your SimpleNN class is defined
# import os
# from model import Transformer
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# # Constants and hyperparameters
# NUM_EPOCHS = 150
# PATIENCE = 5
# FEATURE_DIM = len(features)  # Assuming 'features' is defined elsewhere in your code
# OUTPUT_DIM = 1
# NUM_TRAIL = 25
# device = "cuda" if torch.cuda.is_available() else "cpu"

# # Best parameters from Optuna study
# best_params = {
#     'train_length_multiplier': 14,
#     'val_period_length': 63,
#     'lookahead': 5,
#     'num_heads': 5,  # Updated to match with the Transformer definition below
#     'hidden_dim': 208,
#     'num_layers': 10,
#     'learning_rate': 0.002
# }

# # Choose model
# best_model = Transformer(
#     input_dim=FEATURE_DIM,
#     d_model=best_params['hidden_dim'],
#     output_dim=OUTPUT_DIM,
#     num_heads=best_params['num_heads'],
#     num_layers=best_params['num_layers'],
# ).to(device)


# # # Initialize the best model using SimpleNN
# # best_model = RankPredictorNN(input_dim=FEATURE_DIM, \
# #     output_dim=OUTPUT_DIM).to(device)

# # Initialize loss function, optimizer, and learning rate scheduler
# criterion = nn.MSELoss()
# optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])
# scheduler = StepLR(optimizer, step_size=100, gamma=0.1)


# # label = f'TARGET_ret_fwd_{params["lookahead"]:02d}d_rank_quantiled'
# # Assuming get_era2data() can handle the entire dataset and returns a DataLoader
# all_batches = get_era2data(dataset)  # Replace this with your actual data loading function

# # Training Loop
# # Training Loop
# for epoch in range(NUM_EPOCHS):
#     best_model.train()
    
#     total_loss = 0.0
#     total_mse = 0.0
#     total_corr = 0.0
    
#     # Define the specific label using lookahead
#     for timestamp, (inputs, labels, masks_inputs, target_names) in all_batches.items():
        
#         # Move tensors to the desired device
#         inputs = inputs.to(device)
#         labels = labels.to(device)
#         masks_inputs = masks_inputs.to(device)

#         # Get index for specific label dynamically
#         specific_label_name = f'TARGET_ret_fwd_{best_params["lookahead"]:02d}d_rank_quantiled'
#         specific_label_index = target_names.index(specific_label_name)

#         # Use that index to fetch the specific column
#         labels = labels[:, :, specific_label_index].unsqueeze(2)
#         # print(f"Target Labels Range from training loop: {labels.min().item()}, {labels.max().item()}")

#         # Zero the parameter gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = best_model(inputs / 4, masks_inputs)
#         # print(outputs)
        
#         # Asserting that shapes of labels and outputs match
#         assert labels.shape == outputs.shape, \
#             f"Shape mismatch: labels {labels.shape}, outputs {outputs.shape}"

#         # Compute loss using the custom loss function
#         loss, mse, corr = calculate_loss(outputs, criterion, \
#                     labels, masks_inputs)

#         # print(f"Current batch loss: {loss.item()}, Current batch MSE: {mse.item()}, Current batch Correlation: {corr.item()}")
#         total_loss += loss.item()
#         total_mse += mse.item()
#         total_corr += corr.item()

#         # print(f"Accumulated Total Loss after this batch: {total_loss}, Accumulated Total MSE after this batch: {total_mse}, Accumulated Total Correlation after this batch: {total_corr}")

            
#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()
#         # break
        
#     # Step the learning rate scheduler
#     scheduler.step()

#     # # At the end of the training loop:
#     # print(f"Total loss: {total_loss}, Total MSE: {total_mse}, Total Correlation: {total_corr}, Number of batches: {len(all_batches)}")

#     avg_loss = total_loss / len(all_batches)
#     avg_mse = total_mse / len(all_batches)
#     avg_corr = total_corr / len(all_batches)
        
#     print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {avg_loss:.4f}, MSE: {avg_mse:.4f}, Correlation: {avg_corr:.4f}")

#     # break


# print("Training complete.")