##### Import

In [6]:
import warnings
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from tqdm import tqdm
import shap
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.pipeline import Pipeline
import os
import gc
import sys

# Filter out warning messages
warnings.filterwarnings('ignore')

# Set pandas display options
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

# Set seaborn style
sns.set_style('whitegrid')

# Add the parent directory to sys.path
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# Index and deciles for data slicing
idx = pd.IndexSlice

In [7]:
# from pathlib import Path
# import pandas as pd
# from utils import rank_stocks_and_quantile
# # UNSEEN_KEY = '/data/YEAR_20220803_20230803'
# top = 250  # parameters -> papermill
# DATA_STORE = Path(f'data/{top}_dataset.h5')
# with pd.HDFStore(DATA_STORE) as store:
#     # unseen = store[UNSEEN_KEY]
#     print(store.keys())

In [8]:
import gc
import pandas as pd
from utils import rank_stocks_and_quantile

top = 250  # parameters -> papermill

DATA_STORE = Path(f'data/{top}_dataset.h5')
dataset_keys = [
    '/data/YEAR_20200930_20220802',
    # '/data/YEAR_20181024_20200929',
    # '/data/YEAR_20161116_20181023',
    # '/data/YEAR_20141210_20161115'
]
target_string = 'TARGET_ret_fwd'  # no longer a parameter

# Initialize empty dataset
dataset = pd.DataFrame()

with pd.HDFStore(DATA_STORE) as store:
    for key in dataset_keys:
        df = store[key]
        dataset = pd.concat([dataset, df], ignore_index=False)
        del df
        gc.collect()  # Explicitly call garbage collector

# Rank stocks and quantile
dataset = rank_stocks_and_quantile(dataset, target_substring=target_string)

# Adjust timezone
dataset.index.set_levels(dataset.index.levels[0].tz_localize(None), level=0, inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 117250 entries, (Timestamp('2020-09-30 00:00:00'), 'AA') to (Timestamp('2022-08-02 00:00:00'), 'ZTS')
Columns: 622 entries, FEATURE_open to TARGET_ret_fwd_252d_rank_quantiled
dtypes: float32(360), float64(43), int32(198), int64(12), int8(9)
memory usage: 300.3+ MB


In [9]:
import pandas as pd

# Extract the first and last dates from the MultiIndex
start_date = dataset.index.get_level_values(0).min()
end_date = dataset.index.get_level_values(0).max()

# Generate business dates between the start and end date
business_dates = pd.bdate_range(start_date, end_date)

# Count the number of business days
num_business_days = len(business_dates)

print(f"Number of business days between {start_date} and {end_date}: {num_business_days}")

Number of business days between 2020-09-30 00:00:00 and 2022-08-02 00:00:00: 480


In [None]:
# Get unique dates and sort them
unique_dates = dataset.index.get_level_values('date').unique().sort_values()

# Adjust for the look-ahead gap
look_ahead = 1

# Split dates for training and testing with a gap
train_dates = unique_dates[:-21-look_ahead]
test_dates = unique_dates[-21:]

# Split the dataset
train_data = dataset.loc[train_dates] ## train + val
test_data = dataset.loc[test_dates] # backtesting

In [None]:
from utils import CustomBackwardMultipleTimeSeriesCV


cv = CustomBackwardMultipleTimeSeriesCV(train_period_length=142, 
                                        test_period_length=21, 
                                        lookahead=1, 
                                        date_idx='date')

In [None]:
import optuna
import os
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import json
from models import SimpleNN
from utils import get_era2data, train_model, metrics_on_batch

device = "cuda" if torch.cuda.is_available() else "cpu"

FEATURE_DIM = 225
OUTPUT_DIM = 1
MAX_LEN = 500

# Define paths for saving models and hyperparameters
model_dir = "./saved_models"
best_hyperparams_dir = "./best_hyperparams"
study_dir = "./optuna_studies"

# Ensure directories exist
for dir in [model_dir, best_hyperparams_dir, study_dir]:
    if not os.path.exists(dir):
        os.makedirs(dir)

def save_best_model(study, trial):
    # If the trial is better than the current best, save its model weights
    if study.best_trial.number == trial.number:
        # Retrieve model's parameters from the trial
        trial_params = trial.params
        input_dim = FEATURE_DIM
        hidden_dim = trial_params["HIDDEN_DIM"]
        num_output = OUTPUT_DIM
        
        # Initialize the model with the trial's parameters
        model = SimpleNN(input_dim, hidden_dim, num_output)
        
        # Load the model's state_dict from the saved path
        saved_model_path = trial.user_attrs["model_path"]
        model.load_state_dict(torch.load(saved_model_path))
        
        # Save the model's state_dict as the best model
        best_model_path = os.path.join(model_dir, \
            f"best_model_trial_{trial.number}.pt")
        torch.save(model.state_dict(), best_model_path)
        
        # Delete previous best model file if exists and is different from the current one
        previous_best_model = os.path.join(model_dir, \
            f"best_model_trial_{study.best_trial.number - 1}.pt")
        if os.path.exists(previous_best_model):
            os.remove(previous_best_model)
        
    # Remove the saved model of the current trial since it's not the best
    current_trial_model_path = os.path.join(study_dir, f"model_{trial.number}.pt")
    if os.path.exists(current_trial_model_path):
        os.remove(current_trial_model_path)

# Your objective function remains untouched
def objective(trial):
    # 1. Define hyperparameters to optimize
    HIDDEN_DIM = trial.suggest_int("HIDDEN_DIM", 32, 512, log=True)  # Log scale search for hidden dimensions
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)  # Log scale search for learning rate
    step_size = trial.suggest_int("step_size", 50, 200)
    gamma = trial.suggest_float("gamma", 0.01, 0.5, log=True)
    patience = trial.suggest_int("patience", 3, 10)
    num_epochs = trial.suggest_int("num_epochs", 1, 10)

    # 2. Instantiate model and other components using hyperparameters
    model = SimpleNN(FEATURE_DIM, HIDDEN_DIM, OUTPUT_DIM)
    model.to(device=device)
    criterion = torch.nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)

    val_scores = []

    for train_idx, val_idx in cv.split(train_data):
        train = train_data.loc[train_idx, :]
        val = train_data.loc[val_idx, :]

        # Convert your data to appropriate loaders here
        train_loader = get_era2data(train, features, target)
        val_loader = get_era2data(val, features, target)

        _, batch_matric, _ = train_model(model, criterion, optimizer, \
                scheduler, num_epochs, patience, train_loader, val_loader)

        metrics = metrics_on_batch(batch_matric)
        val_scores.append(metrics[2])

    model_path = os.path.join(study_dir, f"model_{trial.number}.pt")
    torch.save(model.state_dict(), model_path)

    # Set the file path as a user attribute (optional)
    trial.set_user_attr("model_path", model_path)

    return -np.mean(val_scores)

# Callback to provide feedback about each trial's end and the current best trial
def callback(study, trial):
    print(f"\n--- Trial {trial.number} finished ---")
    print(f"Value: {trial.value} and parameters: {trial.params}")
    print(f"Best is trial {study.best_trial.number} with value: {study.best_trial.value}\n")

# Initialize the study
study = optuna.create_study(study_name='Maximizing the Sharpe', direction='minimize',
                            storage=f'sqlite:///{study_dir}/study.db', load_if_exists=True)
study.optimize(objective, n_trials=1, callbacks=[save_best_model, callback])

# # Print best trial's parameters and value
# print(f"Best trial: {study.best_trial.params}")
# print(f"Best value: {study.best_trial.value}")

In [None]:
def train_best_model(train_data, features, target):
    # 1. Get the best parameters from the study
    best_params = study.best_params

    # 2. Initialize model with best parameters
    HIDDEN_DIM = best_params["HIDDEN_DIM"]
    lr = best_params["lr"]
    step_size = best_params["step_size"]
    gamma = best_params["gamma"]
    num_epochs = best_params["num_epochs"]

    model = SimpleNN(FEATURE_DIM, HIDDEN_DIM, OUTPUT_DIM)
    model.to(device=device)
    criterion = torch.nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)

    # 3. Train the model with all the data
    train_loader = get_era2data(train_data, features, target)

    # Note: Since you're training on all data, there's no validation loader.
    # However, the `train_model` function you've provided seems to require one.
    # You can create a dummy one or modify the `train_model` to handle cases where 
    # there's no validation set.

    dummy_val_loader = []  # This is a dummy validation loader since we're training on all data

    train_loss, _, _ = train_model(model, criterion, optimizer, \
        scheduler, num_epochs, patience=None, train_loader=train_loader, \
        val_loader=dummy_val_loader)

    # Save the model after training
    final_model_path = os.path.join(model_dir, "final_best_model.pt")
    torch.save(model.state_dict(), final_model_path)

    return model

# Now, call the function to train your model with the best parameters on all data
final_model = train_best_model(train_data, features, target)
