##### Import

In [None]:
import warnings
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from tqdm import tqdm
import shap
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.pipeline import Pipeline
import os
import gc
import sys

# Filter out warning messages
warnings.filterwarnings('ignore')

# Set pandas display options
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

# Set seaborn style
sns.set_style('whitegrid')

# Add the parent directory to sys.path
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# Index and deciles for data slicing
idx = pd.IndexSlice



from pathlib import Path

# Paths to the downloaded datasets, model, and hyperparameters
data_dir = Path('data/')
model_dir = Path('model/')
best_hyperparams_dir = Path('best_hyperparams/')
study_dir = Path('study/')

# Create directories if they do not exist
data_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)
best_hyperparams_dir.mkdir(parents=True, exist_ok=True)
study_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# from pathlib import Path
# import pandas as pd
# from utils import rank_stocks_and_quantile
# # UNSEEN_KEY = '/data/YEAR_20220803_20230803'
# top = 250  # parameters -> papermill
# DATA_STORE = Path(f'data/{top}_dataset.h5')
# with pd.HDFStore(DATA_STORE) as store:
#     # unseen = store[UNSEEN_KEY]
#     print(store.keys())

In [None]:
"""
Process Large Financial Datasets from HDF5 Format.

This script reads, processes, and normalizes financial datasets stored in an HDF5 format.
The primary processing steps involve converting data types, handling infinite values, and
scaling the dataset. The MinMaxScaler, computed from the entire dataset, is employed for normalization.
Once data processing is complete, stocks are ranked, and quantiles are determined in post-processing.

Attributes:
    - top (int): Number of top stocks to consider.
    - DATA_STORE (Path): Path to the HDF5 file containing the datasets.
    - dataset_keys (list of str): Keys identifying which datasets to process in the HDF5 store.
    - target_string (str): Target column identifier for post-processing.
    - CHUNK_SIZE (int): Size of chunks in which data is read and processed.

Functions:
    - convert_dtype(chunk, feature_columns, dtype='float32'): Converts dtype of specified columns in a chunk.
    - handle_infinite_values(chunk, feature_columns): Handles infinite values in a chunk.
    - process_chunk(chunk, feature_columns, scaler=None): Process a single chunk with optional normalization.

Workflow:
    1. Set parameters and paths.
    2. Define utility functions.
    3. Identify features and target columns from the first chunk.
    4. Determine the MinMaxScaler using all chunks in the dataset.
    5. Process and concatenate chunks to form the dataset.
    6. Rank stocks and compute quantiles in post-processing.
"""

import gc
import numpy as np
import pandas as pd
from pathlib import Path
from utils import rank_stocks_and_quantile
from sklearn.preprocessing import MinMaxScaler

# Parameters and data paths
top = 250
DATA_STORE = Path(f'data/{top}_dataset.h5')
dataset_keys = [
    '/data/YEAR_20200930_20220802',
    # '/data/YEAR_20181024_20200929',
    # '/data/YEAR_20161116_20181023',
    # '/data/YEAR_20141210_20161115'
]
target_string = 'TARGET_ret_fwd'
CHUNK_SIZE = 50000

def convert_dtype(chunk, feature_columns, dtype='float32'):
    """Converts the datatype of the specified columns."""
    chunk[feature_columns] = chunk[feature_columns].astype(dtype)
    return chunk

def handle_infinite_values(chunk, feature_columns):
    """Handle infinite values by replacing them with the maximum finite value."""
    max_val = np.finfo('float32').max
    chunk[feature_columns] = chunk[feature_columns].replace([np.inf, -np.inf], max_val)
    return chunk

def process_chunk(chunk, feature_columns, scaler=None):
    """Process a single chunk of data."""
    chunk = convert_dtype(chunk, feature_columns)
    chunk = handle_infinite_values(chunk, feature_columns)
    
    # Normalize with scaler if provided
    if scaler:
        chunk[feature_columns] = scaler.transform(chunk[feature_columns])
    
    return chunk

# Identify features and targets based on the first chunk
with pd.HDFStore(DATA_STORE) as store:
    first_chunk = store.select(dataset_keys[0], stop=CHUNK_SIZE)
    features = [col for col in first_chunk.columns if col.startswith('FEATURE_')]
    target = [col for col in first_chunk.columns if col.startswith('TARGET_')]

# Determine the scaler using the entire dataset for the identified features
scaler = MinMaxScaler()
for key in dataset_keys:
    with pd.HDFStore(DATA_STORE) as store:
        for chunk in store.select(key, chunksize=CHUNK_SIZE):
            # Convert dtype and handle infinite values
            chunk = convert_dtype(chunk, features)
            chunk = handle_infinite_values(chunk, features)
            scaler.partial_fit(chunk[features])

# Process and concatenate chunks
dataset = pd.DataFrame()
for key in dataset_keys:
    with pd.HDFStore(DATA_STORE) as store:
        for chunk in store.select(key, chunksize=CHUNK_SIZE):
            processed_chunk = process_chunk(chunk, features, scaler)
            dataset = pd.concat([dataset, processed_chunk], ignore_index=False)
            del processed_chunk
            gc.collect()

# Post-processing steps
dataset = rank_stocks_and_quantile(dataset, target_substring=target_string)
dataset.index.set_levels(dataset.index.levels[0].tz_localize(None), level=0, inplace=True)

In [None]:
# dataset = dataset.head(10**4)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
from joblib import Parallel, delayed

PADDING_VALUE = -1
MAX_LEN = None  # If you have a predefined value, set it here; otherwise, it gets calculated automatically.

def pad_sequence(inputs, padding_value=-1, max_len=None):
    if max_len is None:
        max_len = max([input.shape[0] for input in inputs])
    padded_inputs = []
    masks = []
    for input in inputs:
        pad_len = max_len - input.shape[0]
        padded_input = F.pad(input, (0, 0, 0, pad_len), value=padding_value)
        mask = torch.ones((input.shape[0], 1), dtype=torch.float)
        masks.append(
            torch.cat((mask, torch.zeros((pad_len, 1), dtype=torch.float)), dim=0)
        )
        padded_inputs.append(padded_input)
    return torch.stack(padded_inputs), torch.stack(masks)

def convert_to_torch(timestamp, data):
    feature_names = [col for col in data.columns if col.startswith('FEATURE_')]
    target_names = [col for col in data.columns if col.startswith('TARGET_')]
    
    inputs = torch.from_numpy(
                data[feature_names].values.astype(np.float32))
    labels = torch.from_numpy(
                data[target_names].values.astype(np.float32))

    padded_inputs, masks_inputs = pad_sequence(
            [inputs], padding_value=PADDING_VALUE, max_len=MAX_LEN)
    padded_labels, masks_labels = pad_sequence(
            [labels], padding_value=PADDING_VALUE, max_len=MAX_LEN)

    return {
        timestamp: (
            padded_inputs,
            padded_labels,
            masks_inputs
        )
    }

def get_era2data(df):
    # Group by the Timestamp index (level=0)
    res = Parallel(n_jobs=-1, prefer="threads")(
        delayed(convert_to_torch)(timestamp, data)
        for timestamp, data in tqdm(df.groupby(level=0)))
    
    era2data = {}
    for r in tqdm(res):
        era2data.update(r)
    return era2data

# Assuming your DataFrame is named "dataset"
timestamp2data_dataset = get_era2data(dataset)

In [None]:
# import torch 

# PADDING_VALUE = -1
# MAX_LEN = 500
# FEATURE_DIM = len(features)
# HIDDEN_DIM = 128
# OUTPUT_DIM = len(target)
# NUM_HEADS = 2
# NUM_LAYERS = 2

# device = "cuda" if torch.cuda.is_available() else "cpu"


# from model import Transformer

# def test_model():

#     inputs = [
#         torch.randint(0, 4, (5, FEATURE_DIM)).float(),
#         torch.randint(0, 4, (3, FEATURE_DIM)).float(),
#     ]
#     labels = [
#         torch.randint(0, 2, (5, OUTPUT_DIM)).float(),
#         torch.randint(0, 2, (3, OUTPUT_DIM)).float(),
#     ]

#     padded_inputs, masks_inputs = pad_sequence(inputs, \
#                                                padding_value=0, max_len=MAX_LEN)
#     padded_labels, masks_labels = pad_sequence(labels, \
#                                                padding_value=0, max_len=MAX_LEN)

#     transformer = Transformer(
#         input_dim=FEATURE_DIM,
#         d_model=HIDDEN_DIM,
#         output_dim=OUTPUT_DIM,
#         num_heads=NUM_HEADS,
#         num_layers=NUM_LAYERS,
#         max_len=MAX_LEN,
#     )

#     with torch.no_grad():
#         outputs = transformer(padded_inputs, masks_inputs)

#     assert torch.isnan(outputs).sum() == 0
#     assert outputs.shape[:2] == padded_inputs.shape[:2]
#     assert outputs.shape[-1] == len(target)

#     print("Input Shape (padded):", padded_inputs.shape)
#     print("Output Shape (padded):", outputs.shape)

#     del transformer
#     del inputs, labels
#     del padded_inputs, masks_inputs, padded_labels, masks_labels
#     del outputs

#     gc.collect()

# test_model()

In [None]:
target[0]

In [None]:
import torch.nn as nn

import torch 

PADDING_VALUE = -1
MAX_LEN = 500
FEATURE_DIM = len(features)
HIDDEN_DIM = 128
OUTPUT_DIM = 1 # len(target[0])
NUM_HEADS = 2
NUM_LAYERS = 2

class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # First fully connected layer
        self.fc2 = nn.Linear(256, 128)        # Second fully connected layer
        self.fc3 = nn.Linear(128, output_dim) # Output layer

    def forward(self, x, mask):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x * mask  # This ensures that the outputs for padded positions are zero

def test_simple_model():
    inputs = [
        torch.randint(0, 4, (5, FEATURE_DIM)).float(),
        torch.randint(0, 4, (3, FEATURE_DIM)).float(),
    ]

    # Padding sequences to have the same length for batch processing
    padded_inputs, masks_inputs = pad_sequence(inputs)

    model = SimpleNN(FEATURE_DIM, OUTPUT_DIM)
    outputs = model(padded_inputs, masks_inputs)

    print("Input Shape:", padded_inputs.shape)
    print("Output Shape:", outputs.shape)

test_simple_model()

In [None]:
OUTPUT_DIM

In [None]:
# pearsonr in torch differentiable
def pearsonr(x, y):
    mx = x.mean()
    my = y.mean()
    xm, ym = x - mx, y - my
    r_num = torch.sum(xm * ym)
    r_den = torch.sqrt(torch.sum(xm ** 2) * torch.sum(ym ** 2))
    r = r_num / r_den
    return r

In [None]:
def calculate_loss(outputs, criterion, padded_labels, masks_inputs, \
                padded_inputs=None, target_weight_softmax=None):
    # print("Outputs shape:", outputs.shape)
    # print("Padded labels shape:", padded_labels.shape)
    # MSE on all targets; additionally, on primary target
    if target_weight_softmax is not None:
        _mse = criterion(
            outputs * masks_inputs * target_weight_softmax,
            padded_labels * masks_inputs * target_weight_softmax
        ) * 0.1

    else:
        _mse = criterion(outputs * masks_inputs, padded_labels * masks_inputs) * 0.1

    _mse += criterion(outputs[:, 0] * masks_inputs, padded_labels[:, 0] * masks_inputs)

    # Corr with only primary target; adjust as needed
    corr = pearsonr(
        outputs[0][:, 0][masks_inputs.view(-1).nonzero()].view(-1, 1),
        padded_labels[0][:, 0][masks_inputs.view(-1).nonzero()].view(-1, 1),
    )

    loss = _mse - corr #+ some_complex_constraints
    return loss, _mse, corr

# Training loop
def train_on_batch(model, criterion, optimizer, batch):

    padded_inputs = batch[0].to(device=device)
    padded_labels = batch[1].to(device=device)
    masks_inputs = batch[2].to(device=device)

    # print(padded_inputs.shape)
    # print(padded_labels.shape)
    # print(masks_inputs.shape)

    optimizer.zero_grad()

    outputs = model(padded_inputs / 4.0, masks_inputs)
    # print("Outputs shape:", outputs.shape)
    # print("Padded labels shape:", padded_labels.shape)


    target_weight_softmax = None
    #random_weights = torch.rand(padded_labels.shape[-1], device=device)
    #target_weight_softmax = F.softmax(random_weights)

    loss, _mse, _corr = calculate_loss(outputs, criterion, padded_labels, masks_inputs, \
                                       target_weight_softmax=target_weight_softmax)
    loss.backward()
    optimizer.step()
    return loss.item(), _mse.item(), _corr.item()


def evaluate_on_batch(transformer, criterion, batch):

    padded_inputs = batch[0].to(device=device)
    padded_labels = batch[1].to(device=device)
    masks_inputs = batch[2].to(device=device)

    transformer.eval()
    with torch.no_grad():
        outputs = transformer(padded_inputs / 4.0, masks_inputs)
        # print(outputs)
        loss, _mse, _corr = calculate_loss(outputs, criterion, padded_labels, masks_inputs)
        
        # Convert outputs to numpy
        preds = outputs[0][masks_inputs.view(-1).nonzero()].squeeze(1).cpu().numpy()
        # print(preds)

    return loss.item(), _mse.item(), _corr.item(), preds


def metrics_on_batch(era_scores):
    era_scores = pd.Series(era_scores)
    
    # Calculate metrics
    mean_correlation = np.mean(era_scores)
    std_deviation = np.std(era_scores)
    sharpe_ratio = mean_correlation / std_deviation
    max_dd = (era_scores.cummax() - era_scores).max() # from calculate_metrics

    # Smart Sharpe: Modified Sharpe ratio that also considers the instability of scores over time,
    # penalizing models with high score instability even if their mean score is high
    smart_sharpe = mean_correlation / (std_deviation + np.std(era_scores.diff()))
    
    # Autocorrelation: Measure of the correlation of the series with a lagged version of itself
    autocorrelation = era_scores.autocorr()

    metrics = pd.Series({
        'mean_correlation': mean_correlation,
        'std_deviation': std_deviation,
        'sharpe_ratio': sharpe_ratio,
        'smart_sharpe': smart_sharpe,
        'autocorrelation': autocorrelation,
        'max_dd': max_dd, # added from calculate_metrics
        'min_correlation': era_scores.min(), # added from calculate_metrics
        'max_correlation': era_scores.max(), # added from calculate_metrics
    })

    # Cleanup
    _ = gc.collect()
    
    return metrics

In [None]:
from tqdm.auto import tqdm

def train_model(model, criterion, optimizer, scheduler, \
                num_epochs, patience, train_loader, val_loader, is_lr_scheduler=True):
    best_loss = float('inf')
    best_corr = None
    best_model = None
    best_outputs = None
    no_improve_epoch = 0

    epoch_progress = tqdm(range(num_epochs), desc="Epochs", position=0, leave=False)

    for epoch in epoch_progress:
        total_loss = []
        total_corr = []

        # Training
        for era_num in tqdm(train_loader, desc="Training", leave=False, position=1):
            batch = train_loader[era_num]
            loss, _mse, _corr = train_on_batch(model, criterion, optimizer, batch)
            total_loss.append(loss)
            total_corr.append(_corr)

        # Adjust learning rate if is_lr_scheduler is True
        if is_lr_scheduler:
            scheduler.step()

        # Validation
        model.eval()
        val_total_loss = []
        val_total_corr = []
        val_total_outputs = {}
        with torch.no_grad():
            for era_num in tqdm(val_loader, desc="Validation", leave=False, position=2):
                batch = val_loader[era_num]
                loss, _mse, _corr, outputs = evaluate_on_batch(model, criterion, batch)
                # print(outputs)
                val_total_loss.append(loss)
                val_total_corr.append(_corr)
                val_total_outputs[era_num] = outputs

        # Early stopping check
        val_loss = np.mean(val_total_loss)
        if val_loss < best_loss:
            best_loss = val_loss
            best_corr = val_total_corr.copy()
            best_model = model.state_dict().copy()
            best_outputs = val_total_outputs.copy()
            no_improve_epoch = 0
        else:
            no_improve_epoch += 1
            if no_improve_epoch >= patience:
                epoch_progress.set_description(f'Early stopping at epoch {epoch+1}')
                epoch_progress.refresh()
                break

        torch.cuda.empty_cache()
        _ = gc.collect()

    # # Save the best model state
    # torch.save(best_model, data_dir / "model_best.pth")

    return model, best_corr, best_outputs # best_outputs for future use


In [None]:
import optuna
from torch.optim.lr_scheduler import StepLR
from utils import CustomBackwardMultipleTimeSeriesCV
import torch.nn as nn
import torch.optim as optim
import os
import json
device = "cuda" if torch.cuda.is_available() else "cpu"


PADDING_VALUE = -1
MAX_LEN = 500
FEATURE_DIM = len(features)
print(FEATURE_DIM)
OUTPUT_DIM = 1 # len(target[0])


# Suggesting parameters
train_length_multiplier = 10 # trial.suggest_int('train_length_multiplier', 8, 12)
val_period_length = 21 # trial.suggest_categorical('val_period_length', [5, 10, 21])
lookahead = 1 # trial.suggest_categorical('lookahead', [1, 5, 21])

# Instantiate CV object with the suggested parameters
cv = CustomBackwardMultipleTimeSeriesCV(dataset, train_period_length=int(21 * train_length_multiplier),
                                        test_period_length=val_period_length, lookahead=lookahead,
                                        date_idx='date')

hidden_dim_1 = 128 # trial.suggest_int("hidden_dim_1", 128, 512)
hidden_dim_2 = 64 # trial.suggest_int("hidden_dim_2", 64, 256)
lr = 1e-5 # trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)

model = SimpleNN(input_dim=FEATURE_DIM, output_dim=OUTPUT_DIM).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=100, gamma=0.1)

sharpe_ratios = []
NUM_EPOCHS = 1
PATIENCE = 5
for train_idx, val_idx in cv:
    # print(train_idx)
    train_data = dataset.iloc[train_idx]
    val_data = dataset.iloc[val_idx]

    era2data_train = get_era2data(train_data)
    era2data_validation = get_era2data(val_data)

    model, best_corr, outputs = train_model(model, criterion, optimizer, scheduler,
                                            NUM_EPOCHS, PATIENCE, era2data_train, 
                                            era2data_validation, is_lr_scheduler=True)

    # metrics = metrics_on_batch(best_corr)
    # sharpe_ratios.append(metrics['sharpe_ratio'])

In [None]:
STOP

In [None]:
import optuna
from torch.optim.lr_scheduler import StepLR
from utils import CustomBackwardMultipleTimeSeriesCV
import torch.nn as nn
import torch.optim as optim
import os
import json

# Constants and hyperparameters
NUM_EPOCHS = 1
PATIENCE = 5
FEATURE_DIM = len(features)
OUTPUT_DIM = 1
MODEL_DIR = "/home/sayem/Desktop/Project/models"  # Model directory
device = "cuda" if torch.cuda.is_available() else "cpu"

def objective(trial, dataset=dataset):
    print(f"\n--- Starting Trial: {trial.number + 1} ---")

    # Suggesting parameters
    train_length_multiplier = trial.suggest_int('train_length_multiplier', 8, 12)
    val_period_length = trial.suggest_categorical('val_period_length', [5, 10, 21])
    lookahead = trial.suggest_categorical('lookahead', [1, 5, 21])

    # Instantiate CV object with the suggested parameters
    cv = CustomBackwardMultipleTimeSeriesCV(dataset, train_period_length=int(21 * train_length_multiplier),
                                            test_period_length=val_period_length, lookahead=lookahead,
                                            date_idx='date')

    hidden_dim_1 = trial.suggest_int("hidden_dim_1", 128, 512)
    hidden_dim_2 = trial.suggest_int("hidden_dim_2", 64, 256)
    lr = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)

    model = SimpleNN(input_dim=FEATURE_DIM, output_dim=OUTPUT_DIM).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=100, gamma=0.1)

    sharpe_ratios = []

    for train_idx, test_idx in cv:
        train_data = dataset.iloc[train_idx]
        test_data = dataset.iloc[test_idx]

        era2data_train = get_era2data(train_data)
        era2data_validation = get_era2data(test_data)

        model, best_corr, outputs = train_model(model, criterion, optimizer, scheduler,
                                                NUM_EPOCHS, PATIENCE, era2data_train, 
                                                era2data_validation, is_lr_scheduler=True)

        metrics = metrics_on_batch(best_corr)
        sharpe_ratios.append(metrics['sharpe_ratio'])

    return -np.mean(sharpe_ratios)

def callback(study, trial):
    print(f"\n--- Trial {trial.number + 1} finished ---")
    print(f"Value: {trial.value} and parameters: {trial.params}")
    
    # Ensure there is at least one successful trial before trying to get the best trial
    if len(study.trials_dataframe(attrs=("state",))) > 0 and any(study.trials_dataframe(attrs=("state",))["state"] == "COMPLETE"):
        print(f"Best is trial {study.best_trial.number} with value: {study.best_trial.value}\n")
    else:
        print("No successful trials yet.\n")
    
    if study.best_trial and study.best_trial.number == trial.number:
        best_model = trial.user_attrs.get("model", None)
        
        # Constructing filename dynamically
        top = trial.params["hidden_dim_1"]
        model_name = "SimpleNN"  # assuming you're using SimpleNN for now
        lookahead = trial.params["lookahead"]
        filename = f"{top}_{model_name}_TARGET_ret_fwd_{lookahead}d_rank_quantiled.pkl"
        file_path = os.path.join(MODEL_DIR, filename)
        
        # Saving model state dict
        if best_model is not None:
            torch.save(best_model.state_dict(), file_path)
        # Save parameters if needed
        with open(os.path.join(MODEL_DIR, f"{filename}_params.json"), 'w') as f:
            json.dump(trial.params, f)

study = optuna.create_study(study_name='Maximizing the Sharpe', direction='minimize',
                            storage=f'sqlite:///{study_dir}/study.db', load_if_exists=True)
study.optimize(objective, n_trials=25, callbacks=[callback])

In [None]:
STOP