# Exploring yield spread models conditioned on the calc-date

## Import all required modules

In [1]:
import os
from ficc.utils.auxiliary_variables import PREDICTORS, IDENTIFIERS, CATEGORICAL_FEATURES, NON_CAT_FEATURES, BINARY
from ficc.models import get_model_instance

import pandas as pd
import numpy as np
from sklearn import preprocessing

import torch
from torch.utils.data import TensorDataset
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

import shutil

import wandb

torch.multiprocessing.set_sharing_strategy('file_system')

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Seed the random-number generators

In [2]:
# Setting the random seed, for consistency
SEED = 10

pl.utilities.seed.seed_everything(SEED)

Global seed set to 10


10

In [3]:
SEQUENCE_LENGTH = 5
NUM_FEATURES = 5
NUM_EPOCHS = 1500
BATCH_SIZE = 10000

## Data preparation

In [4]:
with open('processed_data.pkl', 'rb') as f:
    df = pd.read_pickle(f)

df = df[~df.purpose_sub_class.isin([6, 20, 21, 22, 44, 57, 90, 106])]
df = df[~df.called_redemption_type.isin([18, 19])]

# Add additional features
processed_data = df[IDENTIFIERS + PREDICTORS + ['trade_datetime']]

# A few features such as the initial issue amount cannot be filled with their logical counterparts as their values are not known and hence are dropped.
processed_data = processed_data.dropna()
unprocessed_data = processed_data.copy()

In [5]:
# Splitting the date into train and test set
test_dataframe = processed_data[processed_data.trade_datetime >= '2022-05-21']
train_dataframe = processed_data[processed_data.trade_datetime < '2022-05-21']
unprocessed_test_df = unprocessed_data[unprocessed_data.trade_datetime >= '2022-05-21']

# Split the training data in to train and validation set
val_dataframe = train_dataframe[train_dataframe.trade_datetime >= '2022-05-07']
train_dataframe = train_dataframe[train_dataframe.trade_datetime < '2022-05-07']

In [6]:
from torch.utils.data import TensorDataset, DataLoader

# Uniform normalization
normalizers = []
for f in NON_CAT_FEATURES + BINARY:
    normalizers.append(preprocessing.StandardScaler())
    processed_data.loc[:, f] = normalizers[-1].fit_transform(processed_data[f].to_numpy().astype('float32').reshape(-1, 1))

# Fitting encoders to the categorical features. These encoders are then used to encode the categorical features of the train and test set
encoders = {}
fmax = {}
for f in CATEGORICAL_FEATURES + ['calc_day_cat']:
    fprep = preprocessing.LabelEncoder().fit(
        processed_data[f].drop_duplicates())
    fmax[f] = np.max(fprep.transform(fprep.classes_))
    encoders[f] = fprep

def create_input(df, with_cdc=False):
    sdf = df[IDENTIFIERS + PREDICTORS]

    global encoders
    datalist = []
    datalist.append(torch.tensor(np.stack(sdf['trade_history'])).float())

    noncat_and_binary = []
    for f in NON_CAT_FEATURES + BINARY:
        noncat_and_binary.append(np.expand_dims(
            sdf[f].to_numpy().astype('float32'), axis=1))
    datalist.append(torch.tensor(np.concatenate(noncat_and_binary, axis=-1)))

    cat_features = CATEGORICAL_FEATURES if not with_cdc else CATEGORICAL_FEATURES + ['calc_day_cat']
    for f in cat_features:
        encoded = encoders[f].transform(sdf[f])
        datalist.append(torch.tensor(encoded).long())

    return datalist

def create_label(df):
    sdf = df[IDENTIFIERS + PREDICTORS]
    return torch.tensor(sdf.yield_spread.to_numpy()).float()

# Create all the datasets and dataloaders, appending _with_cdc to the name to indicate if the calc-day categorical feature is included
train_ds = TensorDataset(*create_input(train_dataframe), create_label(train_dataframe))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=os.cpu_count())
train_ds_with_cdc = TensorDataset(*create_input(train_dataframe, with_cdc=True), create_label(train_dataframe))
train_loader_with_cdc = DataLoader(train_ds_with_cdc, batch_size=BATCH_SIZE, num_workers=os.cpu_count(), persistent_workers=True)

val_ds = TensorDataset(*create_input(val_dataframe), create_label(val_dataframe))
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=os.cpu_count())
val_ds_with_cdc = TensorDataset(*create_input(val_dataframe, with_cdc=True), create_label(val_dataframe))
val_loader_with_cdc = DataLoader(val_ds_with_cdc, batch_size=BATCH_SIZE, num_workers=os.cpu_count(), persistent_workers=True)

test_ds = TensorDataset(*create_input(test_dataframe), create_label(test_dataframe))
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, num_workers=os.cpu_count())
test_ds_with_cdc = TensorDataset(*create_input(test_dataframe, with_cdc=True), create_label(test_dataframe))
test_loader_with_cdc = DataLoader(test_ds_with_cdc, batch_size=BATCH_SIZE, num_workers=os.cpu_count(), persistent_workers=True)

In [None]:
model_params = {
    'num_trade_history_features': NUM_FEATURES,
    'non_categorical_size': NON_CAT_FEATURES + BINARY,
    'category_sizes': fmax,
    'lstm_sizes': [50, 100],
    'embed_sizes': 15,
    'tabular_sizes': [400, 200, 100],
    'tabular_resblocks': 1,
    'final_sizes': [300, 100],
    'final_resblocks': 0,
    'dropout': 0.4,
    'learning_schedule': 'constant', 
    'learning_rate': 3e-04, 
    'weight_decay': 0.0004
}

model = get_model_instance(
    "lstm_yield_spread_model_pytorch",
    **model_params)

wandb = WandbLogger(project="pytorch-models", entity="ficc-ai", name=f"Conditioned Model with dropout {model_params['dropout']}")
wandb.watch(model, log="all")

checkpoint = ModelCheckpoint(monitor="val_mae")
trainer = pl.Trainer(
    gpus=1 if torch.cuda.is_available() else 0,
    max_epochs=NUM_EPOCHS,
    callbacks=[
        checkpoint
    ],
    logger=wandb,
)

trainer.fit(model, train_loader_with_cdc, val_loader_with_cdc)

shutil.copyfile(checkpoint.best_model_path, f"best_ys_model-{model_params['dropout']}.ckpt")

wandb.experiment.finish()

## Load the calc-date model and use it to calculate probabilities

In [9]:
fmax_without_cd = fmax.copy()
fmax_without_cd.pop('calc_day_cat')

# Drop the calc-date categorical feature from fmax
model_params = {
    'num_trade_history_features': NUM_FEATURES,
    'non_categorical_size': NON_CAT_FEATURES + BINARY,
    'category_sizes': fmax_without_cd,
    'lstm_sizes': [50, 100],
    'embed_sizes': 15,
    'tabular_sizes': [400, 200, 100],
    'tabular_resblocks': 1,
    'final_sizes': [300, 100],
    'final_resblocks': 0,
    'dropout': 0.4,
    'learning_schedule': 'constant', 
    'learning_rate': 3e-04, 
    'weight_decay': 0.0004
}

model = get_model_instance(
    "lstm_calc_date_model_pytorch",
    **model_params)

# Reload the checkpoint of the best model, to this point
model = model.load_from_checkpoint(
    checkpoint_path="best_cd_model.ckpt",
    **model_params
)
model.eval()
model = model.cuda()

inputs = [x.cuda() for x in create_input(test_dataframe)]
with torch.no_grad():
    logits = model(*inputs)
    probs = torch.softmax(logits, dim=1)
del inputs

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## Run the yield-spread model with calc-day-cat set to 0, 1, 2, 3. Then weight all the results by the calculated calc-date probabilities

In [10]:
# Reset the category sizes to include the calc-date categorical feature, for the yield-spread model conditioned upon calc-date
model_params['category_sizes'] = fmax

model = get_model_instance(
    "lstm_yield_spread_model_pytorch",
    **model_params)
model = model.load_from_checkpoint(
    checkpoint_path=f"best_ys_model-{model_params['dropout']}.ckpt",
    **model_params
)
model.eval()
model = model.cuda()

yield_spread = None
inputs_with_cdc = [x.cuda() for x in create_input(test_dataframe, with_cdc=True)]
for calc_date in range(4):
    print (f"Calc-date {calc_date}")
    test_dataframe.loc[:, 'calc_day_cat'] = calc_date

    with torch.no_grad():
        current_yield_spread = model.cuda()(*inputs_with_cdc)
        print("\tPredictions complete")
        if yield_spread is None:
            yield_spread = current_yield_spread * probs[:, calc_date]
        else:
            yield_spread = yield_spread + current_yield_spread * probs[:, calc_date]

print(f"Conditioned model attains a yield-spread MAE of {(create_label(test_dataframe) - yield_spread).abs().mean().cpu().numpy()}")

Calc-date 0
	Predictions complete


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


RuntimeError: CUDA out of memory. Tried to allocate 332.90 GiB (GPU 0; 23.70 GiB total capacity; 311.14 MiB already allocated; 18.94 GiB free; 1.87 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF