In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from ficc.utils.auxiliary_variables import *
from ficc.data.queries import *
from ficc.models import get_model_instance
import ficc.utils.attr as attr
from ficc.utils.eval import eval_model
from ficc.utils.gcp_storage_functions import *
from ficc.utils.plotting import *

import pandas as pd
import numpy as np
from google.cloud import bigquery
import gcsfs
from sklearn import preprocessing


import torch
from torch.utils.data import TensorDataset
import pytorch_lightning as pl

torch.multiprocessing.set_sharing_strategy('file_system')

In [None]:
# Setting the seed for layer initializer. We want the layers to be initialized with the same values in all the experiments to remove randomness from the results
SEED = 10

pl.utilities.seed.seed_everything(SEED)

In [None]:
# Setting up the credentials for GCP
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "eng-reactor-287421-112eb767e1b3.json"
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

# Initializing the big query client
bq_client = bigquery.Client()

In [None]:
# The batch size and learning rate have an impact on the smoothness of convergence of the model.\
# Larger the batch size the smoother the convergence. For a larger batch size we need a higher learning rate and vice-versa
TRAIN_TEST_SPLIT = 0.85
LEARNING_RATE = 0.0001
BATCH_SIZE = 10000
NUM_EPOCHS = 250

DROPOUT = 0.37909650481643176
SEQUENCE_LENGTH = 5
NUM_FEATURES = 5

In [None]:
fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
with fs.open('ficc_training_data_latest/processed_data.pkl') as f:
    df = pd.read_pickle(f)

In [None]:
df.purpose_class.fillna(1, inplace=True)

df = df[~df.purpose_sub_class.isin([6, 20, 21, 22, 44, 57, 90, 106])]
df = df[~df.called_redemption_type.isin([18, 19])]

# Add additional features
processed_data = df[IDENTIFIERS + PREDICTORS + ['trade_datetime', 'next_call_date', 'par_call_date', 'maturity_date', 'refund_date']]

# A few features such as the initial issue amount cannot be filled with their logical counterparts as their values are not known and hence are dropped.
processed_data = processed_data.dropna()

# Uniform normalization
for f in NON_CAT_FEATURES + BINARY:
    processed_data[f] = preprocessing.scale(processed_data[f].to_numpy().astype('float32'))

# Fitting encoders to the categorical features. These encoders are then used to encode the categorical features of the train and test set
fmax = {}
with fs.open('ficc_training_data_latest/encoders.pkl') as f:
    encoders = pickle.load(f)
for f in CATEGORICAL_FEATURES:
    fprep = encoders[f]
    fmax[f] = np.max(fprep.transform(fprep.classes_))

In [None]:
# The create input function encodes the categorical features. It then combines the trade history, categorical, non-categorical, and binary features to return a NumPy array containing the data to be fed into the model.
def create_input(df):
    sdf = df[IDENTIFIERS + PREDICTORS]

    global encoders
    datalist = []
    datalist.append(torch.tensor(np.stack(sdf['trade_history'])).float())

    noncat_and_binary = []
    for f in NON_CAT_FEATURES + BINARY:
        noncat_and_binary.append(np.expand_dims(
            sdf[f].to_numpy().astype('float32'), axis=1))
    datalist.append(torch.tensor(np.concatenate(noncat_and_binary, axis=-1)))

    for f in CATEGORICAL_FEATURES:
        encoded = encoders[f].transform(sdf[f])
        datalist.append(torch.tensor(encoded).long())

    return datalist

def create_label(df):
    sdf = df[IDENTIFIERS + PREDICTORS]
    return torch.tensor(sdf.yield_spread.to_numpy()).float()

# Splitting the date into train and test set
train_index = int(len(processed_data) * (1.0 - TRAIN_TEST_SPLIT))
test_dataframe = processed_data[:train_index]
train_dataframe = processed_data[train_index:]

# Split the training data in to train and validation set
split_point = int(len(train_dataframe) * (1.0 - 0.9))
val_dataframe = train_dataframe[:split_point]
train_dataframe = train_dataframe[split_point:]

x_train = create_input(train_dataframe)
y_train = create_label(train_dataframe)
train_dataset = TensorDataset(*x_train, y_train)

x_val = create_input(val_dataframe)
y_val = create_label(val_dataframe)
val_dataset = TensorDataset(*x_val, y_val)

In [None]:
model_params = {
    'num_trade_history_features': NUM_FEATURES,
    'non_categorical_size': NON_CAT_FEATURES + BINARY,
    'category_sizes': fmax,
    'lstm_sizes': [50, 100],
    'embed_sizes': 15,
    'tabular_sizes': [400, 200, 100],
    'tabular_resblocks': 1,
    'final_sizes': [300, 100],
    'final_resblocks': 0,
    'dropout': 0.3758110031582248,
    'learning_schedule': 'constant', 
    'learning_rate': 0.00017360566254027907, 
    'weight_decay': 0.00039784787398219684
}

model = get_model_instance(
    "lstm_yield_spread_model_pytorch",
    **model_params)

# Reload the checkpoint of the best model, to this point
model = model.load_from_checkpoint(
    checkpoint_path="best_ys_model.ckpt",
    **model_params
)

eval_model(model, test_dataframe, create_input, create_label, wandb = None)

x_test = create_input(test_dataframe)
y_test = create_label(test_dataframe)
predictions = model(*x_test)

test_dataframe.loc[:, 'ficc_spread'] = predictions.detach().numpy()
test_dataframe.loc[:, 'abs_delta_yield_spread'] = (test_dataframe.yield_spread - test_dataframe.ficc_spread).abs()

test_dataframe.sort_values(by='abs_delta_yield_spread', ascending=False, inplace=True)
print(test_dataframe.loc[:, 'abs_delta_yield_spread'].mean())

In [None]:
def yield_yield_spread_analysis(test_dataframe):
    from IPython.display import display, Markdown

    cusip_set = set()
    for _, row in test_dataframe.iterrows():
        cusip_dataframe = test_dataframe[test_dataframe.cusip == row['cusip']]

        if (len(cusip_dataframe) > 1) and (row['cusip'] not in cusip_set):
            display(
                Markdown(
                    f"## CUSIP: {row['cusip']} - RTRS Control Numbers: {cusip_dataframe.iloc[0]['rtrs_control_number']} and {cusip_dataframe.iloc[-1]['rtrs_control_number']}\n" +
                    f"### Worst Error: {cusip_dataframe.iloc[0, :].abs_delta_yield_spread}\n" +
                    f"### Best Error: {cusip_dataframe.iloc[-1, :].abs_delta_yield_spread}"
                )
            )

            x_test = create_input(cusip_dataframe)
            y_test = create_label(cusip_dataframe)

            plot_cusip(cusip_dataframe, 'yield_spread')

            attributes = attr.compute_integrated_gradient_attributions(model, x_test)
            error_attributes = attr.compute_integrated_gradient_ysm_error_attributions(model, x_test, y_test)

            fig, ax = plt.subplots(1, 2, figsize=(30, 10))
            fig.suptitle('Yield-Spread Trade History Attributions')
            attr.visualize_trade_history_attribution(attributes[0][0], subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_trade_history_attribution(attributes[0][-1], subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 2, figsize=(30, 10))
            fig.suptitle('Yield-Spread Squared Trade History ERROR Attributions')
            attr.visualize_trade_history_attribution(error_attributes[0][0], subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_trade_history_attribution(error_attributes[0][-1], subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 2, figsize=(30, 5))
            fig.suptitle('Yield-Spread Numerical Feature Attributions')
            attr.visualize_trade_numerical_and_binary_attribution(attributes[1][0], NON_CAT_FEATURES, BINARY, subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_trade_numerical_and_binary_attribution(attributes[1][-1], NON_CAT_FEATURES, BINARY, subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 2, figsize=(30, 5))
            fig.suptitle('Yield-Spread Squared Numerical Feature ERROR Attributions')
            attr.visualize_trade_numerical_and_binary_attribution(error_attributes[1][0], NON_CAT_FEATURES, BINARY, subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_trade_numerical_and_binary_attribution(error_attributes[1][-1], NON_CAT_FEATURES, BINARY, subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 1, figsize=(20, 3))
            ax.set_axis_off()
            values = x_test[1][(0, -1), :].cpu().numpy()
            tbl = ax.table(cellText=values.T, rowLabels=NON_CAT_FEATURES + BINARY, colLabels=["Worst", "Best"], loc='center')
            for i in range(len(NON_CAT_FEATURES + BINARY) + 1):
                if i % 2:
                    for j in range(2):
                        tbl[(i, j)].set_facecolor("#dddddd")

            fig, ax = plt.subplots(1, 2, figsize=(30, 3))
            fig.suptitle('Yield-Spread Categorical Feature Attributions')
            attr.visualize_categorical_attribution(torch.cat([a.unsqueeze(1) for a in attributes[2:]], dim=-1)[0], CATEGORICAL_FEATURES, subtitle="Worst", fig_ax=(fig, ax[0]), values=[x[0].cpu().numpy() for x in x_test[2:]])
            attr.visualize_categorical_attribution(torch.cat([a.unsqueeze(1) for a in attributes[2:]], dim=-1)[-1], CATEGORICAL_FEATURES, subtitle="Best", fig_ax=(fig, ax[1]), values=[x[-1].cpu().numpy() for x in x_test[2:]])

            fig, ax = plt.subplots(1, 2, figsize=(30, 3))
            fig.suptitle('Yield-Spread Squared Categorical Feature ERROR Attributions')
            attr.visualize_categorical_attribution(torch.cat([a.unsqueeze(1) for a in error_attributes[2:]], dim=-1)[0], CATEGORICAL_FEATURES, subtitle="Worst", fig_ax=(fig, ax[0]), values=[x[0].cpu().numpy() for x in x_test[2:]])
            attr.visualize_categorical_attribution(torch.cat([a.unsqueeze(1) for a in error_attributes[2:]], dim=-1)[-1], CATEGORICAL_FEATURES, subtitle="Best", fig_ax=(fig, ax[1]), values=[x[-1].cpu().numpy() for x in x_test[2:]])

            fig, ax = plt.subplots(1, 1, figsize=(20, 3))
            ax.set_axis_off()
            values = [[x[i].cpu().numpy() for i in (0, -1)] for x in x_test[2:]]
            tbl = ax.table(cellText=values, rowLabels=CATEGORICAL_FEATURES, colLabels=["Worst", "Best"], loc='center')
            for i in range(len(NON_CAT_FEATURES + BINARY) + 1):
                if i % 2:
                    for j in range(2):
                        tbl[(i, j)].set_facecolor("#dddddd")

            cusip_set.add(row['cusip'])
        else:
            continue

        yield

yield_spread_analysis_generator = yield_yield_spread_analysis(test_dataframe)

# Display the 10 cusips with the worst yield-spread predictions
## Includes analysis of the cusip's best vs. worst predictions

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
next(yield_spread_analysis_generator)

In [None]:
# The create input function encodes the categorical features. It then combines the trade history, categorical, non-categorical, and binary features to return a NumPy array containing the data to be fed into the model.
def create_input(df):
    sdf = df[IDENTIFIERS + PREDICTORS]

    global encoders
    datalist = []
    datalist.append(torch.tensor(np.stack(sdf['trade_history'])).float())

    noncat_and_binary = []
    for f in NON_CAT_FEATURES + BINARY:
        noncat_and_binary.append(np.expand_dims(
            sdf[f].to_numpy().astype('float32'), axis=1))
    datalist.append(torch.tensor(np.concatenate(noncat_and_binary, axis=-1)))

    for f in CATEGORICAL_FEATURES:
        encoded = encoders[f].transform(sdf[f])
        datalist.append(torch.tensor(encoded).long())

    return datalist

def create_ys_label(df):
    sdf = df[IDENTIFIERS + PREDICTORS]
    return torch.tensor(sdf.yield_spread.to_numpy()).float()

def create_cdc_label(df):
    sdf = df[IDENTIFIERS + PREDICTORS]
    return torch.tensor(sdf.calc_day_cat.to_numpy()).int()

def create_ys_cdc_label(df):
    sdf = df[IDENTIFIERS + PREDICTORS]
    return create_ys_label(sdf), create_cdc_label(df)

# Splitting the date into train and test set
train_index = int(len(processed_data) * (1.0 - TRAIN_TEST_SPLIT))
test_dataframe = processed_data[:train_index]
train_dataframe = processed_data[train_index:]

# Split the training data in to train and validation set
split_point = int(len(train_dataframe) * (1.0 - 0.9))
val_dataframe = train_dataframe[:split_point]
train_dataframe = train_dataframe[split_point:]

x_train = create_input(train_dataframe)
y_train = create_cdc_label(train_dataframe)
train_dataset = TensorDataset(*x_train, y_train)

x_val = create_input(val_dataframe)
y_val = create_cdc_label(val_dataframe)
val_dataset = TensorDataset(*x_val, y_val)

In [None]:
model_params = {
    'num_trade_history_features': NUM_FEATURES,
    'non_categorical_size': NON_CAT_FEATURES + BINARY,
    'category_sizes': fmax,
    'lstm_sizes': [50, 100],
    'embed_sizes': 15,
    'tabular_sizes': [400, 200, 100],
    'tabular_resblocks': 1,
    'final_sizes': [300, 100],
    'final_resblocks': 0,
    'dropout': 0.3758110031582248,
    'learning_schedule': 'constant', 
    'learning_rate': 0.00017360566254027907, 
    'weight_decay': 0.00039784787398219684
}

model = get_model_instance(
    "lstm_calc_date_model_pytorch",
    **model_params)

# Reload the checkpoint of the best model, to this point
model = model.load_from_checkpoint(
    checkpoint_path="best_cd_model.ckpt",
    **model_params
)

x_test = create_input(test_dataframe)
y_test = create_cdc_label(test_dataframe)
predictions = model(*x_test)

test_dataframe.loc[:, 'calc_date_prediction'] = predictions.detach().numpy().argmax(axis=1)
test_dataframe.loc[:, 'calc_date_match'] = (test_dataframe.calc_day_cat - test_dataframe.calc_date_prediction).abs()
test_dataframe.sort_values(by='calc_date_match', ascending=False, inplace=True)

In [None]:
def yield_calc_date_analysis(test_dataframe):
    from IPython.display import display, Markdown

    calc_date_cat_dict = {0:'next_call_date',
    1:'par_call_date',
    2:'maturity_date',
    3:'refund_date'}

    cusip_set = set()
    for _, row in test_dataframe.iterrows():
        cusip_dataframe = test_dataframe[test_dataframe.cusip == row['cusip']]

        if (cusip_dataframe.calc_date_prediction == cusip_dataframe.calc_day_cat).any() and (cusip_dataframe.calc_date_prediction != cusip_dataframe.calc_day_cat).any() and (len(cusip_dataframe) > 1) and (row['cusip'] not in cusip_set):
            display(Markdown(
                f"## CUSIP: {row['cusip']} - RTRS Control Numbers: {cusip_dataframe.iloc[0]['rtrs_control_number']} and {cusip_dataframe.iloc[-1]['rtrs_control_number']}\n" +
                f"### Actual Calculation Date (failed): {cusip_dataframe.iloc[0]['calc_day_cat']} - {cusip_dataframe.iloc[0][calc_date_cat_dict[cusip_dataframe.iloc[-1]['calc_day_cat']]]}\n" +
                f"### Predicted Calculation Date (failed): {cusip_dataframe.iloc[0]['calc_date_prediction']} - {cusip_dataframe.iloc[0][calc_date_cat_dict[cusip_dataframe.iloc[-1]['calc_date_prediction']]]}\n" +
                f"### Actual Calculation Date (succeeded): {cusip_dataframe.iloc[-1]['calc_day_cat']} - {cusip_dataframe.iloc[-1][calc_date_cat_dict[cusip_dataframe.iloc[-1]['calc_day_cat']]]}\n" +
                f"### Predicted Calculation Date (succeeded): {cusip_dataframe.iloc[-1]['calc_date_prediction']} - {cusip_dataframe.iloc[-1][calc_date_cat_dict[cusip_dataframe.iloc[-1]['calc_date_prediction']]]}"
            ))

            x_test = create_input(cusip_dataframe)
            y_test = create_cdc_label(cusip_dataframe)

            # plot_cusip(cusip_dataframe, 'calc_date')

            error_attributes = attr.compute_integrated_gradient_calc_date_error_attributions(model, x_test, y_test)

            fig, ax = plt.subplots(1, 2, figsize=(30, 10))
            fig.suptitle('Calc-Date Trade History ERROR Attributions')
            attr.visualize_trade_history_attribution(error_attributes[0][0], subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_trade_history_attribution(error_attributes[0][-1], subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 2, figsize=(30, 5))
            fig.suptitle('Calc-Date Numerical Feature ERROR Attributions')
            attr.visualize_trade_numerical_and_binary_attribution(error_attributes[1][0], NON_CAT_FEATURES, BINARY, subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_trade_numerical_and_binary_attribution(error_attributes[1][-1], NON_CAT_FEATURES, BINARY, subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 1, figsize=(20, 3))
            ax.set_axis_off()
            values = x_test[1][(0, -1), :].cpu().numpy()
            tbl = ax.table(cellText=values.T, rowLabels=NON_CAT_FEATURES + BINARY, colLabels=["Worst", "Best"], loc='center')
            for i in range(len(NON_CAT_FEATURES + BINARY) + 1):
                if i % 2:
                    for j in range(2):
                        tbl[(i, j)].set_facecolor("#dddddd")

            fig, ax = plt.subplots(1, 2, figsize=(30, 3))
            fig.suptitle('Calc-Date Categorical Feature ERROR Attributions')
            attr.visualize_categorical_attribution(torch.cat([a.unsqueeze(1) for a in error_attributes[2:]], dim=-1)[0], CATEGORICAL_FEATURES, subtitle="Worst", fig_ax=(fig, ax[0]))
            attr.visualize_categorical_attribution(torch.cat([a.unsqueeze(1) for a in error_attributes[2:]], dim=-1)[-1], CATEGORICAL_FEATURES, subtitle="Best", fig_ax=(fig, ax[1]))

            fig, ax = plt.subplots(1, 1, figsize=(20, 3))
            ax.set_axis_off()
            values = [[x[i].cpu().numpy() for i in (0, -1)] for x in x_test[2:]]
            tbl = ax.table(cellText=values, rowLabels=CATEGORICAL_FEATURES, colLabels=["Worst", "Best"], loc='center')
            for i in range(len(NON_CAT_FEATURES + BINARY) + 1):
                if i % 2:
                    for j in range(2):
                        tbl[(i, j)].set_facecolor("#dddddd")

            cusip_set.add(row['cusip'])
        else:
            continue
            
        yield

calc_date_analysis_generator = yield_calc_date_analysis(test_dataframe)

# 10 cusips with both correct and incorrect calc-date predictions
## Includes analysis of the cusip's best vs. worst predictions

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)

In [None]:
next(calc_date_analysis_generator)