### Comparing dollar price and yield spread model

This notebook is used to measure the accuracy of the dollar price model on specific subset of the data.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import time

import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import seaborn as sns

from datetime import datetime
import matplotlib.pyplot as plt
import pickle
from lightgbm import LGBMRegressor
import lightgbm

from IPython.display import display, HTML
import os

from ficc.pricing.price import compute_price
from ficc.data.process_data import process_data
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS, NON_CAT_FEATURES_DOLLAR_PRICE, BINARY_DOLLAR_PRICE, CATEGORICAL_FEATURES_DOLLAR_PRICE
from ficc.utils.gcp_storage_functions import upload_data, download_data
from ficc.utils.auxiliary_variables import COUPON_FREQUENCY_TYPE

Initializing pandarallel with 16.0 cores
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

2023-11-14 17:50:00.434180: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 17:50:00.537686: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 17:50:00.538389: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="../ahmad_creds.json"
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.options.mode.chained_assignment = None

Creating big query and gcp storage client

In [4]:
bq_client = bigquery.Client()
storage_client = storage.Client()

In [5]:
if 'ficc_treasury_spread' not in PREDICTORS:
    PREDICTORS.append('ficc_treasury_spread')
    NON_CAT_FEATURES.append('ficc_treasury_spread')
if 'target_attention_features' not in PREDICTORS:
    PREDICTORS.append('target_attention_features')

Declaring hyper-parameters

In [6]:
BATCH_SIZE = 1000
SEQUENCE_LENGTH = 5
DOLLAR_SEQUENCE_LENGTH = 2
NUM_FEATURES = 6

Loading data from GCP bucket. This file only contain trades with negative yields.

In [11]:
%%time
import gcsfs
fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
with fs.open('ficc_training_data_latest/processed_data_coupon_3.pkl') as f:
    data = pd.read_pickle(f)

CPU times: user 38.9 ms, sys: 6.6 ms, total: 45.5 ms
Wall time: 111 ms


In [12]:
len(data)

5041

In [9]:
data = data[(data.days_to_call == 0) | (data.days_to_call > np.log10(400))]
data = data[(data.days_to_refund == 0) | (data.days_to_refund > np.log10(400))]
data = data[(data.days_to_maturity == 0) | (data.days_to_maturity > np.log10(400))]
data = data[data.days_to_maturity < np.log10(30000)]

In [13]:
len(data)

5041

In [14]:
print(f'Restricting history to {DOLLAR_SEQUENCE_LENGTH} trades')
data.trade_history = data.trade_history.apply(lambda x: x[:DOLLAR_SEQUENCE_LENGTH])
data.target_attention_features = data.target_attention_features.apply(lambda x:x[:DOLLAR_SEQUENCE_LENGTH])

Restricting history to 2 trades


In [15]:
print(data.trade_history.iloc[0].shape)

(2, 5)


For a few CUSIPs, the last trade took place prior to October 2020. As a result, we lack features or data pertaining to these trades since our available data begins from October 2020 onwards.

In [16]:
data['last_yield_spread'].fillna(0, inplace=True)
data['last_seconds_ago'].fillna(0, inplace=True)
data['last_dollar_price'].fillna(0, inplace=True)
# data.dropna(subset=['new_ficc_ycl'], inplace=True)

In [17]:
len(data)

5041

Loading the encoders for dollar price model and yield spread model

In [18]:
encoders_dollar_price = download_data(storage_client,'automated_training','encoders_dollar_price.pkl')

File encoders_dollar_price.pkl downloaded to automated_training.


Dollar price trade history features. This is an adaption of the code from Charles's notebook

In [19]:
data.sort_values('trade_datetime', inplace=True)

In [20]:
ttype_dict = { (0,0):'D', (0,1):'S', (1,0):'P' }

dp_variants = ["max_dp", "min_dp", "max_qty", "min_ago", "D_min_ago", "P_min_ago", "S_min_ago"]
dp_feats = ["_dp", "_ttypes", "_ago", "_qdiff"]
D_prev = dict()
P_prev = dict()
S_prev = dict()

def get_trade_history_columns_dollar_price():
    '''
    This function is used to create a list of columns
    '''
    YS_COLS = []
    for prefix in dp_variants:
        for suffix in dp_feats:
            YS_COLS.append(prefix + suffix)
    return YS_COLS

def extract_feature_from_trade_dollar_price(row, name, trade):
    dollar_price = trade[0]
    ttypes = ttype_dict[(trade[2],trade[3])] + row.trade_type
    seconds_ago = trade[4]
    quantity_diff = np.log10(1 + np.abs(10**trade[1] - 10**row.quantity))
    return [dollar_price, ttypes,  seconds_ago, quantity_diff]

def trade_history_derived_features_dollar_price(row):
    trade_history = row.trade_history
    trade = trade_history[0]
    
    D_min_ago_t = D_prev.get(row.cusip,trade)
    D_min_ago = 9        

    P_min_ago_t = P_prev.get(row.cusip,trade)
    P_min_ago = 9
    
    S_min_ago_t = S_prev.get(row.cusip,trade)
    S_min_ago = 9
    
    max_dp_t = trade; max_dp = trade[0]
    min_dp_t = trade; min_dp = trade[0]
    max_qty_t = trade; max_qty = trade[1]
    min_ago_t = trade; min_ago = trade[4]
    
    for trade in trade_history[0:]:
        #Checking if the first trade in the history is from the same block
        if trade[4] <= 0: 
            continue
 
        if trade[0] > max_dp: 
            max_dp_t = trade
            max_dp = trade[0]
        elif trade[0] < min_dp: 
            min_dp_t = trade; 
            min_dp = trade[0]

        if trade[1] > max_qty: 
            max_qty_t = trade 
            max_qty = trade[1]
        if trade[4] < min_ago: 
            min_ago_t = trade; 
            min_ago = trade[4]
            
        side = ttype_dict[(trade[2],trade[3])]
        if side == "D":
            if trade[4] < D_min_ago: 
                D_min_ago_t = trade
                D_min_ago = trade[4]
                D_prev[row.cusip] = trade
        elif side == "P":
            if trade[4] < P_min_ago: 
                P_min_ago_t = trade
                P_min_ago = trade[4]
                P_prev[row.cusip] = trade
        elif side == "S":
            if trade[4] < S_min_ago: 
                S_min_ago_t = trade
                S_min_ago = trade[4]
                S_prev[row.cusip] = trade
        else: 
            print("invalid side", trade)
    
    trade_history_dict = {"max_dp":max_dp_t,
                          "min_dp":min_dp_t,
                          "max_qty":max_qty_t,
                          "min_ago":min_ago_t,
                          "D_min_ago":D_min_ago_t,
                          "P_min_ago":P_min_ago_t,
                          "S_min_ago":S_min_ago_t}

    return_list = []
    for variant in dp_variants:
        feature_list = extract_feature_from_trade_dollar_price(row,variant,trade_history_dict[variant])
        return_list += feature_list
    
    return return_list

In [21]:
temp = data[['cusip','trade_history','quantity','trade_type']].parallel_apply(trade_history_derived_features_dollar_price, axis=1)

In [22]:
YS_COLS_DOLLAR_PRICE = get_trade_history_columns_dollar_price()

In [23]:
data[YS_COLS_DOLLAR_PRICE] = pd.DataFrame(temp.tolist(), index=data.index)

In [24]:
data.sort_values('trade_datetime',ascending=False,inplace=True)

In [25]:
len(data)

5041

In [26]:
def create_input_dollar_price(df, encoders):
    datalist = []
    datalist.append(np.stack(df['trade_history'].to_numpy()))
    datalist.append(np.stack(df['target_attention_features'].to_numpy()))

    noncat_and_binary = []
    for f in NON_CAT_FEATURES_DOLLAR_PRICE + BINARY_DOLLAR_PRICE:
        noncat_and_binary.append(np.expand_dims(df[f].to_numpy().astype('float32'), axis=1))
    datalist.append(np.concatenate(noncat_and_binary, axis=-1))
    
    for f in CATEGORICAL_FEATURES_DOLLAR_PRICE:
        encoded = encoders[f].transform(df[f])
        datalist.append(encoded.astype('float32'))
    
    return datalist

In [27]:
x_test_dollar_price = create_input_dollar_price(data, encoders_dollar_price)

#### Loading models

In [28]:
dollar_price_model = keras.models.load_model('saved_models/dollar-model-08-31')

2023-11-14 17:50:19.782846: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-14 17:50:19.786501: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 17:50:19.787317: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-14 17:50:19.787904: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [29]:
dollar_price_predictions = dollar_price_model.predict(x_test_dollar_price, batch_size=BATCH_SIZE)

2023-11-14 17:50:34.525925: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:689] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "Tesla T4" frequency: 1590 num_cores: 40 environment { key: "architecture" value: "7.5" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 65536 memory_size: 14488961024 bandwidth: 320064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-11-14 17:50:38.584026: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200


Converting predicted dollar price to yield to worst

In [30]:
data['predicted_dollar_price'] = dollar_price_predictions

In [31]:
from ficc.pricing.yield_rate import compute_yield

In [32]:
def get_predicted_yield(row):
    try:
        ytw, calc_date = compute_yield(row, price = row.predicted_dollar_price)
    except:
        ytw, calc_date = None, None
    return ytw, calc_date

In [33]:
%%time
temp = data.parallel_apply(lambda x: get_predicted_yield(x), axis=1)
data[['converted_yield','dp_calc_date']] = pd.DataFrame(temp.tolist(), index=data.index)

CPU times: user 126 ms, sys: 418 ms, total: 545 ms
Wall time: 3.58 s


In [34]:
data.converted_yield *= 100

In [36]:
data = data[data.callable_at_cav == False]

In [37]:
len(data)

5041

In [39]:
data['yield'] /= 100

In [40]:
data['price_delta'] = data.dollar_price - data.predicted_dollar_price
data['yield_delta'] = data['yield'] - data.converted_yield

In [41]:
yield_error = np.mean(np.abs(data['yield_delta']))
price_error = np.mean(np.abs(data['price_delta']))

In [42]:
yield_error

20.38299595141701

In [56]:
data[data.price_delta > 50][['cusip','rtrs_control_number','trade_datetime','dollar_price','predicted_dollar_price','trade_history']]

Unnamed: 0,cusip,rtrs_control_number,trade_datetime,dollar_price,predicted_dollar_price,trade_history
31,499652AM7,2023092911141800,2023-09-29 15:17:46,98.095,30.965151,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
32,89438UBM9,2023092911116900,2023-09-29 15:16:52,96.845,29.922115,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
53,097067NL2,2023092910623500,2023-09-29 14:56:24,96.695,29.286558,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
179,68608ENB4,2023092904843900,2023-09-29 11:37:11,96.357,32.548153,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
180,68608ENB4,2023092904844000,2023-09-29 11:37:11,96.457,33.548046,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
...,...,...,...,...,...,...
5011,647207BB4,2023090101741900,2023-09-01 10:02:56,100.999,30.602377,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
5026,880443JF4,2023090101140200,2023-09-01 09:34:02,101.861,32.106697,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
5027,84136FBB2,2023090101140100,2023-09-01 09:34:00,101.953,32.072739,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"
5028,880443JF4,2023090101129600,2023-09-01 09:34:00,101.761,30.890341,"[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"


In [44]:
data['price_delta'].describe()

count    5003.000000
mean       11.485913
std        25.640822
min       -10.415552
25%        -0.381756
50%         0.173987
75%         1.219654
max        83.299492
Name: price_delta, dtype: float64

In [39]:
data.to_csv('coupon_type_3_prediction.csv',index=False)