In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import time

import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import seaborn as sns
from pandas.tseries.offsets import BDay

from datetime import datetime
import matplotlib.pyplot as plt
import pickle5 as pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
import lightgbm

from IPython.display import display, HTML
import os


from ficc.data.process_data import process_data
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS, PURPOSE_CLASS_DICT
from ficc.utils.gcp_storage_functions import upload_data, download_data
from ficc.utils.auxiliary_variables import RELATED_TRADE_BINARY_FEATURES, RELATED_TRADE_NON_CAT_FEATURES, RELATED_TRADE_CATEGORICAL_FEATURES

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Extension horovod.torch has not been built: /opt/conda/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.


In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="ahmad_creds.json"
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.options.mode.chained_assignment = None

Creating big query and GCP storage client

In [3]:
bq_client = bigquery.Client()
storage_client = storage.Client()

Declaring hyper-parameters

In [4]:
BATCH_SIZE = 1000
SEQUENCE_LENGTH = 5
NUM_FEATURES = 6

### Creating test set

In [5]:
import gcsfs
fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
with fs.open('ficc_training_data_latest/trade_history_heuristics_test.pkl') as f:
    data = pd.read_pickle(f)

In [6]:
data.trade_date.max()

Timestamp('2023-01-31 00:00:00')

Here is a list of exclusions that we will be experimenting with. The model is trained with these exclusions.
<ul>
    <li>Callable less than a year in the future </li>
    <li>Maturity less than a year in the future and more than 30 years in the future</li>
<ul>

In [7]:
data = data[(data.days_to_call == 0) | (data.days_to_call > np.log10(400))]
data = data[(data.days_to_refund == 0) | (data.days_to_refund > np.log10(400))]
data = data[(data.days_to_maturity == 0) | (data.days_to_maturity > np.log10(400))]
data = data[data.days_to_maturity < np.log10(30000)]

In [8]:
if 'target_attention_features' not in PREDICTORS:
    PREDICTORS.append('target_attention_features')

In [9]:
if 'ficc_treasury_spread' not in PREDICTORS:
    PREDICTORS.append('ficc_treasury_spread')
    NON_CAT_FEATURES.append('ficc_treasury_spread')

In [10]:
auxiliary_features = ['dollar_price',
                     'calc_date', 
                     'trade_date',
                     'trade_datetime', 
                     'purpose_sub_class', 
                     'called_redemption_type', 
                     'calc_day_cat',
                     'yield',
                     'ficc_ycl',
#                      'trade_history_sum',
                     'new_ficc_ycl',
                     'days_to_refund',
                     'is_called']

In [11]:
processed_data = data[IDENTIFIERS + PREDICTORS + auxiliary_features]# + ['target_attention_features','yield_spread']]

In [12]:
len(processed_data)

666125

Removing Nan values.

This feature is used to check if there are any NaN values in the trade history. **It is not used to train the model**

In [13]:
# processed_data['trade_history_sum'] = processed_data.trade_history.parallel_apply(lambda x: np.sum(x))
# processed_data.dropna(inplace=True, subset=PREDICTORS+['trade_history_sum'])

In [14]:
processed_data.issue_amount = processed_data.issue_amount.replace([np.inf, -np.inf], np.nan)

In [15]:
processed_data.dropna(inplace=True, subset=PREDICTORS)

In [16]:
len(processed_data)

666125

Creating yield spreads from new ficc ycl

In [17]:
processed_data['new_ys'] = processed_data['yield'] - processed_data['new_ficc_ycl']

Isolating data to test models

In [18]:
test_dataframe = processed_data[(processed_data.trade_date >= '01-01-2023') & (processed_data.trade_date <= '01-31-2023') ]

Converting data into format suitable for the model

In [19]:
encoders = download_data(storage_client,'ahmad_data','encoders.pkl')

File encoders.pkl downloaded to ahmad_data.


In [20]:
def create_input(df, encoders):
    datalist = []
    datalist.append(np.stack(df['trade_history'].to_numpy()))
    datalist.append(np.stack(df['target_attention_features'].to_numpy()))

    noncat_and_binary = []
    for f in NON_CAT_FEATURES + BINARY:
        noncat_and_binary.append(np.expand_dims(df[f].to_numpy().astype('float32'), axis=1))
    datalist.append(np.concatenate(noncat_and_binary, axis=-1))
    
    for f in CATEGORICAL_FEATURES:
        encoded = encoders[f].transform(df[f])
        datalist.append(encoded.astype('float32'))
    
    return datalist

In [21]:
%%time
x_test = create_input(processed_data, encoders)

CPU times: user 2.37 s, sys: 66.4 ms, total: 2.44 s
Wall time: 2.44 s


### Loading New ys model

In [22]:
new_ys_model = keras.models.load_model('saved_model_test_2023-03-13-22-43')

In [23]:
new_ys_predictions = new_ys_model.predict(x_test, batch_size=BATCH_SIZE)

In [24]:
test_dataframe['predicted_new_ys'] = new_ys_predictions

In [25]:
test_dataframe['predicted_new_ytw'] = test_dataframe['new_ficc_ycl'] + test_dataframe['predicted_new_ys']

## Measuring accuracy of model on test data with filtering on trade history 

In [26]:
def calculate_mae(df,column_1, column_2):
    print(f"Number of samples: {len(df)}")
    return np.mean(np.abs(df[column_1] - df[column_2]))

Yield spread mae

In [27]:
calculate_mae(test_dataframe,'new_ys', 'predicted_new_ys')

Number of samples: 666125


11.732624409651304

Yield to worst mae

In [28]:
calculate_mae(test_dataframe,'yield','predicted_new_ytw')

Number of samples: 666125


11.732624409651304

Large Dealer Dealer mae

In [29]:
large_dealer_dealer_trades = test_dataframe[(test_dataframe.trade_type == 'D') & (test_dataframe.quantity >= np.log10(500000))]
calculate_mae(large_dealer_dealer_trades, 'new_ys', 'predicted_new_ys')

Number of samples: 13887


7.806897540219416

In [30]:
for days in [30, 60, 100, 200 ,300, 360]:
    temp_df = test_dataframe[test_dataframe.last_seconds_ago > (days * 24 * 60 * 60)]
    mae = calculate_mae(temp_df, 'new_ys', 'predicted_new_ys')
    print(f"Days before which last trade occurred :{days} MAE: {mae}")

Number of samples: 88893
Days before which last trade occurred :30 MAE: 19.558742629548135
Number of samples: 45037
Days before which last trade occurred :60 MAE: 20.662442372056937
Number of samples: 27103
Days before which last trade occurred :100 MAE: 21.499756051836275
Number of samples: 14589
Days before which last trade occurred :200 MAE: 22.942774714689175
Number of samples: 8063
Days before which last trade occurred :300 MAE: 24.312782739390794
Number of samples: 5909
Days before which last trade occurred :360 MAE: 25.182038661774552
