### This module independently:
let a model predict new_ys, which is the estimated YS now assuming last_duration

add new_ficc_ycl and get predicted YTW

predict calc_date

In [1]:
# Author: Gil
# Date: 2022-01-03
# Last Modified by: Gil
# Last Modified time: 2023-01-18

# imports:

import gcsfs
import warnings
import pandas as pd
import pickle
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np  
import torch
import sklearn
import ficc.utils.globals as globals
import seaborn as sns
import math
import tensorflow as tf
from datetime import timedelta ,datetime

from google.cloud import storage
from google.cloud import bigquery
from tensorflow import keras
from IPython.display import display, HTML
from matplotlib.colors import to_rgb, to_rgba

from ficc.utils.yield_curve import get_ficc_ycl,yield_curve_level
from ficc.utils.gcp_storage_functions import download_data
from ficc.utils.auxiliary_variables import PREDICTORS, IDENTIFIERS, CATEGORICAL_FEATURES, NON_CAT_FEATURES, BINARY
from ficc.models import get_model_instance
from ficc.data.process_data import process_data
from ficc.pricing.auxiliary_functions import transform_reference_data
from ficc.pricing.price import compute_price
from calendars import get_day_before

from ficc.utils.diff_in_days import diff_in_days_two_dates
from ficc.utils.auxiliary_variables import NUM_OF_DAYS_IN_YEAR
from tensorflow.keras.models import Model

warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize':(12,9)})

# Globals:

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/gil/git/ficc/creds.json"

calc_date_cat_dict = {0:'next_call_date',
    1:'par_call_date',
    2:'maturity_date',
    3:'refund_date'}

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Testing the latest attention model:

In [2]:
# Model Trained Until end of 2022:
yield_spread_model = keras.models.load_model('/Users/gil/git/ficc/notebooks/measures_accuracy/model_jan_18_new_ys')

2023-01-24 13:13:38.941681: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Functions:

def create_input(df):
    global encoders
    datalist = []
    datalist.append(np.stack(df['trade_history'].to_numpy()))
    datalist.append(np.stack(df['target_attention_features'].to_numpy()))

    noncat_and_binary = []
    for f in NON_CAT_FEATURES + BINARY:
        noncat_and_binary.append(np.expand_dims(df[f].to_numpy().astype('float32'), axis=1))
    datalist.append(np.concatenate(noncat_and_binary, axis=-1))
    
    for f in CATEGORICAL_FEATURES:
        print(f)
        encoded = encoders[f].transform(df[f])
        datalist.append(encoded.astype('float32'))
    
    return datalist

def get_spread(df):

    '''
    This function takes a dataframe, encodes the features, and returns yield spread estimates.
    NB: This is only used by get_prediction_from_individual_pricing.  In order to be DRY, get_BB11_table and get_ytw_curve
    should also use this function. 
    '''
    
    inputs = create_input(df)
    global yield_spread_model
    preds = yield_spread_model.predict(inputs)
    preds = preds.reshape(len(preds))
    return preds

# plot points with transparency depending on size of trade
def drawpoints(preds, target, test_dataframe):
    # >> Only draw Dealer-Dealer trades <<
    r, g, b = to_rgb('purple')
    keep = (10**test_dataframe.quantity > threshold)  # save time by not plotting points that would be invisible
    w = np.minimum( np.array(10**test_dataframe.quantity[keep]), threshold)
    opacity = w/threshold
    color = [(r, g, b, alpha) for alpha in opacity]
    plt.scatter(preds[keep], target[keep], s=5, c=color)

def category_to_calc_date(row):
    if row.ficc_calc_date_cat == 0:
        if not row.is_callable and not row.is_called:
            return row['maturity_date']
        elif not row.is_callable and row.is_called:  
            return row.refund_date
        else:  
            return row['next_call_date']
    elif row.ficc_calc_date_cat == 1:
        return row['par_call_date']
    elif row.ficc_calc_date_cat == 2:
        return row['maturity_date']
    elif row.ficc_calc_date_cat == 3:
        if not row.is_called:
            return row.maturity_date
        else:
            return row.refund_date
    else:
        print(row.ficc_calc_date_cat)
        raise ValueError("Calculation date not found")

def get_trade_price(trade):
    # compute price does not need to return the calc_date, if we are using the calc_date model: 
    final, _ = compute_price(trade, trade.ficc_ytw)
    return final

def target_trade_processing_for_attention(row):
    trade_mapping = {'D':[0,0], 'S':[0,1], 'P':[1,0]}
    target_trade_features = []
    target_trade_features.append(row['quantity'])
    target_trade_features = target_trade_features + trade_mapping[row['trade_type']]
    return np.tile(target_trade_features, (5,1))

The test data below should start after the date for which the model is trained.

In [4]:
from google.cloud import bigquery
from ficc.data.process_data import process_data

SEQUENCE_LENGTH = 5
NUM_FEATURES = 6

DATA_QUERY = '''
  SELECT
    * except(most_recent_event)
  FROM
    `eng-reactor-287421.auxiliary_views.materialized_trade_history`
  WHERE
    trade_date >= '2022-11-01'
    AND trade_date <= '2022-11-05'
    AND msrb_valid_to_date > current_date -- condition to remove cancelled trades
  ORDER BY
    trade_datetime desc
LIMIT 1000
'''

bq_client = bigquery.Client()

# trade_data = process_data(DATA_QUERY, 
#                 bq_client,
#                 SEQUENCE_LENGTH,
#                 NUM_FEATURES,
#                 'data.pkl',
#                 'FICC_NEW',
#                 estimate_calc_date=False,
#                 remove_short_maturity=False,
#                 remove_non_transaction_based=False,
#                 remove_trade_type = [],
#                 trade_history_delay = 0,
#                 min_trades_in_history = 0,
#                 process_ratings=False,
#                 treasury_spread = True,
#                 add_previous_treasury_rate=True,
#                 add_previous_treasury_difference=True,
#                 use_last_duration=False,
#                 add_flags=False)

# trade_data.to_pickle("processed_data.pkl")

trade_data = pd.read_pickle("ahmad_jan_data.pkl")

In [5]:
#trade_data['yield'] = trade_data['yield']/100

In [6]:
trade_data = trade_data[trade_data.trade_date >= "2023-01-01"]

In [7]:
# trade_data['ttypes'] = (trade_data.last_trade_type.astype(str) + trade_data.trade_type.astype(str)).astype('category')
# if 'ttypes' not in CATEGORICAL_FEATURES:
#     CATEGORICAL_FEATURES.append('ttypes')
#     PREDICTORS.append('ttypes')

In [8]:
#for the attention model
trade_data['target_attention_features'] = trade_data.apply(target_trade_processing_for_attention, axis = 1)

if 'target_attention_features' not in PREDICTORS:
    PREDICTORS.append('target_attention_features')

In [9]:
with open('/Users/gil/git/ficc/notebooks/measures_accuracy/encoders.pkl','rb') as f:
    encoders = pickle.load(f)
    
print(encoders)

{'rating': LabelEncoder(), 'incorporated_state_code': LabelEncoder(), 'trade_type': LabelEncoder(), 'purpose_class': LabelEncoder()}


In [10]:
data = trade_data

We currently cover fixed coupon municipals with the following exceptions:

Variable coupon
Derivatives 
Zeros*
Term bonds 
Territories (VI, GU, PR)
Called bonds*
Crossover refunding and partially pre-refunded
Maturity less than a year in the future and more than 30 years in the future
Callable less than a year in the future 
Restructured debt
Defaulted securities 
Private placement/subject to SEC regulation 144A
Purpose Types:
    "Assisted_living"
    "Continuing Care Retirement Center"
    "Convention centers"
    "Correctional facilities"
    “Harbor/chanel"
    "Mall"
    "Single family housing"
    “Real Estate”


*Verify that these are necessary 

In [11]:
# data = data[(data.days_to_call == 0) | (data.days_to_call > np.log10(400))]
# data = data[(data.days_to_refund == 0) | (data.days_to_refund > np.log10(400))]
# data = data[data.days_to_maturity < np.log10(30000)]

# data = data[data.incorporated_state_code != 'PR']
# data = data[data.incorporated_state_code != 'VI']
# data = data[data.incorporated_state_code != 'GU']

# data = data[~data.purpose_sub_class.isin([6, 20, 22, 44, 57, 90])]
# data = data[~data.called_redemption_type.isin([18, 19])]
# data = data[~data.purpose_class.isin([44,35])]

# data = data.loc[:,~data.columns.duplicated()].copy()

df = data

df = df[df.rating != 'CCC']

if 'ficc_treasury_spread' not in PREDICTORS:
    PREDICTORS.append('ficc_treasury_spread')
    NON_CAT_FEATURES.append('ficc_treasury_spread')

predicted_spreads = get_spread(df[PREDICTORS])
df = df.copy()
df['ficc_spreads'] = predicted_spreads #new_ys NOT ficc_spreads

rating
incorporated_state_code
trade_type
purpose_class


In [12]:
# gbt_pred = model.predict(gbmprep(df[PREDICTORS]) )    
# df['ficc_ytw'] = np.round(gbt_pred[0]/100,2) #ytw
# df['yield'] = np.round(gbt_pred[0]/100,2) #ytw
# df['interest_payment_frequency'] = df.orig_interest_payment_frequency

In [13]:
df['error'] =  np.abs(df.yield_spread - df.ficc_spreads)

### Accuracy

In [14]:

#df = df[(df.trade_type == 'D') & (df.quantity >= np.log10(500000))]

daily_maes = []
dates = []
for d in [d for d in pd.date_range(start="01/01/2023",end="01/20/2023",freq='D')]:
    next_day = df[df.trade_date == d].copy()
    error = next_day.yield_spread - next_day.ficc_spreads
    MAE = np.mean(np.abs(error))
    daily_maes.append(MAE)
    dates.append(d)
    if not math.isnan(MAE):
        print(f"MAE for {d}: {MAE}")

rated = df[(df.rating != 'MR') & (df.rating != 'NR')].copy()
error = rated.yield_spread - rated.ficc_spreads
MAE = np.mean(np.abs(error))
print(f"\nAll Rated YS MAE: {round(MAE,3)}")

not_rated = df[(df.rating == 'MR')].copy()
error = not_rated.yield_spread - not_rated.ficc_spreads
MAE = np.mean(np.abs(error))
print(f"\nMissing Rating only YS MAE: {round(MAE,3)}")

not_rated = df[(df.rating == 'MR') | (df.rating == 'NR')].copy()
error = not_rated.yield_spread - not_rated.ficc_spreads
MAE = np.mean(np.abs(error))
print(f"\nAll Not Rated, MR & NRs YS MAE: {round(MAE,3)}")

error = df.yield_spread - df.ficc_spreads
MAE = np.mean(np.abs(error))
print(f"\nAll Trades Yield Spread Preds MAE: {round(MAE,3)}")

dd_true_mid_df = df[(df.trade_type == 'D') & (df.quantity >= np.log10(500000))] #& (df.days_to_call >= np.log10(365))]

error = dd_true_mid_df.yield_spread - dd_true_mid_df.ficc_spreads
MAE = np.mean(np.abs(error))
print(f"\nTrue-Mid DD Yield Spread Preds MAE: {round(MAE,3)}")


MAE for 2023-01-03 00:00:00: 11.881063001285256
MAE for 2023-01-04 00:00:00: 12.42232913836055
MAE for 2023-01-05 00:00:00: 12.798877314662677
MAE for 2023-01-06 00:00:00: 12.219683121773924
MAE for 2023-01-09 00:00:00: 13.02212610147729
MAE for 2023-01-10 00:00:00: 13.322046271377035
MAE for 2023-01-11 00:00:00: 13.200449730294975
MAE for 2023-01-12 00:00:00: 13.30796034974636
MAE for 2023-01-13 00:00:00: 14.104495795494389
MAE for 2023-01-17 00:00:00: 15.089556052859555
MAE for 2023-01-18 00:00:00: 14.381677841671948
MAE for 2023-01-19 00:00:00: 13.828820007578914
MAE for 2023-01-20 00:00:00: 13.98624007385416

All Rated YS MAE: 13.114

Missing Rating only YS MAE: 13.93

All Not Rated, MR & NRs YS MAE: 14.145

All Trades Yield Spread Preds MAE: 13.3

True-Mid DD Yield Spread Preds MAE: 8.582


calc_date Vs fast_calc_date

In [15]:
def get_calc_date(row):
    if row.is_called:
        return row.refund_date
    elif row.is_called == False and row.is_callable is False:
        return row.maturity_date
    else:
        cat = row.last_calc_day_cat
        if cat == 0:
            return row.next_call_date
        elif cat == 1:
            return row.par_call_date
        elif cat == 2:
            return row.maturity_date
        else:
            return row.refund_date
        
def assign(calc_date, which, date):
    calc_date[which] = date[which]

def fast_calc_date(df):
    calc_date = df.par_call_date
    assign(calc_date, df.last_calc_day_cat == 0,                 df.next_call_date)
    assign(calc_date, df.ytw_pred > 100 * df.coupon,                  df.maturity_date)
    assign(calc_date, df.deferred & (df.last_calc_day_cat == 0), df.next_call_date)
    assign(calc_date, ~df.is_callable,                           df.maturity_date)
    assign(calc_date, df.is_called,                              df.refund_date)
    return calc_date

def alt_calc_date(row):
    if row.is_called:
        return row.refund_date
    elif not row.is_callable:
        return row.maturity_date   
    elif (row.coupon == 0) and (row.last_calc_day_cat == 0):
        return row.next_call_date
    elif (row.ytw > 100 * row.coupon):
        return row.maturity_date 
    elif row.last_calc_day_cat == 0:
        return row.next_call_date
    else:                
        return row.par_call_date

def get_yield_for_last_duration(row):
    if row['last_calc_date'] is None or row['last_trade_date'] is None:
        return None
    # temp_date = get_day_before(row['trade_date'])
    duration =  diff_in_days_two_dates(row['last_calc_date'],row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    ycl = yield_curve_level(duration, row['trade_date'], nelson_params, scalar_params, shape_parameter)/100
    return ycl

def sqltodf(sql, bq_client):
    bqr = bq_client.query(sql).result()
    return bqr.to_dataframe()

Beginning of calc date model investigation:
(Remove) The below is a debug cell to better understand when the labels are wrong

In [16]:
#ground truth: 
df["calc_date_label"] = df.calc_date

In [17]:
nelson_params = sqltodf("select * from `eng-reactor-287421.yield_curves_v2.nelson_siegel_coef_daily` order by date desc", bq_client)
nelson_params.set_index("date", drop=True, inplace=True)
nelson_params = nelson_params[~nelson_params.index.duplicated(keep='first')]

scalar_params = sqltodf("select * from`eng-reactor-287421.yield_curves_v2.standardscaler_parameters_daily` order by date desc", bq_client)
scalar_params.set_index("date", drop=True, inplace=True)
scalar_params = scalar_params[~scalar_params.index.duplicated(keep='first')]

shape_parameter  = sqltodf("SELECT *  FROM `eng-reactor-287421.yield_curves_v2.shape_parameters` order by Date desc", bq_client)
shape_parameter.set_index("Date", drop=True, inplace=True)
shape_parameter = shape_parameter[~shape_parameter.index.duplicated(keep='first')]

In [18]:
df['last_trade_date'] = df['last_trade_datetime'].dt.date
df['new_ficc_ycl'] = df[['last_calc_date','last_settlement_date','trade_date','last_trade_date']].parallel_apply(get_yield_for_last_duration, axis=1)
df['new_ficc_ycl'] = df['new_ficc_ycl'] * 100

In [19]:
df['ytw_pred'] = df['yield']
df['calc_date'] = fast_calc_date(df)
print(f"accuracy of fast_calc_date function using MSRB yield: {len(df[df.calc_date == df.calc_date_label])/len(df)}")

accuracy of fast_calc_date function using MSRB yield: 0.9990261258817725


In [20]:
# df['ytw_pred'] = df.last_ytw
# df['calc_date'] = fast_calc_date(df)
# print(f"accuracy of fast_calc_date function using MSRB yield: {len(df[df.calc_date == df.calc_date_label])/len(df)}")

In [21]:
df['ytw_pred'] = df.new_ficc_ycl + df.ficc_spreads #new_ys
df['calc_date'] = fast_calc_date(df)
print(f"accuracy of fast_calc_date function using ytw_pred: {len(df[df.calc_date == df.calc_date_label])/len(df)}")

accuracy of fast_calc_date function using ytw_pred: 0.9566113452063396


In [22]:
# this is the old calc_date func: 
df['calc_date'] = df.apply(get_calc_date,axis=1) 
print(f"accuracy of calc_date function: {len(df[df.calc_date == df.calc_date_label])/len(df)}")
print(len(df))

accuracy of calc_date function: 0.9476115523681586
468233


In [23]:
df['delta'] = df.ytw_pred - df['yield']

In [24]:
np.mean(np.abs(df['delta']))

17.921498606208303