In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import seaborn as sns


from tensorflow.keras.layers import Embedding
from tensorflow.keras import activations
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from sklearn import preprocessing
from datetime import datetime
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
import lightgbm

from IPython.display import display, HTML
import os

import wandb
from wandb.keras import WandbCallback


from ficc.data.process_data import process_data
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS
from ficc.utils.gcp_storage_functions import upload_data, download_data
from ficc.utils.nelson_siegel_model import yield_curve_level
from ficc.utils.diff_in_days import diff_in_days_two_dates
from ficc.utils.auxiliary_variables import NUM_OF_DAYS_IN_YEAR

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Environment variables

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/jupyter/ficc/ml_models/sequence_predictors/ahmad_creds.json"
storage_client = storage.Client()
bq_client = bigquery.Client()

In [4]:
# import gcsfs
# fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
# with fs.open('ficc_training_data_latest/processed_data_2022-10-21-17:41-flags.pkl') as f:
#     data = pd.read_pickle(f)
data = pd.read_pickle('processed_data_2022-10-21-17:41-flags.pkl')

In [5]:
data['last_trade_date'] = data['last_trade_datetime'].dt.date

### Calculating yield for each date

In [10]:
def sqltodf(sql, bq_client):
    bqr = bq_client.query(sql).result()
    return bqr.to_dataframe()

In [11]:
nelson_params = sqltodf("select * from `eng-reactor-287421.yield_curves_v2.nelson_siegel_coef_daily` order by date desc", bq_client)
nelson_params.set_index("date", drop=True, inplace=True)
nelson_params = nelson_params[~nelson_params.index.duplicated(keep='first')]
nelson_params = nelson_params.transpose().to_dict()

In [12]:
scalar_params = sqltodf("select * from`eng-reactor-287421.yield_curves_v2.standardscaler_parameters_daily` order by date desc", bq_client)
scalar_params.set_index("date", drop=True, inplace=True)
scalar_params = scalar_params[~scalar_params.index.duplicated(keep='first')]
scalar_params = scalar_params.transpose().to_dict()

In [13]:
shape_parameter  = sqltodf("SELECT *  FROM `eng-reactor-287421.yield_curves_v2.shape_parameters` order by Date desc", bq_client)
shape_parameter.set_index("Date", drop=True, inplace=True)
shape_parameter = shape_parameter[~shape_parameter.index.duplicated(keep='first')]
shape_parameter = shape_parameter.transpose().to_dict()

In [9]:
def get_yield_for_date(row):
    ficc_ycl_dates = []
    for i in ['maturity_date', 'next_call_date', 'par_call_date', 'refund_date']:
        if pd.isnull(row[i]):
            ficc_ycl_dates.append(np.nan)
            continue
        target_date = row[i]
        duration =  diff_in_days_two_dates(target_date,row['settlement_date'])/NUM_OF_DAYS_IN_YEAR
        ficc_ycl_dates.append(yield_curve_level(duration, row['trade_date'].date(), nelson_params, scalar_params, shape_parameter)/100)

    return ficc_ycl_dates[0], ficc_ycl_dates[1], ficc_ycl_dates[2], ficc_ycl_dates[3] 
        

In [10]:
%%time
temp_df = data.parallel_apply(get_yield_for_date, axis=1)

CPU times: user 1min 20s, sys: 18.5 s, total: 1min 39s
Wall time: 2min 39s


In [11]:
temp_df

0          (4.8887820691943595, 3.7649620424936807, 3.764...
1          (4.8887820691943595, 3.7649620424936807, 3.764...
2          (4.006203951441065, 3.349188290928213, 3.34918...
4          (4.361368630601499, 3.451798554893155, 3.45179...
5          (4.521464125419967, 3.321726866146555, 3.32172...
                                 ...                        
4026130    (3.7183106045407612, 2.784881376930498, 2.7848...
4026131    (3.327084534886003, 2.6296626649174777, 2.6296...
4026132                  (2.6222564075109154, nan, nan, nan)
4026133    (3.458488505831795, 2.6806318005826846, 2.6806...
4026134    (3.540590630980674, 2.618293456189657, 2.61829...
Length: 3745365, dtype: object

In [12]:
data[['ficc_ycl_to_maturity','ficc_ycl_to_next_call','ficc_ycl_to_par_call', 'ficc_ycl_to_refund']] = pd.DataFrame(temp_df.to_list(), index=data.index)

datetime.date(2021, 8, 2)

In [20]:
def get_yield_for_last_date(row):
    ficc_ycl_dates = []
    for i in ['last_maturity_date', 'last_next_call_date', 'last_par_call_date', 'last_refund_date']:
        if pd.isnull(row[i]):
            ficc_ycl_dates.append(np.nan)
            continue
        target_date = row[i]
        duration =  diff_in_days_two_dates(target_date.date(),row['last_settlement_date'])/NUM_OF_DAYS_IN_YEAR
        if row['last_trade_date'] < datetime(2021, 8, 2).date():
            ficc_ycl_dates.append(yield_curve_level(duration, datetime(2021, 8, 3).date(), nelson_params, scalar_params, shape_parameter)/100)
        else:
            ficc_ycl_dates.append(yield_curve_level(duration, row['last_trade_date'], nelson_params, scalar_params, shape_parameter)/100)
        

    return ficc_ycl_dates[0], ficc_ycl_dates[1], ficc_ycl_dates[2], ficc_ycl_dates[3]

In [21]:
temp_df = data.parallel_apply(get_yield_for_last_date, axis=1)

  return L*(1-np.exp(-t/L))/t
  return (L*(1-np.exp(-t/L))/t) -np.exp(-t/L)
  return L*(1-np.exp(-t/L))/t
  return (L*(1-np.exp(-t/L))/t) -np.exp(-t/L)


In [22]:
temp_df

0          (3.1497264368818367, 3.125085410077091, 3.1250...
1                        (3.9337326864296505, nan, nan, nan)
2          (3.169598516089873, 3.1416089348004648, 3.1416...
3          (4.093027692470485, 3.5358624164385035, 3.5358...
4          (4.093027692470485, 3.5358624164385035, 3.5358...
                                 ...                        
5296713    (2.877032654226967, 1.7106782739120547, 1.7106...
5296714                  (1.8611689460582002, nan, nan, nan)
5296715                  (1.8611689460582002, nan, nan, nan)
5296716    (1.6967060793609647, 1.6541873725449876, 1.654...
5296717    (1.6967060793609647, 1.6541873725449876, 1.654...
Length: 5271064, dtype: object

In [23]:
data[['last_ficc_ycl_to_maturity','last_ficc_ycl_to_next_call','last_ficc_ycl_to_par_call', 'last_ficc_ycl_to_refund']] = pd.DataFrame(temp_df.to_list(), index=data.index)

In [24]:
data[(data.last_ficc_ycl_to_maturity != data.ficc_ycl_to_maturity)]

Unnamed: 0,MSRB_maturity_date,MSRB_coupon_rate,msrb_cusip,yield_spread,calc_price,price_to_next_call,price_to_par_call,price_to_maturity,calc_date,price_delta,...,ficc_ycl_to_refund,last_trade_date,last_ficc_ycl_to_maturity,last_ficc_ycl_to_next_call,last_ficc_ycl_to_par_call,last_ficc_ycl_to_refund,is_replica,is_bookkeeping,is_same_day,ntbc_precursor
1,2036-10-01,0E-9,592643BE9,77.396313,52.288,0.000,0.000,0.000,2036-10-01,0.004,...,,2022-10-13,3.933733,,,,False,False,False,False
3,2038-12-01,4.000000000,91739RDN4,40.526398,94.307,96.289,96.289,94.307,2038-12-01,0.000,...,,2022-10-13,4.093028,3.535862,3.535862,,False,False,False,True
4,2038-12-01,4.000000000,91739RDN4,40.526398,94.307,96.289,96.289,94.307,2038-12-01,0.000,...,,2022-10-13,4.093028,3.535862,3.535862,,False,False,False,False
9,2050-11-01,5.000000000,57582RK47,62.328380,106.325,106.325,106.325,115.482,2030-11-01,0.001,...,,2022-10-12,4.649179,3.408521,3.408521,,False,False,False,False
23,2050-10-15,5.000000000,594615BV7,138.366267,101.349,101.349,101.349,107.697,2025-10-15,0.000,...,,2022-10-07,4.622430,3.142356,3.142356,,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5296713,2042-12-01,5.000000000,944514NS3,40.140010,101.874,101.874,101.874,148.373,2022-12-01,0.001,...,,2022-03-30,2.877033,1.710678,1.710678,,False,False,False,False
5296714,2026-08-01,4.000000000,254498FH6,6.970756,107.972,0.000,0.000,0.000,2026-08-01,0.001,...,,2022-03-24,1.861169,,,,False,False,False,False
5296715,2026-08-01,4.000000000,254498FH6,32.670756,106.872,0.000,0.000,0.000,2026-08-01,0.001,...,,2022-03-24,1.861169,,,,False,False,False,False
5296716,2026-12-01,5.000000000,13067WCQ3,3.900978,111.822,111.822,111.822,113.179,2026-06-01,0.001,...,,2022-03-15,1.696706,1.654187,1.654187,,False,False,False,False


In [26]:
data.to_pickle('processed_data_2022-10-21-17:41-flags.pkl')

In [27]:
upload_data(storage_client, 'ficc_training_data_latest','processed_data_2022-10-21-17:41-flags.pkl')

File processed_data_2022-10-21-17:41-flags.pkl uploaded to ficc_training_data_latest.


In [None]:
data[(data.cusip == '835631NS7') & (data.trade_date >= '2022-05-12')][['cusip','trade_datetime','last_trade_datetime','trade_history']][:10]

### Modified Yield

In [5]:
data = pd.read_pickle('processed_data_2022-10-21-17:41-real-time-ycl.pkl')

In [6]:
# import gcsfs
# fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
# with fs.open('ficc_training_data_latest/processed_data_2022-10-21-17:41-flags-corporate-spreads.pkl') as f:
#     data = pd.read_pickle(f)

In [7]:
data['last_duration'] = data[['last_calc_date','last_settlement_date']].parallel_apply(lambda x: diff_in_days_two_dates(x['last_calc_date'], x['last_settlement_date'])/NUM_OF_DAYS_IN_YEAR, axis=1)

In [8]:
def get_yield_for_last_duration(row):
    duration =  diff_in_days_two_dates(row['last_calc_date'],row['last_settlement_date'])/NUM_OF_DAYS_IN_YEAR
    ycl = yield_curve_level(duration, row['trade_date'].date(), nelson_params, scalar_params, shape_parameter)/100
    return ycl

In [14]:
data['new_ficc_ycl'] = data[['last_calc_date','last_settlement_date','trade_date']].parallel_apply(get_yield_for_last_duration, axis=1)

In [15]:
data = data[(data.trade_date >= '2022-06-01') & (data.trade_date < '2022-10-01')]

In [17]:
data.to_pickle('processed_data_2022-10-31-15:41-real-time-ycl-4-months.pkl')

In [30]:
data.to_pickle('processed_data_2022-10-21-17:41-flags-last-duration-4-months.pkl')

In [18]:
upload_data(storage_client, 'ficc_training_data_latest','processed_data_2022-10-31-15:41-real-time-ycl-4-months.pkl')

File processed_data_2022-10-31-15:41-real-time-ycl-4-months.pkl uploaded to ficc_training_data_latest.
