## Data Preparation

This notebook is used to process data for training

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import pickle
import redis
from IPython.display import display, HTML

from ficc.data.process_data import process_data
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS
from ficc.utils.gcp_storage_functions import upload_data, download_data
from ficc.utils.nelson_siegel_model import yield_curve_level
from ficc.utils.diff_in_days import diff_in_days_two_dates
from ficc.utils.auxiliary_variables import NUM_OF_DAYS_IN_YEAR

Initializing pandarallel with 20.0 cores
INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Stting the environment variables

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/jupyter/ahmad_creds.json"
pd.options.mode.chained_assignment = None

Declaring parameters

In [29]:
SEQUENCE_LENGTH = 5
#Change number of features to 7 & when using charles_data_prep
NUM_FEATURES = 6

Initializing BigQuery and storage client

In [4]:
bq_client = bigquery.Client()
storage_client = storage.Client()

#### Query to fetch data

In [5]:
DATA_QUERY = '''SELECT
rtrs_control_number, 
cusip, 
yield, 
is_callable, 
refund_date,
refund_price,
accrual_date,
dated_date, 
next_sink_date,
coupon, 
delivery_date, 
trade_date, 
trade_datetime,
par_call_date, 
interest_payment_frequency,
is_called,
is_non_transaction_based_compensation,
is_general_obligation, 
callable_at_cav, 
extraordinary_make_whole_call,
make_whole_call, 
has_unexpired_lines_of_credit,
escrow_exists, 
incorporated_state_code,
trade_type, 
par_traded, 
maturity_date, 
settlement_date, 
next_call_date, 
issue_amount, 
maturity_amount, 
issue_price, 
orig_principal_amount,
publish_datetime,
max_amount_outstanding, 
recent,
dollar_price,
calc_date,
purpose_sub_class,
called_redemption_type,
calc_day_cat, 
previous_coupon_payment_date,
instrument_primary_name, 
purpose_class,
call_timing,
call_timing_in_part,
sink_frequency,
sink_amount_type,
issue_text,
state_tax_status, 
series_name,
transaction_type,
next_call_price, 
par_call_price, 
when_issued,
min_amount_outstanding,
original_yield, 
par_price,
default_indicator,
sp_stand_alone,
sp_long, 
moodys_long, 
coupon_type,  
federal_tax_status,
use_of_proceeds, 
muni_security_type,
muni_issue_type,
capital_type, 
other_enhancement_type,  
next_coupon_payment_date,
first_coupon_date, 
last_period_accrues_from_date,
maturity_description_code,
de_minimis_threshold
FROM
`eng-reactor-287421.auxiliary_views.materialized_trade_history`
WHERE
  yield IS NOT NULL
  AND yield > 0
  AND par_traded >= 10000
  AND trade_date >= '2023-05-01'
  AND trade_date <= '2023-07-31'
  AND coupon_type in (8, 4, 10, 17)
  AND capital_type <> 10
  AND default_exists <> TRUE
  AND most_recent_default_event IS NULL
  AND default_indicator IS FALSE
  AND msrb_valid_to_date > current_date -- condition to remove cancelled trades
  AND settlement_date is not null
  ORDER BY trade_datetime desc
'''


In [6]:


DATA_QUERY = '''
                SELECT
                  *
                FROM
                  `eng-reactor-287421.auxiliary_views.materialized_trade_history`
                WHERE
                  trade_date >= '2023-08-01'
                  AND trade_date <= '2023-08-31'
                  AND msrb_valid_to_date > current_date -- condition to remove cancelled trades
                  and recent[SAFE_OFFSET(0)].trade_datetime <'2021-08-01'
            '''

In [7]:
# DATA_QUERY = '''SELECT
#     * except(most_recent_event, assumed_settlement_date)
#   FROM
#      --`eng-reactor-287421.auxiliary_views.materialized_trade_history` 
#        `eng-reactor-287421.jesse_test_charles_pipeline.materialized_trade_history`
#   WHERE
#   trade_date >= '2023-05-01'
#   AND trade_date < '2023-07-31'
#   --AND coupon_type in (8, 4, 10, 17)
#   AND capital_type <> 10
#   AND default_exists <> TRUE
#   AND most_recent_default_event IS NULL
#   AND default_indicator IS FALSE
#   AND msrb_valid_to_date > current_date -- condition to remove cancelled trades
#   AND settlement_date is not null
#   ORDER BY trade_datetime desc 
#   '''

#### Data Preparation
We grab the data from BigQuery and converts it into a format suitable for input to the model. We save the processed data as a pickle file. If the file already exists we read it from the file. 

In [19]:
file_timestamp = datetime.now().strftime('%Y-%m-%d-%H:%M')
# processed_file = f"processed_data_{file_timestamp}.pkl"
# processed_file = f"processed_data_dollar_price_{file_timestamp}.pkl"
processed_file = 'test_illiquid_august.pkl'

In [20]:
 processed_file

'test_illiquid_august.pkl'

In [21]:
%%time
# with exclusions
data = process_data(DATA_QUERY, 
                    bq_client,
                    SEQUENCE_LENGTH,
                    NUM_FEATURES,
                    f"raw_data_{file_timestamp}.pkl",
                    'FICC_NEW',
                    remove_short_maturity=True,
                    trade_history_delay = 0.2,
                    min_trades_in_history = 0,
                    treasury_spread = True,
                    add_flags=False,
                    process_rating=False,
                    add_related_trades_bool=False,
                    add_rtrs_in_history=False,
                    only_dollar_price_history = False)

# #Charles data file
# #parameters as per his requests
# data = process_data(DATA_QUERY, 
#                     bq_client,
#                     SEQUENCE_LENGTH,
#                     NUM_FEATURES + 2,
#                     f"raw_data_{file_timestamp}.pkl",
#                     'FICC_NEW',
#                     remove_short_maturity=False,
#                     trade_history_delay = 0,
#                     min_trades_in_history = 0,
#                     process_ratings=False,
#                     treasury_spread = True,
#                     add_previous_treasury_rate=True,
#                     add_previous_treasury_difference=True,
#                     use_last_duration=False,
#                     add_flags=False,
#                     add_related_trades_bool=False,
#                     production_set=False,
#                     add_rtrs_in_history=True)


Running with
 remove_short_maturity:False
 trade_history_delay:0.2
 min_trades_in_hist:0
 add_flags:False
Grabbing yield curve params


  globals.treasury_rate = globals.treasury_rate.transpose().to_dict()


Grabbing data from BigQuery
Saving query and data to raw_data_2023-08-31-21:06.pkl
Raw data contains 4174 samples
Creating trade history
Removing trades less than 0.2 minutes in the history
Trade history created
Getting last trade features
Restricting the trade history to the 5 most recent trades
Padding history
Minimum number of trades required in the history 0
Padding completed
Processed trade history contain 4174 samples
Calculating yield spread using ficc yield curve
Yield spread calculated
Processing features
Removing trades which are settled more than a month from trade date


  result = getattr(ufunc, method)(*inputs, **kwargs)


Numbers of samples 4174
CPU times: user 5.26 s, sys: 5.63 s, total: 10.9 s
Wall time: 29.6 s


Shape of the data

## Adding target trade features

In [22]:
trade_mapping = {'D':[0,0], 'S':[0,1], 'P':[1,0]}
def target_trade_processing_for_attention(row):
    target_trade_features = []
    target_trade_features.append(row['quantity'])
    target_trade_features = target_trade_features + trade_mapping[row['trade_type']]
    return np.tile(target_trade_features, (1,1))

In [23]:
%%time
data['target_attention_features'] = data.parallel_apply(target_trade_processing_for_attention, axis = 1)

CPU times: user 104 ms, sys: 1.01 s, total: 1.11 s
Wall time: 1.26 s


## Replacing the ratings with the stand alone ratings. This is done to exclude enhancements. 

In [24]:
data.loc[data.sp_stand_alone.isna(), 'sp_stand_alone'] = 'NR'

data.rating = data.rating.astype('str')
data.sp_stand_alone = data.sp_stand_alone.astype('str')

data.loc[(data.sp_stand_alone != 'NR'),'rating'] = data[(data.sp_stand_alone != 'NR')]['sp_stand_alone'].loc[:]

### Converting yield scale

In [25]:
data['yield'].head()

0    5.250
1    4.088
2    4.533
3    4.973
4    4.712
Name: yield, dtype: float64

In [26]:
data['yield'] = data['yield'] * 100

In [27]:
data['yield'].head()

0    525.0
1    408.8
2    453.3
3    497.3
4    471.2
Name: yield, dtype: float64

In [28]:
# # We don't need the yield curve coefficients when training dollar price model
# data.to_pickle(processed_file)
# upload_data(storage_client, 'ahmad_data',processed_file)

File test_illiquid_august.pkl uploaded to ahmad_data.


# Adding yield for every possible candidate calc date

In [None]:
def sqltodf(sql, bq_client):
    bqr = bq_client.query(sql).result()
    return bqr.to_dataframe()

In [None]:
nelson_params = sqltodf("select * from `eng-reactor-287421.yield_curves_v2.nelson_siegel_coef_daily` order by date desc", bq_client)
nelson_params.set_index("date", drop=True, inplace=True)
nelson_params = nelson_params[~nelson_params.index.duplicated(keep='first')]
nelson_params = nelson_params.transpose().to_dict()

In [None]:
scalar_params = sqltodf("select * from`eng-reactor-287421.yield_curves_v2.standardscaler_parameters_daily` order by date desc", bq_client)
scalar_params.set_index("date", drop=True, inplace=True)
scalar_params = scalar_params[~scalar_params.index.duplicated(keep='first')]
scalar_params = scalar_params.transpose().to_dict()

In [None]:
shape_parameter  = sqltodf("SELECT *  FROM `eng-reactor-287421.yield_curves_v2.shape_parameters` order by Date desc", bq_client)
shape_parameter.set_index("Date", drop=True, inplace=True)
shape_parameter = shape_parameter[~shape_parameter.index.duplicated(keep='first')]
shape_parameter = shape_parameter.transpose().to_dict()

In [None]:
def get_yield_for_date(row):
    ficc_ycl_dates = []
    for i in ['maturity_date', 'next_call_date', 'par_call_date', 'refund_date']:
        if pd.isnull(row[i]):
            ficc_ycl_dates.append(np.nan)
            continue
        target_date = row[i]
        duration =  diff_in_days_two_dates(target_date,row['trade_date'])/NUM_OF_DAYS_IN_YEAR
        ficc_ycl_dates.append(yield_curve_level(duration, row['trade_date'].date(), nelson_params, scalar_params, shape_parameter))

    return ficc_ycl_dates[0], ficc_ycl_dates[1], ficc_ycl_dates[2], ficc_ycl_dates[3] 
        

In [None]:
%%time
temp_df = data.parallel_apply(get_yield_for_date, axis=1)

In [None]:
data[['ficc_ycl_to_maturity','ficc_ycl_to_next_call','ficc_ycl_to_par_call', 'ficc_ycl_to_refund']] = pd.DataFrame(temp_df.to_list(), index=data.index)

# Adding yield curve level for previous calc date candiates

In [None]:
data['last_trade_date'] = data['last_trade_datetime'].dt.date

In [None]:
def get_yield_for_last_date(row):
    ficc_ycl_dates = []
    for i in ['last_maturity_date', 'last_next_call_date', 'last_par_call_date', 'last_refund_date']:
        if pd.isnull(row[i]):
            ficc_ycl_dates.append(np.nan)
            continue
        target_date = row[i]
        duration =  diff_in_days_two_dates(target_date.date(),row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
        if row['last_trade_date'] < datetime(2021, 8, 2).date():
            ficc_ycl_dates.append(yield_curve_level(duration, datetime(2021, 8, 3).date(), nelson_params, scalar_params, shape_parameter))
        else:
            ficc_ycl_dates.append(yield_curve_level(duration, row['last_trade_date'], nelson_params, scalar_params, shape_parameter))

    return ficc_ycl_dates[0], ficc_ycl_dates[1], ficc_ycl_dates[2], ficc_ycl_dates[3]

In [None]:
temp_df = data.parallel_apply(get_yield_for_last_date, axis=1)

In [None]:
data[['last_ficc_ycl_to_maturity','last_ficc_ycl_to_next_call','last_ficc_ycl_to_par_call', 'last_ficc_ycl_to_refund']] = pd.DataFrame(temp_df.to_list(), index=data.index)

### Grabbing new ficc ycl
New ficc ycl is the yield curve level for the current trade using the duration of the last trade

In [None]:
def get_yield_for_last_duration(row):
    if row['last_calc_date'] is None or row['last_trade_date'] is None:
        return None
    duration =  diff_in_days_two_dates(row['last_calc_date'],row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    ycl = yield_curve_level(duration, row['trade_date'].date(), nelson_params, scalar_params, shape_parameter)/100
    return ycl

In [None]:
%%time
data['new_ficc_ycl'] = data[['last_calc_date','last_settlement_date','trade_date','last_trade_date']].parallel_apply(get_yield_for_last_duration, axis=1)

In [None]:
data.new_ficc_ycl = data.new_ficc_ycl * 100

In [None]:
data.new_ficc_ycl.head()

In [None]:
data['new_ys'] =  data['yield'] - data.new_ficc_ycl

#### Adding the last duration

In [None]:
def get_last_duration(row):
    duration = diff_in_days_two_dates(row['last_calc_date'],row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    return duration

In [None]:
data['last_duration'] = data.parallel_apply(get_last_duration, axis=1)

#### Fixing data types

In [None]:
data.par_traded = data.par_traded.astype(int)
data.last_trade_date = pd.to_datetime(data.last_trade_date)

In [48]:
len(data)

2309312

## Saving and uploading data

In [49]:
processed_file

'processed_data_2023-08-28-22:37.pkl'

In [None]:
data.to_pickle(processed_file)
upload_data(storage_client, 'ahmad_data',processed_file)
# upload_data(storage_client, 'ficc_training_data_latest',processed_file)