## Data Preparationn For Models (Gil)
This notebook is used to process data for training

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import pickle
import redis
from IPython.display import display, HTML

from ficc.data.process_data import process_data
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS
from ficc.utils.gcp_storage_functions import upload_data, download_data
from ficc.utils.nelson_siegel_model import yield_curve_level
from ficc.utils.diff_in_days import diff_in_days_two_dates
from ficc.utils.auxiliary_variables import NUM_OF_DAYS_IN_YEAR

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.



Setting the environment variables

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/gil/git/ficc/creds.json"
pd.options.mode.chained_assignment = None

Declaring parameters

In [3]:
SEQUENCE_LENGTH = 5
NUM_FEATURES = 6

Initializing BigQuery and storage client

In [4]:
bq_client = bigquery.Client()
storage_client = storage.Client()

#### Query to fetch data

In [5]:
DATA_QUERY = '''  SELECT
    * except(most_recent_event)
  FROM
    `eng-reactor-287421.auxiliary_views.materialized_trade_history`
  WHERE
    yield IS NOT NULL
    AND yield > 0
    AND par_traded >= 10000
    AND trade_date >= '2023-01-01'
    AND trade_date <= '2023-01-16'
    AND maturity_description_code = 2
    AND coupon_type in (8, 4, 10, 17)
    AND capital_type <> 10
    AND default_exists <> TRUE
    AND most_recent_default_event IS NULL
    AND default_indicator IS FALSE
    AND msrb_valid_to_date > current_date -- condition to remove cancelled trades
    AND settlement_date is not null
  ORDER BY
    trade_datetime desc
    LIMIT 5000
    '''


#### Data Preparation
We grab the data from BigQuery and converts it into a format suitable for input to the model. We save the processed data as a pickle file. If the file already exists we read it from the file. 

In [6]:
file_timestamp = datetime.now().strftime('%Y-%m-%d-%H:%M')
processed_file = f"processed_data_{file_timestamp}.pkl"
# processed_file = f"processed_data_dec.pkl"

In [7]:
processed_file

'processed_data_2023-01-18-20:12.pkl'

In [8]:
%%time
# with exclusions
data = process_data(DATA_QUERY, 
                    bq_client,
                    SEQUENCE_LENGTH,
                    NUM_FEATURES,
                    f"raw_data_{file_timestamp}.pkl",
                    'FICC_NEW',
                    estimate_calc_date=False,
                    remove_short_maturity=True,
                    remove_non_transaction_based=False,
                    remove_trade_type = [],
                    trade_history_delay = 1,
                    min_trades_in_history = 0,
                    process_ratings=False,
                    treasury_spread = True,
                    add_previous_treasury_rate=True,
                    add_previous_treasury_difference=True,
                    add_flags=False)

# # without any exclusions
# data = process_data(DATA_QUERY, 
#                     bq_client,
#                     SEQUENCE_LENGTH,
#                     NUM_FEATURES,
#                     f"raw_data_{file_timestamp}.pkl",
#                     'FICC_NEW',
#                     estimate_calc_date=False,
#                     remove_short_maturity=False,
#                     remove_non_transaction_based=False,
#                     remove_trade_type = [],
#                     trade_history_delay = 0,
#                     min_trades_in_history = 0,
#                     process_ratings=False,
#                     treasury_spread = True,
#                     add_previous_treasury_rate=True,
#                     add_previous_treasury_difference=True,
#                     use_last_duration=False,
#                     add_flags=False)
data.to_pickle(processed_file)

Running with
 remove_short_maturity:True
 trade_history_delay:1
 min_trades_in_hist:0
 process_ratings:False
 add_flags:False
Grabbing yield curve params


  globals.treasury_rate = globals.treasury_rate.transpose().to_dict()


Grabbing data from BigQuery
Saving query and data to raw_data_2023-01-18-20:12.pkl
Raw data contains 5000 samples
Creating trade history
Removing trades with shorter maturity
Removing trades less than 1 minutes in the history
Trade history created
Getting last trade features
Restricting the trade history to the 5 most recent trades
Padding history
Minimum number of trades required in the history 0
Padding completed
Processed trade history contain 5000 samples
Calculating yield spread using ficc yield curve
Yield spread calculated
Fetiching treasury rates
Difference in treasury rates calculated
Processing features
Removing trades which are settled more than a month from trade date


  result = getattr(ufunc, method)(*inputs, **kwargs)


Numbers of samples 4709
CPU times: user 9.88 s, sys: 2.61 s, total: 12.5 s
Wall time: 3min 12s


In [9]:
len(data)

4709

In [10]:
data = pd.read_pickle("processed_data_2023-01-14-22:09.pkl")

FileNotFoundError: [Errno 2] No such file or directory: 'processed_data_2023-01-14-22:09.pkl'

In [None]:
data.purpose_sub_class.fillna(0, inplace=True)

## Adding target trade features

In [None]:
trade_mapping = {'D':[0,0], 'S':[0,1], 'P':[1,0]}
def target_trade_processing_for_attention(row):
    target_trade_features = []
    target_trade_features.append(row['quantity'])
    target_trade_features = target_trade_features + trade_mapping[row['trade_type']]
    return np.tile(target_trade_features, (SEQUENCE_LENGTH,1))

In [None]:
%%time
data['target_attention_features'] = data.parallel_apply(target_trade_processing_for_attention, axis = 1)

## Replacing the ratings with the stand alone ratings. This is done to exclude enhancements. 

In [None]:
data.loc[data.sp_stand_alone.isna(), 'sp_stand_alone'] = 'NR'

data.rating = data.rating.astype('str')
data.sp_stand_alone = data.sp_stand_alone.astype('str')

data.loc[(data.sp_stand_alone != 'NR'),'rating'] = data[(data.sp_stand_alone != 'NR')]['sp_stand_alone'].loc[:]

In [None]:
data['yield'] = data['yield'] * 100

In [None]:
# data.to_pickle(processed_file)

In [None]:
data = data.drop(columns='recent')

In [None]:
# data.to_pickle('processed_data_2022-11-28-18:12_recent.pkl')

In [None]:
# upload_data(storage_client, 'ficc_training_data_latest',procssed_file)
# upload_data(storage_client, 'ahmad_data','processed_data_2022-11-28-18:12_recent.pkl')

# Adding yield curve for every possible candidate calc date

In [None]:
def sqltodf(sql, bq_client):
    bqr = bq_client.query(sql).result()
    return bqr.to_dataframe()

In [None]:
data.to_pickle(processed_file)

In [None]:
nelson_params = sqltodf("select * from `eng-reactor-287421.yield_curves_v2.nelson_siegel_coef_daily` order by date desc", bq_client)
nelson_params.set_index("date", drop=True, inplace=True)
nelson_params = nelson_params[~nelson_params.index.duplicated(keep='first')]

In [None]:
scalar_params = sqltodf("select * from`eng-reactor-287421.yield_curves_v2.standardscaler_parameters_daily` order by date desc", bq_client)
scalar_params.set_index("date", drop=True, inplace=True)
scalar_params = scalar_params[~scalar_params.index.duplicated(keep='first')]


In [None]:
shape_parameter  = sqltodf("SELECT *  FROM `eng-reactor-287421.yield_curves_v2.shape_parameters` order by Date desc", bq_client)
shape_parameter.set_index("Date", drop=True, inplace=True)
shape_parameter = shape_parameter[~shape_parameter.index.duplicated(keep='first')]


In [None]:
def get_yield_for_date(row):
    ficc_ycl_dates = []
    for i in ['maturity_date', 'next_call_date', 'par_call_date', 'refund_date']:
        if pd.isnull(row[i]):
            ficc_ycl_dates.append(np.nan)
            continue
        target_date = row[i]
        duration =  diff_in_days_two_dates(target_date,row['trade_date'])/NUM_OF_DAYS_IN_YEAR
        ficc_ycl_dates.append(yield_curve_level(duration, row['trade_date'].date(), nelson_params, scalar_params, shape_parameter))

    return ficc_ycl_dates[0], ficc_ycl_dates[1], ficc_ycl_dates[2], ficc_ycl_dates[3] 
        

In [None]:
%%time
temp_df = data.parallel_apply(get_yield_for_date, axis=1)

In [None]:
data[['ficc_ycl_to_maturity','ficc_ycl_to_next_call','ficc_ycl_to_par_call', 'ficc_ycl_to_refund']] = pd.DataFrame(temp_df.to_list(), index=data.index)

# Adding yield curve level for previous calc date candiates

In [None]:
data['last_trade_date'] = data['last_trade_datetime'].dt.date

In [None]:
def get_yield_for_last_date(row):
    ficc_ycl_dates = []
    for i in ['last_maturity_date', 'last_next_call_date', 'last_par_call_date', 'last_refund_date']:
        if pd.isnull(row[i]):
            ficc_ycl_dates.append(np.nan)
            continue
        target_date = row[i]
        duration =  diff_in_days_two_dates(target_date.date(),row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
        if row['last_trade_date'] < datetime(2021, 8, 2).date():
            ficc_ycl_dates.append(yield_curve_level(duration, datetime(2021, 8, 3).date(), nelson_params, scalar_params, shape_parameter))
        else:
            ficc_ycl_dates.append(yield_curve_level(duration, row['last_trade_date'], nelson_params, scalar_params, shape_parameter))

    return ficc_ycl_dates[0], ficc_ycl_dates[1], ficc_ycl_dates[2], ficc_ycl_dates[3]

In [None]:
temp_df = data.parallel_apply(get_yield_for_last_date, axis=1)

In [None]:
data[['last_ficc_ycl_to_maturity','last_ficc_ycl_to_next_call','last_ficc_ycl_to_par_call', 'last_ficc_ycl_to_refund']] = pd.DataFrame(temp_df.to_list(), index=data.index)

In [None]:
# data.to_pickle(processed_file)

In [None]:
# upload_data(storage_client, 'ahmad_data',processed_file)


In [None]:
# upload_data(storage_client, 'ficc_training_data_latest','raw_data_2022-12-01-19:45.pkl' )

## Adding last treasury spread

In [None]:
def sqltodf(sql, bq_client):
    bqr = bq_client.query(sql).result()
    return bqr.to_dataframe()

In [None]:
query = '''SELECT * FROM `eng-reactor-287421.treasury_yield.daily_yield_rate` order by Date desc;'''
treasury_rate = sqltodf(query, bq_client)
treasury_rate.set_index("Date", drop=True, inplace=True)
treasury_rate = treasury_rate.transpose().to_dict()

In [None]:
def previous_treasury_rate(trade):
    treasury_maturities = np.array([1,2,3,5,7,10,20,30])
    if trade['last_calc_date'] is None or trade['last_settlement_date'] is None or trade['last_trade_date'] is None:
        return None
    time_to_maturity = diff_in_days_two_dates(trade['last_calc_date'],trade['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    maturity = min(treasury_maturities, key=lambda x:abs(x-time_to_maturity))
    maturity = 'year_'+str(maturity)
    t_rate = treasury_rate[trade['last_trade_date']][maturity]
    return t_rate

In [None]:
%%time
# data['last_treasury_rate'] = data[['last_trade_date','last_calc_date','last_settlement_date']].parallel_apply(previous_treasury_rate, axis=1)

In [None]:
data[data.last_trade_date.isna()]['rtrs_control_number']

### Adding corporate yield

In [None]:
import quandl
quandl.ApiConfig.api_key = 'C6tWjxHm29zz7L5BLQxW'

In [None]:
corporate_yield = quandl.get("USTREASURY/HQMYC")
corporate_maturities = np.array(corporate_yield.columns).astype(float)

In [None]:
#Adding corporate yield with modified duration
def get_corporate_spread(row):
    if row['last_calc_date'] is None or row['last_trade_date'] is None:
        return None
#     duration =  diff_in_days_two_dates(row['last_calc_date'],row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    duration =  diff_in_days_two_dates(row['calc_date'],row['trade_date'])/NUM_OF_DAYS_IN_YEAR
    maturity = min(corporate_maturities, key=lambda x:abs(x-duration))
    temp_corporate_yield = corporate_yield.iloc[corporate_yield.index.get_loc(row['trade_date'],method='pad')]
    c_yield = temp_corporate_yield[str(maturity)]
    return c_yield

In [None]:
%%time
data['new_corporate_yield'] = data[['trade_date','last_settlement_date','last_calc_date','last_trade_date', 'calc_date']].parallel_apply(get_corporate_spread,axis=1)

In [None]:
# data['corporate_spread'] = (data['ficc_ycl'] - data['corporate_yield']*100)

In [None]:
data['new_corporate_yield'] *= 100

In [None]:
data['new_corporate_yield']

### Adding last corporate yield


In [None]:
#Adding corporate yield with modified duration
def get_last_corporate_spread(row):
    if row['last_calc_date'] is None or row['last_trade_date'] is None or row['last_trade_datetime'] is None:
        return None
    duration =  diff_in_days_two_dates(row['last_calc_date'],row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    maturity = min(corporate_maturities, key=lambda x:abs(x-duration))
    temp_corporate_yield = corporate_yield.iloc[corporate_yield.index.get_loc(row['last_trade_datetime'],method='pad')]
    c_yield = temp_corporate_yield[str(maturity)]
    return c_yield

In [None]:
%%time
# data['last_corporate_yield'] = data[['last_trade_datetime','last_trade_date','last_calc_date']].parallel_apply(get_last_corporate_spread,axis=1)

In [None]:
# data.last_corporate_yield *= 100

### Modified yield

In [None]:
def get_yield_for_last_duration(row):
    if row['last_calc_date'] is None or row['last_trade_date'] is None:
        return None
    duration =  diff_in_days_two_dates(row['last_calc_date'],row['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    ycl = yield_curve_level(duration, row['trade_date'].date(), nelson_params, scalar_params, shape_parameter)/100
    return ycl

In [None]:
%%time
data['new_ficc_ycl'] = data[['last_calc_date','last_settlement_date','trade_date','last_trade_date']].parallel_apply(get_yield_for_last_duration, axis=1)

In [None]:
data.new_ficc_ycl = data.new_ficc_ycl * 100

### new treasury rate

In [None]:
def modified_treasury_rate(trade):
    treasury_maturities = np.array([1,2,3,5,7,10,20,30])
    if trade['last_calc_date'] is None or trade['last_settlement_date'] is None or trade['last_trade_date'] is None:
        return None
    time_to_maturity = diff_in_days_two_dates(trade['last_calc_date'],trade['last_trade_date'])/NUM_OF_DAYS_IN_YEAR
    maturity = min(treasury_maturities, key=lambda x:abs(x-time_to_maturity))
    maturity = 'year_'+str(maturity)
    t_rate = treasury_rate[trade['trade_date'].date()][maturity]
    return t_rate

In [None]:
%%time
data['new_treasury_rate'] = data[['last_trade_date','last_calc_date','last_settlement_date','trade_date']].parallel_apply(modified_treasury_rate, axis=1)

In [None]:
data.new_treasury_rate = data.new_treasury_rate * 100

## Saving and uploading data

In [None]:
data.to_pickle(processed_file_0115)
#upload_data(storage_client, 'ficc_training_data_latest',processed_file)

In [None]:
processed_file

In [None]:
data.trade_date.min()

In [None]:
data.trade_date.max()

In [None]:
data.new_ficc_ycl

In [None]:
data.ficc_ycl_1_month