# Point in time pricing
Last updated by Developer on 2024-11-15.

**NOTE**: This notebook needs to be run on a VM so that the yield curve redis can be accessed which is necessary for `process_data(...)`. The error that will be raised otherwise is a `TimeoutError`.

This notebook allows one to see what prices we would have returned on a specified date for a list of CUSIPs. The user specifies the date and time in `DATETIME_OF_INTEREST` and the file with the list of CUSIPs, and optionally quantities and trade types, in `FILE_TO_BE_PRICED`. The sequence of events is as follows: 
1. create a trade history data file where the most recent trade is not after `DATETIME_OF_INTEREST`, 
2. create a reference data file where the data is the reference features for each CUSIP at the `DATETIME_OF_INTEREST`, and 
3. use the archived deployed models for the same day if the time is before 5pm PT or the business day after `DATETIME_OF_INTEREST`, since after business hours, we consider the model that was trained up until two business days before the day it is deployed and validated on the business day before it is deployed. 

The core idea is to use as much code that is deployed i.e., that in `app_engine/demo/server/modules/finance.py`, as possible to maintain consistencies to what is deployed.

In [None]:
from datetime import datetime, time
from typing import List, Dict     # importing types for hinting

import numpy as np
import pandas as pd
from pandas.tseries.offsets import CustomBusinessDay
from pandas.tseries.holiday import USFederalHolidayCalendar    # used to create a business day defined on the US federal holiday calendar that can be added or subtracted to a datetime

from google.cloud import bigquery

from tensorflow import keras

In [None]:
import os
import sys


from modules.ficc.utils.auxiliary_variables import CATEGORICAL_FEATURES, \
                                                   NON_CAT_FEATURES, \
                                                   BINARY, \
                                                   NON_CAT_FEATURES_DOLLAR_PRICE, \
                                                   CATEGORICAL_FEATURES_DOLLAR_PRICE, \
                                                   BINARY_DOLLAR_PRICE


__file__ = os.path.abspath('point_in_time_pricing.ipynb')    # in a Jupyter Notebook, the `__file__` variable is not automatically defined because notebooks do not run as standard Python scripts
server_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'app_engine', 'demo', 'server'))    # get the directory containing the 'app_engine/demo/server' package
sys.path.append(server_dir)    # add the directory to sys.path


from modules.get_creds import get_creds
get_creds()


from modules.auxiliary_variables import SEQUENCE_LENGTH, \
                                        SEQUENCE_LENGTH_DOLLAR_PRICE, \
                                        NUMERICAL_ERROR
from modules.auxiliary_functions import get_outstanding_amount
from modules.data_preparation_for_pricing import cusip_is_invalid, \
                                                 calculate_cusip_check_digit, \
                                                 convert_isin_to_cusip, \
                                                 fix_cusip_improperly_formatted_from_excel_automatic_scientific_notation, \
                                                 get_encoders
from modules.batch_pricing import prepare_batch_pricing_results_for_logging, \
                                  prepare_batch_pricing_results_to_output_to_user, \
                                  price_cusips_list

import modules.pricing_functions    # used to modify the original functions inside finance.py to use the newly created ones in this notebook
import modules.data_preparation_for_pricing    # used to modify the original functions inside finance.py to use the newly created ones in this notebook

In [None]:
# from `cloud_functions/fast_trade_history_redis_update/main.py` and used to convert the trade history represented as a list of dictionaries to a pandas dataframe
FEATURES_FOR_EACH_TRADE_IN_HISTORY = {'msrb_valid_from_date': 'DATETIME', 
                                      'msrb_valid_to_date': 'DATETIME', 
                                      'rtrs_control_number': 'INTEGER', 
                                      'trade_datetime': 'DATETIME', 
                                      'publish_datetime': 'DATETIME', 
                                      'yield': 'FLOAT', 
                                      'dollar_price': 'FLOAT', 
                                      'par_traded': 'NUMERIC', 
                                      'trade_type': 'STRING', 
                                      'is_non_transaction_based_compensation': 'BOOLEAN', 
                                      'is_lop_or_takedown': 'BOOLEAN', 
                                      'brokers_broker': 'STRING', 
                                      'is_alternative_trading_system': 'BOOLEAN', 
                                      'is_weighted_average_price': 'BOOLEAN', 
                                      'settlement_date': 'DATE', 
                                      'calc_date': 'DATE', 
                                      'calc_day_cat': 'INTEGER', 
                                      'maturity_date': 'DATE', 
                                      'next_call_date': 'DATE', 
                                      'par_call_date': 'DATE', 
                                      'refund_date': 'DATE', 
                                      'transaction_type': 'STRING', 
                                      'sequence_number': 'INTEGER'}

In [None]:
BUSINESS_DAY = CustomBusinessDay(calendar=USFederalHolidayCalendar())    # used to skip over holidays when adding or subtracting business days

In [None]:
DATETIME_OF_INTEREST = '2024-02-08T22:00:00'    # modify to be the datetime at which the pricing occurs

In [None]:
FILE_TO_BE_PRICED = 'cusips_to_be_priced_on_02_08_2024.csv'    # modify to be the file containing the list of CUSIPs to be priced

In [None]:
# checks if file is csv or xlsx
if FILE_TO_BE_PRICED.lower().endswith('.csv'):
    to_be_priced_df = pd.read_csv(FILE_TO_BE_PRICED, header=None)
elif FILE_TO_BE_PRICED.lower().endswith(('.xls', '.xlsx')):
    to_be_priced_df = pd.read_excel(FILE_TO_BE_PRICED, header=None)
else:
    raise ValueError('Unsupported file format')
    
to_be_priced_df.head(10)

In [None]:
to_be_priced_df = to_be_priced_df.rename(columns={to_be_priced_df.columns[0]: 'cusip'})    # rename the first column to 'cusip'
to_be_priced_df['cusip'] = to_be_priced_df['cusip'].apply(convert_isin_to_cusip)
to_be_priced_df['cusip'] = to_be_priced_df['cusip'].apply(lambda cusip: cusip + calculate_cusip_check_digit(cusip))

In [None]:
# add column names
assert len(to_be_priced_df.columns) <= 3, 'Too many columns in `to_be_priced_df`'
column_names = ['cusip', 'quantity', 'trade_type']

if len(to_be_priced_df.columns) < 2:
    to_be_priced_df['quantity'] = 500    # set default value
else:
    to_be_priced_df.iloc[:, 1] = to_be_priced_df.iloc[:, 1].astype(int)

if len(to_be_priced_df.columns) < 3: to_be_priced_df['trade_type'] = 'S'    # set default value

to_be_priced_df.columns = column_names
to_be_priced_df['quantity'] = to_be_priced_df['quantity'] * 1000

to_be_priced_df.head(10)

In [None]:
# TODO: fill in the pseudocode
# if DATETIME_OF_INTEREST is a date with no time:
#     add time to DATETIME_OF_INTEREST
DATETIME_OF_INTEREST = datetime.strptime(DATETIME_OF_INTEREST, '%Y-%m-%dT%H:%M:%S')

Create trade history dataset and reference data dataset for `DATETIME_OF_INTEREST`.

In [None]:
DATETIME_OF_INTEREST_AS_TABLE_STRING = DATETIME_OF_INTEREST.strftime('%Y_%m_%d_%H_%M_%S')

In [None]:
BQ_CLIENT = bigquery.Client()

PROJECT_ID = 'eng-reactor-287421'

In [None]:
def create_view(dataset, name, sql_query):
    db = f'{PROJECT_ID}.{dataset}.'
    name = db + name
    BQ_CLIENT.delete_table(name, not_found_ok=True) 
    view = bigquery.Table(name)
    view.view_query = sql_query
    view = BQ_CLIENT.create_table(view)
    return name


def sqltodf(sql_query, limit=''):
    if limit != '': limit = f' ORDER BY RAND() LIMIT {limit}'
    return BQ_CLIENT.query(sql_query + limit).result().to_dataframe()

The following view creates trade history for a given CUSIP. The goal is to use trades up to a point in time. For this to happen, we need to do the following: look at all the trade messages and add a valid_to and valid_from timestamp to them in order to get the most up-to-date trade for a given timestamp. This procedure is done in a [notebook](https://github.com/Ficc-ai/ficc/blob/dev/SQL_examples/Create_trade_history_with_reference_data.ipynb) that creates a table called `msrb_final`. The table is `msrb_final` is always up-to-date, since it is a view that is created further upstream to the `trade_history_same_issue_5_yr_mat_bucket_1_materialized` table. This view is joined to a table containing calculation dates for each trade. The value par_traded is assumed to be $5MM when the field par_traded is null and the is_trade_with_a_par_amount_over_5MM flag is true. The exclusions are as follows:

1) Trades with a par_traded under $10k, which we have found to be not useful for prediction.
2) Trades with no dollar_price or yield.

Note that these are only restrictions for trade data; we would still handle these CUSIPs if they are present in the ICE data.

In [None]:
def create_trade_history_query(up_until_datetime):
    return f'''
        SELECT 
            a.cusip,
            ARRAY_AGG( STRUCT(
                a.msrb_valid_from_date, 
                a.msrb_valid_to_date, 
                a.rtrs_control_number, 
                a.trade_datetime, 
                a.publish_datetime, 
                a.yield, 
                a.dollar_price,   
                CASE
                    WHEN a.par_traded IS NULL AND is_trade_with_a_par_amount_over_5MM IS TRUE THEN 5000000
                    ELSE
                    a.par_traded
                END
                    AS par_traded,
                trade_type, 
                is_non_transaction_based_compensation, 
                is_lop_or_takedown, 
                brokers_broker, 
                is_alternative_trading_system, 
                is_weighted_average_price, 
                CASE
                    WHEN a.settlement_date IS NULL AND a.assumed_settlement_date IS NOT NULL  THEN a.assumed_settlement_date
                ELSE
                    a.settlement_date
                END AS settlement_date,
                b.calc_date, 
                b.calc_date_selection AS calc_day_cat, 
                a.maturity_date, 
                next_call_date, 
                par_call_date, 
                refund_date,
                a.sequence_number,
                a.transaction_type)
                ORDER BY
                    a.trade_datetime DESC
                LIMIT 32) AS recent
        FROM `auxiliary_views_v2.msrb_final` a LEFT JOIN (select distinct * from eng-reactor-287421.auxiliary_views_v2.calculation_date_and_price_v2) b
        ON a.rtrs_control_number = b.rtrs_control_number
           AND a.trade_datetime = b.trade_datetime
           AND a.publish_datetime = b.publish_datetime
           -- AND a.msrb_valid_to_date = b.msrb_valid_to_date
        WHERE a.msrb_valid_to_date > CURRENT_DATETIME('America/New_York')
              AND b.msrb_valid_to_date > CURRENT_DATETIME('America/New_York')
              AND a.dollar_price IS NOT NULL
              AND (a.par_traded IS NULL OR a.par_traded >= 10000)
              AND (a.transaction_type <> "C" or a.transaction_type is null)
              AND a.trade_datetime < "{up_until_datetime}"
        GROUP BY a.cusip'''

In [None]:
%%time
trade_history_table_name = create_view('point_in_time', 
                                       f'trade_history_groupby_{DATETIME_OF_INTEREST_AS_TABLE_STRING}', 
                                       create_trade_history_query(DATETIME_OF_INTEREST))
print(trade_history_table_name)

In [None]:
cusip_list = to_be_priced_df['cusip'].unique().tolist()    # used to filter the query to consider only CUSIPs that we will price later; prefer to do it in the query so we are working with a lot less data and have lower memory usage
cusip_list_as_tuple_string = str(tuple(cusip_list)) if len(cusip_list) > 1 else f'("{cusip_list[0]}")'
cusip_in_cusip_list_clause = f'WHERE cusip IN {cusip_list_as_tuple_string}'    # use `tuple(...)` to have the string representation with parentheses instead of square brackets

In [None]:
trade_history_table_df_query = f'SELECT * FROM {trade_history_table_name} {cusip_in_cusip_list_clause}'
print('query:', trade_history_table_df_query)
trade_history_table_df = sqltodf(trade_history_table_df_query)
trade_history_table_df.head(10)
# trade_history_table_df = trade_history_table_df[trade_history_table_df['cusip'].isin(cusip_list)]    # consider only CUSIPs that we will price later; commented out since we perform the filtering in the query itself

The following view joins the reference data to the trade history.

In [None]:
def join_trade_history_to_reference_data_query(trade_history_table_name, up_until_datetime):
    return f'''
        SELECT
            ref_data_v1.current_coupon_rate AS coupon,
            ref_data_v1.issue_key as series_id,
            CONCAT(IFNULL(organization_primary_name, ''), ' ', IFNULL(instrument_primary_name, ''), ' ', IFNULL(conduit_obligor_name, '')) AS security_description,
            ref_data_v1.cusip,
            ref_valid_from_date,
            ref_valid_to_date,
            incorporated_state_code,
            organization_primary_name,
            instrument_primary_name,
            issue_key,
            issue_text,
            conduit_obligor_name,
            is_called,
            is_callable,
            is_escrowed_or_pre_refunded,
            first_call_date,
            call_date_notice,
            callable_at_cav,
            par_price,
            call_defeased,
            call_timing,
            call_timing_in_part,
            extraordinary_make_whole_call,
            extraordinary_redemption,
            make_whole_call,
            next_call_date,
            next_call_price,
            call_redemption_id,
            first_optional_redemption_code,
            second_optional_redemption_code,
            third_optional_redemption_code,
            first_mandatory_redemption_code,
            second_mandatory_redemption_code,
            third_mandatory_redemption_code,
            par_call_date,
            par_call_price,
            maximum_call_notice_period,
            called_redemption_type,
            muni_issue_type,
            refund_date,
            refund_price,
            redemption_cav_flag,
            max_notification_days,
            min_notification_days,
            next_put_date,
            put_end_date,
            put_feature_price,
            put_frequency,
            put_start_date,
            put_type,
            maturity_date,
            sp_long,
            sp_stand_alone,
            sp_icr_school,
            sp_prelim_long,
            sp_outlook_long,
            sp_watch_long,
            sp_Short_Rating,
            sp_Credit_Watch_Short_Rating,
            sp_Recovery_Long_Rating,
            moodys_long,
            moodys_short,
            moodys_Issue_Long_Rating,
            moodys_Issue_Short_Rating,
            moodys_Credit_Watch_Long_Rating,
            moodys_Credit_Watch_Short_Rating,
            moodys_Enhanced_Long_Rating,
            moodys_Enhanced_Short_Rating,
            moodys_Credit_Watch_Long_Outlook_Rating,
            has_sink_schedule,
            next_sink_date,
            sink_indicator,
            sink_amount_type_text,
            sink_amount_type_type,
            sink_frequency,
            sink_defeased,
            additional_next_sink_date,
            sink_amount_type,
            additional_sink_frequency,
            min_amount_outstanding,
            max_amount_outstanding,
            default_exists,
            has_unexpired_lines_of_credit,
            years_to_loc_expiration,
            escrow_exists,
            escrow_obligation_percent,
            escrow_obligation_agent,
            escrow_obligation_type,
            child_linkage_exists,
            put_exists,
            floating_rate_exists,
            bond_insurance_exists,
            is_general_obligation,
            has_zero_coupons,
            delivery_date,
            issue_price,
            primary_market_settlement_date,
            issue_date,
            outstanding_indicator,
            federal_tax_status,
            maturity_amount,
            available_denom,
            denom_increment_amount,
            min_denom_amount,
            accrual_date,
            bond_insurance,
            coupon_type,
            current_coupon_rate,
            daycount_basis_type,
            debt_type,
            default_indicator,
            first_coupon_date,
            interest_payment_frequency,
            issue_amount,
            last_period_accrues_from_date,
            next_coupon_payment_date,
            odd_first_coupon_date,
            orig_principal_amount,
            original_yield,
            outstanding_amount,
            previous_coupon_payment_date,
            sale_type,
            settlement_type,
            additional_project_txt,
            asset_claim_code,
            additional_state_code,
            backed_underlying_security_id,
            bank_qualified,
            capital_type,
            conditional_call_date,
            conditional_call_price,
            designated_termination_date,
            DTCC_status,
            first_execution_date,
            formal_award_date,
            maturity_description_code,
            muni_security_type,
            mtg_insurance,
            orig_cusip_status,
            orig_instrument_enhancement_type,
            other_enhancement_type,
            other_enhancement_company,
            pac_bond_indicator,
            project_name,
            purpose_class,
            purpose_sub_class,
            refunding_issue_key,
            refunding_dated_date,
            sale_date,
            sec_regulation,
            secured,
            series_name,
            sink_fund_redemption_method,
            state_tax_status,
            tax_credit_frequency,
            tax_credit_percent,
            use_of_proceeds,
            use_of_proceeds_supplementary,
            rating_downgrade,
            rating_upgrade,
            rating_downgrade_to_junk,
            min_sp_rating_this_year,
            max_sp_rating_this_year,
            min_moodys_rating_this_year, 
            max_moodys_rating_this_year,
                latest.* EXCEPT(cusip)
        FROM `reference_data_v1.reference_data_flat` ref_data_v1 LEFT JOIN {trade_history_table_name} latest
        ON latest.cusip = ref_data_v1.cusip   
        WHERE ref_data_v1.cusip IS NOT NULL
              AND timestamp ("{up_until_datetime}") BETWEEN ref_data_v1.ref_valid_from_date
              AND ref_data_v1.ref_valid_to_date'''

In [None]:
%%time
trade_history_joined_to_reference_data_table_name = create_view('point_in_time', 
                                                                f'trade_history_latest_ref_data_minimal_exclusions_{DATETIME_OF_INTEREST_AS_TABLE_STRING}', 
                                                                join_trade_history_to_reference_data_query(trade_history_table_name, DATETIME_OF_INTEREST))
print(trade_history_joined_to_reference_data_table_name)

In [None]:
%%time
trade_history_joined_to_reference_data_table_df_query = f'SELECT * FROM {trade_history_joined_to_reference_data_table_name} {cusip_in_cusip_list_clause}'
print('query:', trade_history_joined_to_reference_data_table_df_query)
trade_history_joined_to_reference_data_table_df = sqltodf(trade_history_joined_to_reference_data_table_df_query)
trade_history_joined_to_reference_data_table_df.head(10)

If this file exists, then we can use it instead of running everything above this line to create the dataframe. Uncomment the below cell to do so.

In [None]:
# import pickle
# with open('point_in_time_ref_data_10_31.pkl', 'rb') as pickle_file:
#     trade_history_joined_to_reference_data_table_df = pickle.load(pickle_file)

For all quantities that are greater than the outstanding amount for that CUSIP, replace the value with the outstanding amount.

In [None]:
cusip_with_trade_history_and_reference_data = to_be_priced_df.merge(trade_history_joined_to_reference_data_table_df, on='cusip', how='left')

# the following 5 lines of code are inspired by `price_cusips_list(...)` in `finance.py`
outstanding_amount = get_outstanding_amount(cusip_with_trade_history_and_reference_data, batch_pricing=True)
outstanding_amount = outstanding_amount.fillna(np.inf)    # ensures that the condition of whether the quantity is greater than the amount outstanding will always be `False` if `outstanding_amount` does not exist
outstanding_amount = outstanding_amount.replace(0, np.inf)    # ensures that the condition of whether the quantity is greater than the amount outstanding will always be `False` if `outstanding_amount` is 0
quantity_greater_than_outstanding_amount = cusip_with_trade_history_and_reference_data['quantity'] > outstanding_amount
cusip_with_trade_history_and_reference_data.loc[quantity_greater_than_outstanding_amount, 'quantity'] = outstanding_amount[quantity_greater_than_outstanding_amount]

cusip_with_trade_history_and_reference_data.head(10)

In [None]:
cusip_list = cusip_with_trade_history_and_reference_data['cusip'].tolist()
quantity_list = cusip_with_trade_history_and_reference_data['quantity'].tolist()
trade_type_list = cusip_with_trade_history_and_reference_data['trade_type'].tolist()

In [None]:
print('cusip_list[:10]\n', cusip_list[:10])
print('quantity_list[:10]\n', quantity_list[:10])
print('trade_type_list[:10]\n', trade_type_list[:10])

Use this dataset to make predictions.

In [None]:
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

Load model.

In [None]:
MODEL_DATETIME = DATETIME_OF_INTEREST
if DATETIME_OF_INTEREST.time() > time(17, 0, 0): MODEL_DATETIME = DATETIME_OF_INTEREST + (BUSINESS_DAY * 1)    # after business hours (17 comes from converting 5pm to military time)
MODEL_DATE_AS_STRING = MODEL_DATETIME.strftime('%m-%d')

This function finds the appropriate model, either in the automated_training directory, or in a special directory. TODO: clean up the way we store models on cloud storage by unifying the folders and naming convention and adding the year to the name.

In [None]:
MAX_NUM_WEEK_DAYS_IN_THE_PAST_TO_CHECK = 10    # denotes the maximum number of week days back that we go to search for the model before raising an error

In [None]:
def load_model(folder, bucket='gs://automated_training'):
    assert folder in ('yield_spread_model', 'dollar_price_model')
    model_prefix = '' if folder == 'yield_spread_model' else 'dollar-'

    for num_business_days_in_the_past in range(MAX_NUM_WEEK_DAYS_IN_THE_PAST_TO_CHECK):
        model_date_string = (MODEL_DATETIME - (BUSINESS_DAY * num_business_days_in_the_past)).strftime('%m-%d')
        bucket_folder_model_path = os.path.join(os.path.join(bucket, folder), f'{model_prefix}model-{model_date_string}')    # create path of the form: <bucket>/<folder>/<model>
        base_model_path = os.path.join(bucket, f'{model_prefix}model-{model_date_string}')    # create path of the form: <bucket>/<model>
        for model_path in (bucket_folder_model_path, base_model_path):    # iterate over possible paths and try to load the model
            print(f'Attempting to load model from {model_path}')
            try:
                model = keras.models.load_model(model_path)
                print(f'Model loaded from {model_path}')
                return model
            except Exception as e:
                print(f'Model failed to load from {model_path} with exception: {e}')

    raise FileNotFoundError(f'No model for {folder} was found from {MODEL_DATE_AS_STRING} to {model_date_string}')

In [None]:
yield_spread_model = load_model(folder='yield_spread_model')
dollar_price_model = load_model(folder='dollar_price_model')

Replace `get_data_from_redis(...)` to instead grab data from the file generated in this notebook.

In [None]:
def get_data_from_redis(cusips):
    '''Return the data found in the redis from a list of cusips. If `return_cusips_not_found` 
    is True, then we return the data not found in the redis.
    NOTE: experiments with parallelization for getting the data from redis did not give any speedup.'''
    # reference_data_redis_client = redis.Redis(host='10.14.140.37', port=6379, db=0)    # do not need this in the notebook since we are not using the redis
    # trade_history_redis_client = redis.Redis(host='10.75.46.228', port=6379, db=0)    # do not need this in the notebook since we are not using the redis
    if type(cusips) != list: cusips = [cusips]    # this means that a single cusip was passed in, but not in a list

    cusips_can_be_priced_df = []
    cusips_cannot_be_priced_df = []

    # TODO: take this function outside of `get_data_from_redis(...)` in `finance.py` taking an argument of `reference_data` and `trade_history_data` to make code much more DRY
    def get_data_for_single_cusip(cusip_idx, cusip):
        '''Get redis data for a single CUSIP. Put the data into the correct list
        based on if the data was found or not.'''
        if len(cusip) == 0: return None    # ignore an empty line
        cusip = cusip.strip()        # remove leading and trailing whitespaces from CUSIP column

        def missing_important_dates(single_cusip_data):
            '''Checks whether important dates needed for pricing are null. To price a CUSIP, we need 
            `next_coupon_payment_date`, `maturity_date`, `first_coupon_date` and `accrual_date`. However, 
            we only need these features if the CUSIP is neither called nor a zero coupon bond.'''
            interest_payment_frequency = single_cusip_data['interest_payment_frequency']
            coupon_type = single_cusip_data['coupon_type']
            
            is_called = single_cusip_data['is_called'] is True
            is_zero_coupon = coupon_type is 17 or interest_payment_frequency is 16 or single_cusip_data['coupon'] is 0    # changed `!=` to `is` since `pd.NA == 16` returns `pd.NA` instead of the expected `False` (fixed by using `is`), leading to Error: boolean value of NA is ambiguous
            needs_important_dates = interest_payment_frequency is not 23 and interest_payment_frequency is not 16 and coupon_type is not 4
            is_missing_important_dates = pd.isna(single_cusip_data['next_coupon_payment_date']) or \
                                         pd.isna(single_cusip_data['maturity_date']) or \
                                         pd.isna(single_cusip_data['first_coupon_date']) or \
                                         pd.isna(single_cusip_data['accrual_date'])
            return not is_called and (not is_zero_coupon or needs_important_dates) and is_missing_important_dates

        def get_trade_history_yields(trade_history):
            if len(trade_history) == 0: return []
            trade_history = trade_history[:SEQUENCE_LENGTH]    # only consider the last `SEQUENCE_LENGTH` trades
            return [trade['yield'] for trade in trade_history]

        def get_trade_history_dollar_prices(trade_history):
            if len(trade_history) == 0: return []
            trade_history = trade_history[:SEQUENCE_LENGTH_DOLLAR_PRICE]    # only consider the last `SEQUENCE_LENGTH_DOLLAR_PRICE` trades
            return [trade['dollar_price'] for trade in trade_history]

        def yield_in_history_is_high(trade_history_yields):
            '''Checks whether the MSRB reported yield is greater than 10, in which case, we 
            should initially decline to price with a descriptive message for the user. 
            TODO: handle pricing for this case, possibly with the dollar price model.'''
            return trade_history_yields != [] and None not in trade_history_yields and any(trade_history_yield > 10 for trade_history_yield in trade_history_yields)

        def dollar_price_in_history_is_null(trade_history_dollar_prices):
            '''Checks whether the MSRB reported dollar price is null, in which case, we should decline to price.'''
            return trade_history_dollar_prices != [] and any([pd.isna(dollar_price) for dollar_price in trade_history_dollar_prices])

        def irregular_coupon_rate(single_cusip_data):
            '''Checks whether the coupon rate is irregular / variable or whether the interest payment 
            frequency is one that we cannot yet handle.'''
            interest_payment_frequency = single_cusip_data['interest_payment_frequency']
            coupon_type = single_cusip_data['coupon_type']
            return interest_payment_frequency not in (1, 2, 3, 5, 16) or coupon_type not in (3, 4, 8, 10, 17, 23, 24)    # `coupon_type == 3` corresponds to bonds that have an initial period with a fixed coupon

        get_cusip_cannot_be_priced_series = lambda cusip_idx, cusip, message: pd.Series({'cusip': cusip, 'message': message}, name=cusip_idx)

        if cusip_is_invalid(cusip):    # all cusips must have length >= 8
            cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'invalid'))
        else:
            if len(cusip) == 8:
                check_digit = calculate_cusip_check_digit(cusip)
                orig_cusip = cusip    # `orig_cusip` used for print statement
                cusip = cusip + str(check_digit)
                print(f'*** 8 digit CUSIP of {orig_cusip} was converted to 9 digit CUSIP: {cusip} ***')
            try:
                cusip = convert_isin_to_cusip(cusip)
                cusip = fix_cusip_improperly_formatted_from_excel_automatic_scientific_notation(cusip)
                reference_data = cusip_with_trade_history_and_reference_data.loc[cusip_idx]    # access by index of the dataframe with `cusip_idx` to avoid issue with same CUSIP but different `quantity` or `trade_type`
                trade_history_data = reference_data['recent']
                trade_history_yields = get_trade_history_yields(trade_history_data)
                trade_history_dollar_prices = get_trade_history_dollar_prices(trade_history_data)
                trade_history_data = pd.DataFrame.from_records(trade_history_data)[list(FEATURES_FOR_EACH_TRADE_IN_HISTORY.keys())].to_numpy() if len(trade_history_data) > 0 else np.array([])    # create trade history as a dataframe and convert to numpy to represent the data as it comes from the redis
                reference_data['recent'] = trade_history_data
                if reference_data['outstanding_indicator'] is False:
                    cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'not_outstanding'))
                elif pd.isna(reference_data['coupon']) or pd.isna(reference_data['interest_payment_frequency']) or pd.isna(reference_data['default_indicator']) or missing_important_dates(reference_data):
                    cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'insufficient_data'))
                elif reference_data['default_exists'] is True or reference_data['default_indicator'] is True:
                    cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'defaulted'))
                elif yield_in_history_is_high(trade_history_yields):
                    cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'high_yield_in_history'))
                elif dollar_price_in_history_is_null(trade_history_dollar_prices):
                    cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'null_dollar_price_in_history'))
                elif irregular_coupon_rate(reference_data):
                    cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'irregular_coupon_rate'))
                else:    # no problems with this cusip
                    # reference_data['recent'] = trade_history_data    # do not need to do this in the notebook since we are working specifically with the reference data and trade history together
                    cusips_can_be_priced_df.append(reference_data.rename(cusip_idx))    # change name to be `cusip_idx` in order to preserve original ordering
            except Exception as e:    # this means that the cusip was not found in the data file
                print(f'{cusip} not in dataframe.\n\tError: {e}')
                cusips_cannot_be_priced_df.append(get_cusip_cannot_be_priced_series(cusip_idx, cusip, 'not_found'))

    for cusip_idx, cusip in enumerate(cusips):
        get_data_for_single_cusip(cusip_idx, cusip)
    cusips_can_be_priced_df = pd.concat(cusips_can_be_priced_df, axis=1).T if cusips_can_be_priced_df != [] else pd.DataFrame()    # list of series to dataframe: https://stackoverflow.com/questions/55478191/list-of-series-to-dataframe
    cusips_cannot_be_priced_df = pd.concat(cusips_cannot_be_priced_df, axis=1).T if cusips_cannot_be_priced_df != [] else pd.DataFrame()     # list of series to dataframe: https://stackoverflow.com/questions/55478191/list-of-series-to-dataframe
    
    return cusips_can_be_priced_df, cusips_cannot_be_priced_df
modules.data_preparation_for_pricing.get_data_from_redis = get_data_from_redis

Use the historical yield spread model and dollar price model for predictions

In [None]:
BATCH_SIZE = 10000    # empirically determined to give fast batch predictions

In [None]:
def predict_spread(instances: List[Dict]) -> List[List]:
    '''Retrieves yield spread estimates from the yield spread model on given a set of `instances`.'''
    return yield_spread_model.predict(instances, batch_size=BATCH_SIZE)
modules.pricing_functions.predict_spread = predict_spread


def predict_dollar_price(instances: List[Dict]) -> List[List]:
    '''Retrieves dollar price estimates from the dollar price model on given a set of `instances`.'''
    return dollar_price_model.predict(instances, batch_size=BATCH_SIZE)
modules.pricing_functions.predict_dollar_price = predict_dollar_price

Both `features_for_input_to_nn(...)` and `get_inputs_for_nn(...)` need to be changed since the original functions returns inputs for the neural network as an array list to call the model directly, instead of a list of dictionaries as necessary for vertex AI. Since we get the model as a file in this notebook instead of calling the currently deployed model on vertex AI.

In [None]:
def features_for_input_to_nn(df, use_dollar_price_model):
    '''Returns inputs for the neural network as an array list to call the model directly, 
    instead of a list of dictionaries as necessary for vertex AI. This is because we get 
    the model as a file in this notebook instead of calling the currently deployed model 
    on vertex AI.'''
    encoders = get_encoders(use_dollar_price_model)    # do not make `encoders` a global variable because the only way to use the updated encoders is to re-deploy the server, and we would like to use updated encoders even if there is no server code change
    datalist = []
    non_cat_features = NON_CAT_FEATURES_DOLLAR_PRICE if use_dollar_price_model else NON_CAT_FEATURES
    binary = BINARY_DOLLAR_PRICE if use_dollar_price_model else BINARY
    noncat_and_binary = [np.expand_dims(df[f].to_numpy().astype('float64'), axis=1) for f in non_cat_features + binary]
    datalist.append(np.concatenate(noncat_and_binary, axis=-1))

    categorical_features = CATEGORICAL_FEATURES_DOLLAR_PRICE if use_dollar_price_model else CATEGORICAL_FEATURES
    for f in categorical_features:
        encoded = encoders[f].transform(df[f])
        datalist.append(encoded.astype('float64'))
    return datalist
modules.data_preparation_for_pricing.features_for_input_to_nn = features_for_input_to_nn


def get_inputs_for_nn(df, use_dollar_price_model):
    '''Returns inputs for the neural network as an array list to call the model directly, 
    instead of a list of dictionaries as necessary for vertex AI. This is because we get 
    the model as a file in this notebook instead of calling the currently deployed model 
    on vertex AI.'''
    trade_history_input = df['trade_history_dollar_price'] if use_dollar_price_model else df['trade_history']
    trade_history_input = np.stack(trade_history_input.to_numpy())

    target_attention_features_input = np.stack(df['target_attention_features'].to_numpy())
    
    return [trade_history_input, target_attention_features_input] + features_for_input_to_nn(df, use_dollar_price_model)
modules.data_preparation_for_pricing.get_inputs_for_nn = get_inputs_for_nn

In [None]:
priced_df = price_cusips_list(cusip_list, quantity_list, trade_type_list, DATETIME_OF_INTEREST)
priced_df = prepare_batch_pricing_results_to_output_to_user(prepare_batch_pricing_results_for_logging(priced_df))
priced_df.head(10)

Inspect the results and save the dataframe.

In [None]:
did_not_price = priced_df[priced_df['ytw'] == NUMERICAL_ERROR]
did_not_price

In [None]:
priced_df.to_csv(f'priced_{MODEL_DATE_AS_STRING}.csv', index=False)

Saving to an excel spreadsheet causes the kernel to crash even though the cell works. Uncomment the below cell if excel spreadsheet format is needed.

In [None]:
# priced_df.to_excel(f'priced_{MODEL_DATE_AS_STRING}.xlsx', index=False)    # save the DataFrame to an excel file