# Comparing two CUSIPs
Last updated by Developer on 12/26/2023.

This notebook compares the inputs of two user-specified CUSIPs and determines which inputs are different. This notebook is most effective in the following example situation: CUSIP A prices at 110 and CUSIP B prices at 80, even though both CUSIPs have very similar features, and we want to determine which inputs to the model are different and what could be causing the large price discrepancy.

The core idea is to use as much code that is deployed i.e., that in `app_engine/demo/server/modules/finance.py`, as possible to maintain consistencies to what is deployed.

In [None]:
import numpy as np
import pandas as pd

In [None]:
import os
import sys


from modules.ficc.utils.auxiliary_variables import NON_CAT_FEATURES, \
                                                   BINARY, \
                                                   NON_CAT_FEATURES_DOLLAR_PRICE, \
                                                   BINARY_DOLLAR_PRICE


__file__ = os.path.abspath('compare_model_inputs_for_two_cusips.ipynb')    # in a Jupyter Notebook, the `__file__` variable is not automatically defined because notebooks do not run as standard Python scripts
server_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'app_engine', 'demo', 'server'))    # get the directory containing the 'app_engine/demo/server' package
sys.path.append(server_dir)    # add the directory to sys.path


from modules.get_creds import get_creds
get_creds()


from modules.auxiliary_variables import PRICE_BOTH_DIRECTIONS_TO_CORRECT_INVERSION, \
                                        NUMERICAL_ERROR, \
                                        YEAR_MONTH_DAY, \
                                        YEAR_MONTH_DAY_HOUR_MIN_SEC
from modules.auxiliary_functions import get_current_datetime, \
                                        datetime_as_string, \
                                        get_settlement_date, \
                                        get_outstanding_amount
from modules.pricing_functions import get_trade_price_from_yield_spread_model, \
                                      predict_spread, \
                                      predict_dollar_price
from modules.data_preparation_for_pricing import process_data_for_pricing, \
                                                 get_data_from_redis, \
                                                 reverse_direction_concat, \
                                                 pre_processing, \
                                                 get_inputs_for_nn
from modules.batch_pricing import add_ytw_price_calculationdate_coupon, \
                                  prepare_batch_pricing_results_for_logging, \
                                  prepare_batch_pricing_results_to_output_to_user
from modules.exclusions import CUSIP_ERROR_MESSAGE

Set `CUSIP_0` and `CUSIP_1` as the two CUSIPs to compare. Set the `QUANTITY` and `TRADE_TYPE` to be that of the target trades.

In [None]:
CUSIP_0 = '64971XQM3'
CUSIP_1 = '64971XDT2'
QUANTITY = 500
TRADE_TYPE = 'S'    # P: Purchase from Customer (Bid Side), S: Sale to Customer (Offered Side), D: Inter-Dealer

In [None]:
CUSIP_LIST = [CUSIP_0, CUSIP_1]

We modify `price_cusips_list(...)` and associated functions in order to get the model inputs along with the priced CUSIPs. The model inputs are created in `get_inputs_for_nn(...)`.

In [None]:
COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN = 'inputs_for_nn'

In [None]:
def check_inversion_and_flip_prices_batch(df):
    '''Inverts dealer buy and dealer sell prices, ytw and calc_date for a dataframe with both sides of the 
    trade priced. 

    Each original trade is indexed by the 'id' column, which is used to create a dictionary of price, ytw 
    and calc_date of the other side of the trade. Using 'id' as a key enables multiple trades to have the 
    same cusip without conflict. It is then referenced to invert predictions if predictions cross.'''
    S_idx = df.trade_type == 'S'
    P_idx = df.trade_type == 'P'
    SP_idx = S_idx | P_idx
    original_trades = df.original_trade
    outputs_to_swap = ['price', 'ficc_ytw', 'calc_date', COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN]    # this line was changed from the original function

    opposite_side_dict = dict(zip(df[SP_idx & original_trades]['id'], 
                                  df[SP_idx & ~original_trades][outputs_to_swap].to_records(index=False)))

    def invert_row(row):
        '''Inverts the price of a single row if it crosses, taking reference from a dictionary of predicted 
        price, ytw and calc_date of the other side of the trade.'''
        id = row['id']
        trade_type = row['trade_type']
        if trade_type == 'S':
            S_price = row['price']
            P_price, ytw, calc_date = opposite_side_dict[id]
            if S_price < P_price:
                row[outputs_to_swap] = P_price, ytw, calc_date
        else:
            P_price = row['price']
            S_price, ytw, calc_date = opposite_side_dict[id]
            if S_price < P_price:
                row[outputs_to_swap] = S_price, ytw, calc_date
        return row
    
    df_SP = df[SP_idx & original_trades].apply(invert_row, axis=1)
    return pd.concat([df_SP, df[~SP_idx & original_trades]])

In [None]:
def get_ytw_dollar_price_for_list(df, current_datetime, quantity_list, trade_type, current_date, settlement_date):
    '''Return a list of ytw values for a dataframe on `df`. The `reference_datetime` is used to get the 
    yield curve level.
    NOTE: This was modified to handle inverted trades. This is done after `process_data(...)` is called 
    to minimize repeated data retrieval and processing.'''
    df = process_data_for_pricing(df, quantity_list, trade_type, current_date, settlement_date, current_datetime, True)    # get reference data and feature engineering 
    
    if PRICE_BOTH_DIRECTIONS_TO_CORRECT_INVERSION:
        # separate 'S' and 'P' trades from 'D' trades
        df_SP = df[df.trade_type != 'D']
        df_D = df[df.trade_type == 'D']

        df_SP['id'] = range(len(df_SP))    # index each 'S' or 'P' trade with an id for easier identification of rows at the trade-level rather than cusip-level
        df_SP['original_trade'] = True     # `original_trade` flag is used to return only the trades that were priced initially, not the hypothetical prices for the other side of the trade
        df_D['original_trade'] = True
        
        df_SP = reverse_direction_concat(df_SP)
        df = pd.concat([df_SP, df_D])
        del df_SP, df_D

    df = pre_processing(df)

    if type(df) == str: return df

    use_yield_spread_model = df['model_used'] == 'yield_spread'
    df_yield_spread = df[use_yield_spread_model]    # use the yield spread model on these CUSIPs
    df_dollar_price = df[~use_yield_spread_model]    # use the dollar price model on these CUSIPs
    del use_yield_spread_model
    del df

    if len(df_yield_spread) > 0:
        df_list = get_inputs_for_nn(df_yield_spread, use_dollar_price_model=False)
        ys = predict_spread(df_list)
        ys = np.array(ys) / 100
        ys = ys.ravel()    # `np.ravel` returns a contiguous flattened array
        df_yield_spread['yield_spread'] = ys    # used for logging
        yc = np.array(df_yield_spread['ficc_ycl']) / 100    # changing yield_curve_level to ficc_ycl. This now comes from the data package
        ytw = np.add(ys, yc)
        df_yield_spread['ficc_ytw'] = ytw
        df_yield_spread['ficc_ycl'] = yc    # used for logging
        df_yield_spread['price'], df_yield_spread['calc_date'] = zip(*df_yield_spread.apply(get_trade_price_from_yield_spread_model, axis=1))
        df_yield_spread[COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN] = df_list    # this line was added from the original function

    if len(df_dollar_price) > 0:
        df_list_dollar_price = get_inputs_for_nn(df_dollar_price, use_dollar_price_model=True)
        estimated_dollar_price = predict_dollar_price(df_list_dollar_price)    
        estimated_dollar_price = np.array(estimated_dollar_price).ravel()
        df_dollar_price['price'] = estimated_dollar_price
        # df_dollar_price['ficc_ytw'], df_dollar_price['calc_date'] = zip(*df_dollar_price.apply(get_estimated_yield, axis=1))    # converting the dollar price estimate from the dollar price model to ytw
        df_dollar_price['ficc_ytw'] = None
        df_dollar_price['calc_date'] = None
        df_dollar_price['yield_spread'] = None
        df_dollar_price['ficc_ycl'] = None
        df_dollar_price[COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN] = df_list_dollar_price    # this line was added from the original function
    
    df = pd.concat([df_yield_spread, df_dollar_price])
    if PRICE_BOTH_DIRECTIONS_TO_CORRECT_INVERSION:
        # df = df.sort_values(by='id', ascending=True)
        df = check_inversion_and_flip_prices_batch(df)
        df = df[df.original_trade]    # return only the original trades

    return df.drop(columns=COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN).sort_index(), df[[COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN, 'model_used']]    # this line was changed from the original function

In [None]:
def price_cusips_list(cusip_list, quantity_list, trade_type_list, current_datetime=None):
    '''This function takes a list of CUSIPs and returns price and YTW estimates for each. The optional 
    argument `current_datetime` is used for calling `price_cusips_list(...)` outside of the `finance.py` 
    and to manually put in a datetime of choice for point in time pricing.'''
    cusips_can_be_priced_df, cusips_cannot_be_priced_df = get_data_from_redis(cusip_list)
    if len(cusips_cannot_be_priced_df) != 0: print(cusips_cannot_be_priced_df)    # good to have this output to check why certain CUSIPs cannot be priced
    quantity_list = np.array(quantity_list)    # converted to numpy list in order to easily index by list
    trade_type_list = np.array(trade_type_list)    # converted to numpy list in order to easily index by list

    cusip_indices_that_can_be_priced = list(cusips_can_be_priced_df.index.values)
    quantities_for_cusips_that_can_be_priced = quantity_list[cusip_indices_that_can_be_priced]
    cusips_can_be_priced_df['quantity'] = quantities_for_cusips_that_can_be_priced
    cusips_can_be_priced_df['non_log_transformed_quantity'] = quantities_for_cusips_that_can_be_priced    # this is used for later restoring the non-log10 transformed quantities and quantities for CUSIPs not priced to the dataframe
    cusips_can_be_priced_df['trade_type'] = trade_type_list[cusip_indices_that_can_be_priced]
    
    cusip_indices_that_cannot_be_priced = list(cusips_cannot_be_priced_df.index.values)
    cusips_cannot_be_priced_df['quantity'] = quantity_list[cusip_indices_that_cannot_be_priced]
    cusips_cannot_be_priced_df['trade_type'] = trade_type_list[cusip_indices_that_cannot_be_priced]

    def _fill_basic_error_columns(df):
        df['ytw'] = NUMERICAL_ERROR
        df['ytw_LOGGING_PRECISION'] = NUMERICAL_ERROR
        df['price'] = NUMERICAL_ERROR
        df['yield_spread'] = NUMERICAL_ERROR
        df['ficc_ycl'] = NUMERICAL_ERROR
        df['coupon'] = pd.NA
        df['security_description'] = pd.NA
        df['maturity_date'] = pd.NA
        df['model_used'] = None
        df['reason_for_using_dollar_price_model'] = None
        return df

    def fill_error_columns(df, message_key, series_with_error_message_per_row=None):
        if len(df) == 0: return df

        if series_with_error_message_per_row is None:
            message = CUSIP_ERROR_MESSAGE[message_key]
        else:
            message = series_with_error_message_per_row.apply(lambda error_message: CUSIP_ERROR_MESSAGE[message_key](error_message))
        
        df['yield_to_worst_date'] = message
        return _fill_basic_error_columns(df)
    
    def fill_all_error_columns(df):
        if len(df) == 0: return df
        grouped_by_message = df.groupby('message')
        df['yield_to_worst_date'] = grouped_by_message.message.transform(lambda x: CUSIP_ERROR_MESSAGE[x.name])    # assign a value to each group: https://stackoverflow.com/questions/69951813/groupby-specific-column-then-assign-new-values-base-on-conditions
        return _fill_basic_error_columns(df)

    if len(cusips_can_be_priced_df) != 0:
        if current_datetime is None: current_datetime = get_current_datetime()    # `current_datetime` will only not be `None` if we are doing point in time pricing, and so will be passed in as an optional argument
        current_date = datetime_as_string(current_datetime, precision='day')
        current_datetime = datetime_as_string(current_datetime)
        settlement_date = get_settlement_date(current_date)

        settlement_date_after_maturity_date = cusips_can_be_priced_df['maturity_date'] <= settlement_date
        cusips_can_be_priced_df_settlement_date_after_maturity_date = cusips_can_be_priced_df[settlement_date_after_maturity_date]
        cusips_can_be_priced_df_settlement_date_after_maturity_date = fill_error_columns(cusips_can_be_priced_df_settlement_date_after_maturity_date, 'maturing_soon')
        cusips_can_be_priced_df = cusips_can_be_priced_df[~settlement_date_after_maturity_date]

        outstanding_amount = get_outstanding_amount(cusips_can_be_priced_df, batch_pricing=True)
        outstanding_amount = outstanding_amount.fillna(np.inf)    # ensures that the condition of whether the quantity is greater than the amount outstanding will always be `False` if `outstanding_amount` does not exist
        outstanding_amount = outstanding_amount.replace(0, np.inf)    # ensures that the condition of whether the quantity is greater than the amount outstanding will always be `False` if `outstanding_amount` is 0
        quantity_greater_than_outstanding_amount = cusips_can_be_priced_df['quantity'] > outstanding_amount
        cusips_can_be_priced_quantity_greater_than_outstanding_amount = cusips_can_be_priced_df[quantity_greater_than_outstanding_amount]
        cusips_can_be_priced_quantity_greater_than_outstanding_amount = fill_error_columns(cusips_can_be_priced_quantity_greater_than_outstanding_amount, 'quantity_greater_than_outstanding_amount', outstanding_amount[quantity_greater_than_outstanding_amount])
        cusips_can_be_priced_df = cusips_can_be_priced_df[~quantity_greater_than_outstanding_amount]
        
        if len(cusips_can_be_priced_df) > 0:    # only attempt to price cusips if there are any remaining after removing those where the settlement date is after the maturity date and where the quantity is lesser than the outstanding_amount
            cusips_can_be_priced_df, inputs_for_nn_and_model = get_ytw_dollar_price_for_list(cusips_can_be_priced_df, 
                                                                                             pd.to_datetime(current_datetime, format=YEAR_MONTH_DAY_HOUR_MIN_SEC), 
                                                                                             cusips_can_be_priced_df['quantity'].values, 
                                                                                             cusips_can_be_priced_df['trade_type'].values, 
                                                                                             pd.to_datetime(current_date, format=YEAR_MONTH_DAY), 
                                                                                             settlement_date)    # this line was changed from the original function
            # converting estimated yield to dollar price
            cusips_can_be_priced_df = add_ytw_price_calculationdate_coupon(cusips_can_be_priced_df)

            cusips_can_be_priced_df = pd.concat([cusips_can_be_priced_df, cusips_can_be_priced_df_settlement_date_after_maturity_date, cusips_can_be_priced_quantity_greater_than_outstanding_amount])
            cusips_can_be_priced_df['quantity'] = cusips_can_be_priced_df['non_log_transformed_quantity']    # put the non-log10 transformed quantity back into the dataframe

            # refuse to price CUSIPs within 60 days of the calc date
            not_null_calc_date = cusips_can_be_priced_df['calc_date'].notnull()
            if not_null_calc_date.sum() > 0:    # only enter if at least one CUSIP has a calc date
                cusips_with_calc_date = cusips_can_be_priced_df[not_null_calc_date]
                cusips_wo_calc_date = cusips_can_be_priced_df[~not_null_calc_date]
                DAYS_TO_CALC_DATE_COLUMN_NAME = 'days_to_calc_date'
                cusips_with_calc_date[DAYS_TO_CALC_DATE_COLUMN_NAME] = cusips_with_calc_date.apply(lambda row: diff_in_days_two_dates(row['calc_date'], row['settlement_date']), axis=1)
                within_60_days_of_calc_date = cusips_with_calc_date[DAYS_TO_CALC_DATE_COLUMN_NAME] <= 60
                cusips_with_calc_date = cusips_with_calc_date.drop(columns=DAYS_TO_CALC_DATE_COLUMN_NAME)
                cusips_within_60_days_of_calc_date = cusips_with_calc_date[within_60_days_of_calc_date]
                cusips_not_within_60_days_of_calc_date = cusips_with_calc_date[~within_60_days_of_calc_date]
                cusips_within_60_days_of_calc_date = fill_error_columns(cusips_within_60_days_of_calc_date, 'maturing_soon')
                cusips_can_be_priced_df = pd.concat([cusips_wo_calc_date, cusips_not_within_60_days_of_calc_date, cusips_within_60_days_of_calc_date])
        else:    # all of the CUSIPs in the original `cusips_can_be_priced_df` are now in either `cusips_can_be_priced_df_settlement_date_after_maturity_date` or `cusips_can_be_priced_quantity_greater_than_outstanding_amount`
            cusips_can_be_priced_df = pd.concat([cusips_can_be_priced_df_settlement_date_after_maturity_date, cusips_can_be_priced_quantity_greater_than_outstanding_amount])
    cusips_cannot_be_priced_df = fill_all_error_columns(cusips_cannot_be_priced_df)
    cusips_df = pd.concat([cusips_can_be_priced_df, cusips_cannot_be_priced_df])
    return cusips_df.sort_index(), inputs_for_nn_and_model    # this line was changed from the original function

We batch price the CUSIPs in `CUSIP_LIST` at `QUANTITY` and `TRADE_TYPE` instead of individually pricing them one after the other to ensure that the trade datetime is the same.

In [None]:
priced_df, inputs_for_nn_and_model = price_cusips_list(CUSIP_LIST, [QUANTITY] * 2, [TRADE_TYPE] * 2)
priced_df = prepare_batch_pricing_results_to_output_to_user(prepare_batch_pricing_results_for_logging(priced_df))
priced_df

Inputs for NN and the model used and differences between the inputs.

In [None]:
def compare_inputs(inputs_for_nn, model_used):
    inputs_for_nn_0 = inputs_for_nn.iloc[0]
    inputs_for_nn_1 = inputs_for_nn.iloc[1]
    model_0, model_1 = model_used.tolist()
    print(f'{CUSIP_0} using {model_0} model\tvs\t{CUSIP_1} using {model_1} model')
    
    if model_0 == model_1 == 'yield_spread':
        non_cat_and_binary_features_labels = NON_CAT_FEATURES + BINARY
    elif model_0 == model_1 == 'dollar_price':
        non_cat_and_binary_features_labels = NON_CAT_FEATURES_DOLLAR_PRICE + BINARY_DOLLAR_PRICE
    else:
        print('Models are different')
        return None
    
    for input_feature, input_value_0 in inputs_for_nn_0.items():
        input_value_1 = inputs_for_nn_1[input_feature]
        if input_feature == 'trade_history_input':
            print(f'Trade history for {CUSIP_0}')
            print(input_value_0)
            print(f'Trade history for {CUSIP_1}')
            print(inputs_for_nn_1[input_feature])
        elif input_feature == 'target_attention_input':
            target_attention_input_0 = input_value_0[0]    # since it is a one item list, we extract the value
            target_attention_input_1 = input_value_1[0]    # since it is a one item list, we extract the value
            for idx, item_0 in enumerate(target_attention_input_0):
                item_1 = target_attention_input_1[idx]
                if item_0 != item_1: print(f'{input_feature}[0][{idx}] is different:\t{item_0} vs {item_1}')
        elif input_feature == 'NON_CAT_AND_BINARY_FEATURES':
            for idx, item_0 in enumerate(input_value_0):
                item_1 = input_value_1[idx]
                if item_0 != item_1: print(f'{non_cat_and_binary_features_labels[idx]} is different:\t{item_0} vs {item_1}')
        else:
            if input_value_0 != input_value_1: print(f'{input_feature} is different:\t{input_value_0} vs {input_value_1}')

In [None]:
inputs_for_nn, model_used = inputs_for_nn_and_model[COLUMN_NAME_FOR_RESULTS_OF_GET_INPUTS_FOR_NN], inputs_for_nn_and_model['model_used']
compare_inputs(inputs_for_nn, model_used)