In [None]:
'''
 # @ Create Time: 2022-01-14 17:44:00
 # @ Modified by: Gil
 # @ Modified time: 2024-08-12 11:55am PT
 # @ Description: This file implements functions from the pricing module
 
 '''
# ensures that any changes to the modules will be reloaded when this cell is run
%load_ext autoreload
%autoreload 2

import os
import pickle
import numpy as np
import pandas as pd

from google.cloud import bigquery

from ficc.utils.auxiliary_functions import sqltodf
from ficc.utils.process_features import process_features
from ficc.pricing.price import compute_price
from ficc.pricing.yield_rate import compute_yield

In [85]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/gil/git/ficc/creds.json'

### Conditions that we tested on:
* `yield > 0`: conceptually, this notebook should work for any yield, but in practice, bonds with zero yield indicates a problem with MSRB data, and negative yields haven't been tested on
* `is_non_transaction_based_compensation is false`, `is_lop_or_takedown is false`: conceptually, this notebook should work without these filters, but in practice, away from market prices sometimes have anomolous yields reported in the MSRB data
* `callable_at_cav is false`: this notebook will not be correct for bonds callable at cav; future work will incorporate the special case of cav bonds that have call prices from old reference data
* `ref_valid_to_date > timestamp(publish_datetime, "America/New_York")`, `timestamp(publish_datetime, "America/New_York") >= ref_valid_from_date`, `msrb_valid_to_date > publish_datetime`, `msrb_valid_from_date <= publish_datetime`: conditions to correctly join old reference data and MSRB
* `par_call_price = 100`: conceptually this is identical to the fourth bullet point, but sometimes there are issues in the old reference data
* `interest_payment_frequency is not NULL`: if interest payment frequency is NULL, then the coupon frequency is unknown; possibly the field `coupon_type` could be used to determine the interest payment frequency

In [None]:
query = lambda trade_date: ''' 
SELECT
  IFNULL(settlement_date, assumed_settlement_date) AS settlement_date,
  trade_date,
  cusip,
  dated_date,
  par_traded,
  accrual_date,
  dollar_price,
  issue_price,
  coupon,
  interest_payment_frequency,
  next_call_date,
  par_call_date,
  next_call_price,
  par_call_price,
  maturity_date,
  previous_coupon_payment_date,
  next_coupon_payment_date,
  first_coupon_date,
  coupon_type,
  muni_security_type,
  called_redemption_type,
  refund_date,
  refund_price,
  is_callable,
  is_called,
  call_timing,
  yield,
  rtrs_control_number,
  has_zero_coupons,
  last_period_accrues_from_date,
  call_defeased,
  issue_amount,
  -- needed for `process_features(...)`
  maturity_amount,
  -- needed for `process_features(...)`
  orig_principal_amount,
  -- needed for `process_features(...)`
  max_amount_outstanding,
  -- needed for `process_features(...)`
  delivery_date,
  -- needed for `process_features(...)`
  next_sink_date    -- needed for `process_features(...)`
FROM
  `eng-reactor-287421.jesse_tests.msrb_sp_joined`
  -- `eng-reactor-287421.auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized`
WHERE
  -- yield > 0 and
  -- is_non_transaction_based_compensation is false and
  -- callable_at_cav is false and
  -- is_lop_or_takedown is false and
  DATETIME(ref_valid_to_date) > publish_datetime
  AND publish_datetime >= DATETIME(ref_valid_from_date)
  AND msrb_valid_to_date > publish_datetime
  AND msrb_valid_from_date <= publish_datetime
  AND interest_payment_frequency IN (1, 2, 3, 5, 16, 31) -- Semiannual, Monthly, Annually, Quarterly, AtMat, zcb
  and default_exists is false
  -- par_call_price is not NULL and
  -- par_call_price = 100 and
  -- interest_payment_frequency is not NULL and
  -- interest_payment_frequency = 16 and
  -- coupon > 0 and
  AND trade_date >= \'''' + trade_date + '''\'
'''

In [87]:
date = '2024-08-01'

In [88]:
muni_df_query = query(date)

using_saved_muni_df = False
MUNI_DF_FILE_NAME = 'sp_joined.pkl' #'ice_joined.pkl' 
if os.path.exists(MUNI_DF_FILE_NAME):
    with open(MUNI_DF_FILE_NAME, 'rb') as file:
        muni_df_query_from_pkl, muni_df = pickle.load(file)
    if muni_df_query == muni_df_query_from_pkl:
        using_saved_muni_df = True

if not using_saved_muni_df:
    bqclient = bigquery.Client()
    muni_df = sqltodf(muni_df_query, bqclient)
    with open(MUNI_DF_FILE_NAME, 'wb') as file:
        pickle.dump((muni_df_query, muni_df), file)

In [89]:
muni_df = muni_df.dropna(subset=['accrual_date'])

In [None]:
muni_df[muni_df.cusip == '67868NCJ6']

In [None]:
muni_df = process_features(muni_df)

In [92]:
muni_df = muni_df.rename(columns={'yield': 'ficc_ytw'})

Apply the `compute_price` function to every trade.

In [93]:
# Hold off - Convert all decimals in the df to float
# muni_df = muni_df[~(muni_df.is_callable & pd.isnull(muni_df.next_call_date))]

In [94]:
# not a great solution: 
muni_df = muni_df[~(pd.isnull(muni_df.last_period_accrues_from_date))]

In [None]:
# muni_df[muni_df.cusip == '6500357E2'][['called_redemption_type','call_defeased']]
#muni_df[pd.isna(muni_df.maturity_date)]

print(len(muni_df))

muni_df = muni_df[muni_df.cusip != '882723J34']
muni_df

In [None]:
muni_df['price_calc_from_yield'] = muni_df.apply(lambda x: compute_price(x), axis=1)

Plot the price from the reference data on the x-axis and our computed price on the y-axis.

In [None]:
muni_df

In [None]:
muni_df['price_from_yield'] = [x[0] for x in muni_df['price_calc_from_yield']]
muni_df.plot.scatter(x='dollar_price', y='price_from_yield', c='DarkBlue')

Return the sum and mean of the errors where the error is defined as the absolute distance between our computed price and the reference price.

In [99]:
muni_df['price_delta'] = abs(muni_df['price_from_yield'] - muni_df['dollar_price'])

In [None]:

print(f'Sum of errors: {np.sum(muni_df['price_delta'])}')
print(f'Mean of errors: {np.mean(muni_df['price_delta'])}')

# Check for infinite and NaN values in 'price_delta'
num_infinite_values = np.isinf(muni_df['price_delta']).sum()
num_nan_values = muni_df['price_delta'].isna().sum()

print(f'{num_infinite_values} infinite values found in \'price_delta\'')
print(f'{num_nan_values} NaN values found in \'price_delta\'')

# Filter out rows with infinite or NaN values in 'price_delta'
muni_df_filtered = muni_df[~np.isinf(muni_df['price_delta']) & muni_df['price_delta'].notna()]

# Calculate the sum and mean of errors in the filtered DataFrame
sum_errors = np.sum(muni_df_filtered['price_delta'])
mean_errors = np.mean(muni_df_filtered['price_delta'])

print(f'Sum of errors: {sum_errors}')
print(f'Mean of errors: {mean_errors}')

In [None]:
#muni_df.price_delta.value_counts()

# Define the bins
bins = [0, 0.1, 5, np.inf]

# Bin the 'price_delta' values
binned_price_delta = pd.cut(muni_df['price_delta'], bins=bins)

# Count the occurrences in each bin
binned_price_delta.value_counts()


In [None]:
muni_df[muni_df['price_delta'] > 10][['cusip','ficc_ytw','dollar_price','price_calc_from_yield','price_from_yield','price_delta']].sort_values(by='price_delta', ascending=False)

In [103]:
muni_df.to_pickle('sp.pkl')

In [None]:
muni_df

## S&P Compare:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the relevant pickled data files
sp_df = pd.read_pickle('sp.pkl')
ice_df = pd.read_pickle('ice.pkl')

def analyze_and_plot(data, title):
    # Calculate price_from_yield
    data['price_from_yield'] = [x[0] for x in data['price_calc_from_yield']]
    
    # Calculate price_delta
    data['price_delta'] = abs(data['price_from_yield'] - data['dollar_price'])
    
    # Normalize price_delta for color mapping
    norm = plt.Normalize(data['price_delta'].min(), data['price_delta'].max())
    
    # Create a custom colormap with darker colors
    colors = plt.cm.Purples(np.linspace(0.3, 1, 256))  # Adjusted to use darker shades
    custom_cmap = plt.cm.colors.LinearSegmentedColormap.from_list("custom", colors)
    
    # Apply the custom colormap to the price_delta values
    color_values = custom_cmap(norm(data['price_delta']))
    
    # Plot the scatter plot with the custom colormap
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(data['dollar_price'], data['price_from_yield'], 
                          color=color_values, edgecolors='none', alpha=0.7, s=30)
    
    # Label major outliers without duplicates
    threshold = data['price_delta'].quantile(0.9995)  # Top 0.03% of outliers
    outliers = data[data['price_delta'] > threshold].sort_values('price_delta', ascending=False)
    labeled_cusips = set()
    for _, row in outliers.iterrows():
        if row['cusip'] not in labeled_cusips:
            plt.annotate(row['cusip'], 
                         (row['dollar_price'], row['price_from_yield']),
                         xytext=(5, 5), textcoords='offset points', 
                         fontsize=8, alpha=0.7)
            labeled_cusips.add(row['cusip'])
    
    # Adding plot details
    plt.xlabel('Dollar Price')
    plt.ylabel('Price from Yield')
    plt.title(title)
    plt.tight_layout()
    plt.show()
    
    # Calculate and print the sum and mean of errors
    sum_errors = np.sum(data['price_delta'])
    mean_errors = np.mean(data['price_delta'])
    print(f'Sum of errors: {sum_errors}')
    print(f'Mean of errors: {mean_errors}')
    
    return data

# Analyze and plot for S&P
sp_analysis = analyze_and_plot(sp_df, 'S&P Data Analysis')

# Analyze and plot for old reference data
ice_analysis = analyze_and_plot(ice_df, 'old reference data Data Analysis')

# Identify and compare outliers for both datasets, ensuring distinct cusips
sp_outliers = sp_analysis[sp_analysis['price_delta'] > 10][['cusip', 'ficc_ytw', 'dollar_price', 'price_calc_from_yield', 'price_from_yield', 'price_delta']].sort_values(by='price_delta', ascending=False).drop_duplicates(subset=['cusip'])
ice_outliers = ice_analysis[ice_analysis['price_delta'] > 10][['cusip', 'ficc_ytw', 'dollar_price', 'price_calc_from_yield', 'price_from_yield', 'price_delta']].sort_values(by='price_delta', ascending=False).drop_duplicates(subset=['cusip'])

# Check if the outliers are the same in both datasets
common_outliers = pd.merge(sp_outliers, ice_outliers, on='cusip', suffixes=('_sp', '_ice'))

# Display the common outliers
print(common_outliers)

In [None]:
# Load the relevant pickled data files
sp_df = pd.read_pickle('sp.pkl')

sp_df = sp_df.drop_duplicates(subset=['cusip'])

sp_lst = sp_df[sp_df.price_delta > 1].cusip

ice_df = pd.read_pickle('ice.pkl')
ice_df = ice_df.drop_duplicates(subset=['cusip'])

# Merge the dataframes on the 'cusip' field
combined_df = pd.merge(ice_df, sp_df, on='cusip', suffixes=('_ice', '_sp'))
# combined_df = combined_df.drop_duplicates(subset=['cusip'])

print(len(sp_df[sp_df.price_delta > 1]))

#print([ice_df[ice_df.cusip == cusip] for cusip in sp_lst])

merged_df = pd.merge(ice_df, sp_df, on='cusip', suffixes=('_ice', '_sp'))
merged_df = merged_df.drop_duplicates(subset=['cusip'])

print(len(merged_df[merged_df.price_delta_sp > 1]))


TBD D


In [None]:
# Load the relevant pickled data files
sp_df = pd.read_pickle('sp.pkl')
ice_df = pd.read_pickle('ice.pkl')

# Merge the dataframes on the 'cusip' field
combined_df = pd.merge(ice_df, sp_df, on='cusip', suffixes=('_ice', '_sp'))
combined_df = combined_df.drop_duplicates(subset=['cusip'])

# print(f" combined_df[combined_df.price_delta_sp > 1])

# Identify CUSIPs that are outliers in SP but not in old reference data
ice_good_sp_bad = combined_df[
    (combined_df['price_delta_ice'] <= 10) & 
    (combined_df['price_delta_sp'] > 1)
]

# List of key fields to compare (ensure these fields are available in both dataframes)
key_fields = [
    'dollar_price', 
    'price_from_yield',
    'price_delta',
    'next_call_date', 
    'interest_payment_frequency', 
    'first_coupon_date', 
    'maturity_date', 
    'dated_date',
    'accrual_date', 
    'issue_price', 
    'coupon', 
    'par_call_date', 
    'par_call_price', 
    'next_coupon_payment_date', 
    'coupon_type', 
    'muni_security_type', 
    'issue_amount', 
    'maturity_amount', 
    'delivery_date', 
    'next_sink_date'
]

# Create a new DataFrame with side-by-side comparison
comparison_df = pd.DataFrame()
comparison_df['cusip'] = ice_good_sp_bad['cusip']

for field in key_fields:
    comparison_df[f'{field}_ice'] = ice_good_sp_bad[f'{field}_ice']
    comparison_df[f'{field}_sp'] = ice_good_sp_bad[f'{field}_sp']

# Display the comparison
print("Comparison of Key Fields for Outliers")
display(comparison_df)
comparison_df.to_csv('sp_ytp_data_comparison_analysis.csv')

# Create a dataframe to store differences
diff_comparison_df = pd.DataFrame()

# Iterate over each cusip to find differences
for cusip in ice_good_sp_bad['cusip']:
    cusip_row = ice_good_sp_bad[ice_good_sp_bad['cusip'] == cusip]
    diff_row = {'cusip': cusip}
    for field in key_fields:
        ice_value = cusip_row[f"{field}_ice"].values[0]
        sp_value = cusip_row[f"{field}_sp"].values[0]
        if ice_value != sp_value:
            diff_row[f"{field}_ice"] = ice_value
            diff_row[f"{field}_sp"] = sp_value
    diff_comparison_df = diff_comparison_df.append(diff_row, ignore_index=True)

# Display the comparison of fields with differences
print("Comparison of Key Fields for Outliers with Differences")
display(diff_comparison_df)

In [None]:
comparison_df[comparison_df.dollar_price_ice == comparison_df.dollar_price_sp]

Plot the yield from the reference data on the x-axis and our computed yield on the y-axis.

In [None]:
muni_df['yield_from_price'] = [x[0] for x in muni_df['yield_and_calc_date']]
muni_df['yield_calc_date'] = [x[1] for x in muni_df['yield_and_calc_date']]
# when we get close to redemption, it's difficult to compute the yield from the price
muni_df.plot.scatter(x='yield', y='yield_from_price', c='DarkBlue')

Return the sum and mean of the errors where the error is defined as the absolute distance between our computed yield and the reference yield.

In [None]:
muni_df['ytw_delta'] = abs(muni_df['yield_from_price'] - muni_df['yield'])
print(f'Sum of errors: {np.sum(muni_df['ytw_delta'])}')
print(f'Mean of errors: {np.mean(muni_df['ytw_delta'])}')