In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import time
import gc
import numpy as np
from google.cloud import bigquery
from google.cloud import storage
import gcsfs
from datetime import datetime
import matplotlib.pyplot as plt
import pickle5 as pickle

pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/jupyter/ficc/isaac_creds.json"
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.options.mode.chained_assignment = None

bq_client = bigquery.Client()
storage_client = storage.Client()

In [3]:
def load_data_from_pickle(path, bucket = 'isaac_data'):
    if os.path.isfile(path):
        print('File available, loading pickle')
        with open(path, 'rb') as f:
            data = pickle.load(f)
    else:
        print(f'File not available, downloading from cloud storage and saving to {path}')
        fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
        gc_path = os.path.join(bucket, path)
        print(gc_path)
        with fs.open(gc_path) as gf:
            data = pd.read_pickle(gf)
        with open(path, 'wb') as f:
            pickle.dump(data, f)
    return data

In [4]:
df = pd.read_csv('gs://enitre-universe-project/20231106_ficc_500k.csv')

errors = ['We do not provide an evaluated yield since previous MSRB reported yields for this CUSIP are missing or negative.',
          'We are developing an option to display yield to average life for Planned Amortization Class (PAC) bonds.']

In [5]:
(df['1000_sale_to_customer_price'] == -1).sum()/len(df), (df['1000_sale_to_customer_price'] == -1).sum()

(0.6028535781433787, 590943)

In [207]:
def checkcross(df, qty, prediction_col):
    if qty not in [500, 1000]  or prediction_col not in ['price','ytw']:
        raise ValueError(f"{qty} not in [500, 1000] or {prediction_col} not in ['price','ytw']")
        
    crossdiff = df[[f'{qty}_sale_to_customer_{prediction_col}', f'{qty}_purchase_from_customer_{prediction_col}']]
    f1 = crossdiff.isin(errors).any(axis=1) 
    crossdiff = crossdiff[~f1]
    
    for col in crossdiff.columns:
        crossdiff[col] = crossdiff[col].astype(float)
        
    f2 = (crossdiff==-1).any(axis=1)
    crossdiff = crossdiff[~f2]
    
    if prediction_col == 'ytw':
        crossdiff = crossdiff[~f].diff(axis=1).iloc[:,1] < 0
    else:
        crossdiff = crossdiff.diff(axis=1).iloc[:,1] > 0
        
    print(f'{"Col: "+ prediction_col:12} {"Qty: "+ str(qty):12} {"Num Cross: "+ str(crossdiff.sum()):15} {"Cross %: " + str(np.round(100*crossdiff.sum()/len(df), 4)):15} {"Invalid Preds: " + str(sum(f1)+sum(f2)):10}')

In [208]:
for prediction_col in ['ytw']:
    for qty in [500, 1000]:
        checkcross(df, qty, prediction_col)

Col: ytw     Qty: 500     Num Cross: 343  Cross %: 0.035  Invalid Preds: 432464
Col: ytw     Qty: 1000    Num Cross: 6    Cross %: 0.0006 Invalid Preds: 594119


In [186]:
df = pd.read_csv('gs://enitre-universe-project/20231106_ficc_500k.csv')

errors = ['We do not provide an evaluated yield since previous MSRB reported yields for this CUSIP are missing or negative.',
          'We are developing an option to display yield to average life for Planned Amortization Class (PAC) bonds.']

f = df.isin(errors).any(axis=1)

In [187]:
f = (df == -1).any(axis=1)

In [228]:
df[(df['500_sale_to_customer_price'] != -1)] #['500_sale_to_customer_ytw'].value_counts()

Unnamed: 0,cusip,500_sale_to_customer_price,500_sale_to_customer_ytw,500_yield_to_worst_date,500_purchase_from_customer_price,500_purchase_from_customer_ytw,1000_sale_to_customer_price,1000_sale_to_customer_ytw,1000_yield_to_worst_date,1000_purchase_from_customer_price,1000_purchase_from_customer_ytw
1,000379BY2,100.223,4.619,03-01-2024,100.177,4.763,-1.000,-1,The quantity attempting to be priced is larger...,-1.000,-1
2,000379BZ9,101.360,We do not provide an evaluated yield since pre...,,100.973,We do not provide an evaluated yield since pre...,101.361,We do not provide an evaluated yield since pre...,,101.054,We do not provide an evaluated yield since pre...
6,000379CF2,95.536,6.183,09-01-2028,94.912,6.339,95.524,6.186,09-01-2028,94.887,6.346
7,000379CG0,97.620,5.74,09-01-2038,96.022,5.906,97.049,5.799,09-01-2038,95.521,5.958
35,00037CUP3,100.004,4.698,11-15-2023,100.000,4.858,100.003,4.755,11-15-2023,99.999,4.9030000000000005
...,...,...,...,...,...,...,...,...,...,...,...
980211,989786FF0,67.786,5.27,02-01-2042,66.480,5.415,67.940,5.254,02-01-2042,66.602,5.401
980212,989786FG8,66.705,5.437,02-01-2043,65.344,5.587,67.030,5.402,02-01-2043,65.675,5.55
980213,989786FH6,66.667,5.339,02-01-2044,65.319,5.483,66.848,5.32,02-01-2044,65.478,5.465
980214,989786FJ2,64.499,5.472,02-01-2045,63.160,5.617,64.627,5.458,02-01-2045,63.244,5.607


In [None]:
CUSIP_ERROR_MESSAGE = {'invalid': 'CUSIP is invalid', 
                       'not_found': 'CUSIP not supported', 
                       'not_outstanding': 'CUSIP is no longer outstanding', 
                       'defaulted': 'CUSIP has defaulted', 
                       'maturing_before_settlement_date': 'CUSIP is maturing very soon or has already matured', 
                       'not_bonds': 'CUSIP is not supported because we do not support Anticipation Notes, Certificates of Obligation, Warrants, or Commercial Paper', 
                       'insufficient_data': 'One or more of the following dates necessary to compute yield has not been reported for this CUSIP: dated date, interest payment/coupon date, maturity date, coupon (interest) rate', 
                       'negative_yield_in_history': 'MSRB reported yields for this CUSIP are missing or negative. Support for this CUSIP is coming soon!', 
                       'irregular_coupon_rate': 'This CUSIP has an irregular/variable coupon rate or interest payment frequency. Support for this CUSIP is coming soon!', 
                       'high_yield_in_history': 'MSRB reported yields for this CUSIP are abnormally high (greater than 10%). Email us at myles@ficc.ai if you would like to see this CUSIP supported!', 
                       'null_dollar_price_in_history': 'MSRB reported prices are missing for this CUSIP. Support for this CUSIP is coming soon!', 
                       'quantity_greater_than_outstanding_amount': 'The quantity attempting to be priced is larger than the amount outstanding for this CUSIP'}