This notebook corresponds to the cloud function: `train_daily_yield_curve`.

In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'creds.json'

In [2]:
import numpy as np
import pandas as pd
from pandas.tseries.offsets import CustomBusinessDay
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import redis
import pickle

from google.cloud import bigquery

In [None]:
BUSINESS_DAY = CustomBusinessDay(calendar=USFederalHolidayCalendar())    # used to skip over holidays when adding or subtracting business days

In [None]:
TARGET_DATE = '2023-06-16'

In [None]:
PROJECT_ID = 'eng-reactor-287421'
DATASET_NAME = 'yield_curves_v2'

In [3]:
TABLE_ID_MODEL = f'{PROJECT_ID}.{DATASET_NAME}.nelson_siegel_coef_daily'
TABLE_ID_SCALER = f'{PROJECT_ID}.{DATASET_NAME}.standardscaler_parameters_daily' 

sp_index_tables = ['sp_12_22_year_national_amt_free_index',
                   'sp_15plus_year_national_amt_free_index',
                   'sp_7_12_year_national_amt_free_municipal_bond_index_yield',
                   'sp_muni_high_quality_index_yield',
                   'sp_high_quality_intermediate_managed_amt_free_municipal_bond_index_yield',
                   'sp_high_quality_short_intermediate_municipal_bond_index_yield',
                   'sp_high_quality_short_municipal_bond_index_yield',
                   'sp_long_term_national_amt_free_municipal_bond_index_yield']

sp_maturity_tables = ['sp_12_22_year_national_amt_free_index',
                      'sp_15plus_year_national_amt_free_index',
                      'sp_7_12_year_national_amt_free_index',
                      'sp_high_quality_index',
                      'sp_high_quality_intermediate_managed_amt_free_index',
                      'sp_high_quality_short_intermediate_index',
                      'sp_high_quality_short_index',
                      'sp_long_term_national_amt_free_municipal_bond_index_yield']

In [4]:
def load_index_data() -> pd.DataFrame:
    '''This function load the S&P index data into a single dataframe. The output of the function is a 
    dataframe containing the yield to worst of all the indices in a single dataframe.'''
    index_data = [] 
    for table in sp_index_tables:
        query = f'''SELECT * FROM `eng-reactor-287421.spBondIndex.{table}` order by date desc limit 1'''    # takes the most recent `date` which refers to the date on which we grabbed the value from S&P
        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        df['ytw'] = df['ytw'] * 100    # convert to basis points
        df = df.drop_duplicates('date')    # TODO: does this line do anything? perhaps not because we set the limit to 1 in `query`
        df.set_index('date', inplace=True, drop=True)    # `drop=True` removes the column that is to be used as the index
        index_data.append(df)
    
    df = pd.concat(index_data, axis=1)
    df.columns = sp_maturity_tables    # the reason that this is `sp_maturity_tables` instead of `sp_index_tables` is because the returned `df` from `load_index_data()` needs to correspond with that from `load_maturity_data()`
    df.ffill(inplace=True, axis=0)
    return df

In [5]:
def load_maturity_data() -> pd.DataFrame:
    '''This function loads the S&P maturity data into a single dataframe. The output of the function is a dataframe containing the 
    weighted average maturities of all the indices in a single dataframe.'''
    maturity_data  = []
    for table in sp_maturity_tables:
        query = f'SELECT * FROM `eng-reactor-287421.spBondIndexMaturities.{table}` order by effectivedate desc limit 1;'    # takes the most recent `effectivedate` which refers to the date on which we grabbed the value from S&P
        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')        
        df['effectivedate'] = pd.to_datetime(df['effectivedate'], format='%Y-%m-%d')
        df = df.drop_duplicates('effectivedate')    # TODO: does this line do anything? perhaps not because we set the limit to 1 in `query`
        df.set_index('effectivedate', inplace=True, drop=True)    # `drop=True` removes the column that is to be used as the index
        df = df[['weightedAverageMaturity']]
        maturity_data.append(df) 
        
    df = pd.concat(maturity_data, axis=1)
    df.columns = sp_maturity_tables
    df.ffill(inplace=True, axis=0)    # fills NaN values; TODO: is this necessary? shouldn't it always be the case that we do not haev NaN values?
    return df

In [6]:
def get_maturity_dict(maturity_df: pd.DataFrame, date:str) -> dict:
    '''This function creates a dictonary with the index name being the key and the weighted average maturities as the values.'''
    temp_df = maturity_df.loc[date].T
    temp_dict = dict(zip(temp_df.index, temp_df.values))
    return temp_dict

In [7]:
def get_yield_curve_maturity_df(index_data: pd.DataFrame, date: str, maturity_dict: dict) -> pd.DataFrame:
    '''This function creates a dataframe that contains the yield to worst and weighted average maturity for a specific date.'''
    df = pd.DataFrame(index_data.loc[date])
    df.columns = ['ytw']
    df['Weighted_Maturity'] = df.index.map(maturity_dict)
    return df

In [8]:
def decay_transformation(t: np.array, L: float):
    '''This function takes a numpy array of maturities and a shape parameter. 
    It returns the exponential function calculated from those values.'''
    return L*(1-np.exp(-t/L))/t

In [9]:
def laguerre_transformation(t: np.array, L: float):
    '''This function takes a numpy array of maturities and a shape parameter. 
    It returns the laguerre function calculated from those values.'''
    return (L*(1-np.exp(-t/L))/t) -np.exp(-t/L)

In [10]:
def get_model_inputs(yield_curve_maturity_df: pd.DataFrame, L: int):
    '''This function creates the inputs for the regression model.
    The inputs are created using the exponential and laguerre transform.'''
    temp_df = yield_curve_maturity_df.copy()
    temp_df['X1'] = decay_transformation(temp_df['Weighted_Maturity'], L)
    temp_df['X2'] = laguerre_transformation(temp_df['Weighted_Maturity'], L)
    
    X = temp_df[['X1', 'X2']]
    y = temp_df['ytw']
    
    return X, y

In [11]:
def train_model(X: np.array, Y: float):
    '''This function train a regression model to estimate the Nelson-Siegel coefficients.'''
    scaler = StandardScaler()    # used to set the mean and std dev to 0 and 1 respectively of a dataset; `scalar` stores the mean and standard deviation as attributes
    X = scaler.fit_transform(X)
    model = Ridge(alpha=0.001, random_state = 1).fit(X , Y)
    return scaler, model

In [12]:
def getSchema_model():
    '''This function returns the schema required for the bigquery table storing the nelson siegel coefficients.'''
    schema = [bigquery.SchemaField('date', 'DATE', 'REQUIRED'),
              bigquery.SchemaField('const', 'FLOAT', 'REQUIRED'),
              bigquery.SchemaField('exponential', 'FLOAT', 'REQUIRED'),
              bigquery.SchemaField('laguerre', 'FLOAT', 'REQUIRED')]
    return schema


def getSchema_scaler():
    '''This function returns the schema required for the bigquery table storing the sklearn StandardScaler's parameters siegel coefficients.'''
    schema = [bigquery.SchemaField('date', 'DATE', 'REQUIRED'),
              bigquery.SchemaField('exponential_mean','FLOAT', 'REQUIRED'),
              bigquery.SchemaField('exponential_std','FLOAT', 'REQUIRED'),
              bigquery.SchemaField('laguerre_mean','FLOAT', 'REQUIRED'),
              bigquery.SchemaField('laguerre_std','FLOAT', 'REQUIRED')]
    return schema

In [13]:
def upload_data(df: pd.DataFrame, table_id: str):
    '''This function upload the coefficient and scalar dataframe to BigQuery. `table_id` is the path of the bigquery table to upload to.'''
    client = bigquery.Client()
    
    if table_id == TABLE_ID_MODEL:
        schema = getSchema_model()
    elif table_id == TABLE_ID_SCALER:
        schema = getSchema_scaler()
    else:
        raise ValueError

    job_config = bigquery.LoadJobConfig(schema=schema, write_disposition='WRITE_APPEND')
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    try:
        job.result()
        print(f'Upload Successful to {table_id}')
    except Exception as e:
        print(f'Failed to Upload to {table_id}')
        raise e

In [14]:
def load_shape_parameter() -> float:
    '''This function grabs the latest shape parameters for the Nelson Siegel.'''
    query = f'''SELECT L FROM `{PROJECT_ID}.{DATASET_NAME}.shape_parameters` ORDER BY date DESC LIMIT 1'''    # takes the most recent `date` which refers to the effective date from S&P
    df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
    return df.loc[0].values[0]

In [27]:
def main(args):
    '''Grab the index YTW values and the weighted average duration, and calculate the Nelson-Siegel coefficients. 
    Store the Nelson-Siegel coefficients in BigQuery and Redis.'''
    maturity_data = load_maturity_data()
    index_data = load_index_data()
    L = load_shape_parameter()

    coefficient_df = pd.DataFrame()
    scaler_df = pd.DataFrame()
    for target_date in list(maturity_data.index.astype(str)):    # `for` loop runs only once and it is when `target_date == TARGET_DATE`; TODO: refactor this code to remove the `for` loop
        if target_date != TARGET_DATE: continue
        print(f'Calculating the coefficients for {target_date}')
        maturity_dict = get_maturity_dict(maturity_data, target_date)
        yield_curve_maturity_df = get_yield_curve_maturity_df(index_data, target_date, maturity_dict)

        # creating the inputs for the model
        X, Y = get_model_inputs(yield_curve_maturity_df, L)
        scaler, model = train_model(X, Y)

        # retrieve model parameters
        const = model.intercept_
        exponential = model.coef_[0]
        laguerre = model.coef_[1]

        # retrieve scaler parameters, used to standardize the data
        exponential_mean = scaler.mean_[0]
        exponential_std = np.sqrt(scaler.var_[0])
        laguerre_mean = scaler.mean_[1]
        laguerre_std = np.sqrt(scaler.var_[1])

        # convert date to datetime object
        date = pd.to_datetime(target_date).date()    # pd.to_datetime(datetime.datetime.now()).date()

        temp_coefficient_df = pd.DataFrame({'date': date, 
                                            'const': const, 
                                            'exponential': exponential, 
                                            'laguerre': laguerre}, index=[0])

        temp_scaler_df = pd.DataFrame({'date': date, 
                                       'exponential_mean': exponential_mean, 
                                       'exponential_std': exponential_std, 
                                       'laguerre_mean': laguerre_mean, 
                                       'laguerre_std': laguerre_std}, index=[0])

        coefficient_df = coefficient_df.append(temp_coefficient_df)
        scaler_df = scaler_df.append(temp_scaler_df)    
    print(f'Uploading Data to {TABLE_ID_MODEL} and {TABLE_ID_SCALER}')
    upload_data(coefficient_df, TABLE_ID_MODEL) 
    upload_data(scaler_df, TABLE_ID_SCALER)

    print('Uploading data to redis')
    string_date = date
    string_date = string_date + (BUSINESS_DAY * 1)    # S&P publishes its indices at the end of the business day, so if a trade occurs on day 1, it should use the S&P indices published on day 0 and not day 1, therefore we add a business day to the S&P effective date so that we use the correct indices 
    string_date = string_date.strftime('%Y-%m-%d')
    
    coefficient_df.reset_index(inplace=True, drop=True)
    scaler_df.reset_index(inplace=True, drop=True)
    nelson_values = coefficient_df.set_index('date')
    scalar_values = scaler_df.set_index('date')

    temp_dict = {'nelson_values': nelson_values, 'scalar_values': scalar_values, 'shape_parameter': L}
    redis_client = redis.Redis(host='10.227.69.60', port=6379, db=0)    # this `redis_client` is used in `ficc/app_engine/demo/server/modules/ficc/utils/yc_data.py` and possibly elsewhere
    value = pickle.dumps(temp_dict, protocol=pickle.HIGHEST_PROTOCOL)
    redis_client.set(string_date, value)
    
    return 'SUCCESS'

In [28]:
main('test')

Calculating the coefficients for 2023-06-16
Uploading Data
Upload Successful
Upload Successful
Uploading data to redis


'SUCCESS'