This notebook corresponds to the cloud function: `train_daily_etf_model`.

In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'ahmad_creds.json'

In [None]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import Lasso
from google.cloud import bigquery

In [None]:
PROJECT_ID = 'eng-reactor-287421'
ETF_DAILY_DATASET = 'ETF_daily_alphavantage'
SP_INDEX_DATASET = 'spBondIndex'
DATASET_NAME = 'yield_curves_v2'

TRAIN_WINDOW_SIZE = 45    # number of days of prior data to train the etf model on; previously tuned hyperparameter

sp_index_tables = ['sp_12_22_year_national_amt_free_index',
                   'sp_15plus_year_national_amt_free_index',
                   'sp_7_12_year_national_amt_free_municipal_bond_index_yield',
                   'sp_muni_high_quality_index_yield',
                   'sp_high_quality_intermediate_managed_amt_free_municipal_bond_index_yield',
                   'sp_high_quality_short_intermediate_municipal_bond_index_yield',
                   'sp_high_quality_short_municipal_bond_index_yield',
                   'sp_long_term_national_amt_free_municipal_bond_index_yield']

sp_maturity_tables = ['sp_12_22_year_national_amt_free_index',
                      'sp_15plus_year_national_amt_free_index',
                      'sp_7_12_year_national_amt_free_index',
                      'sp_high_quality_index',
                      'sp_high_quality_intermediate_managed_amt_free_index',
                      'sp_high_quality_short_intermediate_index',
                      'sp_high_quality_short_index',
                      'sp_long_term_national_amt_free_municipal_bond_index_yield']

# description of how these ETF's were chosen are in subsection 1a of section "Why use ETFs?" in https://www.notion.so/FICC-Yield-Curve-0e9d3fb1a49a4789826083361257a962
best_funds = {'sp_12_22_year_national_amt_free_index' : ['FMHI', 'MUB'], 
              'sp_15plus_year_national_amt_free_index': ['FMHI', 'MLN', 'MUB', 'TFI', 'SUB', 'SHYD', 'HYMB', 'HYD'], 
              'sp_7_12_year_national_amt_free_index': ['TFI', 'PZA', 'ITM', 'MLN'], 
              'sp_high_quality_index': ['PZA', 'TFI', 'ITM'], 
              'sp_high_quality_intermediate_managed_amt_free_index': ['TFI', 'PZA', 'ITM', 'MLN'], 
              'sp_high_quality_short_intermediate_index': ['PZA', 'TFI', 'ITM'], 
              'sp_high_quality_short_index': ['PZA', 'HYMB', 'HYD', 'MLN', 'ITM', 'TFI', 'SHYD', 'SHM'], 
              'sp_long_term_national_amt_free_municipal_bond_index_yield' : ['FMHI', 'MLN', 'MUB', 'SUB']}

best_lambdas = {'sp_12_22_year_national_amt_free_index' : 5.0,
                'sp_15plus_year_national_amt_free_index': 5.0,
                'sp_7_12_year_national_amt_free_index': 1.0,
                'sp_high_quality_index': 1.0,
                'sp_high_quality_intermediate_managed_amt_free_index': 1.0,
                'sp_high_quality_short_intermediate_index': 1.0,
                'sp_high_quality_short_index': 1.0,
                'sp_long_term_national_amt_free_municipal_bond_index_yield': 5.0}    

sp_index_table_to_sp_maturity_table = dict(zip(sp_index_tables, sp_maturity_tables))

ETFS = ['FMHI',
        'HYD',
        'HYMB',
        'IBMK',
        'IBML',
        'ITM',
        'MLN',
        'MMIN',
        'MUB',
        'PZA', 
        'SHM', 
        'SHYD', 
        'SMB', 
        'SUB' , 
        'TFI']

In [None]:
def load_etf_data():
    '''Loads the daily etf prices from BigQuery. The data for each etf is load as a dataframe and then combined 
    in a dictionary.'''
    etf_data  = {}
    for etf in ETFS:
        query = f'''SELECT DISTINCT * FROM {ETF_DAILY_DATASET}.{etf} '''
        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        
        df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')
        df.sort_values('Date', inplace=True)
        df.set_index('Date', inplace=True, drop=True)    # `drop=True` removes the column that is to be used as the index
        etf_data[etf] = df.drop_duplicates()
    return etf_data


def load_index_yields():
    '''Loads the S&P index yields from BigQuery. Each individual index is read as a dataframe which are then 
    combined into a dictionary.'''
    index_data  = {}
    for sp_index_table in sp_index_tables:
        query = f'''SELECT DISTINCT * FROM {SP_INDEX_DATASET}.{sp_index_table} '''
        
        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        df = df.drop_duplicates('date')
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        df.sort_values('date', inplace=True, ascending=True)
        df.set_index('date', inplace=True, drop=True)    # `drop=True` removes the column that is to be used as the index

        df['ytw'] = df['ytw'] * 100    # convert to basis points
        
        sp_maturity_table = sp_index_table_to_sp_maturity_table[sp_index_table]    # standardize names between maturity and yield data
        index_data[sp_maturity_table] = df 
    return index_data


def preprocess_data(index_data: dict, etf_data: dict, index_name: str, etf_names: list, date_start='2020-05', var='Close'):
    '''Takes as input the loaded S&P index data and ETF data from bigquery, which is stored as a dictionary 
    of dataframes. It also takes the name of a single S&P index and a list of ETFs that are relevant to predicting 
    that index. It then merges this data into a single dataframe, calculating the percent change, `pct_change`, in 
    ETF prices in basis points and the change in index ytw in basis points. This is done, by default, for 
    observations after May 2020 and for the Close prices of the ETFs. The merged result is returned.'''
    data = []
    
    # preprocess etf data by retrieving ETFs of interest and calculating pct_change in basis points
    for etf_name in etf_names:
        etf = etf_data[etf_name].copy()
        etf = etf.drop_duplicates()
        data.append(etf[f'{var}_{etf_name}'].pct_change() / 0.0001)
    etf = pd.concat(data, axis=1)
    
    # preprocess index data by first-differencing ytw
    index = index_data[index_name].copy()
    index['ytw_diff'] = index['ytw'].diff()
    
    # merge etf and index date
    temp_df = pd.merge(etf, index, left_index=True, right_index=True).loc[date_start:]
    return temp_df.dropna()


def get_schema_etf(coefficient_df: pd.DataFrame) -> pd.DataFrame:
    '''Gets the bq schema to upload the data to the bq table containing the coefficients 
    for the linear model using ETF prices to predict index yield, for each index.'''
    schema = [bigquery.SchemaField('Date', 'DATE')] + [bigquery.SchemaField(column, 'FLOAT') for column in coefficient_df.columns]
    return schema


def upload_data(df, table_id, schema):
    client = bigquery.Client(project=PROJECT_ID, location='US')
    job_config = bigquery.LoadJobConfig(schema=schema, write_disposition='WRITE_APPEND')
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    try:
        job.result()
        print(f'Upload Successful to {table_id}')
    except Exception as e:
        print(f'Failed to Upload to {table_id}')
        raise e

In [None]:
def main(args):
    '''First, load the index and etf data. Then, for each S&P index, train a LASSO model using 
    the previously identified optimal subset of ETFs and previously identified optimal alpha 
    value to predict yields. Training data size is equal to the window size.'''
    index_data = load_index_yields()
    etf_data = load_etf_data()
    
    for index, best_funds_for_index in best_funds.items(): 
        coefficients_dict = {}    # used to save the coefficients 

        # load data and hyperparameters 
        best_lambda = best_lambdas[index]
        data = preprocess_data(index_data, etf_data, index, best_funds_for_index)

        # get X and Y data
        X = data.drop(['ytw', 'ytw_diff'], axis=1)
        y = data['ytw_diff']
        X_cols = list(X.columns)

        # training data size is the window size 
        X_train = X.iloc[-TRAIN_WINDOW_SIZE:, :]
        y_train = y.iloc[-TRAIN_WINDOW_SIZE:]
        assert len(X_train) == len(y_train)
        
        # get the date to index the model and train the model 
        date = X_train.index.max().date().isoformat()
        lasso = Lasso(alpha=best_lambda, random_state=1, max_iter=5000).fit(X_train, y_train)    # inputs to the Lasso model: ETF prices, output: predicted S&P index value

        # save the coefficients to one row dataframe
        columns = ['constant'] + X_cols
        coefficients = np.hstack([lasso.intercept_, lasso.coef_])
        coefficients_dict[date] = dict(zip(columns, coefficients))
        coefficient_df = pd.DataFrame(coefficients_dict).T
        coefficient_df.index = pd.to_datetime(coefficient_df.index)
        coefficient_df = coefficient_df.reset_index(drop=False).rename({'index': 'Date'}, axis=1)
        
        table_id = PROJECT_ID + '.' + DATASET_NAME + '.' + index
        schema = get_schema_etf(coefficient_df)
        upload_data(coefficient_df, table_id, schema)
        
    return 'SUCCESS'

In [7]:
main('test')

Upload Successful
Upload Successful
Upload Successful
Upload Successful
Upload Successful
Upload Successful
Upload Successful
Upload Successful
