# Fill in missing values for real-time yield curve
Date: 2024-12-05  
Last Edit Date: 2025-01-13  


This notebook was tested on Python 3.10 but should run on any newer version. To run this notebook, use the `requirements_jupyter.txt` found in the `train-minute-yield-curve` cloud function directory. Then, perform the following updates to the notebook:
1. Change `TARGET_DATE` to be the date with missing values for the real-time yield curve that will be filled in with this notebook
2. Put the location of the GCP credentials in `os.environ['GOOGLE_APPLICATION_CREDENTIALS']`
3. Set `UPLOAD_TO_BIGQUERY_AND_REDIS` to `True` to upload to BigQuery or Redis; by default, it is set to `False` to not write which is desirable during testing
4. (Optional) if testing frequently, consider setting the `save_data` optional argument in `load_pickled_query_results_if_exists(...)` to `True` to cache the results of the API call

In [1]:
import os
import time
import requests
import pickle
from datetime import datetime, timedelta

import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import auxiliary_variables
auxiliary_variables.TESTING = True    # this allows print statements to be used throughout the files; since auxiliary_variables.TESTING has already been updated, subsequent imports will reflect this change, since Python modules are cached after the first import, so the change remains effective

from auxiliary_variables import PROJECT_ID, DATASET_NAME, ALPHA, DAILY_ETF_WEIGHTS_TABLES, ETFs
from auxiliary_functions import previous_business_day
from bigquery_utils import load_daily_etf_prices_bq, load_maturity_bq, load_index_yields_bq, get_scalar_df, load_shape_parameter
from yieldcurve import get_maturity_dict, get_NL_inputs, scale_X, run_NL_ridge, get_coefficient_df, load_scaler_daily_bq, get_scaler_params
from main import YIELD_CURVE_REDIS_CLIENT, get_schema_minute_yield, load_etf_models_bq, get_prediction_for_sp_maturity_table, upload_data_to_bigquery, upload_data_to_redis

In [2]:
UPLOAD_TO_BIGQUERY_AND_REDIS = False

In [3]:
TARGET_DATE = '2025-01-13'    # date with missing values for the real-time yield curve
TARGET_YEAR_MONTH = pd.to_datetime(TARGET_DATE).strftime('%Y-%m')

In [4]:
YEAR_MONTH_DAY = '%Y-%m-%d'
HOUR_MIN = '%H:%M'
YEAR_MONTH_DAY_HOUR_MIN = f'{YEAR_MONTH_DAY}:{HOUR_MIN}'
YEAR_MONTH_DAY_HOUR_MIN_SEC = f'{YEAR_MONTH_DAY} %H:%M:%S'

In [5]:
ALPHA_VANTAGE_KEY = 'EZR0IHAAL6MFWX4B'    # TODO: where did we get this from?

In [6]:
def load_pickled_query_results_if_exists(file_name: str, url: str, api_call_function: callable, save_data: bool = False) -> pd.DataFrame:
    file_name = f'files/{file_name}'
    if os.path.exists(file_name):
        print(f'Found {file_name} so will try to load the pickle file')
        with open(file_name, 'rb') as f:
            saved_url, df = pickle.load(f)
        
        if saved_url == url:
            print(f'Saved URL matched the desired URL so will load the dataframe')
            return df
        else:
            print(f'Saved URL: {saved_url} does not match desired URL: {url}, so will make API call')
    
    df = api_call_function(url)
    if save_data:
        print(f'Saving the URL and dataframe in the pickle file: {file_name}')
        with open(file_name, 'wb') as f:
            pickle.dump((url, df), f)
    return df

In [7]:
def get_latest_etf_minute_prices_from_alpha_vantage(etf: str, year_month: str, wait: bool = False) -> pd.DataFrame:
    '''`wait` is a boolean that determines whether we wait to avoid hitting the upper limit of the API calls.'''
    url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={etf}&interval=1min&extended_hours=false&adjusted=false&month={year_month}&outputsize=full&apikey={ALPHA_VANTAGE_KEY}'
    print(f'Getting data for {etf} on {year_month} using: {url}')

    def api_call_function(url: str) -> pd.DataFrame:
        num_seconds_to_wait_between_api_calls = 15
        if wait:
            print(f'... waiting {num_seconds_to_wait_between_api_calls} seconds to not hit the upper limit of API calls ...')
            time.sleep(num_seconds_to_wait_between_api_calls)
        r = requests.get(url)
        data = r.json()
        df = pd.DataFrame(data['Time Series (1min)']).T    # 'Time Series (1min)' is the name of the time series data

        for col in df:
            df[col] = df[col].astype(float)

        # rename columns to match the column names that are used downstream
        df.index.rename('Date', inplace=True)
        df = df.rename({'1. open': 'Open',
                        '2. high': 'High',
                        '3. low': 'Low',
                        '4. close': 'Close',
                        '5. volume': 'Volume'},
                        axis=1)
        df.columns = df.columns + '_' + etf
        df.index = pd.to_datetime(df.index)
        return df
    
    pickle_file_name = f'{etf}_{year_month}.pkl'
    return load_pickled_query_results_if_exists(pickle_file_name, url, api_call_function)

In [8]:
def get_latest_etf_minute_prices_for_all_etfs(year_month: str) -> dict:
    return {etf: get_latest_etf_minute_prices_from_alpha_vantage(etf, year_month, wait=(idx != 0)) for idx, etf in enumerate(ETFs)}    # do not wait on the first call because no other calls have been made yet


def get_close_prices_for_date(date: str, etf_to_dataframe: dict, open_time: str = '09:30', close_time: str = '15:59') -> pd.DataFrame:
    '''The default values for `open_time` and `close_time` are the market open and market close times respectively.'''
    combined_df = pd.concat([df.filter(regex='Close', axis=1) for _, df in etf_to_dataframe.items()], axis=1)    # get close prices for each ETF and merge into one dataframe
    combined_df.columns = combined_df.columns.str.replace('Close_', '', regex=False)    # remove the 'Close_' prefix from each column name
    
    get_open = lambda date: datetime.strptime(f'{date} {open_time}:00', YEAR_MONTH_DAY_HOUR_MIN_SEC)
    get_close = lambda date: datetime.strptime(f'{date} {close_time}:00', YEAR_MONTH_DAY_HOUR_MIN_SEC)
    complete_index = pd.date_range(start=get_open(date), end=get_close(date), freq='min')
    
    combined_df = combined_df.loc[date].reindex(complete_index).ffill()    # forward fill because Alpha Vantage prices have some gaps
    return combined_df


def is_valid_date_format(date_string: str):
    '''
    >>> is_valid_date_format('2025-01-13')
    True
    >>> is_valid_date_format('13-01-2025')
    False
    >>> is_valid_date_format('2025/01/13')
    False
    '''
    try:
        # Try to parse the string using the YYYY-MM-DD format
        datetime.strptime(date_string, YEAR_MONTH_DAY)
        return True
    except ValueError:
        # If parsing fails, the format is incorrect
        return False

In [None]:
etf_to_dataframe = get_latest_etf_minute_prices_for_all_etfs(TARGET_YEAR_MONTH)
combined_df_with_etf_quotes = get_close_prices_for_date(TARGET_DATE, etf_to_dataframe)

In [None]:
combined_df_with_etf_quotes

The below cell requires the `matplotlib` package.

In [11]:
def plot_change_in_etf_throughout_day(combined_df_with_etf_quotes: pd.DataFrame) -> None:
    for etf_idx, etf in enumerate(combined_df_with_etf_quotes.columns):
        plt.figure(figsize=(4, 3))
        combined_df_with_etf_quotes.iloc[:, etf_idx].plot()
        plt.title(etf)
        plt.xlabel('time')
        plt.ylabel('Value')
        plt.grid(True)
        plt.show()

In [None]:
plot_change_in_etf_throughout_day(combined_df_with_etf_quotes)

Set credentials below to access BigQuery.

In [13]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/user/ficc/mitas_creds.json'

Load BigQuery data. Taken almost directly from `cloud_functions/train-minute-yield-curve/main.py::main(...)`.

In [14]:
etf_data = load_daily_etf_prices_bq()
maturity_df = load_maturity_bq()
scaler_daily_parameters = load_scaler_daily_bq()
index_data = load_index_yields_bq()
etf_model_data = load_etf_models_bq()

Get scalar and maturity data most recent to `TARGET_DATE`. Taken almost directly from `cloud_functions/train-minute-yield-curve/main.py::main(...)`. Only put in values that are missing.

In [15]:
def get_timestamps_with_missing_yield_curve_values(date: str) -> list[str]:
    '''The returned list will have strings of the form: HH:MM.'''
    assert is_valid_date_format(date), f'{date} is not a valid YYYY-MM-DD format'
    market_end_time = datetime.strptime('15:59', HOUR_MIN)

    timestamps = []
    current_time = datetime.strptime('09:30', HOUR_MIN)    # initialize with market start time
    while current_time <= market_end_time:
        timestamps.append(current_time.strftime(HOUR_MIN))
        current_time += timedelta(minutes=1)

    timestamps = [f'{date}:{timestamp}' for timestamp in timestamps]    # put `date` before the timestamp and seconds after timestamp to be a valid redis key
    yield_curve_values = YIELD_CURVE_REDIS_CLIENT.mget(timestamps)
    return [timestamp for (timestamp, yield_curve_value) in zip(timestamps, yield_curve_values) if yield_curve_value is None]

In [None]:
timestamps_with_missing_yield_curve_values = get_timestamps_with_missing_yield_curve_values(TARGET_DATE)
print(timestamps_with_missing_yield_curve_values)
timestamps_with_missing_yield_curve_values = set(timestamps_with_missing_yield_curve_values)

In [17]:
day_before_target_date = previous_business_day(TARGET_DATE, return_as_string=True)
exponential_mean, exponential_std, laguerre_mean, laguerre_std = get_scaler_params(day_before_target_date, scaler_daily_parameters)
maturity_dict = get_maturity_dict(maturity_df, day_before_target_date)

prev_close_data = [etf_data[fund][f'Close_{fund}'].loc[day_before_target_date:] for fund in ETFs]
prev_close_data = pd.concat(prev_close_data, axis=1)

yield_curve_coefficients_for_each_timestamp = []
tau = load_shape_parameter(day_before_target_date)
scalar_df = get_scalar_df(day_before_target_date)

In [None]:
for timestamp_to_the_minute, quote_data in tqdm(combined_df_with_etf_quotes.iterrows(), total=len(combined_df_with_etf_quotes)):
    intraday_change = ((quote_data.values - prev_close_data) / prev_close_data) * 100 * 100    # first 100 is for percent, and second 100 is for basis points
    
    predicted_ytw = pd.DataFrame()
    for daily_etf_weights_table_name in DAILY_ETF_WEIGHTS_TABLES:
        predicted_ytw[daily_etf_weights_table_name] = get_prediction_for_sp_maturity_table(daily_etf_weights_table_name, 
                                                                                           timestamp_to_the_minute, 
                                                                                           day_before_target_date, 
                                                                                           intraday_change, 
                                                                                           etf_model_data[daily_etf_weights_table_name], 
                                                                                           index_data[daily_etf_weights_table_name], 
                                                                                           verbose=False)

    yield_curve_df = predicted_ytw.T.rename({0: 'ytw'}, axis=1)
    yield_curve_df['Weighted_Maturity'] = yield_curve_df.index.map(maturity_dict).astype(float)
    X, y = get_NL_inputs(yield_curve_df, tau)
    X = scale_X(X, exponential_mean, exponential_std, laguerre_mean, laguerre_std)
    ridge_model = run_NL_ridge(X, y, scale=False, alpha=ALPHA)
    coefficient_df = get_coefficient_df(ridge_model, timestamp_to_the_minute)
    yield_curve_coefficients_for_each_timestamp.append(coefficient_df)
    
    if timestamp_to_the_minute.strftime(YEAR_MONTH_DAY_HOUR_MIN) in timestamps_with_missing_yield_curve_values:    # only perform upload if the timestamp is missing
        if UPLOAD_TO_BIGQUERY_AND_REDIS:
            print(f'{timestamp_to_the_minute} uploaded to BigQuery and Redis since `UPLOAD_TO_BIGQUERY_AND_REDIS` is set to `True`')
            upload_data_to_bigquery(coefficient_df, f'{PROJECT_ID}.{DATASET_NAME}.nelson_siegel_coef_minute', get_schema_minute_yield())
            upload_data_to_redis(timestamp_to_the_minute, coefficient_df, scalar_df, tau)
        else:
            print(f'{timestamp_to_the_minute} is missing and would be uploaded to BigQuery and Redis if `UPLOAD_TO_BIGQUERY_AND_REDIS` were set to `True`')

In [None]:
pd.concat(yield_curve_coefficients_for_each_timestamp)