This notebook corresponds to the cloud function: `update_daily_etf_prices`.

In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'ahmad_creds.json'

In [None]:
import pandas as pd
import time
import requests
from google.cloud import bigquery

In [None]:
API_KEY = 'EZR0IHAAL6MFWX4B'

etfs = ['HYD', 'HYMB', 'IBMJ', 'IBMK', 'IBML', 'ITM', 'MLN', 'MUB', 'PZA', 'SHM', 'SHYD', 'SMB', 'SUB', 'TFI', 'VTEB', 'FMHI', 'MMIN']    # ETFs that we are using for our models

PROJECT_ID = 'eng-reactor-287421'
SP_ETF_DAILY_DATASET = 'ETF_daily_alphavantage'
BQ_PROJECT_DATASET = 'eng-reactor-287421.ETF_daily_alphavantage'

In [7]:
def load_daily_etf_prices_bq():
    '''Loads the maturity data from the specified bigquery tables in the global etfs list and returns a dictionary 
    with keys corresponding to the ETF names.'''
    etf_data  = {}
    
    for table_name in etfs:
        query = f'''SELECT * FROM {SP_ETF_DAILY_DATASET}.{table_name}  ORDER BY Date DESC LIMIT 1 '''    # takes the most recent `date` which refers to the date coming from AlphaVantage
        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        
        df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
        df.sort_values('Date', inplace=True)
        df.set_index('Date', inplace=True, drop=True)    # `drop=True` removes the column that is to be used as the index
        etf_data[table_name] = df 
        
    assert list(etf_data.keys()) == etfs
    return etf_data


def download_daily_prices():
    '''Downloads the daily ETF price data from the AlphaVantage API, and saves it to a dictionary of dataframes.'''
    dataframes = {}
    for etf in etfs: 
        print(f'Downloading daily ETF prices for etf: {etf}')
        url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&outputsize=full&symbol={etf}&apikey={API_KEY}&adjusted=False'
        response = requests.get(url)
        data = response.json()
        df = pd.DataFrame(data['Time Series (Daily)']).T

        for col in df:
            df[col] = df[col].astype(float)

        df.index.rename('Date', inplace=True)
        df = df.rename({'1. open': 'Open', '2. high': 'High', '3. low': 'Low', '4. close': 'Close', '5. volume': 'Volume'}, axis=1)
        df.columns = df.columns + '_' + etf
        df.index = pd.to_datetime(df.index)
        dataframes[etf] = df

        time.sleep(15)        # have `time.sleep(15)` so that we do not want to keep hitting the AlphaVantage API, otherwise they will block our access
    return dataframes


def get_col_names(df: pd.DataFrame):
    '''Retrieves each of columns in `df`. The columns of each ETF's dataframe have a naming prefix (ie Open_MUB, Close_MUB, etc).'''
    return (df.filter(regex='Open').columns[0], 
            df.filter(regex='Close').columns[0], 
            df.filter(regex='Volume').columns[0],
            df.filter(regex='High').columns[0],
            df.filter(regex='Low').columns[0])


def get_schema(Open: str, Close: str, Volume: str, High: str, Low: str):
    '''Returns the schema of the bigquery table for each ETF. The names of the Open, Close, Volume, High and Low columns are taken 
    as input because they are prefixed with the ETF name.'''
    job_config = bigquery.LoadJobConfig(schema=[bigquery.SchemaField('Date', bigquery.enums.SqlTypeNames.DATE),
                                                bigquery.SchemaField(Open, bigquery.enums.SqlTypeNames.FLOAT),
                                                bigquery.SchemaField(Close, bigquery.enums.SqlTypeNames.FLOAT),
                                                bigquery.SchemaField(Volume, bigquery.enums.SqlTypeNames.FLOAT),
                                                bigquery.SchemaField(High, bigquery.enums.SqlTypeNames.FLOAT),
                                                bigquery.SchemaField(Low, bigquery.enums.SqlTypeNames.FLOAT)],
                                        write_disposition='WRITE_APPEND')
    return job_config

In [None]:
def main(args):
    '''First download the bigquery tables of existing ETF price data, then download the daily data from AlphaVantage. 
    Afterwards, check if the observations in the downloaded data are in bigquery: if they are, we make a note and print 
    those with data already available, and if not, upload that data to bigquery.'''
    bq_data = load_daily_etf_prices_bq()
    daily_data = download_daily_prices()
    client = bigquery.Client()

    excluded = []    # list to keep track of ETFs that might already have today's data to avoid duplicates
    for name, df in daily_data.items():    # for each ETF in the data downloaded from AlphaVantage, check if it is already available, then upload if needed
        print(name)
        df = df.sort_index(ascending=True)
        df.index = pd.to_datetime(df.index)
        data_last_date = str(df.index[-1].date())    # date of the most recent entry in the downloaded data 
        
        dates = pd.to_datetime(bq_data[name].index)
        last_date = str(dates[0].date())    # date of the most recent entry in the bigquery table 
        
        if data_last_date == last_date:    # if the data is already available, then we move on to the next etf
            excluded.append(name)
            continue

        df = df.loc[data_last_date:]    # if the data is not available, we get all the data from the last date as a dataframe
        
        # retrieve the column names and schema, and then upload to Bigquery
        Open, Close, Volume, High, Low = get_col_names(df)
        job_config = get_schema(Open, Close, Volume, High, Low)
        table_id = BQ_PROJECT_DATASET + '.' + name
        df = df.reset_index(drop=False).sort_values(by='Date')
        job = client.load_table_from_dataframe(df, table_id, job_config=job_config)  
        job.result()

    if excluded:
        return f'Data for {excluded} already available'
    else:
        return 'Upload Successful for all ETFs'

In [8]:
main('test')

HYD
HYMB
IBMJ
IBMK
IBML
IBMM
ITM
MLN
MUB
PZA
SHM
SHYD
SMB
SUB
TFI
VTEB
FMHI
MMIN
HYD
HYMB
IBMJ
IBMK
IBML
IBMM
ITM
MLN
MUB
PZA
SHM
SHYD
SMB
SUB
TFI
VTEB
FMHI
MMIN


"Data for ['IBMJ', 'IBMK'] already available"