In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import json
import difflib
import yfinance as yf
from sklearn.linear_model import LinearRegression

days_in_quarter = 63

Same code from beta_values to retrieve the previously compiled data with the ETF holdings per day for each ETF.

In [4]:
# This part of the code is very ad-hoc, someone should clean it up---may be necessary to tweek data files to make it cleaner.

file_list = ['data\\S000006408.csv', 'data\\S000006409.csv', 'data\\S000006410.csv', 'data\\S000006411.csv', 'data\\S000006412.csv', 'data\\S000006413.csv', 'data\\S000006414.csv', 'data\\S000006415.csv', 'data\\S000006416.csv', 'data\\S000051152.csv', 'data\\S000062095.csv']
file_list = ['data\\S000006408.csv', 'data\\S000006409.csv', 'data\\S000006410.csv', 'data\\S000006411.csv', 'data\\S000006412.csv', 'data\\S000006413.csv', 'data\\S000006414.csv', 'data\\S000006415.csv', 'data\\S000006416.csv', 'data\\S000062095.csv']

series_to_ticker_mapping = {
    'S000006408': 'XLY',
    'S000006409': 'XLP',
    'S000006410': 'XLE',
    'S000006411': 'XLF',
    'S000006412': 'XLV',
    'S000006413': 'XLI',
    'S000006414': 'XLB',
    'S000006415': 'XLK',
    'S000006416': 'XLU',
    'S000051152': 'XLRE',
    'S000062095': 'XLC'
}

start_date = np.datetime64('2019-10-01')
end_date = np.datetime64('2024-04-01')

holdings_per_day = {}

for file_name in file_list:
    file_name = file_name.split('\\')[-1]
    series = file_name.split('.')[0]
    ticker = series_to_ticker_mapping[series]
    
    # Read the CSV
    df = pd.read_csv(f'data/{file_name}')
    df.index = df['Tickers']
    df = df.iloc[:,:3:-1]
    
    # This is an ad-hoc fix for the fact that some tickers are duplicated because the associated company name
    # label is not consistent (e.g. LOW in the ETF XLY)
    df = df.groupby(df.index).sum()
    
    df.columns = pd.to_datetime(df.columns)
    
    series_holdings_per_day = {}

    date_range = np.arange(start_date,end_date)

    for date in date_range:
        series_holdings_per_day[date] = df[df.columns[df.columns < date].max()]

    holdings_per_day[ticker] = pd.DataFrame(series_holdings_per_day).transpose() != 0
    
    holdings_per_day[ticker] = holdings_per_day[ticker].where(holdings_per_day[ticker] != 0,np.nan)

These functions were in the beta_values file, but have been amended slightly.  (Please replace the old version of this code in beta_values with this new code.)

In [5]:
def returns(ticker,start_date=None,end_date=None):
    data = yf.Ticker(ticker).history(period='max')
    data.index = data.index.tz_localize(None)
    returns = (data['Close'] - data['Close'].shift(1)) / data['Close'].shift(1)
    if start_date is not None and end_date is not None:
        return returns.loc[str(start_date):str(end_date)]
    else:
        return returns
    
def log_returns(ticker,start_date=None,end_date=None):
    data = yf.Ticker(ticker).history(period='max')
    data.index = data.index.tz_localize(None)
    log_returns = np.log(data.Close / data.Close.shift(1))
    if start_date is not None and end_date is not None:
        return log_returns.loc[start_date.astype(str):end_date.astype(str)]
    else:
        return log_returns

def betas(stock, etf, start_date, end_date, L_min=100, halflife=days_in_quarter):
    
    # L_min specifies how many days the stock must have been listed to calculate the beta coefficient.  
    # If the stock is quite newly listed, then the calculated beta value is unstable and not very meaningful 
    # (and so we will set beta = np.nan).  
    
    times = np.arange(start_date,end_date)#pd.date_range(start=start_date, end=end_date, freq='D')
    betas = []
    
    stock_returns = returns(stock)
    etf_returns = returns(etf)
    
    two_stocks = pd.DataFrame({stock: stock_returns, etf: etf_returns}).dropna()
    
    for t in times:
         
        # Include only columns through t-1 so that the beta on day t depends only on returns from before day t:
        
        etf_t = two_stocks.loc[:(t-1),etf].values.reshape(-1, 1)
        stock_t = two_stocks.loc[:(t-1),stock].values

        L = len(stock_t)
        
        alpha = 0.5 ** (1 / halflife)
        
        # Calculate exponentially decaying weights for linear regression, so that we weight more 
        # recent returns more heavily.
        
        weights = alpha ** (L - 1 - np.arange(L))
        weights /= weights.sum()
        
        # Compute regression coefficient beta, which measures how much the stock returns 
        # are expected to move in response to ETF returns.  Stocks in an ETF with larger betas
        # contribute more to the volatility of the ETF.  Intuitively, stocks with smaller beta
        # values are likely less related to the themes surrounding most high-volume selloffs of the ETF.
              
        model = LinearRegression()
        
        if L > L_min:
            model.fit(etf_t, stock_t, sample_weight=weights)
            beta = model.coef_[0]
        else:
            beta = np.nan
        
        betas.append(beta)
    
    return pd.Series(data = betas, index = times)

We now retrieve the previously compiled ETF beta values of the stocks in each ETF.

In [6]:
etf_tickers = {
    'XLY',
    'XLP',
    'XLE',
    'XLF',
    'XLV',
    'XLI',
    'XLB',
    'XLK',
    'XLU',
    #'XLRE',
    'XLC'
}

betas_per_day = {}

for etf in etf_tickers:
    df = pd.read_csv(f"data/{etf}_betas_per_day.csv")
    df.set_index(df.columns[0],inplace=True)
    df.index.name = 'Day'
    df.index = pd.to_datetime(df.index)
    betas_per_day[etf] = df


Function to calculate an exponentially weighted z-score for a time series.

In [7]:
def exp_weighted_z_score(data,halflife):
    return (data - data.ewm(halflife=halflife).mean().shift(1)) / data.ewm(halflife=halflife).std().shift(1) 

For each ETF, our strategy has :

(1) *When to invest?*  The idea of our strategy involves investing after a day during which the ETF volume is statistically large and the ETF volume is negative.  We compute these days in `high_volume_neg_return_days` below.

(2) *Which stocks to invest in?*

Once we choose a day $D$ and a stock $S$ within an ETF, we must determine the following: if we invested in the stock $S$ at the *close* of day $D$ and held it for 40 days, how does the stock's return $r_S$ compare with the return $r_{ETF}$ of the ETF over that same period?  Since we want to measure our portfolio's alpha compared with the ETF, we leverage our portfolio with the ETF beta value $\beta_S$ of the stock on day $D$.  Therefore, we define the alpha of the stock's 40-day return versus the ETF as $r_S/\beta_S - r_{ETF}$.  These alphas are computed in `stock_40_day_alpha`.




In [8]:
stock_log_returns = {}
stock_40_day_returns = {}
stock_40_day_alpha = {}
high_volume_neg_return_days = {}

for etf in etf_tickers:      
        
    # Compile tickers of both the ETF and all stocks in the ETF's holdings from the time period being considered
    tickers = []
    tickers.append(etf)
    stocks = holdings_per_day[etf].columns
    tickers.extend(stocks.values.tolist())

    # Compute the log returns of each of these tickers during the time period
    
    etf_stock_log_returns = pd.DataFrame({ticker: log_returns(ticker,start_date=start_date,end_date=end_date) for ticker in tickers})
    #etf_stock_log_returns.index = etf_stock_log_returns.index.tz_localize(None)
    
    # Compute, for each day, the returns from buying the stock AFTER that day and holding for 40 days
    
    etf_stock_40_day_returns = np.exp(sum(etf_stock_log_returns.shift(-i) for i in range(1,41))) - 1
    
    # Compute the difference between the stock's 40-day return that we just computed with the corresponding 40-day return of the ETF, leveraging by the stock's ETF beta
    
    df = etf_stock_40_day_returns[stocks].copy()

    for stock in stocks:
        df[stock] = (df[stock] - etf_stock_40_day_returns[etf]) / betas_per_day[etf][stock]
        
    df = df.dropna(how='all')
       
    etf_stock_40_day_alpha = df
    
    # Compute the z scores to determine when the ETF volume is statistically large and the ETF return is negative
    
    start = etf_stock_40_day_alpha.index[0]
    end = etf_stock_40_day_alpha.index[-1]
    
    etf_returns = returns(etf,start_date=start,end_date=end)
    
    etf_ticker = yf.Ticker(etf)
    etf_vol = etf_ticker.history(period='max').Volume.tz_localize(None)    
    etf_vol_z_scores = exp_weighted_z_score(etf_vol,halflife=days_in_quarter).loc[start:end]
    etf_high_volume_neg_return_days = (etf_vol_z_scores >= 3) * (etf_returns < 0)
    
    # Add all the information just computed to different dictionaries, indexing that information by the ETF ticker
    
    stock_log_returns[etf] = etf_stock_log_returns
    stock_40_day_returns[etf] = etf_stock_40_day_returns
    stock_40_day_alpha[etf] = etf_stock_40_day_alpha
    high_volume_neg_return_days[etf] = etf_high_volume_neg_return_days