In [67]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import numpy as np
import ssl
from itertools import product
import sklearn


# Repeat the prior data mining example to identify the top 10 long/short trades
# using the last 10 years. But apply ML to determine whether we should actually
# take those trades offered by the seasonal strategy. In this case, conditional
# on a signal, assign a 1 or 0 if the strategy would have made money, and use
# ML of this on a number of other indicators:
# 1) Other seasonal indicators: Winrate for last year, last 3 yrs, last 5 yrs
# 2) Estimate of daily volatility (EWM with span parameter = 30)
# 3) Simple momentum indicator for the stock and the SP500 overall: 'up'
#    if EWM with span of 5 (1 week) exceeds the EWM with span of 10 (2 weeks)

# Define the feature functions

# Historical rate of stock going up (i.e. winrate if we are long)
def historical_up_rate(data, symbol, start_date, end_date, first_year, last_year):
    up_list = []
    # Deal with Feb 29: assign start/end dates to Mar 1
    if start_date == '02-29': start_date = '03-01'
    if end_date == '02-29': end_date = '03-01'
    for year in range(first_year, (last_year + 1)):
        full_start_date = str(year) + '-' + start_date
        full_end_date = str(year) + '-' + end_date
        trade_start_date = data.loc[full_start_date, 'Price Date']
        trade_end_date = data.loc[full_end_date, 'Price Date']
        start_price = data.loc[full_start_date, symbol]

        # If price data is missing, skip that year
        if np.isnan(start_price):
            continue
        end_price = data.loc[full_end_date, symbol]
        if np.isnan(end_price):
            continue

        if end_price >= start_price:
            ret = 1
        else:
            ret = 0
        up_list.append(ret)
    return np.mean(up_list)


# Daily volatility with exponentially weighted moving average
def get_ewm_vol(data, symbol, span):
    stock_rets = log(data.symbol / data.symbol.shift(1))
    return stock_rets.ewm(span).std()


# EWMA of simple price: compare between a long/short window to determine
# whether the signal is long
def get_ewm_price(data, symbol, span):
    return data.symbol.ewm(span).mean()

def return_stats(x, risk_free_rate = 0):
    d = {}
    d['N'] = x['Symbol'].count()
    d['avg r'] = x['Return'].mean()
    d['vol'] = x['Return'].std()
    downsides = x[x['Return'] < risk_free_rate]['Return']
    d['downside dev'] = 0 if downsides.count()==0 else downsides.std()
    upsides = x[-x['Return'] < risk_free_rate]['Return']
    d['upside dev'] = 0 if upsides.count()==0 else upsides.std()
    d['up'] = sum(x['Return']>risk_free_rate)
    return pd.Series(d, index = ['N','avg r','vol','downside dev','upside dev',
        'up'])

def seasonal_return(data, symbol, start_date, end_date, first_year, last_year):
    data_list = []
    # Deal with Feb 29: assign start/end dates to Mar 1
    if start_date == '02-29': start_date = '03-01'
    if end_date == '02-29': end_date = '03-01'
    for year in range(first_year, (last_year + 1)):
        full_start_date = str(year) + '-' + start_date
        full_end_date = str(year) + '-' + end_date
        trade_start_date = data.loc[full_start_date, 'Price Date']
        trade_end_date = data.loc[full_end_date, 'Price Date']
        start_price = data.loc[full_start_date, symbol]

        # If price data is missing, skip that year
        if np.isnan(start_price):
            continue
        end_price = data.loc[full_end_date, symbol]
        if np.isnan(end_price):
            continue
        returns = (end_price / start_price) - 1
        data_list.append([symbol, year, trade_start_date, start_price,
                          trade_end_date,
                          end_price, returns])

    df = pd.DataFrame(data_list, columns=['Symbol', 'Year', 'Init Date',
                                          'Init Price', 'Final Date', 'Final Price', 'Return'])
    return df

In [68]:
# Pull the adjusted close prices off Yahoo Finance
df = pd.read_csv("S&P500-Symbols.csv")
tickers = list(df['Symbol'])
start_date = '1989-01-01'
end_date = '2024-01-03'  # Get data a few days past end of year to backfill

# Either pull from Yahoo finance, or for read the pre-downloaded CSV
# data = pd.DataFrame(yf.download(tickers, start_date, end_date)['Adj Close'])
# data.reset_index().to_csv("S&P500-adjusted-close.csv", index=False)
data = pd.read_csv('S&P500-adjusted-close.csv')
data['Date']= pd.to_datetime(data['Date'])
data = data.set_index('Date')
all_dates = pd.date_range(start_date, end_date)
data['Price Date'] = data.index

# Backfill with trading prices for missing dates
data = data.reindex(all_dates, method='bfill')
sp500_dates_added = pd.read_csv("S&P500-Info.csv")[['Symbol','Date added']]

all_stocks = data.columns.drop(labels='Price Date')

# Only keep through end of 2023
data = data[data.index < '2024-01-01']

# Restrict to stocks that have data back in 2009, total of 432
sub_cols = data.columns[data.loc['2009-01-01'].notna()][0:20]
sub_cols = sub_cols.append(pd.Index(['Price Date']))
sub_stocks = data[sub_cols].columns.drop(labels='Price Date')
                                                        
sub_data = data[sub_cols][data.index >= '2009-01-01']

hold_range = [7, 14, 28]  # hold for a fixed number of weeks, up to a month
delay_range = [0, 5, 10]
start_months = list(range(1, 12 + 1))
start_days = ['-01', '-15']
initial_dates = [str(i) + j for i, j in product(start_months, start_days)]


In [71]:
all_returns_list = []

for trade_year in [2019, 2020]:
    start_year = trade_year - 10
    end_year = trade_year - 1


    for initial_date in initial_dates:

        initial_calendar_year = datetime.strptime(
            str(trade_year) + "-" + initial_date, "%Y-%m-%d")

        # Delay refers to how many days after the 1st or 15th we start the trade
        # This technically means we will never start a position on the 29-31st
        for delay in delay_range:
            start_calendar_year = initial_calendar_year + timedelta(days=delay)
            start_calendar = start_calendar_year.strftime('%m-%d')

            for hold_length in hold_range:
                end_calendar = (start_calendar_year + timedelta(days=hold_length)
                                ).strftime('%m-%d')
                stock_returns_list = []

                for stock in sub_stocks:
                    stock_returns_list.append(seasonal_return(sub_data, stock,
                                                              start_calendar, end_calendar, start_year, end_year))

                seasonal_returns = pd.concat(stock_returns_list)
                symbol_stats = seasonal_returns.groupby('Symbol').apply(
                    return_stats, risk_free_rate=0)
                symbol_stats['trade window'] = initial_date
                symbol_stats['start date'] = start_calendar_year
                symbol_stats['end date'] = end_calendar
                symbol_stats['Sharpe Long'] = symbol_stats['avg r'] / symbol_stats['vol']
                symbol_stats['Sharpe Short'] = -symbol_stats['avg r'] / symbol_stats['vol']

                symbol_stats['hold length'] = hold_length
                all_returns_list.append(symbol_stats)

all_returns = pd.concat(all_returns_list)


In [72]:
all_returns

Unnamed: 0_level_0,N,avg r,vol,downside dev,upside dev,up,trade window,start date,end date,Sharpe Long,Sharpe Short,hold length
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,10.0,0.024908,0.047819,0.025198,0.040233,8.0,1-01,2019-01-01,01-08,0.520873,-0.520873,7
AAL,10.0,0.056909,0.056717,0.006012,0.038531,7.0,1-01,2019-01-01,01-08,1.003383,-1.003383,7
AAPL,10.0,-0.000371,0.037324,0.031590,0.008680,6.0,1-01,2019-01-01,01-08,-0.009938,0.009938,7
ABT,10.0,0.002839,0.032413,0.019538,0.017176,7.0,1-01,2019-01-01,01-08,0.087596,-0.087596,7
ACGL,10.0,-0.009261,0.014943,0.011300,0.013221,2.0,1-01,2019-01-01,01-08,-0.619717,0.619717,7
...,...,...,...,...,...,...,...,...,...,...,...,...
AIG,10.0,-0.037151,0.399155,0.149413,0.347461,3.0,12-15,2020-12-25,01-22,-0.093073,0.093073,28
AIZ,10.0,-0.131901,0.155974,0.126455,0.027708,2.0,12-15,2020-12-25,01-22,-0.845657,0.845657,28
AJG,10.0,-0.149574,0.124210,0.077128,0.051427,2.0,12-15,2020-12-25,01-22,-1.204203,1.204203,28
AKAM,10.0,-0.086542,0.288647,0.104769,0.228526,4.0,12-15,2020-12-25,01-22,-0.299820,0.299820,28


In [30]:
sub_cols = data.columns[data.loc['2009-01-01'].notna()]
sub_stocks = data[sub_cols].columns.drop(labels='Price Date')


In [45]:
sub_cols[0:20]

Index(['A', 'AAL', 'AAPL', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP',
       'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB'],
      dtype='object')

In [11]:
data.AAL.shift(1)

1989-01-01      NaN
1989-01-02      NaN
1989-01-03      NaN
1989-01-04      NaN
1989-01-05      NaN
              ...  
2023-12-27    14.11
2023-12-28    13.99
2023-12-29    13.98
2023-12-30    13.74
2023-12-31    13.44
Freq: D, Name: AAL, Length: 12783, dtype: float64

In [60]:
sub_cols = data.columns[data.loc['2009-01-01'].notna()]


In [63]:
sub_cols

Index(['A', 'AAL', 'AAPL', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP',
       ...
       'WY', 'WYNN', 'XEL', 'XOM', 'XRAY', 'YUM', 'ZBH', 'ZBRA', 'ZION',
       'Price Date'],
      dtype='object', length=433)