In [26]:
from statsmodels.regression.rolling import RollingOLS
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import yfinance as yf   
import datetime as dt

#%pip install pandas_ta --upgrade --quiet
import pandas_ta
import warnings
warnings.filterwarnings("ignore")

In [27]:
# Get the S&P 500 tickers from Wikipedia
# This will fetch the current list of S&P 500 companies
sp500Tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
sp500Tickers = sp500Tickers[0]
sp500Tickers['Symbol'] = sp500Tickers['Symbol'].str.replace('.', '-').unique()
sp500Tickers = sp500Tickers['Symbol'].to_list()

# Set the start and end dates for the data
end_date = dt.datetime(2024, 12, 31)
start_date = end_date - dt.timedelta(days=365*8)  # 5 years of data

In [28]:
# download the data
data = yf.download(sp500Tickers, start=start_date, end=end_date)
data

[*********************100%***********************]  503 of 503 completed


Price,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-03,43.559391,26.827242,43.450214,,33.456699,27.224224,101.952766,103.480003,61.632336,35.965237,...,789800,2650200,2703500,2065100,10360600,1596700,4793400,1422533,388200,3579700
2017-01-04,44.130951,26.797215,44.062885,,33.722301,27.417574,102.197884,104.139999,61.504829,35.902950,...,477200,3678500,2598800,2542600,9434200,1703000,2835200,1547884,463200,3316300
2017-01-05,43.606239,26.933496,44.397057,,34.013599,27.138645,100.665886,105.910004,60.620853,35.638222,...,583600,3548000,2507500,2611900,14443200,1846000,4040000,1497517,486600,2469100
2017-01-06,44.964848,27.233757,44.410973,,34.938908,27.408066,101.812706,108.300003,60.858826,34.820656,...,812200,2679300,1858400,1707400,16518100,1561600,2863800,1875630,308300,2845800
2017-01-09,45.105389,27.483208,44.703384,,34.904633,27.011858,100.674637,108.570000,61.147827,34.844013,...,467800,4017600,2776200,1840100,13762300,1090400,1944200,1198199,263300,2123300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-23,133.785095,254.655716,175.105225,134.520004,113.188950,92.089996,355.488953,446.739990,212.989365,49.345444,...,411300,3546700,1312400,2535600,12285100,945800,1479100,1948300,238300,2437800
2024-12-24,135.276031,257.578674,176.685577,134.990005,113.634544,92.669998,358.322815,447.940002,216.161041,49.560802,...,191200,1780100,692800,943900,7807000,379300,533000,458600,88700,1023600
2024-12-26,135.007660,258.396667,175.900314,135.320007,114.139534,92.930000,357.133789,450.160004,216.131317,49.541222,...,258700,1736500,1218900,1394900,9652400,575700,1040900,1277300,140100,2167200
2024-12-27,134.719391,254.974930,174.732224,133.384995,113.862282,92.339996,352.922638,446.480011,215.070786,49.511856,...,310700,2320500,1086700,2015000,11943900,552400,1146300,743400,287200,1800100


In [29]:
df = data.copy()
df = df.stack()
# make Date', 'Ticker' the indexes
df.index.names = ['Date', 'Ticker']
df.columns = df.columns.str.lower()
df

Unnamed: 0_level_0,Price,close,high,low,open,volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,A,43.559391,43.803000,42.856669,43.034691,1739600.0
2017-01-03,AAPL,26.827242,26.868817,26.506193,26.746403,115127600.0
2017-01-03,ABBV,43.450214,43.881861,43.122997,43.805278,9328200.0
2017-01-03,ABT,33.456699,33.482405,32.848397,33.096859,9677300.0
2017-01-03,ACGL,27.224224,27.712353,27.106947,27.522172,942900.0
...,...,...,...,...,...,...
2024-12-30,XYL,115.551468,116.068214,114.438490,115.730340,586800.0
2024-12-30,YUM,132.243332,133.124821,131.728300,132.847500,1144600.0
2024-12-30,ZBH,104.902634,105.848148,104.156173,105.549568,1532000.0
2024-12-30,ZBRA,383.850006,386.959991,378.149994,385.059998,211300.0


### Technical indicators:

- Garman-Klaus:
$$ GKV = \frac{(\ln(High)-\ln(Low))^2}{2}- (2\ln(2)-1)(\ln(AQdj Close)-\ln(Open))^2$$

In [30]:
# Geman-Klass volatility calculation
df['garman_klass_vol'] = 0.5*(np.log(df['high'])-np.log(df['low']))**2 -\
      (2*np.log(2)-1)*(np.log(df['close']-np.log(df['open'])))**2

# RSI calculation using pandas_ta
df['rsi'] = df.groupby(level=1)['close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

# Bolinger Bands calculation using pandas_ta
df['bb_lower'] = \
    df.groupby(level=1)['close'].transform(lambda x: pandas_ta.bbands(close=x, length=20, std=2).iloc[:, 0].T.values)
df['bb_middle']= \
    df.groupby(level=1)['close'].transform(lambda x: pandas_ta.bbands(close=x, length=20, std=2).iloc[:, 1].T.values)
df['bb_upper'] = \
    df.groupby(level=1)['close'].transform(lambda x: pandas_ta.bbands(close=x, length=20, std=2).iloc[:, 2].T.values)
# Normalised ATR calculation using pandas_ta
def compute_ATR(stock_df):
    atr = pandas_ta.atr(high=stock_df['high'], 
                        low=stock_df['low'], 
                        close=stock_df['close'], 
                        length=20)
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_ATR)

#Compute Normalised MACD using pandas_ta
def calc_MACD(stock_df):
    macd = pandas_ta.macd(close=stock_df, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['close'].apply(calc_MACD)

# Compute dollar-volume
df['dollar_vol'] = df['close'] * df['volume']/1e6 # in millions

df

Unnamed: 0_level_0,Price,close,high,low,open,volume,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd,dollar_vol
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-01-03,A,43.559391,43.803000,42.856669,43.034691,1739600.0,-5.241927,,,,,,,75.775917
2017-01-03,AAPL,26.827242,26.868817,26.506193,26.746403,115127600.0,-3.854205,,,,,,,3088.555974
2017-01-03,ABBV,43.450214,43.881861,43.122997,43.805278,9328200.0,-5.232926,,,,,,,405.312290
2017-01-03,ABT,33.456699,33.482405,32.848397,33.096859,9677300.0,-4.464780,,,,,,,323.770517
2017-01-03,ACGL,27.224224,27.712353,27.106947,27.522172,942900.0,-3.892041,,,,,,,25.669721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-30,XYL,115.551468,116.068214,114.438490,115.730340,586800.0,-8.561229,35.375384,112.122514,121.125829,130.129145,0.668219,-1.585002,67.805601
2024-12-30,YUM,132.243332,133.124821,131.728300,132.847500,1144600.0,-9.075210,46.389789,129.571025,134.685751,139.800478,0.789448,-0.514964,151.365718
2024-12-30,ZBH,104.902634,105.848148,104.156173,105.549568,1532000.0,-8.200897,43.454495,103.925321,107.000076,110.074830,-0.617093,-0.307769,160.710835
2024-12-30,ZBRA,383.850006,386.959991,378.149994,385.059998,211300.0,-13.604915,45.199475,381.008820,399.536504,418.064187,0.047316,-0.332876,81.107506


### Aggregate Indicators and Filter top 150 most liquid stocks (monthly)

- Convert daily data to monthly data

In [31]:
#Create list of columns we will take last monthly value
monthly_cols = [col for col in df.columns.unique() if col not in ['open', 'high', 'low' 
                                                                  , 'volume', 'dollar_vol']]
monthly_cols

['close',
 'garman_klass_vol',
 'rsi',
 'bb_lower',
 'bb_middle',
 'bb_upper',
 'atr',
 'macd']

In [32]:
# Resample dollar-vol monthly
temp1 = df.unstack('Ticker')['dollar_vol'].resample('M').mean().stack('Ticker').to_frame('dollar_vol.M')
temp1

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_vol.M
Date,Ticker,Unnamed: 2_level_1
2017-01-31,A,101.729835
2017-01-31,AAPL,3113.181687
2017-01-31,ABBV,338.538871
2017-01-31,ABT,368.277052
2017-01-31,ACGL,29.688113
...,...,...
2024-12-31,XYL,165.083142
2024-12-31,YUM,206.888270
2024-12-31,ZBH,164.685230
2024-12-31,ZBRA,121.040691


In [33]:
temp2 = df.unstack('Ticker')[monthly_cols].resample('M').last().stack('Ticker')
temp2

Unnamed: 0_level_0,Price,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-31,A,45.883068,-5.401556,,43.585911,44.939544,46.293177,,
2017-01-31,AAPL,28.028299,-3.972144,,26.805017,27.617167,28.429316,,
2017-01-31,ABBV,42.974415,-5.201184,,41.996995,43.386469,44.775942,,
2017-01-31,ABT,36.018272,-4.677740,,33.751229,34.937108,36.122986,,
2017-01-31,ACGL,28.003962,-3.968958,,26.968826,27.667502,28.366179,,
...,...,...,...,...,...,...,...,...,...
2024-12-31,XYL,115.551468,-8.561229,35.375384,112.122514,121.125829,130.129145,0.668219,-1.585002
2024-12-31,YUM,132.243332,-9.075210,46.389789,129.571025,134.685751,139.800478,0.789448,-0.514964
2024-12-31,ZBH,104.902634,-8.200897,43.454495,103.925321,107.000076,110.074830,-0.617093,-0.307769
2024-12-31,ZBRA,383.850006,-13.604915,45.199475,381.008820,399.536504,418.064187,0.047316,-0.332876


In [34]:
df2 = pd.concat([temp1, temp2], axis=1)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_vol.M,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-31,A,101.729835,45.883068,-5.401556,,43.585911,44.939544,46.293177,,
2017-01-31,AAPL,3113.181687,28.028299,-3.972144,,26.805017,27.617167,28.429316,,
2017-01-31,ABBV,338.538871,42.974415,-5.201184,,41.996995,43.386469,44.775942,,
2017-01-31,ABT,368.277052,36.018272,-4.677740,,33.751229,34.937108,36.122986,,
2017-01-31,ACGL,29.688113,28.003962,-3.968958,,26.968826,27.667502,28.366179,,
...,...,...,...,...,...,...,...,...,...,...
2024-12-31,XYL,165.083142,115.551468,-8.561229,35.375384,112.122514,121.125829,130.129145,0.668219,-1.585002
2024-12-31,YUM,206.888270,132.243332,-9.075210,46.389789,129.571025,134.685751,139.800478,0.789448,-0.514964
2024-12-31,ZBH,164.685230,104.902634,-8.200897,43.454495,103.925321,107.000076,110.074830,-0.617093,-0.307769
2024-12-31,ZBRA,121.040691,383.850006,-13.604915,45.199475,381.008820,399.536504,418.064187,0.047316,-0.332876


#### Step 3: 5-year average rolling dollar-vol average

In [35]:
# 5-year average rolling dollar-vol average
df2['dollar_vol.M'] =  (df2.loc[:, 'dollar_vol.M'].unstack('Ticker').rolling(5*12, min_periods=12).mean().stack())

df2['dollar_vol_rank'] = (df2.groupby('Date')['dollar_vol.M'].rank(ascending=False))
# Filter top 150 most liquid stocks
df2 = df2[df2['dollar_vol_rank']<150].drop(['dollar_vol.M', 'dollar_vol_rank'], axis=1)

df2



Unnamed: 0_level_0,Unnamed: 1_level_0,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-31,AAPL,39.715725,-4.962637,47.784541,39.286545,40.340343,41.394141,-1.302034,-0.285476
2017-12-31,ABBV,69.786903,-6.757277,55.000855,68.081711,69.829850,71.577988,-1.031019,0.094507
2017-12-31,ABT,50.030010,-5.669368,59.877837,47.172267,49.025373,50.878479,-1.465099,0.170748
2017-12-31,ACN,136.721710,-9.204026,65.003603,130.942321,135.016841,139.091362,-1.237276,0.218456
2017-12-31,ADBE,175.240005,-10.190738,51.008420,169.207075,174.475001,179.742927,-1.414954,-0.172391
...,...,...,...,...,...,...,...,...,...
2024-12-31,VZ,37.686069,-4.807737,33.994203,36.766054,39.358626,41.951198,-0.045572,-1.651882
2024-12-31,WDAY,262.000000,-11.885085,49.133109,255.024137,268.768003,282.511870,1.221471,0.242988
2024-12-31,WFC,69.681808,-6.752939,50.026251,67.605490,70.964894,74.324297,1.887223,-0.263496
2024-12-31,WMT,90.104118,-7.648514,51.567988,90.104275,92.965969,95.827663,3.193829,0.985685


#### Step 4: Calculate monthly returns for different time horizons

- Captures time series dynamics like Momentum patterns

In [36]:
# subset one ticker: AAPL
def calculate_returns(temp1):

    outlier_cut_off = 0.005

    lags = [1, 2, 3, 6, 9, 12] #Months
    for lag in lags:
        temp1[f'returns.{lag}M'] = (temp1['close']
                                    .pct_change(lag)
                                    .pipe(lambda x: x.clip(lower=x.quantile(outlier_cut_off),
                                                        upper=x.quantile(1-outlier_cut_off)))
                                    .add(1)
                                    .pow(1/lag)
                                    .sub(1)
        )
    temp1 = temp1.dropna()
    return temp1


df2 = df2.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()
df2.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd,returns.1M,returns.2M,returns.3M,returns.6M,returns.9M,returns.12M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-12-31,AAPL,37.574974,-4.79897,37.072835,34.785068,39.30008,43.815092,-0.725781,-1.244711,-0.116699,-0.132708,-0.101117,-0.025181,-0.005637,-0.004607
2019-01-31,AAPL,39.647385,-4.957928,53.482555,34.167435,36.678956,39.190477,-0.865458,-0.382622,0.055154,-0.034589,-0.086165,-0.020933,0.001996,0.000748
2019-02-28,AAPL,41.422657,-5.088555,57.086764,39.812174,40.966915,42.121656,-1.133613,0.012537,0.044777,0.049952,-0.008827,-0.04289,-0.007195,-0.001098
2019-03-31,AAPL,45.441734,-5.370764,63.873656,40.834471,43.948926,47.063382,-1.079795,0.171826,0.097026,0.070583,0.065415,-0.027103,0.004128,0.011669
2019-04-30,AAPL,48.006268,-5.539714,60.713631,46.257152,48.08043,49.903708,-1.157427,0.140235,0.056436,0.07654,0.065846,-0.013082,0.007179,0.017591
2019-05-31,AAPL,42.04327,-5.13329,34.632847,40.401967,45.441545,50.481122,-0.956201,-0.927129,-0.124213,-0.03812,0.004969,-0.001953,-0.027496,-0.004168
2019-06-30,AAPL,47.530724,-5.509407,56.282124,42.888483,46.341735,49.794987,-1.034384,-0.040209,0.130519,-0.004965,0.015095,0.03995,-0.013235,0.006858
2019-07-31,AAPL,51.161812,-5.73854,64.75573,47.816128,49.356833,50.897537,-1.087124,0.030495,0.076395,0.103125,0.021447,0.043411,-0.001704,0.010727
2019-08-31,AAPL,50.31963,-5.687658,54.15897,46.792277,49.341136,51.889995,-0.909274,-0.185458,-0.016461,0.02892,0.061729,0.032959,0.018838,-0.005919
2019-09-30,AAPL,53.991024,-5.912642,60.753856,50.289536,52.550668,54.811799,-0.988539,0.015433,0.072961,0.027278,0.043396,0.029148,0.041098,0.000627


#### Step 5: Download Fama-French Factors and Calc Rolling Factor beta

- Calculate exposure of assets to common risk factors
- Five Fama-French Factors:
    - Market risk
    - Size
    - Value
    - Operating Profitability
    - Investment
- Used to assess risk/return profiles of portfolios
- Access using `pandas-datareader`

In [37]:
import pandas_datareader.data as web

In [38]:
start_date

datetime.datetime(2017, 1, 2, 0, 0)

In [39]:
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
               'famafrench',
               start = 2010)[0]
factor_data = factor_data.drop(columns=['RF'])
factor_data = factor_data.rename(columns={'Mkt-RF': 'mkt_rf', 'SMB': 'smb', 'HML': 'hml', 'RMW': 'rmw', 'CMA': 'cma'})
factor_data.index = factor_data.index.to_timestamp()
factor_data = factor_data.resample('M').last().div(100)

factor_data.index.name = 'Date'
factor_data.head()


Unnamed: 0_level_0,mkt_rf,smb,hml,rmw,cma
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-31,-0.0335,0.004,0.0033,-0.0108,0.0051
2010-02-28,0.0339,0.0149,0.0318,-0.0029,0.0142
2010-03-31,0.063,0.0183,0.0219,-0.0061,0.0174
2010-04-30,0.02,0.0496,0.0296,0.0061,0.0175
2010-05-31,-0.079,0.0008,-0.0248,0.013,-0.0024


In [40]:
tyemp = df2.copy()
tyemp = tyemp.reset_index()
factors = factor_data.copy().reset_index()
tyemp = pd.merge(factors,tyemp[['Date', 'Ticker','returns.1M']], on ='Date')
tyemp.set_index(['Date', 'Ticker'], inplace=True)
tyemp


Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31,AAPL,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.116699
2018-12-31,ABBV,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.022064
2018-12-31,ABT,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.023228
2018-12-31,ACN,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.142901
2018-12-31,ADBE,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.098250
...,...,...,...,...,...,...,...
2024-12-31,VZ,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,-0.106901
2024-12-31,WDAY,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,0.048042
2024-12-31,WFC,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,-0.075620
2024-12-31,WMT,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,-0.018695


In [41]:
factor_data = tyemp.copy()
factor_data = factor_data.sort_index()
factor_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31,AAPL,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.116699
2018-12-31,ABBV,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.022064
2018-12-31,ABT,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.023228
2018-12-31,ACN,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.142901
2018-12-31,ADBE,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.09825


In [42]:
factor_data.xs('AAPL', level=1)

Unnamed: 0_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.116699
2019-01-31,0.0836,0.0291,-0.0039,-0.0073,-0.0165,0.055154
2019-02-28,0.0342,0.0172,-0.0266,0.0016,-0.0154,0.044777
2019-03-31,0.0110,-0.0352,-0.0414,0.0088,-0.0090,0.097026
2019-04-30,0.0397,-0.0115,0.0213,0.0161,-0.0218,0.056436
...,...,...,...,...,...,...
2024-08-31,0.0161,-0.0355,-0.0110,0.0075,0.0082,0.032353
2024-09-30,0.0173,-0.0092,-0.0277,0.0018,-0.0029,0.017467
2024-10-31,-0.0100,-0.0088,0.0086,-0.0142,0.0098,-0.030429
2024-11-30,0.0649,0.0460,0.0015,-0.0230,-0.0205,0.051707


In [43]:
factor_data.xs('MSFT', level=1)

Unnamed: 0_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.084047
2019-01-31,0.0836,0.0291,-0.0039,-0.0073,-0.0165,0.028158
2019-02-28,0.0342,0.0172,-0.0266,0.0016,-0.0154,0.077358
2019-03-31,0.0110,-0.0352,-0.0414,0.0088,-0.0090,0.052754
2019-04-30,0.0397,-0.0115,0.0213,0.0161,-0.0218,0.107343
...,...,...,...,...,...,...
2024-08-31,0.0161,-0.0355,-0.0110,0.0075,0.0082,-0.001095
2024-09-30,0.0173,-0.0092,-0.0277,0.0018,-0.0029,0.031548
2024-10-31,-0.0100,-0.0088,0.0086,-0.0142,0.0098,-0.055659
2024-11-30,0.0649,0.0460,0.0015,-0.0230,-0.0205,0.044192


In [44]:
observatoins = factor_data.groupby(level=1).size()

valid_stocks = observatoins[observatoins > 10]
factor_data = factor_data[factor_data.index.get_level_values('Ticker').isin(valid_stocks.index)]
factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31,AAPL,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.116699
2018-12-31,ABBV,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.022064
2018-12-31,ABT,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.023228
2018-12-31,ACN,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.142901
2018-12-31,ADBE,-0.0955,-0.0281,-0.0192,-0.0003,0.0024,-0.098250
...,...,...,...,...,...,...,...
2024-12-31,VZ,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,-0.106901
2024-12-31,WDAY,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,0.048042
2024-12-31,WFC,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,-0.075620
2024-12-31,WMT,-0.0315,-0.0383,-0.0300,0.0190,-0.0121,-0.018695


- Calculate rolling Factor Betas

In [45]:
def rolling_ols_beta(x):
    exog = sm.add_constant(x.drop('returns.1M', axis=1))
    n_regressors = exog.shape[1]
    window = min(24, x.shape[0])
    min_nobs = min(window, n_regressors + 1)
    if window > n_regressors + 1:
        res = RollingOLS(endog=x['returns.1M'], 
                         exog=exog,
                         window=window,
                         min_nobs=min_nobs
                        ).fit(params_only=True).params
        return res.drop('const', axis=1)
    else:
        return pd.DataFrame(index=x.index, columns=exog.columns.drop('const'))

betas = (factor_data.groupby(level=1, group_keys=False)
         .apply(rolling_ols_beta))

betas

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31,AAPL,,,,,
2018-12-31,ABBV,,,,,
2018-12-31,ABT,,,,,
2018-12-31,ACN,,,,,
2018-12-31,ADBE,,,,,
...,...,...,...,...,...,...
2024-12-31,VZ,1.228952,-0.864706,0.795097,0.694940,-0.481584
2024-12-31,WDAY,0.948859,-1.275083,0.499369,-2.312068,-1.561540
2024-12-31,WFC,0.456401,-0.018841,0.981487,-2.242127,-0.210725
2024-12-31,WMT,0.739713,0.303660,-0.504227,1.186711,0.700600
