In [None]:
from statsmodels.regression.rolling import RollingOLS
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import yfinance as yf   
import datetime as dt

#%pip install pandas_ta --upgrade --quiet
import pandas_ta
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Get the S&P 500 tickers from Wikipedia
# This will fetch the current list of S&P 500 companies
sp500Tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
sp500Tickers = sp500Tickers[0]
sp500Tickers['Symbol'] = sp500Tickers['Symbol'].str.replace('.', '-').unique()
sp500Tickers = sp500Tickers['Symbol'].to_list()

# Set the start and end dates for the data
end_date = dt.datetime(2024, 12, 31)
start_date = end_date - dt.timedelta(days=365*8)  # 5 years of data

In [62]:
# download the data
data = yf.download(sp500Tickers, start=start_date, end=end_date)
data

[*********************100%***********************]  502 of 502 completed


Price,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-03,43.559391,26.827248,43.450218,,33.456703,27.224224,101.952766,103.480003,61.632328,35.965252,...,789800,2650200,2703500,2065100,10360600,1596700,4793400,1422533,388200,3579700
2017-01-04,44.130943,26.797218,44.062881,,33.722298,27.417574,102.197891,104.139999,61.504818,35.902962,...,477200,3678500,2598800,2542600,9434200,1703000,2835200,1547884,463200,3316300
2017-01-05,43.606243,26.933496,44.397057,,34.013603,27.138645,100.665886,105.910004,60.620846,35.638210,...,583600,3548000,2507500,2611900,14443200,1846000,4040000,1497517,486600,2469100
2017-01-06,44.964848,27.233757,44.410976,,34.938908,27.408066,101.812698,108.300003,60.858856,34.820644,...,812200,2679300,1858400,1707400,16518100,1561600,2863800,1875630,308300,2845800
2017-01-09,45.105389,27.483200,44.703384,,34.904640,27.011858,100.674637,108.570000,61.147827,34.844002,...,467800,4017600,2776200,1840100,13762300,1090400,1944200,1198199,263300,2123300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-23,133.785095,254.655716,175.105225,134.520004,113.188950,92.089996,355.488953,446.739990,212.989349,49.345444,...,411300,3546700,1312400,2535600,12285100,945800,1479100,1948300,238300,2437800
2024-12-24,135.276016,257.578674,176.685577,134.990005,113.634544,92.669998,358.322815,447.940002,216.161057,49.560802,...,191200,1780100,692800,943900,7807000,379300,533000,458600,88700,1023600
2024-12-26,135.007660,258.396667,175.900314,135.320007,114.139534,92.930000,357.133789,450.160004,216.131317,49.541222,...,258700,1736500,1218900,1394900,9652400,575700,1040900,1277300,140100,2167200
2024-12-27,134.719391,254.974930,174.732224,133.384995,113.862282,92.339996,352.922638,446.480011,215.070786,49.511856,...,310700,2320500,1086700,2015000,11943900,552400,1146300,743400,287200,1800100


In [102]:
df = data.copy()
df = df.stack()
# make Date', 'Ticker' the indexes
df.index.names = ['Date', 'Ticker']
df.columns = df.columns.str.lower()
df

Unnamed: 0_level_0,Price,close,high,low,open,volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,A,43.559391,43.803000,42.856669,43.034691,1739600.0
2017-01-03,AAPL,26.827248,26.868822,26.506198,26.746408,115127600.0
2017-01-03,ABBV,43.450218,43.881865,43.123001,43.805282,9328200.0
2017-01-03,ABT,33.456703,33.482408,32.848400,33.096863,9677300.0
2017-01-03,ACGL,27.224224,27.712353,27.106947,27.522172,942900.0
...,...,...,...,...,...,...
2024-12-30,XYL,115.551468,116.068214,114.438490,115.730340,586800.0
2024-12-30,YUM,132.243332,133.124821,131.728300,132.847500,1144600.0
2024-12-30,ZBH,104.902634,105.848148,104.156173,105.549568,1532000.0
2024-12-30,ZBRA,383.850006,386.959991,378.149994,385.059998,211300.0


### Technical indicators:

- Garman-Klaus:
$$ GKV = \frac{(\ln(High)-\ln(Low))^2}{2}- (2\ln(2)-1)(\ln(AQdj Close)-\ln(Open))^2$$

In [103]:
# Geman-Klass volatility calculation
df['garman_klass_vol'] = 0.5*(np.log(df['high'])-np.log(df['low']))**2 -\
      (2*np.log(2)-1)*(np.log(df['close']-np.log(df['open'])))**2

# RSI calculation using pandas_ta
df['rsi'] = df.groupby(level=1)['close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

# Bolinger Bands calculation using pandas_ta
df['bb_lower'] = \
    df.groupby(level=1)['close'].transform(lambda x: pandas_ta.bbands(close=x, length=20, std=2).iloc[:, 0].T.values)
df['bb_middle']= \
    df.groupby(level=1)['close'].transform(lambda x: pandas_ta.bbands(close=x, length=20, std=2).iloc[:, 1].T.values)
df['bb_upper'] = \
    df.groupby(level=1)['close'].transform(lambda x: pandas_ta.bbands(close=x, length=20, std=2).iloc[:, 2].T.values)
# Normalised ATR calculation using pandas_ta
def compute_ATR(stock_df):
    atr = pandas_ta.atr(high=stock_df['high'], 
                        low=stock_df['low'], 
                        close=stock_df['close'], 
                        length=20)
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_ATR)

#Compute Normalised MACD using pandas_ta
def calc_MACD(stock_df):
    macd = pandas_ta.macd(close=stock_df, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['close'].apply(calc_MACD)

# Compute dollar-volume
df['dollar_vol'] = df['close'] * df['volume']/1e6 # in millions

df

Unnamed: 0_level_0,Price,close,high,low,open,volume,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd,dollar_vol
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-01-03,A,43.559391,43.803000,42.856669,43.034691,1739600.0,-5.241927,,,,,,,75.775917
2017-01-03,AAPL,26.827248,26.868822,26.506198,26.746408,115127600.0,-3.854206,,,,,,,3088.556633
2017-01-03,ABBV,43.450218,43.881865,43.123001,43.805282,9328200.0,-5.232926,,,,,,,405.312325
2017-01-03,ABT,33.456703,33.482408,32.848400,33.096863,9677300.0,-4.464781,,,,,,,323.770554
2017-01-03,ACGL,27.224224,27.712353,27.106947,27.522172,942900.0,-3.892041,,,,,,,25.669721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-30,XYL,115.551468,116.068214,114.438490,115.730340,586800.0,-8.561229,35.375387,112.122514,121.125829,130.129145,0.668220,-1.585002,67.805601
2024-12-30,YUM,132.243332,133.124821,131.728300,132.847500,1144600.0,-9.075210,46.389793,129.571023,134.685751,139.800480,0.789449,-0.514963,151.365718
2024-12-30,ZBH,104.902634,105.848148,104.156173,105.549568,1532000.0,-8.200897,43.454503,103.925319,107.000075,110.074831,-0.617093,-0.307769,160.710835
2024-12-30,ZBRA,383.850006,386.959991,378.149994,385.059998,211300.0,-13.604915,45.199475,381.008820,399.536504,418.064187,0.047316,-0.332876,81.107506


### Aggregate Indicators and Filter top 150 most liquid stocks (monthly)

- Convert daily data to monthly data

In [104]:
#Create list of columns we will take last monthly value
monthly_cols = [col for col in df.columns.unique() if col not in ['open', 'high', 'low' 
                                                                  , 'volume', 'dollar_vol']]
monthly_cols

['close',
 'garman_klass_vol',
 'rsi',
 'bb_lower',
 'bb_middle',
 'bb_upper',
 'atr',
 'macd']

In [105]:
# Resample dollar-vol monthly
temp1 = df.unstack('Ticker')['dollar_vol'].resample('M').mean().stack('Ticker').to_frame('dollar_vol.M')
temp1

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_vol.M
Date,Ticker,Unnamed: 2_level_1
2017-01-31,A,101.729833
2017-01-31,AAPL,3113.181459
2017-01-31,ABBV,338.538902
2017-01-31,ABT,368.277067
2017-01-31,ACGL,29.688113
...,...,...
2024-12-31,XYL,165.083142
2024-12-31,YUM,206.888270
2024-12-31,ZBH,164.685228
2024-12-31,ZBRA,121.040691


In [107]:
temp2 = df.unstack('Ticker')[monthly_cols].resample('M').last().stack('Ticker')
temp2

Unnamed: 0_level_0,Price,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-31,A,45.883060,-5.401556,,43.585909,44.939543,46.293177,,
2017-01-31,AAPL,28.028294,-3.972143,,26.805019,27.617165,28.429311,,
2017-01-31,ABBV,42.974411,-5.201183,,41.997001,43.386472,44.775944,,
2017-01-31,ABT,36.018269,-4.677740,,33.751234,34.937110,36.122985,,
2017-01-31,ACGL,28.003962,-3.968958,,26.968826,27.667502,28.366179,,
...,...,...,...,...,...,...,...,...,...
2024-12-31,XYL,115.551468,-8.561229,35.375387,112.122514,121.125829,130.129145,0.668220,-1.585002
2024-12-31,YUM,132.243332,-9.075210,46.389793,129.571023,134.685751,139.800480,0.789449,-0.514963
2024-12-31,ZBH,104.902634,-8.200897,43.454503,103.925319,107.000075,110.074831,-0.617093,-0.307769
2024-12-31,ZBRA,383.850006,-13.604915,45.199475,381.008820,399.536504,418.064187,0.047316,-0.332876


In [123]:
df2 = pd.concat([temp1, temp2], axis=1)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_vol.M,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-31,A,101.729833,45.883060,-5.401556,,43.585909,44.939543,46.293177,,
2017-01-31,AAPL,3113.181459,28.028294,-3.972143,,26.805019,27.617165,28.429311,,
2017-01-31,ABBV,338.538902,42.974411,-5.201183,,41.997001,43.386472,44.775944,,
2017-01-31,ABT,368.277067,36.018269,-4.677740,,33.751234,34.937110,36.122985,,
2017-01-31,ACGL,29.688113,28.003962,-3.968958,,26.968826,27.667502,28.366179,,
...,...,...,...,...,...,...,...,...,...,...
2024-12-31,XYL,165.083142,115.551468,-8.561229,35.375387,112.122514,121.125829,130.129145,0.668220,-1.585002
2024-12-31,YUM,206.888270,132.243332,-9.075210,46.389793,129.571023,134.685751,139.800480,0.789449,-0.514963
2024-12-31,ZBH,164.685228,104.902634,-8.200897,43.454503,103.925319,107.000075,110.074831,-0.617093,-0.307769
2024-12-31,ZBRA,121.040691,383.850006,-13.604915,45.199475,381.008820,399.536504,418.064187,0.047316,-0.332876


#### Step 3: 5-year average rolling dollar-vol average

In [None]:
# 5-year average rolling dollar-vol average
df2['dollar_vol.M'] =  (df2.loc[:, 'dollar_vol.M'].unstack('Ticker').rolling(5*12, min_periods=12).mean().stack())

df2['dollar_vol_rank'] = (df2.groupby('Date')['dollar_vol.M'].rank(ascending=False))
# Filter top 150 most liquid stocks
df2 = df2[df2['dollar_vol_rank']<150].drop(['dollar_vol.M', 'dollar_vol_rank'], axis=1)

df2



Unnamed: 0_level_0,Unnamed: 1_level_0,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-31,AAPL,39.715729,-4.962637,47.784547,39.286550,40.340342,41.394134,-1.302035,-0.285477
2017-12-31,ABBV,69.786903,-6.757277,55.000882,68.081705,69.829845,71.577985,-1.031020,0.094508
2017-12-31,ABT,50.030003,-5.669367,59.877723,47.172277,49.025375,50.878472,-1.465098,0.170746
2017-12-31,ACN,136.721756,-9.204028,65.003683,130.942310,135.016844,139.091379,-1.237277,0.218457
2017-12-31,ADBE,175.240005,-10.190738,51.008420,169.207075,174.475001,179.742927,-1.414954,-0.172391
...,...,...,...,...,...,...,...,...,...
2024-12-31,VZ,38.295414,-4.854933,33.994200,37.360527,39.995016,42.629506,-0.045568,-1.651883
2024-12-31,WDAY,262.000000,-11.885085,49.133109,255.024137,268.768003,282.511870,1.221471,0.242988
2024-12-31,WFC,69.681808,-6.752939,50.026251,67.605490,70.964894,74.324297,1.887223,-0.263496
2024-12-31,WMT,90.104111,-7.648514,51.567969,90.104273,92.965968,95.827664,3.193828,0.985684


#### Step 4: Calculate monthly returns for different time horizons

- Captures time series dynamics like Momentum patterns

In [125]:
# subset one ticker: AAPL
def calculate_returns(temp1):

    outlier_cut_off = 0.005

    lags = [1, 2, 3, 6, 9, 12] #Months
    for lag in lags:
        temp1[f'returns.{lag}M'] = (temp1['close']
                                    .pct_change(lag)
                                    .pipe(lambda x: x.clip(lower=x.quantile(outlier_cut_off),
                                                        upper=x.quantile(1-outlier_cut_off)))
                                    .add(1)
                                    .pow(1/lag)
                                    .sub(1)
        )
    temp1 = temp1.dropna()
    return temp1


df2 = df2.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()
df2.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd,returns.1M,returns.2M,returns.3M,returns.6M,returns.9M,returns.12M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-12-31,AAPL,37.574982,-4.79897,37.072889,34.785068,39.300081,43.815095,-0.725779,-1.24471,-0.116698,-0.132708,-0.101117,-0.025181,-0.005637,-0.004607
2019-01-31,AAPL,39.647388,-4.957928,53.482566,34.16743,36.678953,39.190477,-0.865458,-0.382622,0.055154,-0.034589,-0.086165,-0.020933,0.001996,0.000748
2019-02-28,AAPL,41.422665,-5.088556,57.086853,39.812174,40.966913,42.121653,-1.133613,0.012537,0.044777,0.049952,-0.008827,-0.04289,-0.007195,-0.001098
2019-03-31,AAPL,45.441727,-5.370763,63.873705,40.834467,43.948928,47.063389,-1.079795,0.171827,0.097026,0.070582,0.065415,-0.027103,0.004128,0.011669
2019-04-30,AAPL,48.00626,-5.539714,60.713557,46.257149,48.080427,49.903706,-1.157426,0.140234,0.056436,0.076539,0.065846,-0.013082,0.007179,0.017591
2019-05-31,AAPL,42.04327,-5.13329,34.632825,40.401964,45.441543,50.481122,-0.9562,-0.927129,-0.124213,-0.03812,0.004969,-0.001953,-0.027496,-0.004168
2019-06-30,AAPL,47.53072,-5.509406,56.28209,42.888482,46.341735,49.794987,-1.034383,-0.040209,0.130519,-0.004965,0.015095,0.03995,-0.013236,0.006858
2019-07-31,AAPL,51.1618,-5.738539,64.755668,47.816132,49.356833,50.897535,-1.087123,0.030495,0.076394,0.103125,0.021447,0.043411,-0.001704,0.010727
2019-08-31,AAPL,50.319622,-5.687657,54.158942,46.792277,49.341136,51.889995,-0.909274,-0.185458,-0.016461,0.02892,0.061729,0.032959,0.018838,-0.005919
2019-09-30,AAPL,53.991016,-5.912642,60.753799,50.289533,52.550664,54.811795,-0.988539,0.015432,0.072961,0.027278,0.043396,0.029148,0.041098,0.000627


#### Step 5: Download Fama-French Factors and Calc Rolling Factor beta

- Calculate exposure of assets to common risk factors
- Five Fama-French Factors:
    - Market risk
    - Size
    - Value
    - Operating Profitability
    - Investment
- Used to assess risk/return profiles of portfolios
- Access using `pandas-datareader`

In [13]:
import pandas_datareader.data as web

In [15]:
start_date

datetime.datetime(2017, 1, 2, 0, 0)

In [144]:
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
               'famafrench',
               start = 2010)[0]
factor_data = factor_data.drop(columns=['RF'])
factor_data = factor_data.rename(columns={'Mkt-RF': 'mkt_rf', 'SMB': 'smb', 'HML': 'hml', 'RMW': 'rmw', 'CMA': 'cma'})
factor_data.index = factor_data.index.to_timestamp()
factor_data = factor_data.resample('M').last().div(100)

factor_data.index.name = 'Date'
factor_data.head()


Unnamed: 0_level_0,mkt_rf,smb,hml,rmw,cma
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-31,-0.0335,0.004,0.0033,-0.0108,0.0051
2010-02-28,0.0339,0.0149,0.0318,-0.0029,0.0142
2010-03-31,0.063,0.0183,0.0219,-0.0061,0.0174
2010-04-30,0.02,0.0496,0.0296,0.0061,0.0175
2010-05-31,-0.079,0.0008,-0.0248,0.013,-0.0024


In [145]:
tyemp = df2.copy()
tyemp = tyemp.reset_index()
factors = factor_data.copy().reset_index()
tyemp = pd.merge(factors,tyemp[['Date', 'Ticker','returns.1M']], on ='Date')
tyemp.set_index(['Date', 'Ticker'], inplace=True)
tyemp


Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31,AAPL,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.116698
2018-12-31,ABBV,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.022064
2018-12-31,ABT,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.023227
2018-12-31,ACN,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.142900
2018-12-31,ADBE,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.098250
...,...,...,...,...,...,...,...
2024-12-31,VZ,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,-0.106901
2024-12-31,WDAY,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,0.048042
2024-12-31,WFC,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,-0.075620
2024-12-31,WMT,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,-0.018696


In [146]:
factor_data = tyemp.copy()
factor_data = factor_data.sort_index()
factor_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31,AAPL,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.116698
2018-12-31,ABBV,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.022064
2018-12-31,ABT,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.023227
2018-12-31,ACN,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.1429
2018-12-31,ADBE,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.09825


In [147]:
factor_data.xs('AAPL', level=1)

Unnamed: 0_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.116698
2019-01-31,0.0837,0.0290,-0.0039,-0.0070,-0.0168,0.055154
2019-02-28,0.0342,0.0175,-0.0267,0.0016,-0.0156,0.044777
2019-03-31,0.0110,-0.0353,-0.0416,0.0091,-0.0089,0.097026
2019-04-30,0.0397,-0.0111,0.0215,0.0157,-0.0219,0.056436
...,...,...,...,...,...,...
2024-08-31,0.0161,-0.0356,-0.0110,0.0073,0.0082,0.032354
2024-09-30,0.0173,-0.0091,-0.0277,0.0018,-0.0029,0.017467
2024-10-31,-0.0100,-0.0088,0.0086,-0.0140,0.0098,-0.030429
2024-11-30,0.0649,0.0462,0.0015,-0.0231,-0.0205,0.051707


In [148]:
factor_data.xs('MSFT', level=1)

Unnamed: 0_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.084047
2019-01-31,0.0837,0.0290,-0.0039,-0.0070,-0.0168,0.028158
2019-02-28,0.0342,0.0175,-0.0267,0.0016,-0.0156,0.077357
2019-03-31,0.0110,-0.0353,-0.0416,0.0091,-0.0089,0.052754
2019-04-30,0.0397,-0.0111,0.0215,0.0157,-0.0219,0.107343
...,...,...,...,...,...,...
2024-08-31,0.0161,-0.0356,-0.0110,0.0073,0.0082,-0.001095
2024-09-30,0.0173,-0.0091,-0.0277,0.0018,-0.0029,0.031548
2024-10-31,-0.0100,-0.0088,0.0086,-0.0140,0.0098,-0.055659
2024-11-30,0.0649,0.0462,0.0015,-0.0231,-0.0205,0.044192


In [149]:
observatoins = factor_data.groupby(level=1).size()

valid_stocks = observatoins[observatoins > 15]
facor_data = factor_data[factor_data.index.get_level_values('Ticker').isin(valid_stocks.index)]
factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_rf,smb,hml,rmw,cma,returns.1M
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31,AAPL,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.116698
2018-12-31,ABBV,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.022064
2018-12-31,ABT,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.023227
2018-12-31,ACN,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.142900
2018-12-31,ADBE,-0.0955,-0.0283,-0.0194,-0.0003,0.0026,-0.098250
...,...,...,...,...,...,...,...
2024-12-31,VZ,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,-0.106901
2024-12-31,WDAY,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,0.048042
2024-12-31,WFC,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,-0.075620
2024-12-31,WMT,-0.0315,-0.0384,-0.0300,0.0191,-0.0121,-0.018696


- Calculate rolling Factor Betas

In [150]:
betas = (factor_data.groupby(level=1,
                            group_keys=False)
         .apply(lambda x: RollingOLS(endog=x['returns.1M'], 
                                     exog=sm.add_constant(x.drop('returns.1M', axis=1)),
                                     window=min(24, x.shape[0]),
                                     min_nobs=len(x.columns)+1 if x.shape[0] > len(x.columns)+1 else x.shape[0])
         .fit(params_only=True)
         .params
         .drop('const', axis=1)))

betas

ValueError: min_nobs must be larger than the number of regressors in the model and less than window