In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
import os   
import sys
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize
from scipy import interpolate
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import warnings
import statsmodels.api as sm
from scipy.optimize import fsolve
from scipy.stats import norm
from sklearn.decomposition import PCA
import warnings
from scipy.optimize import fsolve
import numpy.polynomial.polynomial as poly
warnings.filterwarnings(action='ignore')

In [7]:
industries = pd.read_csv('./industry.csv')

In [8]:
industries['industry'] = industries.industry.str.replace('CRSP', '', regex=True)
industries['industry'] = industries.industry.str.replace('Index', '', regex=True)
industries['top_tickers'] = industries['top_tickers'].str.split()
industries = industries.explode('top_tickers').reset_index(drop=True)
industries

Unnamed: 0,industry,top_tickers
0,US Consumer Discretionary,AMZN
1,US Consumer Discretionary,TSLA
2,US Consumer Discretionary,HD
3,US Consumer Discretionary,WMT
4,US Consumer Discretionary,MCD
...,...,...
105,US Utilities,AEP
106,US Utilities,VST
107,US Utilities,D
108,US Utilities,PCG


In [9]:
vols = pd.read_csv('../daily_vol_series.csv')
vols = vols[vols['SYM_ROOT'] == vols['symbol']]

In [10]:
vols = vols.merge(industries, left_on='SYM_ROOT', right_on='top_tickers', how='inner')
vols = vols[['DATE', 'SYM_ROOT', 'industry', 'ivol_t', 'ivol_q']]

In [11]:
vols.head()

Unnamed: 0,DATE,SYM_ROOT,industry,ivol_t,ivol_q
0,2010-01-04,AAPL,US Technology,6.140084e-09,4.3004e-10
1,2010-01-05,AAPL,US Technology,5.997564e-09,3.934783e-10
2,2010-01-06,AAPL,US Technology,5.785301e-09,5.765893e-10
3,2010-01-07,AAPL,US Technology,9.102045e-09,6.42518e-10
4,2010-01-08,AAPL,US Technology,6.838924e-09,5.694237e-10


In [12]:
vols = vols[vols['DATE'] <= '2023-08-31']

In [13]:
vols['month'] = pd.to_datetime(vols['DATE']).dt.month
vols['year'] = pd.to_datetime(vols['DATE']).dt.year
vols['ivol_q_log'] = np.log(vols['ivol_q'])

In [14]:
def moments(df):
    final_dfs = []
    for lag in range(1, 11):
        tmp = df.groupby(['SYM_ROOT', 'industry', 'year', 'month'])['ivol_q_log']\
            .apply(lambda x: (((x - x.shift(lag))**2).sum() / (x.count() - lag)) ).to_frame(f'lag{lag}_mom')
        final_dfs.append(tmp)
    final_df = pd.concat(final_dfs, axis=1)
    return final_df

In [15]:
final = moments(vols)
final.dropna(inplace=True)

In [16]:
betas = pd.DataFrame(columns=['Betas'], index=final.index)
for idx,_ in final.iterrows():
    y = final.loc[idx].values
    X = np.log(range(1, 11)).reshape(-1,1)
    # X = sm.add_constant(X)
    mod = sm.OLS(y, X).fit()
    betas.loc[idx, 'Betas'] = mod.params
betas.reset_index(inplace=True)

In [17]:
betas[betas['Betas']>=0.5]
betas['Realized_H'] = betas['Betas'] * 0.5

In [19]:
betas.groupby('industry')['Realized_H'].mean()

industry
US Consumer Discretionary     0.084805
US Consumer Staples           0.096751
US Energy                     0.082062
US Financials                 0.074574
US Healthcare                 0.101257
US Industrials                0.111189
US Materials                  0.099205
US Media & Communications     0.086026
US Real Estate & REITs        0.130725
US Technology                      inf
US Utilities                  0.098577
Name: Realized_H, dtype: object

In [20]:
betas

Unnamed: 0,SYM_ROOT,industry,year,month,Betas,Realized_H
0,AAPL,US Technology,2010,1,0.27544088153898666,0.13772
1,AAPL,US Technology,2010,2,0.26158834513437185,0.130794
2,AAPL,US Technology,2010,3,0.1420987558389767,0.071049
3,AAPL,US Technology,2010,4,0.27536603200850096,0.137683
4,AAPL,US Technology,2010,5,11.469401456045505,5.734701
...,...,...,...,...,...,...
15469,XOM,US Energy,2023,4,0.04711815985121362,0.023559
15470,XOM,US Energy,2023,5,0.03321159773896469,0.016606
15471,XOM,US Energy,2023,6,0.10502871182051061,0.052514
15472,XOM,US Energy,2023,7,0.1370810235268246,0.068541
