# Factor Construction

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import scipy.stats as stats

from factors import calc_stock_beta, calc_fp_betas, shrink_fp_betas_cross_sectionally, process_ranked_beta_row, process_ew_beta_row, process_value_beta_row
from factors import calc_equal_weights, calc_rank_weights, calc_value_weights, plot_cml, create_ff_bab_df, create_mkt_bab_df


ImportError: cannot import name 'create_market_bab_df' from 'factors' (c:\Users\Ryan\OneDrive\python\Betting-Against-Beta\factors.py)

In [None]:
# returns
stocks = pd.read_csv('data/stocks-2005.csv', index_col='Date', parse_dates=True,date_parser=lambda x: pd.to_datetime(x,format='%d/%m/%Y'))
mkt = pd.read_csv('data/mkt-2005.csv', index_col='Date', parse_dates=True)
ff = pd.read_csv('data/FF-2x3-5-Factor.csv', index_col='Date', parse_dates=True,  date_parser=lambda x: pd.to_datetime(x,format='%Y%m')).div(100)
ff = ff[ff.index.year >= 2005]

market_cap_df = pd.read_csv('data/market-cap-df.csv', index_col=0).sort_index()
market_cap_df.index = pd.to_datetime(market_cap_df.index, format='%Y')
stocks_m = stocks.resample('M').sum().sub(ff.RF.values,axis=0)
mkt_m = mkt.resample('M').sum().sub(ff.RF.values,axis=0)
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', '#00d5ff']


In [None]:
# calculate FP betas
betas = stocks.apply(calc_fp_betas, market_returns=mkt, axis=0).dropna()
betas = betas.applymap(shrink_fp_betas_cross_sectionally)
m_betas = betas.resample('M').last()

In [None]:
# calculate rank-weights
rank_weights = calc_rank_weights(betas)
# calculate even-weights
equal_weights = calc_equal_weights(betas)
# calculate value-weighs
value_weights = calc_value_weights(betas, market_cap_df)

In [None]:
# split into LS portfolios: dollar-neutral
lrw = rank_weights.applymap(lambda x:-x if x < 0 else 0) 
srw = rank_weights.applymap(lambda x:x if x > 0 else 0) 
lew = equal_weights.applymap(lambda x:-x if x < 0 else 0) 
sew = equal_weights.applymap(lambda x:x if x > 0 else 0) 
lvw = value_weights.applymap(lambda x:-x if x < 0 else 0) 
svw = value_weights.applymap(lambda x:x if x > 0 else 0) 


# calculate portfolio betas
lr_betas = lrw.mul(m_betas).sum(axis=1)
sr_betas = srw.mul(m_betas).sum(axis=1)
le_betas = lew.mul(m_betas).sum(axis=1)
se_betas = sew.mul(m_betas).sum(axis=1)
lv_betas = lvw.mul(m_betas).sum(axis=1)
sv_betas = svw.mul(m_betas).sum(axis=1)


# calculate leveraged weights: beta neutral but no longer dollar-neutral
llrw = lrw.div(lr_betas,axis=0)
lsrw = srw.div(sr_betas,axis=0)
llew = lew.div(le_betas,axis=0)
lsew = sew.div(se_betas,axis=0)
llvw = lew.div(le_betas,axis=0)
lsvw = sew.div(se_betas,axis=0)

# factor returns
_lrr, _srr, _rr = calc_ls_returns(lrw, srw, stocks_m)
_ler, _ser, _er = calc_ls_returns(lew, sew, stocks_m)
_lvr, _svr, _vr = calc_ls_returns(lvw, svw, stocks_m)

lrr, srr, rr = calc_ls_returns(llrw, lsrw, stocks_m)
ler, ser, er = calc_ls_returns(llew, lsew, stocks_m)
lvr, svr, vr = calc_ls_returns(llvw, lsvw, stocks_m)

# Flat Capital Market Line

* Entire time
* Analysis 

In [None]:
capm_betas = stocks_m.apply(calc_stock_beta, market_returns=mkt_m)
mean_rets = stocks_m.mean().mul(100)
capm_df = pd.DataFrame({'returns':mean_rets, 'betas':capm_betas})

In [None]:
plot_cml(capm_df)

# Betas of LS Portfolio

In [None]:
ls_betas = pd.DataFrame({'long':lr_betas, 'short':sr_betas})
ls_betas.plot(title='Betas over Time')
plt.show()


# BAB against Fama-French 5 Factor Model

* Regression equation
* Correlation matrix
* Line plot
* Analysis

In [None]:
bab = pd.read_csv('data/aqr-bab.csv', index_col='Date', parse_dates=True)
rw_bab_ff = create_ff_bab_df(bab,ff)

In [None]:
Y = rw_bab_ff['BAB']
X = rw_bab_ff[['Mkt-RF','SMB', 'HML', 'RMW', 'CMA']]
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
results.params

In [None]:
results.summary()

In [None]:
rw_bab_ff

In [None]:
rw_bab_ff.corr()

In [None]:
rw_bab_ff.cumsum().plot(title='BAB vs FF5', color=colors)
# plt.ylabel('returns')

In [None]:
rw_bab_ff.mean().div(rw_bab_ff.std()).mul(np.sqrt(12)).plot(kind='bar', color=colors, title='Sharpe ratios: BAB vs FF5')

In [None]:
results.params[1:].plot(kind='bar',color=colors, title='BAB Tilts')

# BAB against Market Factor

* Regression Equation
* Line plot
* Analysis

In [None]:

bab_mkt = create_mkt_bab_df(bab, ff['Mkt-RF'])
bab_mkt.corr()

In [None]:
bab_mkt.plot(kind='scatter', x='MKT',y='BAB')

In [None]:
bab_mkt.cumsum().plot(color=['#00d5ff','tab:blue'])

In [None]:
Y = bab_mkt['BAB']
X = bab_mkt['MKT']
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
results.summary()

In [None]:
sns.regplot(x='MKT', y='BAB', data=bab_mkt, marker='x', line_kws={"color": "red"})

# Add title and labels
plt.title('BAB vs Mkt-RF')
plt.xlabel('Market')
plt.ylabel('BAB')

# BAB: Value Weighted vs Ranked vs Equal Weighted (BABAB)

* Lineplot
* Analysis

In [None]:
rev_perf = pd.DataFrame({"rank":rr,"equal":er,"value":vr})
rev_perf.cumsum().plot(title='Custom BAB')

# BAB Overweighting Small Cap Stocks (BABAB)

* Turnover by size decile barchart
    * For each time period row, have a dataframe of two columns: decile bucket and turnover
    * Groupby decile and sum
    * Do this for all rows 
    * Plot barchart
* Correlation to size factor scatterplot
* Lineplot with transaction costs
    * Figure out a way to proxy measure transaction costs as a function of market cap

In [None]:
mkt_cap_df = market_cap_df.resample('Y').last()[(market_cap_df.index.year <= 2023)&(market_cap_df.index.year >= 2010)]

def calculate_decile_ranks(column):
    return pd.qcut(column, q=10, labels=False) + 1

# Apply the function to each column (axis=0) and store the result
mkt_cap_df = mkt_cap_df.apply(calculate_decile_ranks, axis=1)

l_turnover = llrw.diff().dropna().abs()
l_turnover = l_turnover.resample('Y').sum()
s_turnover = lsrw.diff().dropna().abs()
s_turnover = s_turnover.resample('Y').sum()

In [None]:
def calc_turnover(mkt_cap_df, turnover_df):

    dfs = []
    for idx, row in turnover_df.iterrows():
        cap_row = mkt_cap_df.loc[idx]
        df = pd.DataFrame({'cap':cap_row,'turnover':row})
        df = pd.DataFrame(df.groupby(cap_row).turnover.sum())
        dfs.append(df)

    dfs = pd.concat(dfs)
    dfs.groupby(dfs.index).sum()
    return dfs.groupby(dfs.index).sum()

In [None]:
l_turnovers = calc_turnover(mkt_cap_df, l_turnover)
s_turnovers = calc_turnover(mkt_cap_df, s_turnover)

In [None]:
turnovers = pd.DataFrame({'long_turnover':l_turnovers.values.flatten(),'short_turnover':s_turnovers.values.flatten()}, index=l_turnovers.index)

In [None]:
turnovers.plot(kind='bar', title='Turnover by Market Cap Decile')

In [None]:
market_cap_df.iloc[-2].T.div(1000).describe(percentiles=np.arange(0,1,0.1))[4:-1].plot(kind='bar', color='tab:purple', title='Market Cap Decile Breakpoints 2023: BN $USD')

# BAB Seasonality

* Sharpe by month
* Drawdown by month

In [None]:
bab.groupby(bab.index.month).apply(lambda x: x.mean()/x.std()).plot(kind='bar', color='#00d5ff', title='BAB Monthly Sharpe')
plt.xlabel('M')

ff.iloc[:,1:-1].groupby(ff.index.month).apply(lambda x: x.mean()/x.std()).plot(kind='bar', title='FF5 monthly Sharpe')
plt.xlabel('M')

# BAB Performance and Funding Liquidity Risk (TED spread)

* Lineplot
* Scatterplot
* Regression

In [None]:
ted = pd.read_csv('data/ted-spread.csv',index_col='date', parse_dates=True, date_parser=lambda x: pd.to_datetime(x,format='%d/%m/%Y'))
ted = ted.resample('M').last()
bab = bab.loc[ted.index[ted.index.isin(bab.index)]]

In [None]:
ted_bab = pd.DataFrame({'ted':ted.values.flatten(),'bab':bab.values.flatten()},index=bab.index)

ted_bab['ted'] = ted_bab.ted.shift(-1).dropna()
ted_bab['log_ted']= np.log2(ted_bab.ted)
ted_bab = ted_bab.dropna()

In [None]:
sns.regplot(x='ted', y='bab', data=ted_bab)

# Add title and labels
plt.title('BAB vs TED')
plt.xlabel('TED')
# plt.ylabel('BAB')

In [None]:
sns.regplot(x='log_ted', y='bab', data=ted_bab)

# Add title and labels
plt.title('BAB vs Log(TED)')
plt.xlabel('TED')
# plt.ylabel('BAB')

In [None]:
Y = ted_bab['bab']
X = ted_bab['ted']
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
results.summary()

In [None]:
ted_bab.corr()

# Post Cover

In [None]:


def plot_cml(df):
    # Scatterplot with regression line
    df.plot(kind='scatter',x='betas', y='returns', xlim=[0,3], ylim=[-0.5,2.5], marker='x')
    # sns.regplot(x='betas', y='returns', data=df, scatter_kws={'s': 100})

    # Fit a linear regression model
    X = df['betas'].values.reshape(-1, 1)
    y = df['returns'].values
    model = LinearRegression().fit(X, y)
    beta_coef = model.coef_[0]
    beta_std_error = stats.sem(y - model.predict(X))
    t_statistic = beta_coef / beta_std_error
    r_squared = model.score(X, y)

    # Plot the observed CML line
    plt.plot([0, 3], [model.intercept_, model.intercept_+  3*model.coef_[0]], label='Observed CML', color='red')

    # Plot the CAPM CML line (slope 1)
    plt.plot([0, 3], [0.03, 2], label='CAPM CML', color='magenta')

    # Display the regression equation on the plot
    plt.annotate(f'E[Ri]-Rf = {model.intercept_:.2f} + {model.coef_[0]:.2f} * (Rm-Rf)\n t-stat: {t_statistic:.2f}\n R2: {r_squared:.2f}', fontsize=10, xy=(0.02, 0.87), xycoords='axes fraction', bbox=dict(facecolor='lightgrey', edgecolor='grey', boxstyle='round'),
                 color='black')

    # Set labels and legend
    plt.xlabel('Betas')
    plt.ylabel('Mean excess return (%)')
    plt.legend()
    plt.title('Observed CML vs Theoretical CML')

    # Show the plot
    plt.show()


rw_bab_ff.cumsum().plot(title='BAB vs FF5', color=colors)
plot_cml(capm_df)
