In [76]:
import matplotlib.pyplot as plt
import os
import warnings
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.formula.api as smf
import pandas_datareader.data as web
from datetime import datetime as dt
import pickle

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def get_multi_factor_model_data():
    START_DATE = '1954-01-01'
    # three factors 
    df_three_factor = web.DataReader('F-F_Research_Data_Factors', 'famafrench', start=START_DATE)[0]
    df_three_factor.index = df_three_factor.index.format()

    # momentum factor
    df_mom = web.DataReader('F-F_Momentum_Factor', 'famafrench', start=START_DATE)[0]
    df_mom.index = df_mom.index.format()

    # three factors
    df_four_factor = df_three_factor.join(df_mom)

    # five factors
    df_five_factor = web.DataReader('F-F_Research_Data_5_Factors_2x3', 'famafrench', start=START_DATE)[0]
    df_five_factor.index = df_five_factor.index.format()

    df_three_factor.to_csv('data/multi-factor_models/F-F_Research_Data_Factors.csv')
    df_four_factor.to_csv('data/multi-factor_models/Carhart_4_Factors.csv')
    df_five_factor.to_csv('data/multi-factor_models/F-F_Research_Data_5_Factors_2x3.csv')

In [2]:
def get_multi_factor_data():
    f = 'data/multi-factor_models'
    df_three_factor = pd.read_csv(os.path.join(f, 'F-F_Research_Data_Factors.csv'), 
                                  index_col='Unnamed: 0')
    df_four_factor = pd.read_csv(os.path.join(f, 'Carhart_4_Factors.csv'), 
                                  index_col='Unnamed: 0')
    df_five_factor = pd.read_csv(os.path.join(f, 'F-F_Research_Data_5_Factors_2x3.csv'), 
                                  index_col='Unnamed: 0')

    return df_three_factor, df_four_factor, df_five_factor


df_three_factor, df_four_factor, df_five_factor = get_multi_factor_data()                               

In [7]:
START_DATE = '2019-01-01'
END_DATE = '2022-01-01'

In [4]:
SPY_info_df = pd.read_csv('data/SPY-Info.csv')
ticker_list = SPY_info_df['Symbol'].to_list()

In [8]:
def get_ticker_data(ticker):
    f = 'data/market_data'

    ticker_df = pd.read_csv(os.path.join(f, f'{ticker}.csv'), index_col='Unnamed: 0',
                            parse_dates=True)

    return ticker_df

Merge the datasets for the four-factor models:

In [84]:
def get_model_parameters(model): 
    params = pd.read_html(model.summary().tables[1].as_html(), header=0, index_col=0)[0]
    params = params.rename(columns={'P>|t|': 'p-value'})
    f_pvalue = round(model.f_pvalue, 3)
    rsquared = round(model.rsquared, 3)
    rsquared_adj = round(model.rsquared_adj, 3)
    
    return params, f_pvalue, rsquared, rsquared_adj

# params, f_pvalue, rsquared, rsquared_adj = get_model_parameters(five_factor_model)


In [91]:
def make_factor_model(model, start, end, ticker):
    ticker_df = get_ticker_data(ticker)
    y = ticker_df['adjclose'].resample('M') \
                             .last() \
                             .pct_change() \
                             .dropna()
    y.index = y.index.strftime('%Y-%m')
    y.name = 'return'

    if model == 3:
        model_df = df_three_factor
        model_data = model_df.join(y, how='inner')
        model_data.columns = ['mkt', 'smb', 'hml', 'rf', 'rtn']
        formula = 'excess_rtn ~ mkt + smb + hml'
    elif model == 4:
        model_df = df_four_factor
        model_data = model_df.join(y, how='inner')
        model_data.columns = ['mkt', 'smb', 'hml', 'rf', 'mom', 'rtn']
        formula = 'excess_rtn ~ mkt + smb + hml + mom'
    elif model == 5:
        model_df = df_five_factor
        model_data = model_df.join(y, how='inner')
        model_data.columns = ['mkt', 'smb', 'hml', 'rmw', 'cma', 'rf', 'rtn']
        formula = 'excess_rtn ~ mkt + smb + hml + rmw + cma'

    model_data.loc[:, model_data.columns != 'rtn'] /= 100
    model_data.index = [pd.to_datetime(x, format='%Y-%m') for x in model_data.index]
    model_data = model_data.loc[start:end]
    model_data['excess_rtn'] = model_data.rtn - model_data.rf

    if not model_data.empty:
        factor_model = smf.ols(formula=formula, data=model_data).fit()
        params, f_pvalue, rsquared, rsquared_adj = get_model_parameters(factor_model)
    else:
        params, f_pvalue, rsquared, rsquared_adj = [pd.DataFrame()] + [np.nan] * 3
    
    return params, f_pvalue, rsquared, rsquared_adj

Merge the datasets for the five-factor models:

In [107]:
def make_ticker_factor_models():
    relevant_ticker_factor_models = {}
    irrelevant_ticker_factor_models = {}
    for model in [3, 4, 5]:
        ticker_factor_models = {}
        relevant_factor_models = {}
        irrelevant_factor_models = {}
    
        for ticker in ticker_list:
            params, f_pvalue, rsquared, rsquared_adj = make_factor_model(model, START_DATE, END_DATE, ticker)
            ticker_factor_models[ticker] = {'f_pvalue': f_pvalue, 
                                            'rsquared_adj': rsquared_adj, 
                                            'params': params}
            if params.empty:
                irrelevant_factor_models[ticker] =  ticker_factor_models[ticker]
            else:
                if f_pvalue <= 0.05 and not all(x > 0.05 for x in params['p-value']) \
                    and (params.loc['Intercept', 'p-value'] > 0.05 or model == 3):
                    relevant_factor_models[ticker] =  ticker_factor_models[ticker]
                else:
                    irrelevant_factor_models[ticker] =  ticker_factor_models[ticker]
            
        # relevant_factor_models = sorted(relevant_factor_models, key=lambda x: x[1]['rsquared_adj'], 
        #                                 reverse=True)
        # irrelevant_factor_models = sorted(irrelevant_factor_models, key=lambda x: x[1]['rsquared_adj'], 
        #                                   reverse=True)
        
        relevant_ticker_factor_models[f'{model}_factor_models'] = relevant_factor_models
        irrelevant_ticker_factor_models[f'{model}_factor_models'] = irrelevant_factor_models

        print(f"{len(relevant_factor_models) / len(ticker_list) * 100:.0f}% " +
              f"of stock returns explained by {model}-factor model")

    f = r'data\multi-factor_models'

    with open(os.path.join(f, f'relevant_ticker_factor_models.pickle'), 'wb') as file:
        pickle.dump(relevant_ticker_factor_models, file)

    with open(os.path.join(f, f'irrelevant_ticker_factor_models.pickle'), 'wb') as file:
        pickle.dump(irrelevant_ticker_factor_models, file)

make_ticker_factor_models()

Compare predictions to actual results

In [102]:
def get_ticker_factor_models():
    f = r'data\multi-factor_models'

    with open(os.path.join(f, 'relevant_ticker_factor_models.pickle'), 'rb') as file:
        relevant_ticker_factor_models = pickle.load(file)
    with open(os.path.join(f, 'irrelevant_ticker_factor_models.pickle'), 'rb') as file:
        irrelevant_ticker_factor_models = pickle.load(file)
    
    return relevant_ticker_factor_models, irrelevant_ticker_factor_models
    
relevant_ticker_factor_models, irrelevant_ticker_factor_models = get_ticker_factor_models()

In [103]:
relevant_3_factor_models = relevant_ticker_factor_models['3_factor_models']
relevant_4_factor_models = relevant_ticker_factor_models['4_factor_models']
relevant_5_factor_models = relevant_ticker_factor_models['5_factor_models']
irrelevant_3_factor_models = irrelevant_ticker_factor_models['3_factor_models']
irrelevant_4_factor_models = irrelevant_ticker_factor_models['4_factor_models']
irrelevant_5_factor_models = irrelevant_ticker_factor_models['5_factor_models']

In [104]:
relevant_3_factor_models

[('PRU',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.886,
   'params':              coef  std err       t  p-value  [0.025  0.975]
   Intercept -0.0065    0.006  -1.065    0.295  -0.019   0.006
   mkt        1.4595    0.117  12.499    0.000   1.222   1.697
   smb       -0.0003    0.214  -0.002    0.999  -0.436   0.435
   hml        0.9879    0.119   8.329    0.000   0.747   1.229}),
 ('KEY',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.852,
   'params':              coef  std err      t  p-value  [0.025  0.975]
   Intercept  0.0074    0.007  1.078    0.289  -0.007   0.021
   mkt        1.0223    0.132  7.736    0.000   0.753   1.291
   smb        0.6803    0.242  2.808    0.008   0.187   1.173
   hml        1.2482    0.134  9.297    0.000   0.975   1.521}),
 ('EMR',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.849,
   'params':              coef  std err       t  p-value  [0.025  0.975]
   Intercept -0.0064    0.006  -1.046    0.303  -0.019   0.006
   mkt        1.3913    0.118  11.826    0.000  

In [105]:
relevant_4_factor_models

[('PRU',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.887,
   'params':              coef  std err       t  p-value  [0.025  0.975]
   Intercept -0.0059    0.006  -0.969    0.340  -0.018   0.006
   mkt        1.3891    0.131  10.615    0.000   1.123   1.656
   smb       -0.0670    0.220  -0.304    0.763  -0.516   0.382
   hml        0.8943    0.143   6.267    0.000   0.604   1.185
   mom       -0.2076    0.178  -1.167    0.252  -0.570   0.155}),
 ('EMR',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.88,
   'params':              coef  std err       t  p-value  [0.025  0.975]
   Intercept -0.0050    0.005  -0.909    0.370  -0.016   0.006
   mkt        1.2228    0.118  10.352    0.000   0.982   1.463
   smb        0.0127    0.199   0.064    0.950  -0.393   0.418
   hml        0.2700    0.129   2.096    0.044   0.008   0.532
   mom       -0.4969    0.161  -3.095    0.004  -0.824  -0.170}),
 ('PH',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.851,
   'params':              coef  std err      t  p-value

In [106]:
relevant_5_factor_models

[('PRU',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.895,
   'params':              coef  std err       t  p-value  [0.025  0.975]
   Intercept -0.0027    0.006  -0.444    0.660  -0.015   0.010
   mkt        1.4641    0.137  10.677    0.000   1.184   1.744
   smb       -0.2252    0.255  -0.884    0.383  -0.745   0.294
   hml        1.2422    0.169   7.349    0.000   0.897   1.587
   rmw       -0.3996    0.291  -1.374    0.179  -0.993   0.193
   cma       -0.5343    0.312  -1.710    0.097  -1.171   0.103}),
 ('BAC',
  {'f_pvalue': 0.0,
   'rsquared_adj': 0.893,
   'params':              coef  std err       t  p-value  [0.025  0.975]
   Intercept  0.0081    0.006   1.437    0.161  -0.003   0.019
   mkt        1.3481    0.126  10.663    0.000   1.090   1.606
   smb       -0.4083    0.235  -1.738    0.092  -0.887   0.071
   hml        1.2993    0.156   8.337    0.000   0.981   1.617
   rmw       -0.8871    0.268  -3.309    0.002  -1.434  -0.340
   cma       -0.7028    0.288  -2.440    0.021  