In [76]:
import matplotlib.pyplot as plt
import os
import warnings
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.formula.api as smf
import pandas_datareader.data as web
from datetime import datetime as dt
import pickle

warnings.simplefilter(action='ignore', category=FutureWarning)

In [176]:
def get_multi_factor_model_data():
    START_DATE = '1954-01-01'
    # three factors 
    df_three_factor = web.DataReader('F-F_Research_Data_Factors', 'famafrench', start=START_DATE)[0]
    df_three_factor.index = df_three_factor.index.format()

    # momentum factor
    df_mom = web.DataReader('F-F_Momentum_Factor', 'famafrench', start=START_DATE)[0]
    df_mom.index = df_mom.index.format()

    # three factors
    df_four_factor = df_three_factor.join(df_mom)

    # five factors
    df_five_factor = web.DataReader('F-F_Research_Data_5_Factors_2x3', 'famafrench', start=START_DATE)[0]
    df_five_factor.index = df_five_factor.index.format()

    df_three_factor.to_csv('data/multi-factor_models/F-F_Research_Data_Factors.csv')
    df_four_factor.to_csv('data/multi-factor_models/Carhart_4_Factors.csv')
    df_five_factor.to_csv('data/multi-factor_models/F-F_Research_Data_5_Factors_2x3.csv')

get_multi_factor_model_data()

In [2]:
def get_multi_factor_data():
    f = 'data/multi-factor_models'
    df_three_factor = pd.read_csv(os.path.join(f, 'F-F_Research_Data_Factors.csv'), 
                                  index_col='Unnamed: 0')
    df_four_factor = pd.read_csv(os.path.join(f, 'Carhart_4_Factors.csv'), 
                                  index_col='Unnamed: 0')
    df_five_factor = pd.read_csv(os.path.join(f, 'F-F_Research_Data_5_Factors_2x3.csv'), 
                                  index_col='Unnamed: 0')

    return df_three_factor, df_four_factor, df_five_factor


df_three_factor, df_four_factor, df_five_factor = get_multi_factor_data()                               

In [7]:
START_DATE = '2019-01-01'
END_DATE = '2022-01-01'

In [4]:
SPY_info_df = pd.read_csv('data/SPY-Info.csv')
ticker_list = SPY_info_df['Symbol'].to_list()

In [8]:
def get_ticker_data(ticker):
    f = 'data/market_data'
    ticker_df = pd.read_csv(os.path.join(f, f'{ticker}.csv'), index_col='Unnamed: 0',
                            parse_dates=True)

    return ticker_df

Merge the datasets for the four-factor models:

In [84]:
def get_model_parameters(model): 
    params = pd.read_html(model.summary().tables[1].as_html(), header=0, index_col=0)[0]
    params = params.rename(columns={'P>|t|': 'p-value'})
    f_pvalue = round(model.f_pvalue, 3)
    rsquared = round(model.rsquared, 3)
    rsquared_adj = round(model.rsquared_adj, 3)
    
    return params, f_pvalue, rsquared, rsquared_adj

# params, f_pvalue, rsquared, rsquared_adj = get_model_parameters(five_factor_model)


In [91]:
def make_factor_model(model, start, end, ticker):
    ticker_df = get_ticker_data(ticker)
    y = ticker_df['adjclose'].resample('M') \
                             .last() \
                             .pct_change() \
                             .dropna()
    y.index = y.index.strftime('%Y-%m')
    y.name = 'return'

    if model == 3:
        model_df = df_three_factor
        model_data = model_df.join(y, how='inner')
        model_data.columns = ['mkt', 'smb', 'hml', 'rf', 'rtn']
        formula = 'excess_rtn ~ mkt + smb + hml'
    elif model == 4:
        model_df = df_four_factor
        model_data = model_df.join(y, how='inner')
        model_data.columns = ['mkt', 'smb', 'hml', 'rf', 'mom', 'rtn']
        formula = 'excess_rtn ~ mkt + smb + hml + mom'
    elif model == 5:
        model_df = df_five_factor
        model_data = model_df.join(y, how='inner')
        model_data.columns = ['mkt', 'smb', 'hml', 'rmw', 'cma', 'rf', 'rtn']
        formula = 'excess_rtn ~ mkt + smb + hml + rmw + cma'

    model_data.loc[:, model_data.columns != 'rtn'] /= 100
    model_data.index = [pd.to_datetime(x, format='%Y-%m') for x in model_data.index]
    model_data = model_data.loc[start:end]
    model_data['excess_rtn'] = model_data.rtn - model_data.rf

    if not model_data.empty:
        factor_model = smf.ols(formula=formula, data=model_data).fit()
        params, f_pvalue, rsquared, rsquared_adj = get_model_parameters(factor_model)
    else:
        params, f_pvalue, rsquared, rsquared_adj = [pd.DataFrame()] + [np.nan] * 3
    
    return params, f_pvalue, rsquared, rsquared_adj

Merge the datasets for the five-factor models:

In [107]:
def make_ticker_factor_models():
    relevant_ticker_factor_models = {}
    irrelevant_ticker_factor_models = {}
    for model in [3, 4, 5]:
        ticker_factor_models = {}
        relevant_factor_models = {}
        irrelevant_factor_models = {}
    
        for ticker in ticker_list:
            params, f_pvalue, rsquared, rsquared_adj = make_factor_model(model, START_DATE, END_DATE, ticker)
            ticker_factor_models[ticker] = {'f_pvalue': f_pvalue, 
                                            'rsquared_adj': rsquared_adj, 
                                            'params': params}
            if params.empty:
                irrelevant_factor_models[ticker] =  ticker_factor_models[ticker]
            else:
                if f_pvalue <= 0.05 and not all(x > 0.05 for x in params['p-value']) \
                    and (params.loc['Intercept', 'p-value'] > 0.05 or model == 3):
                    relevant_factor_models[ticker] =  ticker_factor_models[ticker]
                else:
                    irrelevant_factor_models[ticker] =  ticker_factor_models[ticker]
            
        # relevant_factor_models = sorted(relevant_factor_models, key=lambda x: x[1]['rsquared_adj'], 
        #                                 reverse=True)
        # irrelevant_factor_models = sorted(irrelevant_factor_models, key=lambda x: x[1]['rsquared_adj'], 
        #                                   reverse=True)
        
        relevant_ticker_factor_models[f'{model}_factor_models'] = relevant_factor_models
        irrelevant_ticker_factor_models[f'{model}_factor_models'] = irrelevant_factor_models

        print(f"{len(relevant_factor_models) / len(ticker_list) * 100:.0f}% " +
              f"of stock returns explained by {model}-factor model")

    f = r'data\multi-factor_models'

    with open(os.path.join(f, f'relevant_ticker_factor_models.pickle'), 'wb') as file:
        pickle.dump(relevant_ticker_factor_models, file)

    with open(os.path.join(f, f'irrelevant_ticker_factor_models.pickle'), 'wb') as file:
        pickle.dump(irrelevant_ticker_factor_models, file)

make_ticker_factor_models()



92% of stock returns explained by 3-factor model




86% of stock returns explained by 4-factor model




85% of stock returns explained by 5-factor model


Compare predictions to actual results

In [111]:
def get_ticker_factor_models():
    f = r'data\multi-factor_models'

    with open(os.path.join(f, 'relevant_ticker_factor_models.pickle'), 'rb') as file:
        relevant_ticker_factor_models = pickle.load(file)
    with open(os.path.join(f, 'irrelevant_ticker_factor_models.pickle'), 'rb') as file:
        irrelevant_ticker_factor_models = pickle.load(file)
    
    return relevant_ticker_factor_models, irrelevant_ticker_factor_models
    
relevant_ticker_factor_models, irrelevant_ticker_factor_models = get_ticker_factor_models()

In [112]:
relevant_3_factor_models = relevant_ticker_factor_models['3_factor_models']
relevant_4_factor_models = relevant_ticker_factor_models['4_factor_models']
relevant_5_factor_models = relevant_ticker_factor_models['5_factor_models']
irrelevant_3_factor_models = irrelevant_ticker_factor_models['3_factor_models']
irrelevant_4_factor_models = irrelevant_ticker_factor_models['4_factor_models']
irrelevant_5_factor_models = irrelevant_ticker_factor_models['5_factor_models']

In [160]:
from datetime import timedelta


dt.strptime(END_DATE, '%Y-%m') + timedelta(days=31)

datetime.datetime(2022, 2, 1, 0, 0)

In [177]:
END_DATE = '2022-01'
ticker = 'GOOGL'
ticker_df = get_ticker_data(ticker)
y = ticker_df['adjclose'].resample('M') \
                            .last() \
                            .pct_change() \
                            .dropna()

df = df_four_factor
df.columns = df.columns.str.lower()
df.columns = df.columns.str.strip()
df = df.rename(columns={'mkt-rf': 'mkt'})
factors = df.loc[END_DATE, df_four_factor.columns != 'rf']
betas = relevant_4_factor_models[ticker]['params'].coef

# pred_date = dt.strptime(END_DATE, '%Y-%m') + timedelta(days=31)
y['2022-02']


2022-02-28   -0.001822
Freq: M, Name: adjclose, dtype: float64

In [175]:
relevant_4_factor_models[ticker]

{'f_pvalue': 0.0,
 'rsquared_adj': 0.496,
 'params':              coef  std err      t  p-value  [0.025  0.975]
 Intercept  0.0096    0.009  1.093    0.282  -0.008   0.027
 mkt        1.0416    0.189  5.513    0.000   0.657   1.426
 smb       -0.1963    0.318 -0.617    0.542  -0.845   0.452
 hml        0.1107    0.206  0.537    0.595  -0.309   0.530
 mom        0.1289    0.257  0.502    0.619  -0.394   0.652}

In [172]:
rtn = 0  # expected return according to model
for x in factors.index:
    # print(x, betas[x])
    rtn += factors[x] * betas[x]

rtn

-4.257873000000001