# Replicate Fama-French Characteristic-sorted portfolios and Factors

Rong Wang, July 2025

In [1]:
# Packages 
import numpy as np
import pandas as pd
import os
import gc
import time
import datetime as dt
from dateutil.relativedelta import relativedelta
from pandas.tseries.offsets import *
from pathlib import Path
from tqdm import tqdm
import wrds
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

--- 
## Data Preparation

### Load FF factors

In [None]:
# 6 factors from Fama-French
ff6 = pd.read_csv('../Data/FF6_factors_monthly.csv')
ff6['MKT'] = ff6['Mkt-RF'] + ff6['RF']
ff6 = ff6.drop(columns=['Mkt-RF','RF'])
ff6.loc[:, ff6.columns != 'date'] = ff6.loc[:, ff6.columns != 'date'] / 100 # remove percentage
ff6

Unnamed: 0,date,SMB,HML,RMW,CMA,MOM,MKT
0,196307,-0.0048,-0.0081,0.0064,-0.0115,0.0101,-0.0012
1,196308,-0.0080,0.0170,0.0040,-0.0038,0.0100,0.0533
2,196309,-0.0043,0.0000,-0.0078,0.0015,0.0012,-0.0130
3,196310,-0.0134,-0.0004,0.0279,-0.0225,0.0313,0.0283
4,196311,-0.0085,0.0173,-0.0043,0.0227,-0.0078,-0.0059
...,...,...,...,...,...,...,...
733,202408,-0.0355,-0.0110,0.0075,0.0082,0.0481,0.0209
734,202409,-0.0092,-0.0277,0.0018,-0.0029,-0.0062,0.0213
735,202410,-0.0088,0.0086,-0.0142,0.0098,0.0296,-0.0061
736,202411,0.0460,0.0015,-0.0230,-0.0205,0.0101,0.0689


### Individual Stock Returns and Characteristics

In [4]:
%%time
# Load stock returns and characteristics data
charc = pd.read_csv('/work/rw196/data/FF_stock_charcs.csv')
charc['date'] = pd.to_datetime(charc['date'].astype(str), format='%Y%m') + MonthEnd(0)
charc['year'] = charc['date'].dt.year
charc['month'] = charc['date'].dt.month
charc = charc.dropna(subset='ret').reset_index(drop=True)
charc

CPU times: user 4.32 s, sys: 477 ms, total: 4.8 s
Wall time: 4.83 s


Unnamed: 0,date,permno,ret,me,bm,op,inv,mom,year,month
0,1960-01-31,10006,0.005155,6.912750e+01,,,,,1960,1
1,1960-01-31,10014,0.138889,1.118788e+01,,,,,1960,1
2,1960-01-31,10022,-0.045455,2.017312e+01,,,,,1960,1
3,1960-01-31,10030,-0.015000,7.998200e+01,,,,,1960,1
4,1960-01-31,10057,-0.020785,2.650000e+01,,,,,1960,1
...,...,...,...,...,...,...,...,...,...,...
3446394,2024-12-31,93397,-0.117446,4.410872e+02,,,,1.361294,2024,12
3446395,2024-12-31,93426,0.021768,2.867095e+02,,,,0.674201,2024,12
3446396,2024-12-31,93429,-0.094742,2.045564e+04,,,,1.223448,2024,12
3446397,2024-12-31,93434,0.133333,1.824916e+01,,,,0.530076,2024,12


NYSE indicators.

In [5]:
nyse = pd.read_csv('../Data/constructed/nyse_indicators.csv')
nyse['date'] = pd.to_datetime(nyse['date'].astype(str), format='%Y%m') + MonthEnd(0)
charc = pd.merge(charc, nyse, on=['date','permno'],how='left')
charc

Unnamed: 0,date,permno,ret,me,bm,op,inv,mom,year,month,isNYSE
0,1960-01-31,10006,0.005155,6.912750e+01,,,,,1960,1,1.0
1,1960-01-31,10014,0.138889,1.118788e+01,,,,,1960,1,1.0
2,1960-01-31,10022,-0.045455,2.017312e+01,,,,,1960,1,1.0
3,1960-01-31,10030,-0.015000,7.998200e+01,,,,,1960,1,1.0
4,1960-01-31,10057,-0.020785,2.650000e+01,,,,,1960,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...
3446394,2024-12-31,93397,-0.117446,4.410872e+02,,,,1.361294,2024,12,
3446395,2024-12-31,93426,0.021768,2.867095e+02,,,,0.674201,2024,12,1.0
3446396,2024-12-31,93429,-0.094742,2.045564e+04,,,,1.223448,2024,12,
3446397,2024-12-31,93434,0.133333,1.824916e+01,,,,0.530076,2024,12,


July to next June dates. Portfolio weights of this holding periods are determined at the end of June.

In [6]:
charc['ffdate'] = charc['date']+MonthEnd(-6)
charc['ffyear'] = charc['ffdate'].dt.year
charc['ffmonth'] = charc['ffdate'].dt.month
charc

Unnamed: 0,date,permno,ret,me,bm,op,inv,mom,year,month,isNYSE,ffdate,ffyear,ffmonth
0,1960-01-31,10006,0.005155,6.912750e+01,,,,,1960,1,1.0,1959-07-31,1959,7
1,1960-01-31,10014,0.138889,1.118788e+01,,,,,1960,1,1.0,1959-07-31,1959,7
2,1960-01-31,10022,-0.045455,2.017312e+01,,,,,1960,1,1.0,1959-07-31,1959,7
3,1960-01-31,10030,-0.015000,7.998200e+01,,,,,1960,1,1.0,1959-07-31,1959,7
4,1960-01-31,10057,-0.020785,2.650000e+01,,,,,1960,1,1.0,1959-07-31,1959,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446394,2024-12-31,93397,-0.117446,4.410872e+02,,,,1.361294,2024,12,,2024-06-30,2024,6
3446395,2024-12-31,93426,0.021768,2.867095e+02,,,,0.674201,2024,12,1.0,2024-06-30,2024,6
3446396,2024-12-31,93429,-0.094742,2.045564e+04,,,,1.223448,2024,12,,2024-06-30,2024,6
3446397,2024-12-31,93434,0.133333,1.824916e+01,,,,0.530076,2024,12,,2024-06-30,2024,6


### Helper function to compute weighted-average portfolio returns and weights

In [7]:
def wavg(df, ret_name, me_name, method):
    """
    Calculates the weighted average of returns based on different methods.

    This function computes portfolio returns using one of three weighting schemes:
    'ew' for equal-weighted, 'vw' for value-weighted, and 'cvw' for
    capped value-weighted.

    Args:
        df (pd.DataFrame): DataFrame containing stock data, including returns,
                           market equity, and potentially NYSE indicators.
        ret_name (str): The name of the column containing stock returns.
        me_name (str): The name of the column containing market equity values.
        method (str): The weighting method to use. Must be one of 'vw',
                      'ew', or 'cvw'.

    Returns:
        float: The calculated portfolio return for the given period.

    Raises:
        ValueError: If an invalid method is specified, or if 'cvw' is used
                    without an 'isNYSE' column or without any NYSE stocks
                    in the DataFrame.
    """
    rt = df[ret_name].to_numpy()
    
    if method == 'ew':
        return np.mean(rt)  

    elif method == 'vw':
        wt = df[me_name].to_numpy()
        total_wt = np.sum(wt)
        return np.sum(rt * wt) / total_wt

    elif method == 'cvw':
        if 'isNYSE' not in df.columns:
            raise ValueError("df does not have NYSE indicators required for 'cvw' method.")
        
        is_nyse = df['isNYSE'].to_numpy() == 1
        wt = df[me_name].to_numpy()
        cap = np.nanquantile(wt[is_nyse], 0.8) if np.any(is_nyse) else 0        
        capped_wt = np.minimum(wt, cap)
        total_capped = np.sum(capped_wt)
        return np.sum(rt * capped_wt) / total_capped

    else:
        raise ValueError("Method must be one of 'vw', 'ew', or 'cvw'.")

## Univariate-sorted Portfolios

### BM, OP, IV portfolios are updated annually

In [8]:
# Info as of June
june = charc.loc[charc['month']==6, ['date','permno','me','bm','op','inv','isNYSE']].copy().reset_index(drop=True)
june

Unnamed: 0,date,permno,me,bm,op,inv,isNYSE
0,1960-06-30,10006,65.58250,,,,1.0
1,1960-06-30,10014,10.91500,,,,1.0
2,1960-06-30,10030,66.78700,,,,1.0
3,1960-06-30,10057,25.00000,,,,1.0
4,1960-06-30,10102,289.26600,,,,1.0
...,...,...,...,...,...,...,...
286814,2024-06-30,93423,2792.70780,,,,1.0
286815,2024-06-30,93426,375.35564,0.782073,0.166276,-0.989143,1.0
286816,2024-06-30,93429,17882.48924,0.222983,0.272794,-1.069811,
286817,2024-06-30,93434,15.60384,2.277920,-0.261477,-1.136051,


NOTE: Python's _.describe(percentiles=[0.3, 0.7])_ produces slightly different breakpoints copmared to Matlab's _prctile(, [30 70])_ since MATLAB's default method maps each sorted data point to a specific percentile rank and then performs linear interpolation between those ranks. NumPy's default method uses a different linear interpolation scheme based on fractional indices. However, these differences are generally small and have minimal impact on the results.

In [9]:
def rank_prctile(arr, p):
    """
    Calculates percentiles of a 1D array in a way that replicates MATLAB's default prctile function.

    Args:
        arr (np.ndarray): The input 1D array. NaNs are ignored.
        p (list or float): A list or single value of percentiles to compute (0-100).

    Returns:
        np.ndarray or float: An array of the calculated percentiles or a single float if p is a scalar.
    """     
    # Remove NaNs and sort the data, as MATLAB's prctile ignores NaNs
    clean_arr = arr[~np.isnan(arr)]
    if len(clean_arr) == 0:
        if isinstance(p, (list, tuple, np.ndarray)):
            return np.full(len(p), np.nan)
        else:
            return np.nan
            
    sorted_arr = np.sort(clean_arr)
    n = len(sorted_arr)

    # Create the percentile ranks corresponding to each data point
    # MATLAB's method considers the k-th value to be the 100*(k-0.5)/n percentile
    p_rank = 100 * (np.arange(1, n + 1) - 0.5) / n

    # Use linear interpolation (np.interp) to find the values at the desired percentiles
    return np.interp(p, p_rank, sorted_arr)

# Use NYSE breakpoints
nyse_sz = june[june['isNYSE']==1].groupby(['date'])['me'].median().to_frame().reset_index().rename(columns={'me':'sizemedn'})
nyse_bm = june[june['isNYSE']==1].groupby(['date'])['bm'].apply(lambda x: rank_prctile(x.values,[30, 70])).apply(pd.Series).rename(columns={0: 'bm30', 1: 'bm70'}).reset_index()
nyse_op = june[june['isNYSE']==1].groupby(['date'])['op'].apply(lambda x: rank_prctile(x.values,[30, 70])).apply(pd.Series).rename(columns={0: 'op30', 1: 'op70'}).reset_index()
nyse_iv = june[june['isNYSE']==1].groupby(['date'])['inv'].apply(lambda x: rank_prctile(x.values,[30, 70])).apply(pd.Series).rename(columns={0: 'iv30', 1: 'iv70'}).reset_index()

# Merge two breakpoints to the charc data
nyse_breaks = pd.merge(nyse_sz, nyse_bm, how='inner', on=['date'])
nyse_breaks = pd.merge(nyse_breaks, nyse_op, how='inner', on=['date'])
nyse_breaks = pd.merge(nyse_breaks, nyse_iv, how='inner', on=['date'])
june = pd.merge(june, nyse_breaks, how='left', on=['date'])
june

Unnamed: 0,date,permno,me,bm,op,inv,isNYSE,sizemedn,bm30,bm70,op30,op70,iv30,iv70
0,1960-06-30,10006,65.58250,,,,1.0,61.859625,,,,,,
1,1960-06-30,10014,10.91500,,,,1.0,61.859625,,,,,,
2,1960-06-30,10030,66.78700,,,,1.0,61.859625,,,,,,
3,1960-06-30,10057,25.00000,,,,1.0,61.859625,,,,,,
4,1960-06-30,10102,289.26600,,,,1.0,61.859625,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286814,2024-06-30,93423,2792.70780,,,,1.0,3629.474760,0.288861,0.800949,0.133095,0.311143,-1.073445,-0.989102
286815,2024-06-30,93426,375.35564,0.782073,0.166276,-0.989143,1.0,3629.474760,0.288861,0.800949,0.133095,0.311143,-1.073445,-0.989102
286816,2024-06-30,93429,17882.48924,0.222983,0.272794,-1.069811,,3629.474760,0.288861,0.800949,0.133095,0.311143,-1.073445,-0.989102
286817,2024-06-30,93434,15.60384,2.277920,-0.261477,-1.136051,,3629.474760,0.288861,0.800949,0.133095,0.311143,-1.073445,-0.989102


In [9]:
# # Use NYSE breakpoints
# nyse_sz = june[june['isNYSE']==1].groupby(['date'])['me'].median().to_frame().reset_index().rename(columns={'me':'sizemedn'})
# nyse_bm = june[june['isNYSE']==1].groupby(['date'])['bm'].describe(percentiles=[0.3, 0.7]).reset_index()
# nyse_bm = nyse_bm[['date','30%','70%']].rename(columns={'30%':'bm30', '70%':'bm70'})
# nyse_op = june[june['isNYSE']==1].groupby(['date'])['op'].describe(percentiles=[0.3, 0.7]).reset_index()
# nyse_op = nyse_op[['date','30%','70%']].rename(columns={'30%':'op30', '70%':'op70'})
# nyse_iv = june[june['isNYSE']==1].groupby(['date'])['inv'].describe(percentiles=[0.3, 0.7]).reset_index()
# nyse_iv = nyse_iv[['date','30%','70%']].rename(columns={'30%':'iv30', '70%':'iv70'})

# # Merge two breakpoints to the charc data
# nyse_breaks = pd.merge(nyse_sz, nyse_bm, how='inner', on=['date'])
# nyse_breaks = pd.merge(nyse_breaks, nyse_op, how='inner', on=['date'])
# nyse_breaks = pd.merge(nyse_breaks, nyse_iv, how='inner', on=['date'])
# june = pd.merge(june, nyse_breaks, how='left', on=['date'])
# june

In [10]:
# Assign portfolio buckets
ptfassign = june[['date','permno']].copy()

# size buckets
conditions = [
    june['me']<june['sizemedn'],
    june['me']>=june['sizemedn']
]
choices = [1,2] # list(range(1, 3))
ptfassign['sz'] = np.select(conditions, choices, default=np.nan)

# bm buckets
conditions = [
    june['bm']<june['bm30'],
    (june['bm']>=june['bm30']) & (june['bm']<june['bm70']),
    june['bm']>=june['bm70']
]
choices = [1,2,3] # list(range(1, 4))
ptfassign['bm'] = np.select(conditions, choices, default=np.nan)

# op buckets
conditions = [
    june['op']<june['op30'],
    (june['op']>=june['op30']) & (june['op']<june['op70']),
    june['op']>=june['op70']
]
choices = [1,2,3]
ptfassign['op'] = np.select(conditions, choices, default=np.nan)

# inv buckets
conditions = [
    june['inv']<june['iv30'],
    (june['inv']>=june['iv30']) & (june['inv']<june['iv70']),
    june['inv']>=june['iv70']
]
choices = [1,2,3]
ptfassign['iv'] = np.select(conditions, choices, default=np.nan)
ptfassign

Unnamed: 0,date,permno,sz,bm,op,iv
0,1960-06-30,10006,2.0,,,
1,1960-06-30,10014,1.0,,,
2,1960-06-30,10030,2.0,,,
3,1960-06-30,10057,1.0,,,
4,1960-06-30,10102,2.0,,,
...,...,...,...,...,...,...
286814,2024-06-30,93423,1.0,,,
286815,2024-06-30,93426,1.0,2.0,2.0,2.0
286816,2024-06-30,93429,2.0,1.0,2.0,2.0
286817,2024-06-30,93434,1.0,3.0,1.0,1.0


In [11]:
# Merge back with monthly records
# Keep the portfolio assignments from July to next June
ptfassign['ffyear'] = ptfassign['date'].dt.year
ptfassign = pd.merge(charc[['date','permno','ret','me','isNYSE','ffyear']], ptfassign.drop(columns='date'), how='left', on=['permno','ffyear'])
ptfassign

Unnamed: 0,date,permno,ret,me,isNYSE,ffyear,sz,bm,op,iv
0,1960-01-31,10006,0.005155,6.912750e+01,1.0,1959,,,,
1,1960-01-31,10014,0.138889,1.118788e+01,1.0,1959,,,,
2,1960-01-31,10022,-0.045455,2.017312e+01,1.0,1959,,,,
3,1960-01-31,10030,-0.015000,7.998200e+01,1.0,1959,,,,
4,1960-01-31,10057,-0.020785,2.650000e+01,1.0,1959,,,,
...,...,...,...,...,...,...,...,...,...,...
3446394,2024-12-31,93397,-0.117446,4.410872e+02,,2024,1.0,2.0,1.0,3.0
3446395,2024-12-31,93426,0.021768,2.867095e+02,1.0,2024,1.0,2.0,2.0,2.0
3446396,2024-12-31,93429,-0.094742,2.045564e+04,,2024,2.0,1.0,2.0,2.0
3446397,2024-12-31,93434,0.133333,1.824916e+01,,2024,1.0,3.0,1.0,1.0


### MOM portfolios are updated monthly

In [12]:
mom = charc[['date','permno','me','mom','isNYSE']].copy().reset_index(drop=True)
mom

Unnamed: 0,date,permno,me,mom,isNYSE
0,1960-01-31,10006,6.912750e+01,,1.0
1,1960-01-31,10014,1.118788e+01,,1.0
2,1960-01-31,10022,2.017312e+01,,1.0
3,1960-01-31,10030,7.998200e+01,,1.0
4,1960-01-31,10057,2.650000e+01,,1.0
...,...,...,...,...,...
3446394,2024-12-31,93397,4.410872e+02,1.361294,
3446395,2024-12-31,93426,2.867095e+02,0.674201,1.0
3446396,2024-12-31,93429,2.045564e+04,1.223448,
3446397,2024-12-31,93434,1.824916e+01,0.530076,


In [13]:
# Use NYSE breakpoints
nyse_sz = mom[mom['isNYSE']==1].groupby(['date'])['me'].median().to_frame().reset_index().rename(columns={'me':'sizemedn'})
# nyse_mom = mom[mom['isNYSE']==1].groupby(['date'])['mom'].describe(percentiles=[0.3, 0.7]).reset_index()
# nyse_mom = nyse_mom[['date','30%','70%']].rename(columns={'30%':'mom30', '70%':'mom70'})
nyse_mom = mom[mom['isNYSE']==1].groupby(['date'])['mom'].apply(lambda x: rank_prctile(x.values,[30, 70])).apply(pd.Series).rename(columns={0: 'mom30', 1: 'mom70'}).reset_index()

# Merge two breakpoints to the charc data
nyse_breaks = pd.merge(nyse_sz, nyse_mom, how='inner', on=['date'])
mom = pd.merge(mom, nyse_breaks, how='left', on=['date'])
mom

Unnamed: 0,date,permno,me,mom,isNYSE,sizemedn,mom30,mom70
0,1960-01-31,10006,6.912750e+01,,1.0,62.793000,,
1,1960-01-31,10014,1.118788e+01,,1.0,62.793000,,
2,1960-01-31,10022,2.017312e+01,,1.0,62.793000,,
3,1960-01-31,10030,7.998200e+01,,1.0,62.793000,,
4,1960-01-31,10057,2.650000e+01,,1.0,62.793000,,
...,...,...,...,...,...,...,...,...
3446394,2024-12-31,93397,4.410872e+02,1.361294,,4048.003205,0.992977,1.360168
3446395,2024-12-31,93426,2.867095e+02,0.674201,1.0,4048.003205,0.992977,1.360168
3446396,2024-12-31,93429,2.045564e+04,1.223448,,4048.003205,0.992977,1.360168
3446397,2024-12-31,93434,1.824916e+01,0.530076,,4048.003205,0.992977,1.360168


In [14]:
# Assign portfolio buckets
momassign = mom[['date','permno']].copy()

# size buckets
conditions = [
    mom['me']<mom['sizemedn'],
    mom['me']>=mom['sizemedn']
]
choices = [1,2] # list(range(1, 3))
momassign['sz_mom'] = np.select(conditions, choices, default=np.nan)

# mom buckets
conditions = [
    mom['mom']<mom['mom30'],
    (mom['mom']>=mom['mom30']) & (mom['mom']<mom['mom70']),
    mom['mom']>=mom['mom70']
]
choices = [1,2,3] # list(range(1, 4))
momassign['mom'] = np.select(conditions, choices, default=np.nan)
momassign

Unnamed: 0,date,permno,sz_mom,mom
0,1960-01-31,10006,2.0,
1,1960-01-31,10014,1.0,
2,1960-01-31,10022,1.0,
3,1960-01-31,10030,2.0,
4,1960-01-31,10057,1.0,
...,...,...,...,...
3446394,2024-12-31,93397,1.0,3.0
3446395,2024-12-31,93426,1.0,1.0
3446396,2024-12-31,93429,2.0,2.0
3446397,2024-12-31,93434,1.0,1.0


In [15]:
# Merge back with monthly records
# Move date 1M forward such that ptf assignments are 1M lagged 
momassign['date'] = momassign['date'] + pd.DateOffset(months=1) + MonthEnd(0)
ptfassign = pd.merge(ptfassign, momassign, how='left', on=['date','permno'])
ptfassign

Unnamed: 0,date,permno,ret,me,isNYSE,ffyear,sz,bm,op,iv,sz_mom,mom
0,1960-01-31,10006,0.005155,6.912750e+01,1.0,1959,,,,,,
1,1960-01-31,10014,0.138889,1.118788e+01,1.0,1959,,,,,,
2,1960-01-31,10022,-0.045455,2.017312e+01,1.0,1959,,,,,,
3,1960-01-31,10030,-0.015000,7.998200e+01,1.0,1959,,,,,,
4,1960-01-31,10057,-0.020785,2.650000e+01,1.0,1959,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3446394,2024-12-31,93397,-0.117446,4.410872e+02,,2024,1.0,2.0,1.0,3.0,,
3446395,2024-12-31,93426,0.021768,2.867095e+02,1.0,2024,1.0,2.0,2.0,2.0,,
3446396,2024-12-31,93429,-0.094742,2.045564e+04,,2024,2.0,1.0,2.0,2.0,,
3446397,2024-12-31,93434,0.133333,1.824916e+01,,2024,1.0,3.0,1.0,1.0,,


### 1-month lagged market capitalization for portfolio construction

In [16]:
# Lagged mkt cap
ptfassign['lme'] = ptfassign.sort_values(by=['permno','date']).groupby(['permno'])['me'].shift(1)

### Size 2 portfolios

In [17]:
# Value-weigthed return
vwret = ptfassign.dropna(subset=['ret','me','sz']).groupby(['date','sz']).apply(wavg,'ret','me','vw').to_frame().reset_index().rename(columns={0: 'vwret'})
vwret['ptfidx'] = vwret['sz'].astype(int)
vwret

Unnamed: 0,date,sz,vwret,ptfidx
0,1960-07-31,1.0,-0.014044,1
1,1960-07-31,2.0,-0.020274,2
2,1960-08-31,1.0,0.047299,1
3,1960-08-31,2.0,0.033320,2
4,1960-09-30,1.0,-0.056485,1
...,...,...,...,...
1543,2024-10-31,2.0,-0.001159,2
1544,2024-11-30,1.0,0.170274,1
1545,2024-11-30,2.0,0.075275,2
1546,2024-12-31,1.0,-0.039950,1


In [18]:
# Reshape
sport = vwret.pivot(index='date', columns='ptfidx', values='vwret').reset_index()
sport = sport.dropna().reset_index(drop=True)
sport.columns.name = None
sport.columns = ['date' if col == 'date' else f'me_{col}' for col in sport.columns]
sport

Unnamed: 0,date,me_1,me_2
0,1960-07-31,-0.014044,-0.020274
1,1960-08-31,0.047299,0.033320
2,1960-09-30,-0.056485,-0.056019
3,1960-10-31,-0.025153,-0.001105
4,1960-11-30,0.051939,0.051046
...,...,...,...
769,2024-08-31,0.002379,0.027096
770,2024-09-30,0.022291,0.025750
771,2024-10-31,0.021425,-0.001159
772,2024-11-30,0.170274,0.075275


## Bivariate-sorted Portfolios

### Size x BM (2x3) Portfolios

In [19]:
# Value-weigthed return
vwret = ptfassign.dropna(subset=['ret','lme','sz','bm']).groupby(['date','sz','bm']).apply(wavg,'ret','lme','vw').to_frame().reset_index().rename(columns={0: 'vwret'})
vwret['ptfidx'] = (vwret['sz'].astype(int)-1)*3 + vwret['bm'].astype(int)
vwret

Unnamed: 0,date,sz,bm,vwret,ptfidx
0,1961-07-31,1.0,1.0,-0.003153,1
1,1961-07-31,1.0,2.0,0.014445,2
2,1961-07-31,1.0,3.0,0.010803,3
3,1961-07-31,2.0,1.0,0.032384,4
4,1961-07-31,2.0,2.0,0.035461,5
...,...,...,...,...,...
4567,2024-12-31,1.0,2.0,-0.081862,2
4568,2024-12-31,1.0,3.0,-0.071373,3
4569,2024-12-31,2.0,1.0,-0.014364,4
4570,2024-12-31,2.0,2.0,-0.044439,5


In [20]:
# Reshape
sbport = vwret.pivot(index='date', columns='ptfidx', values='vwret').reset_index()
sbport = sbport.dropna().reset_index(drop=True)
sbport.columns.name = None
sbport.columns = ['date' if col == 'date' else f'me_bm_{col}' for col in sbport.columns]
sbport

Unnamed: 0,date,me_bm_1,me_bm_2,me_bm_3,me_bm_4,me_bm_5,me_bm_6
0,1961-07-31,-0.003153,0.014445,0.010803,0.032384,0.035461,0.011100
1,1961-08-31,0.019590,0.005934,0.010608,0.031556,0.018412,0.023950
2,1961-09-30,-0.044093,-0.035206,-0.033088,-0.011135,-0.029606,-0.039385
3,1961-10-31,0.013424,0.037207,0.002503,0.023432,0.038352,0.031648
4,1961-11-30,0.060450,0.064653,0.044685,0.049196,0.038270,0.042738
...,...,...,...,...,...,...,...
757,2024-08-31,-0.009230,-0.024212,-0.017155,0.028454,0.000138,0.028833
758,2024-09-30,0.011261,0.005461,0.005332,0.028737,0.010899,-0.005472
759,2024-10-31,0.000566,-0.014137,-0.010150,-0.011220,-0.000556,0.017041
760,2024-11-30,0.162790,0.108171,0.110996,0.068472,0.054283,0.096789


### Size x Profitability (2x3) Portfolios

In [21]:
# Value-weigthed return
vwret = ptfassign.dropna(subset=['ret','lme','sz','op']).groupby(['date','sz','op']).apply(wavg,'ret','lme','vw').to_frame().reset_index().rename(columns={0: 'vwret'})
vwret['ptfidx'] = (vwret['sz'].astype(int)-1)*3 + vwret['op'].astype(int)
vwret

Unnamed: 0,date,sz,op,vwret,ptfidx
0,1961-07-31,1.0,1.0,0.016557,1
1,1961-07-31,1.0,2.0,0.017912,2
2,1961-07-31,1.0,3.0,0.000581,3
3,1961-07-31,2.0,1.0,0.015187,4
4,1961-07-31,2.0,2.0,0.036406,5
...,...,...,...,...,...
4567,2024-12-31,1.0,2.0,-0.077980,2
4568,2024-12-31,1.0,3.0,-0.081695,3
4569,2024-12-31,2.0,1.0,-0.059625,4
4570,2024-12-31,2.0,2.0,-0.048057,5


In [22]:
# Reshape
spport = vwret.pivot(index='date', columns='ptfidx', values='vwret').reset_index()
spport = spport.dropna().reset_index(drop=True)
spport.columns.name = None
spport.columns = ['date' if col == 'date' else f'me_op_{col}' for col in spport.columns]
spport

Unnamed: 0,date,me_op_1,me_op_2,me_op_3,me_op_4,me_op_5,me_op_6
0,1961-07-31,0.016557,0.017912,0.000581,0.015187,0.036406,0.034161
1,1961-08-31,0.015397,0.008730,0.019760,0.004074,0.025517,0.029444
2,1961-09-30,-0.038009,-0.039219,-0.021181,-0.057920,-0.032340,0.007448
3,1961-10-31,0.007301,0.030436,0.047850,0.009347,0.030076,0.030249
4,1961-11-30,0.070401,0.061868,0.055853,0.035125,0.043231,0.050823
...,...,...,...,...,...,...,...
757,2024-08-31,-0.018519,-0.017405,-0.018345,0.010447,0.019290,0.024460
758,2024-09-30,0.006260,0.007085,0.007997,0.034824,0.018241,0.022005
759,2024-10-31,-0.005536,-0.008780,-0.019793,0.002013,0.007062,-0.015694
760,2024-11-30,0.119916,0.128491,0.121057,0.110400,0.077759,0.054633


### Size x Investment (2x3) Portfolios

In [23]:
# Value-weigthed return
vwret = ptfassign.dropna(subset=['ret','lme','sz','op']).groupby(['date','sz','iv']).apply(wavg,'ret','lme','vw').to_frame().reset_index().rename(columns={0: 'vwret'})
vwret['ptfidx'] = (vwret['sz'].astype(int)-1)*3 + vwret['iv'].astype(int)
vwret

Unnamed: 0,date,sz,iv,vwret,ptfidx
0,1962-07-31,1.0,1.0,0.068128,1
1,1962-07-31,1.0,2.0,0.054153,2
2,1962-07-31,1.0,3.0,0.062499,3
3,1962-07-31,2.0,1.0,0.080624,4
4,1962-07-31,2.0,2.0,0.060489,5
...,...,...,...,...,...
4495,2024-12-31,1.0,2.0,-0.081072,2
4496,2024-12-31,1.0,3.0,-0.060882,3
4497,2024-12-31,2.0,1.0,-0.018154,4
4498,2024-12-31,2.0,2.0,-0.021836,5


In [24]:
# Reshape
siport = vwret.pivot(index='date', columns='ptfidx', values='vwret').reset_index()
siport = siport.dropna().reset_index(drop=True)
siport.columns.name = None
siport.columns = ['date' if col == 'date' else f'me_iv_{col}' for col in siport.columns]
siport

Unnamed: 0,date,me_iv_1,me_iv_2,me_iv_3,me_iv_4,me_iv_5,me_iv_6
0,1962-07-31,0.068128,0.054153,0.062499,0.080624,0.060489,0.035775
1,1962-08-31,0.032099,0.019052,0.032915,0.013913,0.027529,0.021897
2,1962-09-30,-0.074790,-0.058921,-0.077273,-0.063487,-0.033237,-0.059356
3,1962-10-31,-0.038281,-0.031976,-0.033816,0.001922,0.011302,0.014187
4,1962-11-30,0.134455,0.133412,0.137597,0.115967,0.098138,0.113380
...,...,...,...,...,...,...,...
745,2024-08-31,-0.023532,-0.017264,-0.014220,0.017182,0.023754,0.026988
746,2024-09-30,0.002863,0.004488,0.012296,0.025970,0.017968,0.018751
747,2024-10-31,-0.002218,-0.007198,-0.019061,-0.013563,-0.012811,0.026438
748,2024-11-30,0.130602,0.112856,0.127612,0.076872,0.065028,0.048018


### Size x Momemtum (2x3) Portfolios

In [25]:
# Value-weigthed return
vwret = ptfassign.dropna(subset=['ret','lme','sz_mom','mom']).groupby(['date','sz_mom','mom']).apply(wavg,'ret','lme','vw').to_frame().reset_index().rename(columns={0: 'vwret'})
vwret['ptfidx'] = (vwret['sz_mom'].astype(int)-1)*3 + vwret['mom'].astype(int)
vwret

Unnamed: 0,date,sz_mom,mom,vwret,ptfidx
0,1961-01-31,1.0,1.0,0.119091,1
1,1961-01-31,1.0,2.0,0.084503,2
2,1961-01-31,1.0,3.0,0.061583,3
3,1961-01-31,2.0,1.0,0.068033,4
4,1961-01-31,2.0,2.0,0.070938,5
...,...,...,...,...,...
2683,2024-11-30,1.0,2.0,0.106391,2
2684,2024-11-30,1.0,3.0,0.118846,3
2685,2024-11-30,2.0,1.0,0.068488,4
2686,2024-11-30,2.0,2.0,0.063684,5


In [26]:
# Reshape
smport = vwret.pivot(index='date', columns='ptfidx', values='vwret').reset_index()
smport = smport.dropna().reset_index(drop=True)
smport.columns.name = None
smport.columns = ['date' if col == 'date' else f'me_mom_{col}' for col in smport.columns]
smport

Unnamed: 0,date,me_mom_1,me_mom_2,me_mom_3,me_mom_4,me_mom_5,me_mom_6
0,1961-01-31,0.119091,0.084503,0.061583,0.068033,0.070938,0.052350
1,1961-02-28,0.065012,0.066883,0.091130,0.046065,0.024021,0.041469
2,1961-04-30,0.007138,0.012127,0.027366,-0.029567,0.009742,0.011865
3,1961-06-30,-0.061195,-0.042438,-0.052814,-0.026102,-0.027793,-0.027968
4,1961-08-31,0.009489,0.009620,0.018911,0.018905,0.023533,0.036999
...,...,...,...,...,...,...,...
443,2024-04-30,-0.081157,-0.070547,-0.073173,-0.025750,-0.034546,-0.047991
444,2024-06-30,-0.024324,-0.010859,-0.032542,0.034069,0.020691,0.052745
445,2024-08-31,-0.048701,-0.007348,0.008531,-0.010266,0.026762,0.023797
446,2024-09-30,0.005371,0.002992,0.002540,0.037717,0.018352,0.023199


## FF Factors

In [27]:
ff_rep = pd.merge(sbport, spport, how='outer', on=['date']).sort_values(by='date')
ff_rep = pd.merge(ff_rep, siport, how='outer', on=['date']).sort_values(by='date')
ff_rep = pd.merge(ff_rep, smport, how='outer', on=['date']).sort_values(by='date')
ff_rep['SMB_bm'] = (ff_rep['me_bm_1']+ff_rep['me_bm_2']+ff_rep['me_bm_3'])/3 - (ff_rep['me_bm_4']+ff_rep['me_bm_5']+ff_rep['me_bm_6'])/3
ff_rep['SMB_pf'] = (ff_rep['me_op_1']+ff_rep['me_op_2']+ff_rep['me_op_3'])/3 - (ff_rep['me_op_4']+ff_rep['me_op_5']+ff_rep['me_op_6'])/3
ff_rep['SMB_iv'] = (ff_rep['me_iv_1']+ff_rep['me_iv_2']+ff_rep['me_iv_3'])/3 - (ff_rep['me_iv_4']+ff_rep['me_iv_5']+ff_rep['me_iv_6'])/3
ff_rep['SMB'] = (ff_rep['SMB_bm']+ff_rep['SMB_pf']+ff_rep['SMB_iv'])/3
# ff_rep['SMB'] = (ff_rep['me_bm_1']+ff_rep['me_bm_2']+ff_rep['me_bm_3'])/3 - (ff_rep['me_bm_4']+ff_rep['me_bm_5']+ff_rep['me_bm_6'])/3
ff_rep['HML'] = (ff_rep['me_bm_3']+ff_rep['me_bm_6'])/2 - (ff_rep['me_bm_1']+ff_rep['me_bm_4'])/2
ff_rep['RMW'] = (ff_rep['me_op_3']+ff_rep['me_op_6'])/2 - (ff_rep['me_op_1']+ff_rep['me_op_4'])/2
ff_rep['CMA'] = (ff_rep['me_iv_3']+ff_rep['me_iv_6'])/2 - (ff_rep['me_iv_1']+ff_rep['me_iv_4'])/2
ff_rep['MOM'] = (ff_rep['me_mom_3']+ff_rep['me_mom_6'])/2 - (ff_rep['me_mom_1']+ff_rep['me_mom_4'])/2
ff_rep = ff_rep[['date','SMB','HML','RMW','CMA','MOM']]
ff_rep['date'] = ff_rep['date'].dt.year * 100 + ff_rep['date'].dt.month
ff_rep

Unnamed: 0,date,SMB,HML,RMW,CMA,MOM
0,196101,,,,,-0.036596
1,196102,,,,,0.010761
2,196104,,,,,0.030830
3,196106,,,,,0.003258
4,196107,,-0.003665,0.001499,,
...,...,...,...,...,...,...
761,202408,-0.037715,-0.003773,0.007093,0.009559,0.045647
762,202409,-0.012098,-0.020069,-0.005541,0.001107,-0.008674
763,202410,-0.009446,0.008772,-0.015982,0.011579,
764,202411,0.052249,-0.011739,-0.027313,-0.015921,-0.000028


In [28]:
print(ff6.set_index('date')['SMB'].corr(ff_rep.set_index('date')['SMB']))
print(ff6.set_index('date')['HML'].corr(ff_rep.set_index('date')['HML']))
print(ff6.set_index('date')['RMW'].corr(ff_rep.set_index('date')['RMW']))
print(ff6.set_index('date')['CMA'].corr(ff_rep.set_index('date')['CMA']))
print(ff6.set_index('date')['MOM'].corr(ff_rep.set_index('date')['MOM']))

0.9971669169246585
0.9823954076383838
0.981181988880758
0.9807960719458096
0.9967178664439557
