# Data Preparation

##### Rong Wang

This notebook prepares the data for multiple asset pricing applications.

In [1]:
# Packages 
import numpy as np
import pandas as pd
import os
import gc
import time
import datetime as dt
from dateutil.relativedelta import relativedelta
from pandas.tseries.offsets import *
from joblib import Parallel, delayed
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

### Individual stock returns and characteristics 

Individual stock characteristics data used in [Kelly, Pruitt, and Su (2019)](https://www.sciencedirect.com/science/article/abs/pii/S0304405X19301151), [Gu, Kelly, and Xiu (2020)](https://academic.oup.com/rfs/article/33/5/2223/5758276), and [Gu, Kelly, and Xiu (2021)](https://www.sciencedirect.com/science/article/abs/pii/S0304407620301998) is available on the websites of [Prof Dacheng Xiu](https://dachxiu.chicagobooth.edu/) or [Shihao Gu](https://shihaogu.com/), with the current version ending in 2021. Instead, I opt to use a different data source: [Global Factor Data](https://jkpfactors.com/stock-char) organized by Prof. Theis Jensen, Bryan Kelly, and Lasse Heje Pedersen, which offers the most up-to-date individual stock returns and firm characteristics.

The data sample ranges from Jan 1960 to Dec 2024.   

In [2]:
%%time
# Load stock returns and characteristics data (1959-2024)
charc = pd.read_parquet('/work/rw196/data/JKP_stock_charcs.parquet')
charc = charc.rename(columns={"eom": "date"})
charc['date'] = pd.to_datetime(charc['date'])+MonthEnd(0)
charc

CPU times: user 14.3 s, sys: 8.33 s, total: 22.6 s
Wall time: 10.2 s


Unnamed: 0,excntry,date,permno,size_grp,me,ret,ret_exc,sic,age,aliq_at,...,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,z_score,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,USA,1962-02-28,,,,,,,26.0,0.736970,...,0.055493,0.691228,0.807338,-0.002817,,,,,,
1,USA,1962-01-31,,,,,,,1.0,,...,,,,,,,,,,
2,USA,1962-01-31,,,,,,,25.0,,...,,,0.743086,,,,,,,
3,USA,1962-01-31,,mega,3.727224e+02,,,,145.0,0.972656,...,,,0.735235,0.011927,,,,,,
4,USA,1962-01-31,,,,,,,37.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3911126,USA,2024-08-31,93436.0,mega,6.840044e+05,-0.077390,-0.081809,3711.0,236.0,0.849340,...,0.062915,0.504812,0.647934,-0.054016,0.029518,0.360574,9.472730,0.000715,0.000828,0.000661
3911127,USA,2024-09-30,93436.0,mega,8.390474e+05,0.221942,0.217323,3711.0,237.0,0.849340,...,0.062915,0.504812,0.647934,-0.054016,0.028794,0.372432,9.472730,0.000715,0.000826,0.000695
3911128,USA,2024-10-31,93436.0,mega,8.020335e+05,-0.045025,-0.048932,3711.0,238.0,0.851467,...,0.021696,0.197611,0.662048,-0.052716,0.027153,0.382452,10.166609,0.000831,0.001147,0.000753
3911129,USA,2024-11-30,93436.0,mega,1.107984e+06,0.381469,0.377514,3711.0,239.0,0.851467,...,0.021696,0.197611,0.662048,-0.052716,0.028598,0.416990,10.166609,0.000860,0.000964,0.000804


In [3]:
# List of characteristics
charc_list = [col for col in charc.columns if col not in 
                ['excntry','date','permno','size_grp','me','ret','ret_exc','sic']]
len(charc_list)

153

#### Lag characteristics by 1 period 

Portfolio assignments and factor constructions are based on 1-month lagged characteristics. Instead of looping through every single characteristic column, I shift the return and date forward by 1 month to achieve the same goal.

In [4]:
# Sort by permno and date before computing lead return
charc = charc.sort_values(['permno', 'date'])

# Now compute the 1-month lead return
charc['ret'] = charc.groupby('permno')['ret'].shift(-1)

# Move date 1M forward such that all characteristics (including size_grp, me, and sic) are 1M lagged 
charc['date'] = charc['date'] + pd.DateOffset(months=1)

In [None]:
# # Should NOT do this filtering (look-ahead bias)
# # Count number of non-missing returns per stock during 1960-2024
# ret_counts_by_permno = charc[charc['date']>='1960-01-01'].groupby('permno')['ret'].apply(lambda x: x.notna().sum())
# # Filter firms with at least 60 non-missing return months during 1960-2024
# allPermnos = ret_counts_by_permno[ret_counts_by_permno >= 60].index.to_numpy().astype(int)
# charc = charc[charc['permno'].isin(allPermnos)].copy()

In [5]:
# Drop obs without ret/permno
charc = charc.dropna(subset=['ret','permno']).reset_index(drop=True)
charc = charc.drop(columns=['excntry','ret_exc']).reset_index(drop=True)
charc

Unnamed: 0,date,permno,size_grp,me,ret,sic,age,aliq_at,aliq_mat,ami_126d,...,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,z_score,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,1986-02-28,10000.0,micro,1.610000e+01,-0.257143,3990.0,25.0,,,,...,,,,,,,,,0.005172,
1,1986-03-28,10000.0,micro,1.196000e+01,0.365385,3990.0,26.0,,,,...,,,0.365969,,,,,,0.007663,
2,1986-04-30,10000.0,micro,1.633000e+01,-0.098592,3990.0,27.0,,,,...,,,0.365969,,,,,,0.007436,
3,1986-05-30,10000.0,micro,1.517200e+01,-0.222656,3990.0,28.0,,,,...,,,0.365969,,0.000787,1.086926,,0.007351,0.007655,
4,1986-06-30,10000.0,nano,1.179386e+01,-0.005025,3990.0,29.0,,,,...,,,0.757813,,0.000763,1.122440,15.228146,0.007469,0.007128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3658838,2024-08-31,93436.0,mega,7.413801e+05,-0.077390,3711.0,235.0,0.849340,0.083090,0.000001,...,0.062915,0.504812,0.647934,-0.054016,0.031009,0.334198,9.472730,0.000703,0.000453,0.000633
3658839,2024-09-30,93436.0,mega,6.840044e+05,0.221942,3711.0,236.0,0.849340,0.085964,0.000002,...,0.062915,0.504812,0.647934,-0.054016,0.029518,0.360574,9.472730,0.000715,0.000828,0.000661
3658840,2024-10-30,93436.0,mega,8.390474e+05,-0.045025,3711.0,237.0,0.849340,0.088406,0.000002,...,0.062915,0.504812,0.647934,-0.054016,0.028794,0.372432,9.472730,0.000715,0.000826,0.000695
3658841,2024-11-30,93436.0,mega,8.020335e+05,0.381469,3711.0,238.0,0.851467,0.113783,0.000001,...,0.021696,0.197611,0.662048,-0.052716,0.027153,0.382452,10.166609,0.000831,0.001147,0.000753


In [6]:
# Unique dates and stocks
allDates = charc['date'].dt.year * 100 + charc['date'].dt.month
allDates = allDates.drop_duplicates().sort_values().to_numpy()
allPermnos = charc['permno'].drop_duplicates().sort_values().to_numpy().astype(int)
len(allDates), len(allPermnos)

(780, 28828)

In [7]:
# All numeric months and stocks shared across all models
np.savetxt('/work/rw196/data/ALL/allDates.csv', allDates, fmt='%d', delimiter=',')
np.savetxt('/work/rw196/data/ALL/allPermnos.csv', allPermnos, fmt='%d', delimiter=',')

### Univariate-sorted portfolios and weights

I following the same procedure in [Jensen, Kelly, and Pedersen (2023)](https://onlinelibrary.wiley.com/doi/full/10.1111/jofi.13249) to construct portfolios and factors. See details in their paper and data appendix.

For each characteristic, I keep the two univariate-sorted corner portfolios — top and bottom decile portfolios. [Lettau and Pelger (2020)](https://academic.oup.com/rfs/article-abstract/33/5/2274/5756219) shows that most of the relevant information is contained in the extreme first and tenth decile portfolios. In summary, I have 153*2 = 306 univariate-sorted portfolios in total.

Note that the timing of my portfolio construction is slightly different from what the literature does. While Fama-French form portfolios annually at the end of each June and JKP form portfolios on a monthly basis, I construct portfolios based on the characteristics information at the end of each December to be consistent with my rolling testing periods. Consistent with JKP, I compute capped value-weighted returns and store portfolio weights on individual stocks. [Fama and French (2008)](https://onlinelibrary.wiley.com/doi/10.1111/j.1540-6261.2008.01371.x) and [Hou, Xue, and Zhang (2020)](https://academic.oup.com/rfs/article-abstract/33/5/2019/5236964?redirectedFrom=fulltext) discuss the importance of using NYSE breakpoints to make portfolio sorts robust to outliers and skewed distributions. 

Information as of December. Since I have lagged the characteristics, I extract the observations in January.

In [8]:
jan = charc[charc['date'].dt.month==1].copy().reset_index(drop=True)
jan

Unnamed: 0,date,permno,size_grp,me,ret,sic,age,aliq_at,aliq_mat,ami_126d,...,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,z_score,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,1987-01-31,10000.0,nano,1.981547e+00,-0.212121,3990.0,36.0,,,1.371849e+01,...,,,0.571950,,0.001537,1.499872,2.617846,1.317024,1.913112,0.846164
1,1987-01-31,10001.0,nano,6.937000e+00,-0.035714,4920.0,12.0,,,1.629133e+01,...,,,0.711161,,0.001059,1.270361,3.312806,2.630911,6.689263,1.953846
2,1988-01-31,10001.0,nano,5.828000e+00,0.063830,4924.0,24.0,0.533838,0.538235,2.407606e+01,...,-0.003908,-0.147436,0.729152,-0.035511,0.001309,3.310462,2.765372,3.778838,5.730556,3.990824
3,1989-01-31,10001.0,nano,6.362250e+00,0.019608,4924.0,36.0,0.556474,0.620230,2.305792e+01,...,0.010908,0.226950,0.756384,0.012697,0.000302,1.857314,2.987471,8.110663,6.008550,7.809863
4,1990-01-31,10001.0,micro,1.034775e+01,-0.018519,4924.0,48.0,0.892288,0.968528,7.079995e+00,...,-0.277350,-4.262417,0.826893,0.023270,0.001707,3.721716,2.569062,1.836945,0.007793,2.421445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304501,2020-01-31,93436.0,mega,7.574366e+04,0.555160,3711.0,180.0,0.699106,0.239572,8.650000e-06,...,-0.193711,-9.362176,0.644313,0.002224,0.046513,0.525003,1.744269,0.000204,0.000189,0.000180
304502,2021-01-31,93436.0,mega,6.773402e+05,0.124506,3711.0,192.0,0.779729,0.243884,1.480000e-06,...,-0.089619,-9.275171,0.689222,0.000259,0.066533,0.539604,5.360048,0.000272,0.000334,0.000212
304503,2022-01-31,93436.0,mega,1.092191e+06,-0.113609,3711.0,204.0,0.969883,0.052417,1.040000e-06,...,-0.045987,-1.162769,0.699649,0.006570,0.022622,0.420708,15.080915,0.000915,0.000832,0.000780
304504,2023-01-31,93436.0,mega,3.897415e+05,0.406235,3711.0,216.0,0.843883,0.041459,1.410000e-06,...,0.006787,0.048865,0.699544,0.008947,0.028228,0.394399,15.576261,0.000594,0.000327,0.000673


Helper functions.

In [9]:
# Calculates breakpoints to divide data into a specified number of equal-sized groups
def get_equal_group_breakpoints(arr, num_groups: int):
    """
    This function determines the values that split a dataset into `num_groups`
    portfolios, each containing an equal number of observations.

    Args:
        arr (np.ndarray): The input 1D array of data. NaNs are ignored.
        num_groups (int): The number of equal-sized groups to create (e.g., 3 for terciles).

    Returns:
        np.ndarray: An array of the calculated breakpoints. The number of breakpoints
                    will be `num_groups` - 1.
    """
    # Remove NaNs
    clean_arr = arr[~np.isnan(arr)]
        
    # If the array is empty after cleaning, return NaNs for the breakpoints
    if len(clean_arr) == 0:
        return np.full(num_groups - 1, np.nan)

    # Sort the data
    sorted_arr = np.sort(clean_arr)
    n = len(sorted_arr)

    # Determine the desired percentiles for the breakpoints
    #    For `num_groups = 3`, we need percentiles at [33.33, 66.67]
    #    For `num_groups = 5`, we need percentiles at [20, 40, 60, 80]
    p_desired = 100 * np.arange(1, num_groups) / num_groups

    # Create the percentile ranks corresponding to each data point
    #    It considers the k-th value to be the 100*(k-0.5)/n percentile (same as Matlab convention)
    p_rank = 100 * (np.arange(1, n + 1)-0.5) / n

    # Use linear interpolation to find the data values at the desired percentiles.
    #    This finds the breakpoints.
    breakpoints = np.interp(p_desired, p_rank, sorted_arr)

    return breakpoints

In [10]:
# Calculates the weighted average of returns and the corresponding weights
def getPtfRetsWts(df: pd.DataFrame, ret_name: str, me_name: str, method: str) -> tuple[float, pd.Series]:
    """
    This function computes portfolio returns and weights using one of three schemes:
    'ew' for equal-weighted, 'vw' for value-weighted, and 'cvw' for
    capped value-weighted.

    Args:
        df (pd.DataFrame): DataFrame containing stock data.
        ret_name (str): The name of the column containing stock returns.
        me_name (str): The name of the column containing market equity values.
        method (str): The weighting method ('vw', 'ew', or 'cvw').

    Returns:
        A tuple containing:
        - float: The calculated portfolio return for the given period.
        - pd.Series: The weights for each stock in the portfolio.

    Raises:
        ValueError: If an invalid method is specified or for 'cvw' issues.
    """
    # At least 5 stocks in each of the long and short legs
    if len(df) < 5:
        return np.nan, pd.Series(dtype=float)
        
    rt = df[ret_name]
    me = df[me_name]
    
    if method == 'ew':
        # Equal weights for all stocks
        weights = pd.Series(1 / len(df), index=df.index)
        portfolio_return = rt.mean()

    elif method == 'vw':
        # Value weights based on market equity
        total_me = me.sum()
        weights = me / total_me
        portfolio_return = (rt * weights).sum()

    elif method == 'cvw':
        if 'size_grp' not in df.columns:
            raise ValueError("Data does not have size_grp indicators required for 'cvw' method.")
        
        # Cap market equity at the 80th percentile of NYSE stocks
        nyse80 = df['size_grp'] == 'mega'
        cap = me[nyse80].min() if np.any(nyse80) else np.nan  
        capped_me = me.clip(upper=cap)
        
        # Calculate weights based on capped market equity
        total_capped_me = capped_me.sum()
        weights = capped_me / total_capped_me
        portfolio_return = (rt * weights).sum()

    else:
        raise ValueError("Method must be one of 'vw', 'ew', or 'cvw'.")
        
    return portfolio_return, weights

#### Assign portfolio buckets

In [11]:
# Portfolio sorts are based on non-micro stocks (i.e., larger than NYSE 20th percentile)
non_micro = jan[jan['size_grp'].isin(['mega', 'large','small'])]

# Initialize the portfolio assigment matrix
ptfassign = jan[['date','permno']].copy()
ptfassign['year'] = ptfassign['date'].dt.year

In [12]:
%%time
# Loop through the charcs
for ch in tqdm(charc_list, desc='Processing', colour='green'):
    
    # Sort non-micro stocks into three groups of equal numbers
    breaks = non_micro.groupby(['date'])[ch].apply(lambda x: get_equal_group_breakpoints(x.values,3)).apply(pd.Series).rename(columns={0: f'{ch}1', 1: f'{ch}2'}).reset_index()
    
    # Merge breakpoints to the original data
    breaks = pd.merge(jan, breaks, how='left', on=['date'])
    
    # Assign portfolio buckets
    conditions = [
        breaks[ch]<=breaks[f'{ch}1'],
        (breaks[ch]>breaks[f'{ch}1']) & (breaks[ch]<=breaks[f'{ch}2']),
        breaks[ch]>breaks[f'{ch}2']
    ]
    choices = [1,2,3] # list(range(1, 4))
    ptfassign[ch] = np.select(conditions, choices, default=np.nan)

Processing: 100%|[32m██████████[0m| 153/153 [01:11<00:00,  2.13it/s]

CPU times: user 48 s, sys: 22.9 s, total: 1min 10s
Wall time: 1min 11s





In [13]:
# Merge back with monthly records
charc['year'] = charc['date'].dt.year
ptfassign = pd.merge(charc[['date','permno','me','ret','size_grp','year']], ptfassign.drop(columns='date'), how='left', on=['permno','year'])
ptfassign

Unnamed: 0,date,permno,me,ret,size_grp,year,age,aliq_at,aliq_mat,ami_126d,...,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,z_score,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,1986-02-28,10000.0,1.610000e+01,-0.257143,micro,1986,,,,,...,,,,,,,,,,
1,1986-03-28,10000.0,1.196000e+01,0.365385,micro,1986,,,,,...,,,,,,,,,,
2,1986-04-30,10000.0,1.633000e+01,-0.098592,micro,1986,,,,,...,,,,,,,,,,
3,1986-05-30,10000.0,1.517200e+01,-0.222656,micro,1986,,,,,...,,,,,,,,,,
4,1986-06-30,10000.0,1.179386e+01,-0.005025,nano,1986,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3658838,2024-08-31,93436.0,7.413801e+05,-0.077390,mega,2024,2.0,3.0,1.0,1.0,...,3.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,1.0,1.0
3658839,2024-09-30,93436.0,6.840044e+05,0.221942,mega,2024,2.0,3.0,1.0,1.0,...,3.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,1.0,1.0
3658840,2024-10-30,93436.0,8.390474e+05,-0.045025,mega,2024,2.0,3.0,1.0,1.0,...,3.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,1.0,1.0
3658841,2024-11-30,93436.0,8.020335e+05,0.381469,mega,2024,2.0,3.0,1.0,1.0,...,3.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,1.0,1.0


#### Compute portfolio returns and weights

In [14]:
def compute_charc_port(ch):

    local_returns_list = []
    local_weights_list = []
    ptfassign_ch = ptfassign[ptfassign[ch]!=2][['date','permno','ret','me','size_grp',ch]].copy()
    groups = ptfassign_ch.dropna(subset=['ret', 'me', ch]).groupby(['date', ch])

    for group_keys, group_df in groups:
        dt, ptfid = group_keys
        group_df = group_df.set_index('permno')
        # Compute the portfolio return and its weights
        ptfret, ptfwts = getPtfRetsWts(group_df, 'ret', 'me', 'cvw')
        local_returns_list.append({
            'date': dt,
            'portfolio': f'{ch}_{ptfid.astype(int)}',
            'wret': ptfret
        })
        
        for permno, wt in ptfwts.items():
            local_weights_list.append({
                'date': dt,
                'portfolio': f'{ch}_{ptfid.astype(int)}',
                'permno': permno,
                'wts': wt
            })
            
    return local_returns_list, local_weights_list

Parallel processing saves a lot of time, but the kernel remains busy after the procedure. This is probably becasue the child processes created by joblib did not shut down cleanly, leading to a deadlock where the main kernel is stuck waiting for child processes that will never respond. Hence, I will need to manually interrupt the kernel to proceed.

In [15]:
%%time
# Parallel processing
with Parallel(n_jobs=-1) as parallel:
    results = parallel(delayed(compute_charc_port)(ch) for ch in charc_list)

# Combine the results from all processes
returns_list = []
weights_list = []
for r_list, w_list in results:
    returns_list.extend(r_list)
    weights_list.extend(w_list)

CPU times: user 12min 39s, sys: 1min 46s, total: 14min 26s
Wall time: 15min 31s


In [15]:
# %%time
# # Store results
# returns_list = []
# weights_list = []

# # Loop through the charcs
# for ch in tqdm(charc_list, desc='Processing', colour='green'):

#     # Focus on the top and bottom decile portfolios
#     ptfassign_ch = ptfassign[ptfassign[ch]!=2][['date','permno','ret','me','size_grp',ch]].copy()

#     # Group the data (dropping missing values)
#     groups = ptfassign_ch.dropna(subset=['ret', 'me', ch]).groupby(['date', ch])

#     # Loop through groups to get both returns and weights
#     for group_keys, group_df in groups:
#         dt, ptfid = group_keys
#         group_df = group_df.set_index('permno')
        
#         # Compute portfolio returns and weights
#         ptfret, ptfwts = getPtfRetsWts(group_df, 'ret', 'me', 'cvw')
        
#         # Append the aggregated return for the group
#         returns_list.append({
#             'date': dt,
#             'portfolio': f'{ch}_{ptfid.astype(int)}',
#             'wret': ptfret
#         })
        
#         # Append the detailed weights Series for all stocks in the group
#         for permno, wt in ptfwts.items():
#             weights_list.append({
#                 'date': dt,
#                 'portfolio': f'{ch}_{ptfid.astype(int)}',
#                 'permno': permno,
#                 'wts': wt
#             })

Processing: 100%|[32m██████████[0m| 153/153 [30:30<00:00, 11.96s/it]

CPU times: user 28min 24s, sys: 1min 56s, total: 30min 21s
Wall time: 30min 30s





Portfolio returns.

In [16]:
# Concatenate to long format
returns_df = pd.DataFrame(returns_list)

# Reshape to wide format
port_unisort = returns_df.pivot(index='date', columns='portfolio', values='wret').reset_index()
port_unisort.columns.name = None

# Numeric date
port_unisort['date'] = pd.to_datetime(port_unisort['date']).dt.year * 100 + pd.to_datetime(port_unisort['date']).dt.month
port_unisort

Unnamed: 0,date,age_1,age_3,aliq_at_1,aliq_at_3,aliq_mat_1,aliq_mat_3,ami_126d_1,ami_126d_3,at_be_1,...,turnover_var_126d_1,turnover_var_126d_3,z_score_1,z_score_3,zero_trades_126d_1,zero_trades_126d_3,zero_trades_21d_1,zero_trades_21d_3,zero_trades_252d_1,zero_trades_252d_3
0,196001,-0.047032,-0.050836,-0.069434,-0.054530,-0.070681,-0.051854,-0.060388,-0.034125,-0.063587,...,-0.060813,-0.047267,,,-0.061824,-0.037269,-0.057823,-0.047078,-0.063355,-0.038492
1,196002,0.023137,0.005951,-0.001587,0.022649,0.011000,0.004022,0.013248,0.006624,-0.005441,...,0.011557,0.013140,,,0.017850,0.008040,0.016760,0.017306,0.017188,0.007046
2,196003,-0.004808,-0.021344,-0.023611,-0.016225,-0.008327,-0.050818,-0.019740,-0.011444,-0.021467,...,-0.018194,-0.016157,,,-0.039341,-0.008124,-0.045374,-0.002540,-0.039737,-0.008587
3,196004,-0.004962,-0.012011,-0.025596,0.003943,-0.012870,-0.013045,-0.011309,-0.010760,-0.023802,...,-0.009918,-0.018789,,,-0.024552,-0.007307,-0.029668,-0.002430,-0.026675,-0.002491
4,196005,0.044509,0.024084,0.019758,0.066012,0.043058,0.023484,0.040704,0.021671,0.031179,...,0.025761,0.035282,,,0.062162,0.022761,0.063672,0.029310,0.054415,0.022333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,202408,0.007426,0.017541,0.022528,0.009962,0.022897,0.001692,0.020682,-0.009375,0.000262,...,0.013790,0.012317,0.016999,0.014259,-0.004875,0.024004,-0.005831,0.023878,-0.002913,0.027029
776,202409,0.026409,0.013624,0.023380,0.013892,0.014199,0.015504,0.019817,0.014928,0.013489,...,0.016517,0.020748,0.026744,0.019296,0.023520,0.015336,0.026022,0.014438,0.024406,0.014579
777,202410,0.008542,-0.015402,-0.016373,-0.004497,-0.016103,-0.004915,-0.009729,-0.000728,-0.017910,...,-0.011797,0.002133,0.006997,-0.017445,-0.003719,-0.008202,-0.000554,-0.009618,-0.002451,-0.009412
778,202411,0.120158,0.064593,0.062062,0.111678,0.065340,0.093281,0.075602,0.100348,0.075756,...,0.069689,0.103682,0.089003,0.081660,0.118860,0.066617,0.124971,0.063439,0.118444,0.062895


Portfolio weights.

In [17]:
%%time
# Concatenate to long format
weights_df = pd.DataFrame(weights_list)  

# Reshape to wide format
port_unisort_wts = weights_df.pivot(index=['date','permno'], columns='portfolio', values='wts').reset_index()
port_unisort_wts.columns.name = None

# Numeric date
port_unisort_wts['date'] = pd.to_datetime(port_unisort_wts['date']).dt.year * 100 + pd.to_datetime(port_unisort_wts['date']).dt.month
port_unisort_wts

CPU times: user 13min 21s, sys: 44.3 s, total: 14min 6s
Wall time: 14min 7s


Unnamed: 0,date,permno,age_1,age_3,aliq_at_1,aliq_at_3,aliq_mat_1,aliq_mat_3,ami_126d_1,ami_126d_3,...,turnover_var_126d_1,turnover_var_126d_3,z_score_1,z_score_3,zero_trades_126d_1,zero_trades_126d_3,zero_trades_21d_1,zero_trades_21d_3,zero_trades_252d_1,zero_trades_252d_3
0,196001,10006.0,,0.001461,,,,0.006098,,,...,,,,,0.002329,,0.002574,,0.002257,
1,196001,10014.0,,0.000209,,,,,,0.000518,...,,,,,0.000333,,0.000368,,0.000322,
2,196001,10022.0,,0.000449,,,,,,0.001115,...,,,,,0.000716,,0.000791,,0.000694,
3,196001,10030.0,,0.001725,,,,,,,...,,,,,0.002750,,,,0.002665,
4,196001,10057.0,,0.000575,,,,,,0.001427,...,,,,,,0.001033,,0.000634,,0.001007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3514788,202412,93397.0,,,0.00008,,,,,0.000270,...,,0.000121,,,,0.000071,,,,0.000071
3514789,202412,93426.0,,,,,,0.000065,,0.000151,...,,,,,,0.000040,,,,0.000040
3514790,202412,93427.0,,,,0.001738,,,,,...,,0.002053,,0.001317,,,,,,
3514791,202412,93434.0,0.000004,,,,,0.000004,,0.000009,...,,0.000004,0.000003,,,0.000002,,0.000002,,0.000002


In [18]:
%%time
# Verification
# Group by date and calculate the sum for each portfolio column
wts_sums = port_unisort_wts.drop(columns='permno').groupby('date').sum()

# Use np.allclose to check if all portfolio weights sum to close to 1
# This is more robust than checking for exact equality with 1
is_verified = np.allclose(wts_sums.replace(0, 1), 1)

if is_verified:
    print("✅ Success! For each date, all portfolio weights sum to 1.")
else:
    print("❌ Failure!")

✅ Success! For each date, all portfolio weights sum to 1.
CPU times: user 17.9 s, sys: 1.76 s, total: 19.7 s
Wall time: 19.7 s


### Pre-specified factors and weights

#### Compute factor returns and weights

In [19]:
%%time
# Initialize the DataFrames
fac = port_unisort[['date']]
fac_wts = port_unisort_wts[['date','permno']]

# Loop through the charcs
for ch in tqdm(charc_list, desc='Processing', colour='green'):

    # Determine the long and short legs
    if port_unisort[f'{ch}_1'].mean() >= port_unisort[f'{ch}_3'].mean():
        fac[ch] = port_unisort[f'{ch}_1'] - port_unisort[f'{ch}_3']
        fac_wts[ch] = port_unisort_wts[f'{ch}_1'].fillna(0) - port_unisort_wts[f'{ch}_3'].fillna(0)
    else:
        fac[ch] = port_unisort[f'{ch}_3'] - port_unisort[f'{ch}_1']
        fac_wts[ch] = port_unisort_wts[f'{ch}_3'].fillna(0) - port_unisort_wts[f'{ch}_1'].fillna(0)
        
    # Replace 0 wts to NaN
    fac_wts.loc[fac_wts[ch] == 0, ch] = np.nan

Processing: 100%|[32m██████████[0m| 153/153 [00:11<00:00, 12.85it/s]

CPU times: user 22.7 s, sys: 5.33 s, total: 28.1 s
Wall time: 11.9 s





Add a market factor.

In [20]:
mktassign = charc[['date','permno','me','ret']]
mktassign['date'] = pd.to_datetime(mktassign['date']).dt.year * 100 + pd.to_datetime(mktassign['date']).dt.month

# Value-weighted returns
mktassign['vwts'] = mktassign.groupby('date')['me'].transform(lambda x: x / x.sum())

# Market factor
mktassign['vwret'] = mktassign['ret']*mktassign['vwts']
mkt = mktassign.groupby('date')['vwret'].sum().reset_index().rename(columns={'vwret': 'mkt'})

In [21]:
# Merge into factor returns and weights
fac = pd.merge(mkt[mkt['date']>=196001], fac, on='date',how='left')
fac_wts = pd.merge(mktassign[['date','permno','vwts']].rename(columns={'vwts': 'mkt'}).dropna(subset='mkt'), fac_wts, on=['date','permno'], how='left')

#### Compare JKP/FF factors and self-constructed factors

In [22]:
# 6 factors from JKP
jkp = pd.read_csv('../Data/raw/JKP_factors_monthly.csv') 
jkp = jkp.pivot(index='date', columns='name', values='ret').reset_index() # long to wide
jkp = jkp[['date','market_equity','be_me','ope_be','at_gr1','ret_12_1']]
jkp['date'] = pd.to_datetime(jkp['date']).dt.year * 100 + pd.to_datetime(jkp['date']).dt.month
jkp

name,date,market_equity,be_me,ope_be,at_gr1,ret_12_1
0,192601,0.027025,,,,
1,192602,-0.029599,,,,
2,192603,-0.051052,,,,
3,192604,0.005405,,,,
4,192605,-0.005611,,,,
...,...,...,...,...,...,...
1183,202408,-0.050941,-0.020158,0.012547,-0.009432,0.029552
1184,202409,-0.018374,-0.019773,-0.006242,0.003045,0.010313
1185,202410,0.001539,-0.004121,-0.016460,-0.015150,0.024211
1186,202411,0.019527,-0.015056,-0.033224,-0.029310,0.031126


In [23]:
# Compare to JKP factors
for ch in ['market_equity','be_me','ope_be','at_gr1','ret_12_1']:
    print(jkp.set_index('date')[ch].corr(fac.set_index('date')[ch]))

0.9617274553258629
0.9491408556933988
0.9575032177059046
0.888130689295986
0.7860240773081487


In [24]:
# 6 factors from Fama-French
ff6 = pd.read_csv('../Data/raw/FF6_factors_monthly.csv')
ff6['MKT'] = ff6['Mkt-RF'] + ff6['RF']
ff6 = ff6.drop(columns=['Mkt-RF','RF'])
ff6.loc[:, ff6.columns != 'date'] = ff6.loc[:, ff6.columns != 'date'] / 100 # remove percentage
ff6

Unnamed: 0,date,SMB,HML,RMW,CMA,MOM,MKT
0,196307,-0.0048,-0.0081,0.0064,-0.0115,0.0101,-0.0012
1,196308,-0.0080,0.0170,0.0040,-0.0038,0.0100,0.0533
2,196309,-0.0043,0.0000,-0.0078,0.0015,0.0012,-0.0130
3,196310,-0.0134,-0.0004,0.0279,-0.0225,0.0313,0.0283
4,196311,-0.0085,0.0173,-0.0043,0.0227,-0.0078,-0.0059
...,...,...,...,...,...,...,...
733,202408,-0.0355,-0.0110,0.0075,0.0082,0.0481,0.0209
734,202409,-0.0092,-0.0277,0.0018,-0.0029,-0.0062,0.0213
735,202410,-0.0088,0.0086,-0.0142,0.0098,0.0296,-0.0061
736,202411,0.0460,0.0015,-0.0230,-0.0205,0.0101,0.0689


In [25]:
# Compare to JKP factors
charc_list_FF = ['MKT','SMB','HML','RMW','CMA','MOM']
for i, ch in enumerate(['mkt','market_equity','be_me','ope_be','at_gr1','ret_12_1']):
    print(ff6.set_index('date')[charc_list_FF[i]].corr(fac.set_index('date')[ch]))

0.9991895845789045
0.9102969900529643
0.8621332846723144
0.873955850259698
0.7798650093351083
0.7530529775762329


The least correlation occurs for the momentum factor becasue both JKP and FF update MOM on a monthly basis, while I only form portfolios and factors at the end of each December.

### Characteristics Processing

#### Keep charcs whose factors are available during 1960-2024

I end up using the remaining 136 characteristics.

In [26]:
fac = fac.dropna(axis=1)
charc_list = [col for col in fac.columns if col not in ['date','mkt']]
len(charc_list)

136

In [27]:
# Update univariate-sorted portfolios returns
cols_to_keep = [f"{ch}_1" for ch in charc_list] + [f"{ch}_3" for ch in charc_list]
port_unisort = port_unisort[['date'] + cols_to_keep]
port_unisort.to_csv('/work/rw196/data/ALL/port.csv', index=False)
port_unisort

Unnamed: 0,date,age_1,aliq_at_1,aliq_mat_1,ami_126d_1,at_be_1,at_gr1_1,at_me_1,at_turnover_1,be_gr1a_1,...,seas_6_10na_3,taccruals_at_3,taccruals_ni_3,tangibility_3,tax_gr1a_3,turnover_126d_3,turnover_var_126d_3,zero_trades_126d_3,zero_trades_21d_3,zero_trades_252d_3
0,196001,-0.047032,-0.069434,-0.070681,-0.060388,-0.063587,-0.058072,-0.069614,-0.042738,-0.058721,...,-0.072839,-0.054950,-0.059555,-0.072869,-0.050325,-0.058638,-0.047267,-0.037269,-0.047078,-0.038492
1,196002,0.023137,-0.001587,0.011000,0.013248,-0.005441,-0.000771,0.018321,0.009373,0.000393,...,0.003138,-0.000625,-0.000952,-0.004252,0.030932,0.017396,0.013140,0.008040,0.017306,0.007046
2,196003,-0.004808,-0.023611,-0.008327,-0.019740,-0.021467,-0.027271,-0.009326,0.002598,-0.034723,...,-0.029674,-0.014980,-0.020649,-0.022520,-0.005391,-0.041011,-0.016157,-0.008124,-0.002540,-0.008587
3,196004,-0.004962,-0.025596,-0.012870,-0.011309,-0.023802,-0.012569,-0.004867,-0.006033,-0.023422,...,-0.025013,-0.019918,-0.014922,-0.024171,0.001462,-0.023671,-0.018789,-0.007307,-0.002430,-0.002491
4,196005,0.044509,0.019758,0.043058,0.040704,0.031179,0.020258,0.060285,0.016005,0.020508,...,0.032680,0.037148,0.036444,0.022104,0.056530,0.059767,0.035282,0.022761,0.029310,0.022333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,202408,0.007426,0.022528,0.022897,0.020682,0.000262,0.010976,0.019784,0.014754,0.015884,...,0.021574,0.008771,0.007986,0.003213,0.012435,-0.004919,0.012317,0.024004,0.023878,0.027029
776,202409,0.026409,0.023380,0.014199,0.019817,0.013489,0.019309,0.024078,0.017015,0.022434,...,0.013495,0.021671,0.021279,0.014702,0.014579,0.023523,0.020748,0.015336,0.014438,0.014579
777,202410,0.008542,-0.016373,-0.016103,-0.009729,-0.017910,-0.006667,-0.010337,0.010780,-0.003724,...,-0.017993,-0.012164,-0.016109,-0.005905,-0.008569,-0.003717,0.002133,-0.008202,-0.009618,-0.009412
778,202411,0.120158,0.062062,0.065340,0.075602,0.075756,0.081271,0.088793,0.091281,0.090722,...,0.073068,0.082653,0.078910,0.082868,0.087287,0.118841,0.103682,0.066617,0.063439,0.062895


In [28]:
# Update univariate-sorted portfolios weights
port_unisort_wts = port_unisort_wts[['date','permno'] + cols_to_keep]
port_unisort_wts.to_parquet('/work/rw196/data/ALL/port_wts.parquet', engine='pyarrow')
port_unisort_wts

Unnamed: 0,date,permno,age_1,aliq_at_1,aliq_mat_1,ami_126d_1,at_be_1,at_gr1_1,at_me_1,at_turnover_1,...,seas_6_10na_3,taccruals_at_3,taccruals_ni_3,tangibility_3,tax_gr1a_3,turnover_126d_3,turnover_var_126d_3,zero_trades_126d_3,zero_trades_21d_3,zero_trades_252d_3
0,196001,10006.0,,,,,0.003561,,,,...,0.001858,,0.003679,0.003253,,0.002580,,,,
1,196001,10014.0,,,,,,,,,...,,,,,,0.000368,,,,
2,196001,10022.0,,,,,,,,,...,,,,,,0.000793,,,,
3,196001,10030.0,,,,,,,,,...,,,,,,0.003046,,,,
4,196001,10057.0,,,,,,,,,...,0.000731,,,,,,,0.001033,0.000634,0.001007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3514788,202412,93397.0,,0.00008,,,0.000096,0.000095,,,...,0.000105,0.000084,0.000082,,0.000078,,0.000121,0.000071,,0.000071
3514789,202412,93426.0,,,,,0.000054,,,,...,0.000059,0.000047,0.000046,0.000061,0.000043,,,0.000040,,0.000040
3514790,202412,93427.0,,,,,0.001627,,0.001129,,...,0.001783,0.001431,0.001400,0.001864,0.001318,,0.002053,,,
3514791,202412,93434.0,0.000004,,,,,,,,...,,0.000003,0.000003,,,,0.000004,0.000002,0.000002,0.000002


In [29]:
# Update factor returns
fac.to_csv('/work/rw196/data/ALL/fac.csv', index=False)
fac

Unnamed: 0,date,mkt,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,at_turnover,...,seas_6_10na,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,196001,-0.066326,-0.003805,-0.014904,0.018827,0.026264,0.000516,-0.002328,0.029854,-0.014650,...,0.037136,-0.016555,-0.011538,-0.012014,0.019637,0.013319,0.013546,0.024554,0.010745,0.024863
1,196002,0.014506,-0.017186,-0.024236,-0.006978,-0.006623,0.027669,-0.020722,-0.013344,0.023110,...,0.009387,0.014400,0.019653,-0.022051,0.033315,-0.003439,0.001584,-0.009810,0.000546,-0.010142
2,196003,-0.012739,-0.016536,-0.007386,-0.042492,0.008296,-0.001878,-0.020172,-0.009212,-0.026398,...,0.016789,-0.028961,-0.016577,-0.005659,0.028135,0.040148,0.002037,0.031216,0.042834,0.031150
3,196004,-0.015187,-0.007049,-0.029539,-0.000175,0.000549,0.016022,-0.010853,0.000748,0.000857,...,0.017616,0.006858,0.006004,-0.017920,0.029906,0.021199,-0.008871,0.017245,0.027238,0.024184
4,196005,0.034137,-0.020425,-0.046254,-0.019574,-0.019032,0.008592,-0.023093,-0.050051,0.041204,...,-0.006972,-0.011593,-0.006931,-0.032269,0.030661,-0.035743,0.009521,-0.039401,-0.034362,-0.032082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,202408,0.021757,0.010115,0.012566,-0.021205,-0.030057,0.021849,-0.004602,-0.008796,-0.012887,...,-0.019945,0.004030,0.013897,-0.026233,0.000982,0.028964,-0.001473,0.028880,0.029709,0.029942
776,202409,0.020778,-0.012784,0.009488,0.001305,-0.004889,0.006770,0.002830,-0.007496,0.002218,...,0.006525,0.001691,-0.000900,-0.004900,-0.004655,-0.008174,0.004231,-0.008184,-0.011584,-0.009827
777,202410,-0.006856,-0.023943,-0.011877,0.011188,0.009001,0.022456,0.001394,0.012422,-0.039773,...,0.011068,0.008863,0.018310,0.005529,0.012005,-0.004480,0.013930,-0.004484,-0.009064,-0.006961
778,202411,0.067769,-0.055565,-0.049616,0.027941,0.024746,0.020035,-0.008820,0.003387,-0.012774,...,0.004542,0.007711,0.012744,0.014406,0.026132,-0.052214,0.033993,-0.052243,-0.061531,-0.055549


In [30]:
# Update factor weights
fac_wts = fac_wts[['date','permno','mkt'] + charc_list]
fac_wts.to_parquet('/work/rw196/data/ALL/fac_wts.parquet', engine='pyarrow')
fac_wts

Unnamed: 0,date,permno,mkt,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,...,seas_6_10na,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,198602,10000.0,0.000007,,,,,,,,...,,,,,,,,,,
1,198603,10000.0,0.000005,,,,,,,,...,,,,,,,,,,
2,198604,10000.0,0.000007,,,,,,,,...,,,,,,,,,,
3,198605,10000.0,0.000006,,,,,,,,...,,,,,,,,,,
4,198606,10000.0,0.000005,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3656456,202408,93436.0,0.012662,,-0.004357,-0.002927,-0.001643,-0.004006,-0.003475,-0.002830,...,-0.004413,-0.003511,-0.003438,0.004615,,-0.004395,-0.002301,-0.004397,-0.004706,-0.004199
3656457,202409,93436.0,0.011459,,-0.004430,-0.002963,-0.001650,-0.004087,-0.003520,-0.002868,...,-0.004491,-0.003557,-0.003488,0.004718,,-0.004524,-0.002326,-0.004525,-0.004828,-0.004304
3656458,202410,93436.0,0.013787,,-0.004445,-0.002971,-0.001658,-0.004086,-0.003519,-0.002856,...,-0.004496,-0.003595,-0.003520,0.004711,,-0.004579,-0.002335,-0.004580,-0.004873,-0.004365
3656459,202411,93436.0,0.013288,,-0.004429,-0.002976,-0.001656,-0.004114,-0.003520,-0.002854,...,-0.004477,-0.003577,-0.003503,0.004718,,-0.004537,-0.002342,-0.004538,-0.004814,-0.004328


In [31]:
# Update individual characteristics
charc = charc[['date','permno','size_grp','sic','ret']+charc_list]
charc['date'] = charc['date'].dt.year * 100 + charc['date'].dt.month

In [32]:
# Size groups
size_grp = charc[['date','permno','size_grp']].copy()
size_grp.to_parquet('/work/rw196/data/ALL/size_grp.parquet', engine='pyarrow')

In [33]:
# Individual returns
ret_long = charc[['date','permno','ret']].copy()
ret_long.to_parquet('/work/rw196/data/ALL/ret_long.parquet', engine='pyarrow')
ret = ret_long.pivot(index='date', columns='permno', values='ret') 
ret.to_parquet('/work/rw196/data/ALL/ret.parquet', engine='pyarrow')
ret

permno,10000.0,10001.0,10002.0,10003.0,10005.0,10006.0,10007.0,10008.0,10009.0,10010.0,...,93426.0,93427.0,93428.0,93429.0,93430.0,93432.0,93433.0,93434.0,93435.0,93436.0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
196001,,,,,,0.005155,,,,,...,,,,,,,,,,
196002,,,,,,0.046154,,,,,...,,,,,,,,,,
196003,,,,,,-0.059553,,,,,...,,,,,,,,,,
196004,,,,,,-0.081794,,,,,...,,,,,,,,,,
196005,,,,,,0.048851,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202408,,,,,,,,,,,...,-0.194809,0.104688,,,,,,-0.316753,,-0.077390
202409,,,,,,,,,,,...,-0.061934,-0.029592,,,,,,-0.329953,,0.221942
202410,,,,,,,,,,,...,-0.103861,0.019159,,,,,,-0.345661,,-0.045025
202411,,,,,,,,,,,...,-0.010340,-0.026518,,,,,,2.065217,,0.381469


In [34]:
# Save the raw characteristics without transformating or filling NaNs
charc = charc[['date','permno','sic']+charc_list]
charc.to_parquet('/work/rw196/data/ALL/charc_raw.parquet', engine='pyarrow')
charc

Unnamed: 0,date,permno,sic,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,...,seas_6_10na,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,198602,10000.0,3990.0,25.0,,,,,,,...,,,,,,,,,0.005172,
1,198603,10000.0,3990.0,26.0,,,,,,0.060201,...,,,,0.365969,,,,,0.007663,
2,198604,10000.0,3990.0,27.0,,,,,,0.044091,...,,,,0.365969,,,,,0.007436,
3,198605,10000.0,3990.0,28.0,,,,,,0.047456,...,,,,0.365969,,0.000787,1.086926,0.007351,0.007655,
4,198606,10000.0,3990.0,29.0,,,,1.835994,,0.107259,...,,,,0.757813,,0.000763,1.122440,0.007469,0.007128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3658838,202408,93436.0,3711.0,235.0,0.849340,0.083090,0.000001,1.696635,0.257886,0.147328,...,0.005636,0.062915,0.504812,0.647934,-0.054016,0.031009,0.334198,0.000703,0.000453,0.000633
3658839,202409,93436.0,3711.0,236.0,0.849340,0.085964,0.000002,1.696635,0.257886,0.159686,...,0.009071,0.062915,0.504812,0.647934,-0.054016,0.029518,0.360574,0.000715,0.000828,0.000661
3658840,202410,93436.0,3711.0,237.0,0.849340,0.088406,0.000002,1.696635,0.257886,0.130179,...,0.005787,0.062915,0.504812,0.647934,-0.054016,0.028794,0.372432,0.000715,0.000826,0.000695
3658841,202411,93436.0,3711.0,238.0,0.851467,0.113783,0.000001,1.697539,0.245510,0.140682,...,0.011291,0.021696,0.197611,0.662048,-0.052716,0.027153,0.382452,0.000831,0.001147,0.000753


#### Fill missing characteristics

In [2]:
# charc = pd.read_parquet('/work/rw196/data/ALL/charc_raw.parquet')
# charc_list = [col for col in charc.columns if col not in ['date','permno','sic']]
# charc

Unnamed: 0,date,permno,sic,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,...,seas_6_10na,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,198602,10000.0,3990.0,25.0,,,,,,,...,,,,,,,,,0.005172,
1,198603,10000.0,3990.0,26.0,,,,,,0.060201,...,,,,0.365969,,,,,0.007663,
2,198604,10000.0,3990.0,27.0,,,,,,0.044091,...,,,,0.365969,,,,,0.007436,
3,198605,10000.0,3990.0,28.0,,,,,,0.047456,...,,,,0.365969,,0.000787,1.086926,0.007351,0.007655,
4,198606,10000.0,3990.0,29.0,,,,1.835994,,0.107259,...,,,,0.757813,,0.000763,1.122440,0.007469,0.007128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3658838,202408,93436.0,3711.0,235.0,0.849340,0.083090,0.000001,1.696635,0.257886,0.147328,...,0.005636,0.062915,0.504812,0.647934,-0.054016,0.031009,0.334198,0.000703,0.000453,0.000633
3658839,202409,93436.0,3711.0,236.0,0.849340,0.085964,0.000002,1.696635,0.257886,0.159686,...,0.009071,0.062915,0.504812,0.647934,-0.054016,0.029518,0.360574,0.000715,0.000828,0.000661
3658840,202410,93436.0,3711.0,237.0,0.849340,0.088406,0.000002,1.696635,0.257886,0.130179,...,0.005787,0.062915,0.504812,0.647934,-0.054016,0.028794,0.372432,0.000715,0.000826,0.000695
3658841,202411,93436.0,3711.0,238.0,0.851467,0.113783,0.000001,1.697539,0.245510,0.140682,...,0.011291,0.021696,0.197611,0.662048,-0.052716,0.027153,0.382452,0.000831,0.001147,0.000753


In [None]:
# Inspect characteristics with the most missing values
# charc[charc_list].isna().sum().sort_values(ascending = False).head(10)

Fill missing values with the _cross-sectional median_ by industry at each month. 

In [3]:
%%time
# Create 'sic2' and fill missing values with 99 (nonclassifiable establishmnt)
charc['sic2'] = pd.to_numeric(charc['sic'], errors='coerce').floordiv(100).astype('Int64')
charc['sic2'] = charc['sic2'].fillna(99) 

def fill_missing_charc(df, char_name):
    # Create a copy to avoid modifying the original slice
    char_series = df[char_name].copy()
    # Step 1: Fill NaNs using (date, sic2) medians
    fill_values = df.loc[df['sic2'] != 99].groupby(['date', 'sic2'])[char_name].transform('median')
    char_series.fillna(fill_values, inplace=True)
    # Step 2: Fill any remaining NaNs using the cross-sectional median
    fill_values_cs = df.groupby('date')[char_name].transform('median')
    char_series.fillna(fill_values_cs, inplace=True)
    return char_series

# Parallel processing
with Parallel(n_jobs=-1) as parallel:
    results_list = parallel(delayed(fill_missing_charc)(charc, ch) for ch in charc_list)
                            
charc_filled = pd.concat(results_list, axis=1)
charc[charc_list] = charc_filled

CPU times: user 12.5 s, sys: 10.3 s, total: 22.7 s
Wall time: 1min 32s


In [None]:
# %%time
# # Fill NaNs simply with cross-sectional median
# for ch in tqdm(charc_list, desc='Processing', colour='green'):
#      charc[ch] = charc.groupby('date')[ch].transform(lambda x: x.fillna(x.median()))

In [4]:
# NO missing characteristics after the nan-filling
assert charc[charc_list].isna().sum().sum() == 0, "There are still missing values in some characteristics!"

#### Transform the stock characteristics

[Kelly, Pruitt, and Su (2019)](https://www.sciencedirect.com/science/article/abs/pii/S0304405X19301151), [Gu, Kelly, and Xiu (2021)](https://www.sciencedirect.com/science/article/abs/pii/S0304407620301998), and some other papers map the characteristics into the $(-1,1)$ intervel. [Kozak, Nagel, and Santosh (2020)](https://www.sciencedirect.com/science/article/abs/pii/S0304405X19301655) further normalize the ranks such that the characteristics-managed portfolios are zero-investment and have fixed leverage. 

1. For each period, rank all stock characteristics across the cross-section with ties. This transformation helps reduce the influence of outliers on the results.
2. Normalize each rank-transformed characteristic by first subtracting its cross-sectional mean and then dividing by the sum of the absolute deviations from this mean across all stocks. This ensures the resulting portfolio is zero-investment and fully allocated, with weights summing in absolute value to one (i.e., 50% long and 50% short, 1x gross leverage).

Since I follow the first approach that simply transforms the characteristics into the $(-1,1)$ intervel for two reasons:
- Since the characteristics are mainly used for estimation, it is not necessary to produce interpretable characteristics-managed portfolios. These portfolios may not be economically interesting because they consist of all existing stocks with minimal weights.
- KNS procedure produces very small values after transformation as they can be interpreted as portfolio weights. The tiny characteristics values unstablize model estimation. 

Note that the transformed characteristics are only used for model estimation. 

In [5]:
%%time
def rank_and_normalize_by_group(df, char_name):
    # Group by date FIRST, then rank the specific characteristic series
    ranks = df.groupby('date')[char_name].rank(pct=True)
    # Normalize the ranked series
    normalized_series = 2 * (ranks - 0.5)
    return normalized_series

# Process in parallel
with Parallel(n_jobs=-1) as parallel:
    results_list = parallel(delayed(rank_and_normalize_by_group)(charc, ch) for ch in charc_list)
                            
charc_filled = pd.concat(results_list, axis=1)
charc[charc_list] = charc_filled
charc

CPU times: user 10.7 s, sys: 7.37 s, total: 18.1 s
Wall time: 37.4 s


Unnamed: 0,date,permno,sic,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,...,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,zero_trades_126d,zero_trades_21d,zero_trades_252d,sic2
0,198602,10000.0,3990.0,-0.832030,0.358892,0.566304,0.163040,0.102856,-0.093676,-0.141959,...,-0.236484,-0.189901,0.329310,-0.532302,-0.097756,0.005610,0.047093,-0.136688,0.196702,39
1,198603,10000.0,3990.0,-0.822200,0.435152,0.558559,0.109808,0.167942,0.182560,-0.985382,...,-0.224035,-0.184600,-0.837498,-0.482747,-0.054734,0.064933,0.041815,0.210267,0.085671,39
2,198604,10000.0,3990.0,-0.811471,0.444426,0.454947,0.302562,0.165111,0.221449,-0.991515,...,-0.267945,-0.290344,-0.838113,-0.290005,0.084677,0.077550,0.107416,0.221449,0.186832,39
3,198605,10000.0,3990.0,-0.798883,0.233621,0.440494,0.018283,-0.032165,-0.280684,-0.986795,...,-0.305400,-0.271542,-0.892331,0.428983,-0.438971,-0.357373,-0.086169,0.222617,0.290164,39
4,198606,10000.0,3990.0,-0.778114,0.298874,0.522105,0.229282,-0.248613,-0.292822,-0.945873,...,-0.385611,-0.396705,0.562279,0.432342,-0.468482,-0.311145,-0.075139,0.124559,0.146243,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3658838,202408,93436.0,3711.0,0.073115,0.719657,-0.931899,-0.998329,-0.397535,0.765197,-0.928557,...,0.780238,0.701274,0.268018,-0.958220,0.796950,-0.986213,-0.798203,-0.876332,-0.827449,37
3658839,202409,93436.0,3711.0,0.072880,0.718534,-0.934660,-0.998325,-0.395602,0.761675,-0.912042,...,0.778429,0.705969,0.268272,-0.958115,0.788901,-0.973194,-0.789319,-0.780105,-0.818639,37
3658840,202410,93436.0,3711.0,0.073739,0.717227,-0.942017,-0.998319,-0.395378,0.761345,-0.931933,...,0.775630,0.704202,0.271849,-0.957563,0.787815,-0.961765,-0.788655,-0.794538,-0.809664,37
3658841,202411,93436.0,3711.0,0.076098,0.714526,-0.903721,-0.998318,-0.380702,0.757410,-0.926845,...,0.648098,0.613622,0.312592,-0.956695,0.771284,-0.959638,-0.772125,-0.787681,-0.788522,37


In [20]:
# %%time
# # KNS procedure
# def scale_and_normalize(s, ch):
#     """
#     Performs a two-step normalization on a pandas Series. 
#     -> s_normalized = (rk - mean(rk)) / sum(abs(rk - mean(rk)))
#     1. Rank the characteristic with ties.
#     2. Centers the ranked characteristic and divides by the sum of absolute deviations.
#     """
#     # Step 1: cross-sectionally rank the characteristic with ties
#     charc_ranks = s.rank(method='average')

#     # Step 2: Normalize by sum of absolute deviations
#     mean_ranks = charc_ranks.mean()
#     centered_ranks = charc_ranks - mean_ranks
#     scaler = np.abs(centered_ranks).sum()
    
#     # Handle the edge case where all values are the same (scaler is zero)
#     if scaler == 0:
#         print(f"Warning: Characteristic '{ch}' has all identical values for this date group. Returning NaNs.")
#         return np.full(s.shape, np.nan)
#     else:
#         return centered_ranks / scaler

# # Apply scaling and normalization
# for ch in tqdm(charc_list, desc='Processing', colour='green'):
#     charc[ch] = charc.groupby('date')[ch].transform(lambda x: scale_and_normalize(x, ch))

Processing: 100%|[32m██████████[0m| 136/136 [02:25<00:00,  1.07s/it]

CPU times: user 2min 24s, sys: 81.2 ms, total: 2min 24s
Wall time: 2min 25s





In [21]:
# # Verification
# # Group by date and calculate the sum for each characteristic column
# wts_sums = charc.groupby('date')[charc_list].sum()
# wts_abs_sums = charc.groupby('date')[charc_list].apply(lambda x: x.abs().sum())

# # Use np.allclose to check if all values in the DataFrame are close to zero.
# # This is more robust than checking for exact equality with 0
# is_verified = np.allclose(wts_sums, 0) and np.allclose(wts_abs_sums, 1)

# if is_verified:
#     print("✅ Success! For each date, all columns sum to 0 AND their absolute values sum to 1.")
# else:
#     print("❌ Failure! One or both conditions were not met.")

✅ Success! For each date, all columns sum to 0 AND their absolute values sum to 1.


In [6]:
# Save the characteristics after transformation
charc = charc.drop(columns=['sic','sic2'])
charc.to_parquet('/work/rw196/data/ALL/charc.parquet', engine='pyarrow')
charc

Unnamed: 0,date,permno,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,at_turnover,...,seas_6_10na,taccruals_at,taccruals_ni,tangibility,tax_gr1a,turnover_126d,turnover_var_126d,zero_trades_126d,zero_trades_21d,zero_trades_252d
0,198602,10000.0,-0.832030,0.358892,0.566304,0.163040,0.102856,-0.093676,-0.141959,0.465998,...,-0.116117,-0.236484,-0.189901,0.329310,-0.532302,-0.097756,0.005610,0.047093,-0.136688,0.196702
1,198603,10000.0,-0.822200,0.435152,0.558559,0.109808,0.167942,0.182560,-0.985382,0.471868,...,0.098249,-0.224035,-0.184600,-0.837498,-0.482747,-0.054734,0.064933,0.041815,0.210267,0.085671
2,198604,10000.0,-0.811471,0.444426,0.454947,0.302562,0.165111,0.221449,-0.991515,0.453928,...,0.182420,-0.267945,-0.290344,-0.838113,-0.290005,0.084677,0.077550,0.107416,0.221449,0.186832
3,198605,10000.0,-0.798883,0.233621,0.440494,0.018283,-0.032165,-0.280684,-0.986795,0.373117,...,-0.025055,-0.305400,-0.271542,-0.892331,0.428983,-0.438971,-0.357373,-0.086169,0.222617,0.290164
4,198606,10000.0,-0.778114,0.298874,0.522105,0.229282,-0.248613,-0.292822,-0.945873,-0.626492,...,0.253824,-0.385611,-0.396705,0.562279,0.432342,-0.468482,-0.311145,-0.075139,0.124559,0.146243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3658838,202408,93436.0,0.073115,0.719657,-0.931899,-0.998329,-0.397535,0.765197,-0.928557,0.573010,...,-0.657823,0.780238,0.701274,0.268018,-0.958220,0.796950,-0.986213,-0.798203,-0.876332,-0.827449
3658839,202409,93436.0,0.072880,0.718534,-0.934660,-0.998325,-0.395602,0.761675,-0.912042,0.570681,...,-0.425759,0.778429,0.705969,0.268272,-0.958115,0.788901,-0.973194,-0.789319,-0.780105,-0.818639
3658840,202410,93436.0,0.073739,0.717227,-0.942017,-0.998319,-0.395378,0.761345,-0.931933,0.569328,...,-0.660084,0.775630,0.704202,0.271849,-0.957563,0.787815,-0.961765,-0.788655,-0.794538,-0.809664
3658841,202411,93436.0,0.076098,0.714526,-0.903721,-0.998318,-0.380702,0.757410,-0.926845,0.546353,...,0.577885,0.648098,0.613622,0.312592,-0.956695,0.771284,-0.959638,-0.772125,-0.787681,-0.788522
