In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Unifies the GHZ (2017) stock zoo used by both papers and tags usage for Goyenko & Zhang (2022) as “Both (alone & with options),”
# reflecting that they run stock-only and combined (stock+option)
pd.read_csv('../factor_zoo/combined_stock_characteristics.csv')

In [None]:
# Reconstructs the GHZ (2017) “factor zoo” as an expanded dictionary 
#(≈90 entries here; GHZ report 94). It follows the table labeling in the appendix
# of Goyenko & Zhang
pd.read_csv('factor_zoo/ghz_factor_zoo_full.csv')

In [4]:
# Full Option Zoo from Goyenko & Zhang (9 option characteristics + 15 risk-neutral moments) labeled Type=Option,
# then appends the GHZ stock zoo labeled Type=Stock
feature_zoo = pd.read_csv('../factor_zoo/option_zoo_plus_factor_zoo.csv')
option_zoo = (feature_zoo[feature_zoo['Type']=='Option'].drop(['Type', 'Reference'], axis=1)
        .rename(columns={'Characteristic': 'charcteristic', 'Group': 'category'}))  
option_zoo

Unnamed: 0,charcteristic,category
0,avg_impl_volatility,ATM implied volatility
1,cvol,Change in call IV
2,pvol,Change in put IV
3,cw,Call–Put IV spread
4,hviv,Hist vol – Implied vol
5,med_hq,Borrowing fee (PC parity)
6,opt_baspread,Option bid-ask illiquidity
7,os,Option-to-stock volume ratio
8,skewness,Volatility skew
9,volq1,RN variance (1m)


In [2]:
acc_ratios = pd.read_csv('../data/acc_ratios.csv')
acc_ratios

Unnamed: 0,Variable,Description
0,netdebt_me,Net debt-to-price
1,rd_sale,R&D-to-sales
2,ebit_bev,Return on net operating assets
3,ebit_sale,Profit margin
4,ni_be,Return on equity
5,niq_be,Quarterly return on equity
6,ocf_at,Operating cash flow to assets
7,gp_at,Gross profits-to-assets
8,niq_at,Quarterly return on assets
9,op_at,Operating profits-to-book assets


In [5]:
stock_zoo =  ['be_me','market_equity','turnover_126d','gp_at','cop_at','inv_gr1','noa_at','ivol_capm_21d'] + ['size_log_me', 'bm','rev_1m','mom_7_12','inv_at','ivol','rvol','turnover','gp_at','op_at','ebit_sale','f_score','rd_me','inv_gr1','capx_gr1','capx_gr2','capx_gr3','noa_gr1a','ret_6_1','ret_9_1','ret_12_1','ret_12_7','market_equity','turnover_126d','bidaskhl_21d','zero_trades_126d']

lst_to_fetch = option_zoo['charcteristic'].to_list() + stock_zoo + acc_ratios['Variable'].to_list()

In [20]:
feature_glossary = pd.read_csv('../stock_char_appendix/table1.csv')
feature_glossary[feature_glossary['Acronym'].isin(lst_to_fetch)].sort_values('Acronym')

Unnamed: 0,Feature,Acronym,Reference
9,Book-to-market equity,be_me,Rosenberg Reid and Lanstein (1985)
15,The high-low bid-ask spread,bidaskhl_21d,Corwin and Schultz (2012)
18,CAPEX growth (2 years),capx_gr2,Anderson and Garcia-Feijoo (2006)
19,CAPEX growth (3 years),capx_gr3,Anderson and Garcia-Feijoo (2006)
24,Cash-based operating profits-to-book assets,cop_at,
33,Dividend yield,div12m_me,Litzenberger and Ramaswamy (1979)
40,Return on net operating assets,ebit_bev,Soliman (2008)
41,Profit margin,ebit_sale,Soliman (2008)
49,Pitroski F-score,f_score,Piotroski (2000)
52,Gross profits-to-assets,gp_at,Novy-Marx (2013)


In [7]:
ml_data = pd.read_csv('../data/ml_data.csv')

In [25]:
ml_data.tic.nunique()

1568

In [13]:
import random
random.shuffle(lst_to_fetch)
dct = {'charc_to_fetch': lst_to_fetch,
       'tic' : ml_data['tic'].unique().tolist()
       }

In [14]:
dct

{'charc_to_fetch': ['capx_gr1',
  'kurtq2',
  'capx_gr2',
  'capx_gr3',
  'skewness',
  'f_score',
  'inv_gr1',
  'skewq4',
  'op_at',
  'div12m_me',
  'market_equity',
  'ret_12_1',
  'noa_at',
  'kurtq4',
  'cw',
  'sale_me',
  'gp_at',
  'size_log_me',
  'kurtq5',
  'niq_be',
  'ni_me',
  'mom_7_12',
  'cop_at',
  'rev_1m',
  'ret_6_1',
  'volq6',
  'turnover_126d',
  'inv_gr1',
  'ebit_sale',
  'skewq6',
  'skewq1',
  'skewq5',
  'ebit_bev',
  'bidaskhl_21d',
  'pvol',
  'inv_at',
  'volq4',
  'avg_impl_volatility',
  'opt_baspread',
  'kurtq1',
  'noa_gr1a',
  'volq1',
  'netdebt_me',
  'rd_me',
  'ni_be',
  'ret_9_1',
  'gp_at',
  'ocf_at',
  'be_me',
  'turnover',
  'ivol_capm_21d',
  'rd_me',
  'market_equity',
  'be_me',
  'volq5',
  'skewq2',
  'ebit_sale',
  'kurtq6',
  'turnover_126d',
  'ret_12_7',
  'med_hq',
  'ivol',
  'rvol',
  'rd_sale',
  'volq2',
  'hviv',
  'cvol',
  'zero_trades_126d',
  'gp_at',
  'op_at',
  'os',
  'niq_at',
  'bm'],
 'tic': ['JJSF',
  'PLXS',
 

In [19]:
import json

# Assuming dct is your dictionary
with open('../data/fetch_frm_wrds.json', 'w') as f:
    json.dump(dct, f)

In [20]:
with open('../data/fetch_frm_wrds.json', 'r') as f:
    dct_loaded = json.load(f)

In [23]:
dct_loaded.keys()


dict_keys(['charc_to_fetch', 'tic'])

In [8]:
# 25 stock variables drawn from Gu, Kelly & Xiu (2020) and Green, Hand & Zhang (2017)
stock_char_short = pd.read_csv('../factor_zoo/stock_characteristics.csv')
stock_char_short

Unnamed: 0,Characteristic,Reference,Group
0,mom1m,"Gu, Kelly & Xiu (2020); Green, Hand & Zhang (2...",Price trends / reversals
1,mom6m,"Green, Hand & Zhang (2017)",Price trends
2,mom12m,"Gu, Kelly & Xiu (2020); Green, Hand & Zhang (2...",Price trends
3,mom36m,"Gu, Kelly & Xiu (2020)",Price trends (long-term reversal)
4,chmom,"Gu, Kelly & Xiu (2020)",Price trends
5,indmom,"Gu, Kelly & Xiu (2020)",Industry momentum
6,maxret,"Gu, Kelly & Xiu (2020)",Price trends / extreme past return
7,turn,"Gu, Kelly & Xiu (2020); Green, Hand & Zhang (2...",Liquidity / turnover
8,std_turn,"Gu, Kelly & Xiu (2020)",Liquidity / turnover volatility
9,mvel1,"Gu, Kelly & Xiu (2020)",Size / liquidity


In [None]:
factor_glossary = pd.read_csv('./table1/table1.csv')
factor_features = pd.read_csv('./data/factor_char_list.csv').values.flatten().tolist()

# Distinguish referenced and non referenced factors
factors_no_ref = list(set(factor_features)-set(factor_glossary["Acronym"].values))
factors_with_ref = list(set(factor_features).intersection(set(factor_glossary["Acronym"].values)))

# List refrenced factors
idx_factor_features = factor_glossary.index[factor_glossary["Acronym"].isin(factors_with_ref)].tolist()
factor_glossary.iloc[idx_factor_features].reset_index().drop('index', axis=1).rename(columns={'Feature':'Referenced Factors'})

In [None]:
# Check if all factor features are in acc_ratios
acc_ratios = pd.read_csv('./data/acc_ratios.csv')
set(factor_features).intersection(set(acc_ratios.Variable)) == set(acc_ratios.Variable)

True

In [None]:
# Load the short list of features used in modeling
features_short = pd.read_csv('data/referenced_predictive_factors.csv')
features_short

Unnamed: 0,Predictive_Var,Feature,Acronym,Reference,Category,Empirical_Rank
0,market_equity,Market Equity,market_equity,Banz (1981),Size,3
1,dolvol_126d,Dollar trading volume,dolvol_126d,"Brennan, Chordia & Subrahmanyam (1998)",Liquidity,1
2,turnover_126d,Share turnover,turnover_126d,"Datar, Naik & Radcliffe (1998)",Turnover,2
3,rvol_21d,Idiosyncratic volatility (FF3 residuals),ivol_ff3_21d,Ang et al. (2006),Volatility,4
4,ni_ivol,Earnings volatility,ni_ivol,Francis et al. (2004),Volatility,5
5,beta_60m,Market Beta,beta_60m,Fama & MacBeth (1973),Beta,6
6,z_score,Altman Z-score,z_score,Dichev (1998),Distress,7
7,f_score,Piotroski F-score,f_score,Piotroski (2000),Fundamentals Quality,8


In [None]:
l = features_short['Acronym'].tolist()
l.extend(acc_ratios.Variable.tolist())


In [None]:
import polars as p
df = pl.scan_csv('MAIN DATA and SUPPORTING CODES/ret_sample.csv').select(l).collect().to_pandas()

In [14]:
df.shape

(6401414, 23)