In [1]:
import polars as pl 
import pandas as pd
import numpy as np
import requests

## Fetch SP500 companies meta data

In [2]:
# Collect the SP500 from the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} # Mimic a browser to avoid 403 Error
tables = pd.read_html(url, storage_options={'User-Agent': headers['User-Agent']})
sp_meta1 = tables[0]
sp_meta2 = tables[1]
sp_meta2.columns = sp_meta2.columns.map('_'.join)
sp500 = set(sp_meta1.Symbol).difference(set(sp_meta2.Removed_Ticker)).union(set(sp_meta2.Added_Ticker))
sp_meta1.rename(columns={'Symbol':'tic', 'GICS Sector':'gics_sector', 'GICS Sub-Industry':'gics_sub_industy',
                         'Headquarters Location':'hq_loc', 'Founded':'founded_date'}, inplace=True)
 

sp_meta1.drop(['Security', 'Date added', 'CIK'], axis=1, inplace=True)
sp_meta1['founded_date'] = sp_meta1["founded_date"].str.findall(r"\b\d{4}\b").apply(lambda x: min(map(int, x)) if x else None)
sp_meta1['hq_loc'] = sp_meta1['hq_loc'].str.split(',').apply(lambda x: x[-1])
sp_meta = sp_meta1
del sp_meta1, sp_meta2


## Define the investment universe data

In [None]:
data_dir = '~/Documents/folio/gill-hack/2025/ASSET MANAGEMENT HACKATHON 2025/data/'
data = pd.read_csv(data_dir + 'modeling_data.csv')
cmpny_us = (data[data.fic == 'USA'][
             ['gvkey','iid','tic', 'conm', 'market_equity']]
                .drop_duplicates(subset=['gvkey','iid','tic', 'conm'])
                .dropna()
                .sort_values(by='market_equity', ascending=False)
                .reset_index(drop=True)
           )



# Consolidate with the meta data
cmpny_us = cmpny_us.merge(sp_meta, how='left', on=['tic'])


In [None]:
# Filter the available US stocks with a market equity above .25 quantile criterion
invst_univ = cmpny_us[cmpny_us['market_equity']>=cmpny_us['market_equity'].quantile(0.25)]

In [None]:
invst_univ_data = data[
                        data['gvkey'].isin(invst_univ['gvkey']) & data['iid'].isin(invst_univ['iid'])
                  ].drop(['tpci', 'fic'], axis=1)


In [None]:
invst_univ_data.tic.nunique()

In [None]:
invst_univ_data.info()

In [10]:
pl.from_pandas(invst_univ_data).write_csv('../data/invst_univ_data.csv')

## Consolidate with SP500 meta data

In [None]:
keys = set(invst_univ_data.columns).intersection(set(cmpny_us.columns))
invst_univ_data = invst_univ_data.merge(cmpny_us, how='left', on=list(keys))
invst_univ_data = invest_univ_data.loc[:, ['gvkey', 'iid', 'id', 'date', 'tic', 'conm', 'market_equity', 'gics_sector', 'gics_sub_industy',
       'hq_loc', 'founded_date', 'year', 'month','dolvol_126d', 'turnover_126d', 'ivol_ff3_21d', 'ni_ivol', 'beta_60m',
       'z_score', 'f_score', 'netdebt_me', 'rd_sale', 'ebit_bev', 'ebit_sale',
       'ni_be', 'niq_be', 'ocf_at', 'gp_at', 'niq_at', 'op_at', 'rd_me',
       'be_me', 'div12m_me', 'ni_me', 'sale_me', 'rf_feature', 'mgmt_feature',
       'target_ret']]


In [None]:
pl.from_pandas( invst_univ_data).write_csv('../data/invst_univ_data.csv')

## Consolidate with SIC dummies

In [106]:
# Read the data fecthed using the feth from edgar pipelin
sic_dummies_lnktabl = pd.read_csv('../data/invst_univ_companies_with_sic.csv')
sic_dummies_lnktabl.head()

Unnamed: 0,tic,conm,SIC,SIC_desc,SIC2
0,XOM,EXXON MOBIL CORP,2911.0,Petroleum Refining,29.0
1,CHFC,CHEMICAL FINANCIAL CORP,,,
2,GR,GOODRICH CORP,,,
3,PEP,PEPSICO INC,2080.0,Beverages,20.0
4,TXN,TEXAS INSTRUMENTS INC,3674.0,Semiconductors & Related Devices,36.0


In [107]:
sic_dummies_lnktabl = sic_dummies_lnktabl.merge(invst_univ_data[['conm', 'tic', 'gvkey', 'iid']], 
                                                on=['conm', 'tic'], 
                                                how='left', 
                                                validate='one_to_many').drop_duplicates().dropna()

sic_dummies_lnktabl = sic_dummies_lnktabl.astype({'SIC2': int, 'SIC': int})

In [108]:
sic_dummies_lnktabl

Unnamed: 0,tic,conm,SIC,SIC_desc,SIC2,gvkey,iid
0,XOM,EXXON MOBIL CORP,2911,Petroleum Refining,29,4503,01
11,PEP,PEPSICO INC,2080,Beverages,20,8479,01
12,TXN,TEXAS INSTRUMENTS INC,3674,Semiconductors & Related Devices,36,10499,01
23,BOH,BANK OF HAWAII CORP,6022,State Commercial Banks,60,2005,01
31,BA,BOEING CO,3721,Aircraft,37,2285,01
...,...,...,...,...,...,...,...
9696,VLTO,VERALTO CORP,3825,Instruments For Meas & Testing of Electricity...,38,43197,01
9697,LINE,LINEAGE INC,6798,Real Estate Investment Trusts,67,50389,01
9698,PMT,PENNYMAC MORTGAGE INVEST TR,6798,Real Estate Investment Trusts,67,183324,01
9699,INN,SUMMIT HOTEL PROPERTIES INC,6798,Real Estate Investment Trusts,67,185396,01


In [None]:
# 
cols = ['tic', 'conm']

for col in cols:
    filled = (
    invst_univ_data
      .groupby(['gvkey','iid'])[col]
      .transform(lambda s: s.ffill().bfill())
)
    invst_univ_data[col] = filled.infer_objects(copy=False).astype('string')

invst_univ_data = invst_univ_data.merge(sic_dummies_lnktabl, on=['conm', 'tic', 'gvkey', 'iid'], how='left', validate='many_to_one').drop_duplicates().dropna(subset=['SIC2'])
    

In [142]:
invst_univ_data.dropna(subset=['SIC2'])

Unnamed: 0,gvkey,iid,id,date,tic,conm,market_equity,dolvol_126d,turnover_126d,ivol_ff3_21d,...,ni_me,sale_me,rf_feature,mgmt_feature,target_ret,year,month,SIC,SIC_desc,SIC2
1,9728,02,comp_009728_02,20050228,MUX,MCEWEN INC,7.569090,1.479429e+04,0.001550,0.026080,...,-0.101333,0.000000,,,0.027027,2005,2,1040.0,Gold and Silver Ores,10.0
2,13071,01,comp_013071_01,20050228,RDNT,RADNET INC,19.320290,3.125634e+04,0.001401,0.046515,...,-0.407965,7.162004,,,-0.212766,2005,2,8071.0,Services-Medical Laboratories,80.0
3,15240,02,comp_015240_02,20050228,WWR,WESTWATER RESOURCES INC,112.778970,3.941216e+05,0.004194,0.030881,...,-0.129590,0.000000,,,0.017241,2005,2,1000.0,Metal Mining,10.0
4,17602,01,comp_017602_01,20050228,BHRB,BURKE HERBERT FINL SRVS CORP,,4.156433e+04,,0.004553,...,,,,,0.000000,2005,2,6021.0,National Commercial Banks,60.0
5,17877,01,comp_017877_01,20050228,CHMG,CHEMUNG FINANCIAL CORP,115.602175,6.325633e+04,0.000557,0.009080,...,,,,,0.022998,2005,2,6022.0,State Commercial Banks,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515561,315318,01,comp_315318_01,20250630,ESI,ELEMENT SOLUTIONS INC,5184.735520,5.925649e+07,0.010482,,...,0.046791,0.473872,,,0.063407,2025,6,2890.0,Miscellaneous Chemical Products,28.0
515562,316056,01,comp_316056_01,20250630,ALLE,ALLEGION PLC,12279.192300,1.264881e+08,0.011089,,...,0.048660,0.307203,,,0.013758,2025,6,7381.0,"Services-Detective, Guard & Armored Car Services",73.0
515563,317264,01,comp_317264_01,20250630,LPG,DORIAN LPG LTD,913.093680,1.790935e+07,0.018638,,...,0.176673,0.016281,,,0.138720,2025,6,4412.0,Deep Sea Foreign Transportation of Freight,44.0
515564,326688,01,comp_326688_01,20250630,NVT,NVENT ELECTRIC PLC,10833.048800,1.308619e+08,0.012945,,...,0.022228,0.277493,,,0.113222,2025,6,3550.0,Special Industry Machinery (No Metalworking Ma...,35.0


In [143]:
pl.from_pandas(invst_univ_data).write_csv('../data/invst_univ_data_with_sic.csv')

## Explore the source of NAs

In [135]:
invst_univ_data.drop(['rf_feature', 'mgmt_feature'], axis=1).dropna()

Unnamed: 0,gvkey,iid,id,date,tic,conm,market_equity,dolvol_126d,turnover_126d,ivol_ff3_21d,...,niq_at,op_at,rd_me,be_me,div12m_me,ni_me,sale_me,target_ret,year,month
17343,11903,01,crsp_10025,20050228,AEPI,AEP INDUSTRIES INC,1.590375e+02,3.598615e+05,0.002923,0.039046,...,-0.003451,0.118364,0.011318,0.277796,0.000000,-0.108911,4.060459,0.082228,2005,2
17357,12141,01,crsp_10107,20050228,MSFT,MICROSOFT CORP,2.859322e+05,5.725156e+08,0.006475,0.005927,...,0.027363,0.224901,0.026922,0.265608,0.120160,0.028265,0.132231,-0.039574,2005,2
17364,12053,01,crsp_10147,20050228,EMC.2,EMC CORP/MA,3.139126e+04,1.631799e+08,0.005432,0.013269,...,0.015048,0.162282,0.027338,0.358809,0.000000,0.024553,0.246381,-0.033588,2005,2
17368,12181,01,crsp_10200,20050228,RGEN,REPLIGEN CORP,6.917020e+01,3.639339e+05,0.005309,0.041756,...,-0.059221,0.038262,0.084097,0.359664,0.000000,-0.111999,0.109006,-0.213043,2005,2
17371,1408,01,crsp_10225,20050228,BEAM.2,BEAM INC,1.210597e+04,4.059626e+07,0.003729,0.014250,...,0.024446,0.164309,0.004799,0.237437,0.015079,0.053750,0.489568,-0.031555,2005,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6236404,179841,01,crsp_93345,20241231,CDXS,CODEXIS INC,3.727112e+02,1.772361e+06,0.007160,0.040815,...,-0.152147,-0.003720,0.127595,0.164865,0.000000,-0.204877,0.163370,0.041485,2024,12
6236407,183974,01,crsp_93356,20241231,SPSC,SPS COMMERCE INC,7.253254e+03,3.664191e+07,0.005164,0.009399,...,0.021471,0.206998,0.008017,0.097947,0.000000,0.009911,0.080488,-0.047030,2024,12
6236410,184258,01,crsp_93372,20241231,PLOW,DOUGLAS DYNAMICS INC,5.979036e+02,4.998834e+06,0.008427,0.007979,...,0.042175,0.123704,0.016861,0.442073,0.045469,0.048260,0.959892,-0.075898,2024,12
6236413,185138,01,crsp_93426,20241231,VPG,VISHAY PRECISION GROUP INC,2.806015e+02,2.024922e+06,0.006262,0.020405,...,0.009873,0.110314,0.072701,1.177898,0.000000,0.074843,1.188604,0.021768,2024,12


In [113]:
most_na = ['SIC',
 'SIC2',
 'SIC_desc',
 'conm',
 'mgmt_feature', # reviuse if links with (gvkey, iid) would yield less NAs
 'rd_me', # just drop it
 'rd_sale', # just drop it
 'rf_feature',
 'tic']

(invst_univ_data.merge(sic_dummies_lnktabl, on=['conm', 'tic', 'gvkey', 'iid'], how='left')
    .loc[:, 
                  ['gvkey', 'iid', 'date', 'tic', 'conm', 'market_equity',
                    'SIC', 'SIC_desc', 'SIC2', 'year',
                    'month', 'dolvol_126d', 'turnover_126d', 'ivol_ff3_21d', 'ni_ivol',
                    'beta_60m', 'z_score', 'f_score', 'netdebt_me', 'rd_sale', 'ebit_bev',
                    'ebit_sale', 'ni_be', 'niq_be', 'ocf_at', 'gp_at', 'niq_at', 'op_at',
                    'rd_me', 'be_me', 'div12m_me', 'ni_me', 'sale_me', 'rf_feature',
                    'mgmt_feature', 'target_ret']]
    
).drop_duplicates().dropna()

Unnamed: 0,gvkey,iid,date,tic,conm,market_equity,SIC,SIC_desc,SIC2,year,...,niq_at,op_at,rd_me,be_me,div12m_me,ni_me,sale_me,rf_feature,mgmt_feature,target_ret
127,4503,01,20050228,XOM,EXXON MOBIL CORP,332886.863760,2911.0,Petroleum Refining,29.0,2005,...,0.031383,0.211692,0.001856,0.345754,0.020746,0.070775,0.738599,0.000000,0.000038,0.232171
184,10499,01,20050228,TXN,TEXAS INSTRUMENTS INC,40105.137668,3674.0,Semiconductors & Related Devices,36.0,2005,...,0.035078,0.350809,0.048348,0.323126,0.003991,0.046977,0.304126,0.000000,0.000034,0.140457
221,2285,01,20050228,BA,BOEING CO,42483.606919,3721.0,Aircraft,37.0,2005,...,0.008069,0.105456,0.044817,0.224604,0.015076,0.064990,1.231040,0.000000,0.018250,0.091304
238,6774,01,20050228,LMT,LOCKHEED MARTIN CORP,25591.100168,3760.0,Guided Missiles & Space Vehicles & Parts,37.0,2005,...,0.011700,0.102462,0.035286,0.282911,0.015853,0.048376,1.349649,0.000000,0.009066,0.028715
245,8530,01,20050228,PFE,PFIZER INC,181948.838050,2834.0,Pharmaceutical Preparations,28.0,2005,...,0.027492,0.225675,0.047964,0.440481,0.028351,0.049866,0.283459,0.000000,0.000030,0.096027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504805,24844,03,20241031,SPOK,SPOK HOLDINGS INC,305.175848,4812.0,Radiotelephone Communications,48.0,2024,...,0.018605,0.175467,0.036068,0.524842,0.082666,0.055001,0.461223,0.026945,0.003403,0.033864
504963,174236,01,20241031,SEM,SELECT MEDICAL HOLDINGS CORP,4533.832131,8060.0,Services-Hospitals,80.0,2024,...,0.005538,0.090353,0.000000,0.331126,0.014244,0.047480,1.394028,0.000211,0.007393,-0.080011
504968,183366,01,20241031,H,HYATT HOTELS CORP,6838.193663,7011.0,Hotels & Motels,70.0,2024,...,0.040676,0.060159,0.000000,0.541810,0.003921,0.100026,0.979937,0.000165,0.002930,-0.044350
504971,12785,01,20241031,PPC,PILGRIM'S PRIDE CORP,10919.467919,2015.0,Poultry Slaughtering and Processing,20.0,2024,...,0.017779,0.125684,0.000522,0.355481,0.000000,0.044948,1.608002,0.000474,0.003991,0.051900


In [43]:
invst_univ_data.groupby(['gvkey', 'iid']).size()

gvkey   iid
1004    01     244
1034    01      46
1045    01      83
        04     137
1050    01     244
              ... 
326688  01      84
328795  01      78
333070  01      15
339965  01      51
343180  01      41
Length: 3409, dtype: int64

In [None]:
# Collect the number of data points per security
df = (invst_univ_data
           .groupby(['gvkey','iid'])
           .agg(n_tic=('tic','size'), # size() counts NAs included, count() only NAs
                n_unique=('tic','nunique'),
                tic=('tic','first'),
                conm=('conm', 'first'),
                min_date=('date','min'),
                max_date=('date','max'))
           .sort_values('n_tic', ascending=False)
           .reset_index()
     )

In [92]:
df.dropna()

Unnamed: 0,gvkey,iid,n_tic,n_unique,tic,conm,min_date,max_date
0,157307,01,245,1,ISBA,ISABELLA BANK CORP,20050228,20250630
1,153616,01,245,1,EFSI,EAGLE FINANCIAL SERVICES INC,20050228,20250630
2,1004,01,244,1,AIR,AAR CORP,20050228,20250630
3,63456,01,244,1,CSV,CARRIAGE SERVICES INC,20050228,20250630
4,63447,01,244,1,GES,GUESS INC,20050228,20250630
...,...,...,...,...,...,...,...,...
3400,22459,01,5,1,UE,URBAN EDGE PROPERTIES,20250228,20250630
3401,30293,01,5,1,ESS,ESSEX PROPERTY TRUST,20250228,20250630
3402,18468,01,5,1,PAGP,PLAINS GP HOLDINGS LP,20250228,20250630
3403,183324,01,5,1,PMT,PENNYMAC MORTGAGE INVEST TR,20250228,20250630


In [None]:

invst_univ_data.set_index(['gvkey','iid']).loc[(8901,'01')]

  invst_univ_data.set_index(['gvkey','iid']).loc[(8901,'01')]


Unnamed: 0_level_0,Unnamed: 1_level_0,id,date,tic,conm,market_equity,dolvol_126d,turnover_126d,ivol_ff3_21d,ni_ivol,beta_60m,...,rd_me,be_me,div12m_me,ni_me,sale_me,rf_feature,mgmt_feature,target_ret,year,month
gvkey,iid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
8901,01,crsp_66683,20050228,,,741.926746,1.731443e+06,0.002787,0.014391,0.064323,0.796267,...,,0.248985,0.004646,0.036146,0.436523,,,0.014347,2005,2
8901,01,crsp_66683,20050331,,,750.844811,2.112068e+06,0.003133,0.026980,0.064323,0.825528,...,,0.246028,0.005742,0.035717,0.431338,,,-0.127012,2005,3
8901,01,crsp_66683,20050429,,,660.248522,2.297776e+06,0.003322,0.026177,0.064323,0.782323,...,,0.279787,0.006529,0.040618,0.490524,,,-0.042791,2005,4
8901,01,crsp_66683,20050531,,,632.097418,2.414081e+06,0.003425,0.018815,0.060484,0.832617,...,,0.305915,0.006820,0.055012,0.537563,,,0.018569,2005,5
8901,01,crsp_66683,20050630,,,639.998890,2.145315e+06,0.003072,0.020468,0.060484,0.841322,...,,0.302138,0.008102,0.054333,0.530926,,,0.145565,2005,6
8901,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8901,01,comp_008901_01,20250228,RES,RPC INC,1317.778360,1.038197e+07,0.007673,,0.169543,,...,,0.855608,0.026098,0.090259,1.118677,0.028769,0.003206,-0.083624,2025,2
8901,01,comp_008901_01,20250331,,,1199.543760,1.017502e+07,0.007605,,0.169543,,...,,0.939942,0.028654,0.099155,1.228941,,,-0.014337,2025,3
8901,01,comp_008901_01,20250430,,,1188.291500,9.717867e+06,0.007459,,0.169543,,...,,0.948843,0.028926,0.100094,1.240579,,,-0.140000,2025,4
8901,01,comp_008901_01,20250530,,,1043.272450,9.347323e+06,0.007589,,0.162604,,...,,1.089342,0.032946,0.087651,1.356308,,,-0.053439,2025,5


## Collect Non US companies

In [None]:
# How many securirties have tpci information
(data[['gvkey','iid','fic','tpci']].drop_duplicates()
       .groupby('fic')
       .agg(lambda df: df.dropna()
                         .count()
                         
            )
).rename(columns={col: col+'_count' for col in ['gvkey','iid', 'tpci'] })

In [11]:
# No text data available for any of the foreign companies
data.loc[data[(data.fic !='USA')][['rf_feature', 'mgmt_feature']].dropna().index]

Unnamed: 0,gvkey,iid,id,tpci,date,tic,conm,fic,market_equity,dolvol_126d,...,rd_me,be_me,div12m_me,ni_me,sale_me,rf_feature,mgmt_feature,target_ret,year,month
1187545,21216,01C,comp_021216_01C,0.0,20090331,GQM.H,GOLDEN QUEEN MINING CONS LTD,CAN,28.359696,25067.15024,...,,0.14803,,-0.133239,0.0,0.012001,0.002423,0.104464,2009,3
1190207,157080,01C,comp_157080_01C,0.0,20090331,RVM.,REVETT MINING CO INC,CAN,6.003634,28258.061869,...,,10.169172,,-0.75483,6.000218,0.01732,0.004206,-0.0476,2009,3
1482424,21216,01C,comp_021216_01C,0.0,20100331,GQM.H,GOLDEN QUEEN MINING CONS LTD,CAN,81.859705,84248.930808,...,,0.038913,,-0.033315,0.0,0.010856,0.002522,0.014359,2010,3
1777356,21216,01C,comp_021216_01C,0.0,20110331,GQM.H,GOLDEN QUEEN MINING CONS LTD,CAN,275.58123,152815.0896,...,,0.028405,,-0.031024,0.0,0.07942,0.00179,-0.015715,2011,3


### tpci seems not to be credible as a chatgpt search revealed where for canada, REVETT MINING CO INC has been acquired and whose ticker is defunct

In [25]:
data[(data.fic == 'CAN') & (~data.tpci.isna())].drop_duplicates()

Unnamed: 0,gvkey,iid,id,tpci,date,tic,conm,fic,market_equity,dolvol_126d,...,rd_me,be_me,div12m_me,ni_me,sale_me,rf_feature,mgmt_feature,target_ret,year,month
1187545,21216,01C,comp_021216_01C,0.0,20090331,GQM.H,GOLDEN QUEEN MINING CONS LTD,CAN,28.359696,25067.15024,...,,0.14803,,-0.133239,0.0,0.012001,0.002423,0.104464,2009,3
1190207,157080,01C,comp_157080_01C,0.0,20090331,RVM.,REVETT MINING CO INC,CAN,6.003634,28258.061869,...,,10.169172,,-0.75483,6.000218,0.01732,0.004206,-0.0476,2009,3
1482424,21216,01C,comp_021216_01C,0.0,20100331,GQM.H,GOLDEN QUEEN MINING CONS LTD,CAN,81.859705,84248.930808,...,,0.038913,,-0.033315,0.0,0.010856,0.002522,0.014359,2010,3
1777356,21216,01C,comp_021216_01C,0.0,20110331,GQM.H,GOLDEN QUEEN MINING CONS LTD,CAN,275.58123,152815.0896,...,,0.028405,,-0.031024,0.0,0.07942,0.00179,-0.015715,2011,3


# Anonymize Columns

In [4]:
# Select only numerical columns
numeric_cols = data.select_dtypes(include=[np.number]).columns.to_list()
for itm in ['date', 'target_ret', 'year', 'month']:
    numeric_cols.remove(itm)


In [6]:
char_list = ['market_equity', 'dolvol_126d',
       'turnover_126d', 'ivol_ff3_21d', 'ni_ivol', 'beta_60m', 'z_score',
       'f_score', 'netdebt_me', 'rd_sale', 'ebit_bev', 'ebit_sale', 'ni_be',
       'niq_be', 'ocf_at', 'gp_at', 'niq_at', 'op_at', 'rd_me', 'be_me',
       'div12m_me', 'ni_me', 'sale_me']

df1 = data[[col for col in data.columns if col not in numeric_cols]]
df1 = df1.drop(['year', 'month'], axis=1)

In [7]:
df2 = data[numeric_cols].div(data[numeric_cols].max(), axis=1
         ).mul(10000, axis=1
         ).rename(columns={'rf_feature':'txt_feat1', 'mgmt_feature':'txt_feat2'}
         ).rename(columns={col: f'quant_feat{i}' for i, col in enumerate(char_list)}
         )


In [8]:
pd.concat([df1,df2], axis=1).dropna().head(12)

Unnamed: 0,date,tic,conm,fic,target_ret,quant_feat0,quant_feat1,quant_feat2,quant_feat3,quant_feat4,...,quant_feat15,quant_feat16,quant_feat17,quant_feat18,quant_feat19,quant_feat20,quant_feat21,quant_feat22,txt_feat1,txt_feat2
17590,20050228,XOM,EXXON MOBIL CORP,USA,0.232171,916.302044,2669.904376,0.078678,76.294386,1.614773e-14,...,0.028086,0.028587,0.030658,0.000253,0.012854,0.007557,0.011234,0.01567,0.0,8.550785
17627,20050228,GR,GOODRICH CORP,USA,0.079592,11.228838,102.114906,0.230355,170.575807,1.390029e-14,...,0.022471,0.007659,0.014953,0.010428,0.011817,0.00846,0.005498,0.0238,0.0,2298.450883
17707,20050228,TXN,TEXAS INSTRUMENTS INC,USA,0.140457,110.39312,1450.456629,0.320773,282.794459,4.782894e-14,...,0.042106,0.031952,0.050806,0.0066,0.012012,0.001454,0.007457,0.006452,0.0,7.695706
17765,20050228,BA,BOEING CO,USA,0.091304,116.940078,821.622024,0.160381,162.461613,1.750541e-14,...,0.016243,0.00735,0.015273,0.006118,0.00835,0.005491,0.010316,0.026118,0.0,4074.87637
17794,20050228,LMT,LOCKHEED MARTIN CORP,USA,0.028715,70.441883,580.482524,0.196789,169.025984,1.456548e-14,...,0.009884,0.010657,0.014839,0.004817,0.010517,0.005774,0.007679,0.028634,0.0,2024.398239
17806,20050228,PFE,PFIZER INC,USA,0.096027,500.831094,2768.950728,0.174684,139.066472,4.774918e-14,...,0.03426,0.025043,0.032683,0.006548,0.016375,0.010327,0.007915,0.006014,0.0,6.698115
18112,20050228,GNTX,GENTEX CORP,USA,0.002365,7.250993,158.518043,0.508051,289.751086,5.737711e-15,...,0.026919,0.028102,0.035846,0.001549,0.01093,0.006864,0.006815,0.004054,0.0,331.057875
18130,20050228,GGG,GRACO INC,USA,0.084432,6.783919,43.814537,0.153732,217.462446,2.365028e-14,...,0.090184,0.078435,0.078485,0.001142,0.003423,0.019462,0.006599,0.004989,0.0,960.395616
18132,20050228,B.2,BARNES GROUP INC,USA,0.047953,1.635118,8.822907,0.122346,128.767681,2.987445e-15,...,0.038979,0.00982,0.014614,0.001149,0.021499,0.011329,0.01001,0.03441,0.0,1225.042398
18179,20050228,HNI,HNI CORP,USA,0.084882,6.239196,22.513922,0.081571,181.676854,8.936659e-15,...,0.07411,0.032832,0.035048,0.001553,0.011966,0.005175,0.007886,0.018748,0.0,767.432912
