correct tickers from CRSP

# 1) Setup

## 1a) Library functions

In [25]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from time import time
from datetime import datetime, timedelta
from copy import copy

print("done")

done


## 1b) Helper functions

In [26]:
def ListUnion(l1, l2):
    union = list(set(l1) | set(l2))
    return union

# 2) CRSP Permnos
- https://wrds-www.wharton.upenn.edu/data-dictionary/crsp_a_indexes/dsp500list/
- Log in to the SAS cloud and navigate to: /wrds/crsp/sasdata/a_indexes/dsp500list.sas7bdat
- convert permnos to IBES tickers here https://wrds-www.wharton.upenn.edu/pages/get-data/linking-suite-wrds/ibes-crsp-link/

In [27]:
#crsp sas
wrds_tickers = pd.read_sas("CorrectMembers/dsp500list.sas7bdat")
wrds_tickers

Unnamed: 0,PERMNO,start,ending
0,10006.0,1957-03-01,1984-07-18
1,10030.0,1957-03-01,1969-01-08
2,10049.0,1925-12-31,1932-10-01
3,10057.0,1957-03-01,1992-07-02
4,10078.0,1992-08-20,2010-01-28
...,...,...,...
2007,93159.0,2012-07-31,2016-03-29
2008,93246.0,2021-03-22,2021-12-31
2009,93422.0,2010-07-01,2015-06-30
2010,93429.0,2017-03-01,2021-12-31


In [28]:
relevant_tickers = wrds_tickers[(wrds_tickers["ending"] >= "2000-01-01")]
tickers = relevant_tickers["PERMNO"]
relevant_tickers

Unnamed: 0,PERMNO,start,ending
4,10078.0,1992-08-20,2010-01-28
6,10104.0,1989-08-03,2021-12-31
7,10107.0,1994-06-07,2021-12-31
8,10108.0,2002-07-22,2005-08-11
10,10137.0,2000-12-11,2011-02-25
...,...,...,...
2007,93159.0,2012-07-31,2016-03-29
2008,93246.0,2021-03-22,2021-12-31
2009,93422.0,2010-07-01,2015-06-30
2010,93429.0,2017-03-01,2021-12-31


In [29]:

with open("CorrectMembers/SPX_CRSPMembers.txt", 'w') as f:
    for item in tickers:
        item = int(item)
        f.write("%s\n" % item)
print("done")

done


# 3) Financial Ratios
- https://wrds-www.wharton.upenn.edu/pages/get-data/financial-ratios-suite-wrds/financial-ratios-with-ibes-subscription/financial-ratios-firm-level-ibes/
- Also use this to get conversion from PERMNO to TICKER
- create month label to merge with macro data


In [30]:
df_FundamentalRatios = pd.read_csv("CorrectMembers/FundamentalRatios.csv")
df_FundamentalRatios["month"] = pd.to_datetime(df_FundamentalRatios["public_date"]).dt.to_period('M')
df_FundamentalRatios.rename(columns={'cusip':'CUSIP'}, inplace=True)
df_FundamentalRatios.sort_values("month")

Unnamed: 0,gvkey,permno,adate,qdate,public_date,bm,evm,pe_exi,ps,pcf,...,curr_ratio,cash_conversion,sale_nwc,accrual,ptb,PEG_1yrforward,PEG_ltgforward,TICKER,CUSIP,month
0,12136,10078,1999/06/30,1999/09/30,2000/01/31,0.071,18.262,109.115,9.857,49.139,...,2.071,43.674,4.039,0.169,23.938,4.380,5.407,SUNW,86681010,2000-01
117049,24782,77182,1999/06/30,1999/09/30,2000/01/31,0.620,8.963,,0.718,-84.244,...,2.769,115.147,3.257,-0.048,1.761,,,PRGO,71429010,2000-01
99266,11818,68304,1999/06/30,1999/09/30,2000/01/31,0.936,23.345,8.563,0.582,-163.540,...,,2629.440,,-0.005,1.111,,,BSC,07390210,2000-01
151661,13440,85991,1998/12/31,1999/09/30,2000/01/31,0.201,-47.967,-1734.400,91.839,100.064,...,7.520,,0.109,0.109,11.265,-3.532,-41.379,LVLT,52729N10,2000-01
53352,5709,32870,1999/10/31,1999/10/31,2000/01/31,0.273,10.811,18.300,0.863,11.989,...,2.076,44.500,8.096,0.048,3.445,1.730,1.755,HRL,44045210,2000-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106822,14624,75573,2019/12/31,2020/09/30,2020/12/31,1.752,6.619,,0.155,2.408,...,1.131,34.062,30.767,0.141,0.860,,,ODP,88337F10,2020-12
106570,12540,75510,2019/11/30,2020/08/31,2020/12/31,0.048,40.330,62.987,19.292,45.085,...,1.138,,172.458,0.069,20.351,,,ADBE,00724F10,2020-12
13925,171007,13447,2019/12/31,2020/09/30,2020/12/31,0.028,150.269,155.929,25.442,70.545,...,1.159,23.755,9.495,0.132,40.165,,,NOW,81762P10,2020-12
108429,16478,75825,2019/12/31,2020/09/30,2020/12/31,1.190,10.119,,2.379,5.110,...,1.536,,6.531,0.165,1.166,,,EOG,26875P10,2020-12


In [31]:
# get tickers
tickers = df_FundamentalRatios["TICKER"].unique()
with open("CorrectMembers/SPX_CRSPMembers_Tickers.txt", 'w') as f:
    for item in tickers:
        f.write("%s\n" % item)
print("done")

done


In [32]:
cusips = df_FundamentalRatios["CUSIP"].unique()
with open("CorrectMembers/SPX_CRSPMembers_Cusips.txt", 'w') as f:
    for item in cusips:
        f.write("%s\n" % item)
print("done")

done


# 4) Price Data
- compustat: https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/security-monthly/
- Optionmetrics: https://wrds-www.wharton.upenn.edu/pages/get-data/optionmetrics/ivy-db-us/securities/security-prices/

In [33]:
df_PriceData = pd.read_csv("CorrectMembers/PriceData.csv")
df_PriceData["month"] = pd.to_datetime(df_PriceData["datadate"]).dt.to_period('M')
df_PriceData.rename(columns={'tic':'TICKER'}, inplace=True)
df_PriceData["cusip"] = df_PriceData["cusip"].str[:-1]
df_PriceData.rename(columns={'cusip':'CUSIP'}, inplace=True)
df_PriceData.sort_values(["month","TICKER"])

Unnamed: 0,gvkey,iid,datadate,TICKER,CUSIP,ajexm,cshtrm,curcdm,prccm,trfm,trt1m,cshom,ggroup,gind,gsector,gsubind,month
86184,10795,01,2000/01/31,3UALAQ,90254950,1.0,20473200.0,USD,57.125,2.1210,-26.3497,53524000.0,2030.0,203020.0,20.0,20302010.0,2000-01
93057,11609,02,2000/01/31,3WWYWB,98252620,2.5,,USD,77.750,1.0253,-5.3333,24130000.0,3020.0,302020.0,30.0,30202030.0,2000-01
39710,5597,02,2000/01/31,4741B,42786630,2.0,,,,,,,3020.0,302020.0,30.0,30202030.0,2000-01
61627,7938,02,2000/01/31,4749B,62957920,1.0,,,,,,,1010.0,101020.0,10.0,10102050.0,2000-01
60378,7906,02,2000/01/31,4764B,65410620,8.0,,,,,,,2520.0,252030.0,25.0,25203020.0,2000-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165583,144559,01,2021/12/31,ZBH,98956P10,1.0,41701268.0,USD,127.040,1.0875,6.4214,208908000.0,3510.0,351010.0,35.0,35101010.0,2021-12
120055,24405,01,2021/12/31,ZBRA,98920710,1.0,6187041.0,USD,595.200,1.0000,1.0904,53441000.0,4520.0,452030.0,45.0,45203010.0,2021-12
186160,187039,01,2021/12/31,ZG,98954M10,1.0,21375982.0,USD,62.220,3.0946,14.9880,61374000.0,6010.0,601020.0,60.0,60102040.0,2021-12
93955,11687,01,2021/12/31,ZION,98970110,1.0,27241303.0,USD,63.160,3.5754,0.1268,156463000.0,4010.0,401010.0,40.0,40101015.0,2021-12


# 5) Analyst estimates
-  https://wrds-www.wharton.upenn.edu/pages/get-data/ibes-thomson-reuters/ibes-academic/unadjusted-summary/price-target/

In [34]:
df_AnalystEstimates = pd.read_csv("CorrectMembers/AnalystPredictions.csv")
df_AnalystEstimates["month"] = pd.to_datetime(df_AnalystEstimates["STATPERS"]).dt.to_period('M')
df_AnalystEstimates.rename(columns={'TICKER':'tic'}, inplace=True)
df_AnalystEstimates.rename(columns={'OFTIC':'TICKER'}, inplace=True)
df_AnalystEstimates.sort_values(["TICKER","month"])

Unnamed: 0,TICKER,tic,CUSIP,CNAME,STATPERS,NUMEST,NUMUP4W,NUMDOWN4W,NUMUP1M,NUMDOWN1M,MEANPTG,MEDPTG,STDEV,PTGHIGH,PTGLOW,CURR,month
0,A,AT1,00846U10,AGILENT TECHNOLOGIES INC,2000/01/20,7,4,0,3,0,72.143,80.0,16.547,90.0,55.0,USD,2000-01
1,A,AT1,00846U10,AGILENT TECHNOLOGIES INC,2000/02/17,7,2,0,2,0,77.857,85.0,16.036,90.0,55.0,USD,2000-02
2,A,AT1,00846U10,AGILENT TECHNOLOGIES INC,2000/03/16,8,6,0,6,0,156.250,172.5,28.504,180.0,110.0,USD,2000-03
3,A,AT1,00846U10,AGILENT TECHNOLOGIES INC,2000/04/20,8,0,1,0,1,150.000,160.0,28.031,175.0,110.0,USD,2000-04
4,A,AT1,00846U10,AGILENT TECHNOLOGIES INC,2000/05/18,8,0,3,0,3,139.375,125.0,28.715,175.0,110.0,USD,2000-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187644,ZTS,ZOTS,98978V10,ZOETIS INC,2021/08/19,13,9,0,9,0,225.000,227.0,14.259,240.0,184.0,USD,2021-08
187645,ZTS,ZOTS,98978V10,ZOETIS INC,2021/09/16,13,0,0,0,0,225.000,227.0,14.259,240.0,184.0,USD,2021-09
187646,ZTS,ZOTS,98978V10,ZOETIS INC,2021/10/14,12,0,0,0,0,224.583,226.0,14.811,240.0,184.0,USD,2021-10
187647,ZTS,ZOTS,98978V10,ZOETIS INC,2021/11/18,11,7,0,7,0,236.545,245.0,21.398,258.0,184.0,USD,2021-11


In [35]:
print(df_AnalystEstimates["tic"].nunique())
print(df_AnalystEstimates["TICKER"].nunique())
print(df_PriceData["TICKER"].nunique())
print(df_FundamentalRatios["TICKER"].nunique())

1121
1071
1103
1186


# 6) Merge dataframes + Macro data

In [36]:
len(ListUnion(df_FundamentalRatios["TICKER"].unique(),df_AnalystEstimates["TICKER"].unique()))

1187

In [37]:
#merge
df_Merge = copy(df_FundamentalRatios)
df_Merge = df_Merge.merge(df_AnalystEstimates, on = ["TICKER","month"], how = "inner")
df_Merge = df_Merge.merge(df_PriceData, on = ["gvkey","month"], how = "inner")
df_Merge.sort_values("month")

Unnamed: 0,gvkey,permno,adate,qdate,public_date,bm,evm,pe_exi,ps,pcf,...,cshtrm,curcdm,prccm,trfm,trt1m,cshom,ggroup,gind,gsector,gsubind
0,12142,10104,1999/05/31,1999/11/30,2000/01/31,0.037,23.669,25.486,15.247,72.985,...,7.715501e+08,USD,49.9531,1.0000,-10.8477,2.821529e+09,4510.0,451030.0,45.0,45103020.0
99452,12338,76565,1999/09/30,1999/09/30,2000/01/31,0.484,7.552,23.623,2.476,19.940,...,5.461200e+07,USD,13.9375,1.0000,4.2056,2.408580e+08,3510.0,351020.0,35.0,35102020.0
99620,23809,76605,1999/08/31,1999/11/30,2000/01/31,0.322,8.822,15.441,0.859,12.397,...,8.664700e+06,USD,26.2500,1.0000,-18.7621,1.381880e+08,2550.0,255040.0,25.0,25504050.0
100119,23877,76619,1998/12/31,1999/09/30,2000/01/31,0.969,9.164,5.653,0.196,-20.337,...,2.935800e+06,USD,7.2500,1.0000,7.4074,5.921300e+07,3510.0,351020.0,35.0,35102030.0
100276,23943,76624,1998/12/31,1999/09/30,2000/01/31,0.033,51.883,104.335,49.282,124.440,...,3.959850e+07,USD,180.5000,1.0000,12.5926,6.805600e+07,4530.0,453010.0,45.0,45301020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66032,6730,50876,2019/12/31,2020/09/30,2020/12/31,0.052,20.147,27.633,6.957,24.294,...,8.128802e+07,USD,168.8400,5.1526,15.9217,9.565820e+08,3520.0,352020.0,35.0,35202010.0
154904,163946,90829,2019/12/31,2020/09/30,2020/12/31,0.613,7.573,,2.034,6.662,...,4.327228e+07,USD,38.7100,1.2927,3.7802,2.139160e+08,1510.0,151010.0,15.0,15101030.0
66284,6994,51263,2019/12/31,2020/09/30,2020/12/31,2.192,9.471,,0.312,6.241,...,8.136950e+06,USD,13.3100,14.6013,19.8020,3.455900e+07,2010.0,201060.0,20.0,20106010.0
155121,31249,90926,2019/12/31,2020/09/30,2020/12/31,0.457,-7.832,,57.863,-16.202,...,4.567375e+07,USD,11.2800,1.0000,-7.7678,3.307100e+07,3520.0,352010.0,35.0,35201010.0


In [38]:
df_Merge.columns

Index(['gvkey', 'permno', 'adate', 'qdate', 'public_date', 'bm', 'evm',
       'pe_exi', 'ps', 'pcf', 'dpr', 'npm', 'roe', 'aftret_invcapx',
       'debt_assets', 'de_ratio', 'quick_ratio', 'curr_ratio',
       'cash_conversion', 'sale_nwc', 'accrual', 'ptb', 'PEG_1yrforward',
       'PEG_ltgforward', 'TICKER_x', 'CUSIP_x', 'month', 'tic', 'CUSIP_y',
       'CNAME', 'STATPERS', 'NUMEST', 'NUMUP4W', 'NUMDOWN4W', 'NUMUP1M',
       'NUMDOWN1M', 'MEANPTG', 'MEDPTG', 'STDEV', 'PTGHIGH', 'PTGLOW', 'CURR',
       'iid', 'datadate', 'TICKER_y', 'CUSIP', 'ajexm', 'cshtrm', 'curcdm',
       'prccm', 'trfm', 'trt1m', 'cshom', 'ggroup', 'gind', 'gsector',
       'gsubind'],
      dtype='object')

In [39]:
#add in index inclusion
wrds_tickers.rename(columns={'PERMNO':'permno'}, inplace=True)
df_Merge = df_Merge.merge(wrds_tickers, on = ["permno"], how = "left")
df_Merge["is_member"] = ( (df_Merge["public_date"] >= df_Merge["start"]) & (df_Merge["public_date"] <= df_Merge["ending"]) )

In [40]:
#(df_Merge.count()/len(df_Merge)).sort_values(ascending = False)

In [41]:
temp = df_Merge.pop('month')
df_Merge.insert(0, 'month', temp)
df_Merge.insert(0, 'TICKER', df_Merge["TICKER_x"])
df_Merge = df_Merge.drop(labels = ["sale_nwc", "cash_conversion", "curr_ratio", "quick_ratio", "PEG_ltgforward", "dpr", "PEG_1yrforward", 
                        "qdate", "public_date", "TICKER_x", "TICKER_y","CUSIP_x", "CUSIP_y", "datadate", "iid","STATPERS","tic", "adate"],axis = 1)
df_Merge

Unnamed: 0,TICKER,month,gvkey,permno,bm,evm,pe_exi,ps,pcf,npm,...,trfm,trt1m,cshom,ggroup,gind,gsector,gsubind,start,ending,is_member
0,ORCL,2000-01,12142,10104,0.037,23.669,25.486,15.247,72.985,0.155,...,1.0,-10.8477,2.821529e+09,4510.0,451030.0,45.0,45103020.0,1989-08-03,2021-12-31,True
1,ORCL,2000-02,12142,10104,0.037,23.669,37.883,22.592,108.144,0.155,...,1.0,48.6393,2.821529e+09,4510.0,451030.0,45.0,45103020.0,1989-08-03,2021-12-31,True
2,ORCL,2000-03,12142,10104,0.037,23.669,39.828,23.752,113.696,0.155,...,1.0,5.1347,2.821529e+09,4510.0,451030.0,45.0,45103020.0,1989-08-03,2021-12-31,True
3,ORCL,2000-04,12142,10104,0.020,34.204,62.451,23.393,91.705,0.197,...,1.0,2.4019,2.838409e+09,4510.0,451030.0,45.0,45103020.0,1989-08-03,2021-12-31,True
4,ORCL,2000-05,12142,10104,0.020,34.204,56.152,20.805,81.560,0.197,...,1.0,-10.0860,2.838409e+09,4510.0,451030.0,45.0,45103020.0,1989-08-03,2021-12-31,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172771,TSLA,2020-08,184996,93436,0.049,35.254,258.197,18.062,171.660,0.014,...,1.0,74.1452,9.318100e+08,2510.0,251020.0,25.0,25102010.0,2020-12-21,2021-12-31,False
172772,TSLA,2020-09,184996,93436,0.049,35.254,222.285,15.820,150.352,0.014,...,1.0,-13.9087,9.335400e+08,2510.0,251020.0,25.0,25102010.0,2020-12-21,2021-12-31,False
172773,TSLA,2020-10,184996,93436,0.049,35.254,201.057,14.308,135.979,0.014,...,1.0,-9.5499,9.479010e+08,2510.0,251020.0,25.0,25102010.0,2020-12-21,2021-12-31,False
172774,TSLA,2020-11,184996,93436,0.039,53.275,494.500,19.095,123.713,0.020,...,1.0,46.2736,9.479010e+08,2510.0,251020.0,25.0,25102010.0,2020-12-21,2021-12-31,False


In [42]:
macro = ["CPI", "FedFundsTargetRate", "GDP", "MedianHomeSalesPrice", 
         "NonFarmPayrolls", "PMI", "PPI", "PrivateHousingStarts", "Unemployment"]

for ratio in macro:
    readname = "MacroData/df_US_" + ratio + ".pkl"
    df_temp = pd.read_pickle(readname)
    if ratio == "PPI":   #PPI had different format
        df_temp = df_temp[1:]
        df_temp.columns = ["year", "m", "month", "PPI"]
        df_temp = df_temp[["month", "PPI"]]
        df_temp["month"] = pd.to_datetime(df_temp["month"]).dt.to_period('M')
    elif ratio == "GDP":
        df_temp.columns = ["date", ratio]
        df_temp["date"] = pd.to_datetime(df_temp["date"])   #format date
        df_temp["month"] = df_temp['date'].dt.to_period('M')   #nearest month
        df_temp = df_temp[["month", ratio]]
        df_temp = df_temp.set_index('month').resample('M').interpolate().reset_index()
    else:
        df_temp.columns = ["date", ratio]
        df_temp["date"] = pd.to_datetime(df_temp["date"])   #format date
        df_temp["month"] = df_temp['date'].dt.to_period('M')   #nearest month
        df_temp = df_temp[["month", ratio]]
    df_Merge = df_Merge.merge(df_temp, on = ["month"], how = "left")

df_Merge.to_pickle("df_Merge.pkl")
df_Merge

Unnamed: 0,TICKER,month,gvkey,permno,bm,evm,pe_exi,ps,pcf,npm,...,is_member,CPI,FedFundsTargetRate,GDP,MedianHomeSalesPrice,NonFarmPayrolls,PMI,PPI,PrivateHousingStarts,Unemployment
0,ORCL,2000-01,12142,10104,0.037,23.669,25.486,15.247,72.985,0.155,...,True,2.74,5.500000,1.290394e+07,163500,131005000,56.7,128.3,1.94,4.0
1,ORCL,2000-02,12142,10104,0.037,23.669,37.883,22.592,108.144,0.155,...,True,3.22,5.741379,1.291960e+07,162400,131124000,55.8,129.8,2.03,4.1
2,ORCL,2000-03,12142,10104,0.037,23.669,39.828,23.752,113.696,0.155,...,True,3.76,5.838710,1.293525e+07,165100,131596000,54.9,130.8,1.89,4.0
3,ORCL,2000-04,12142,10104,0.020,34.204,62.451,23.393,91.705,0.197,...,True,3.07,6.000000,1.301375e+07,162600,131888000,54.7,130.7,1.90,3.8
4,ORCL,2000-05,12142,10104,0.020,34.204,56.152,20.805,81.560,0.197,...,True,3.19,6.258065,1.309225e+07,164700,132105000,53.2,131.6,1.84,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172771,TSLA,2020-08,184996,93436,0.049,35.254,258.197,18.062,171.660,0.014,...,False,1.31,0.250000,1.812658e+07,325500,141149000,55.6,194.3,1.47,8.4
172772,TSLA,2020-09,184996,93436,0.049,35.254,222.285,15.820,150.352,0.014,...,False,1.37,0.250000,1.856077e+07,344400,141865000,55.7,195.5,1.54,7.9
172773,TSLA,2020-10,184996,93436,0.049,35.254,201.057,14.308,135.979,0.014,...,False,1.18,0.250000,1.862978e+07,346900,142545000,58.8,196.5,1.61,6.9
172774,TSLA,2020-11,184996,93436,0.039,53.275,494.500,19.095,123.713,0.020,...,False,1.17,0.250000,1.869878e+07,350800,142809000,57.7,198.3,1.65,6.7


In [43]:
(df_Merge.count()/len(df_Merge)).sort_values(ascending = False)

TICKER                  1.000000
gsubind                 1.000000
MEDPTG                  1.000000
PTGHIGH                 1.000000
month                   1.000000
CURR                    1.000000
CUSIP                   1.000000
ggroup                  1.000000
gind                    1.000000
gsector                 1.000000
is_member               1.000000
NUMDOWN1M               1.000000
CPI                     1.000000
FedFundsTargetRate      1.000000
GDP                     1.000000
MedianHomeSalesPrice    1.000000
NonFarmPayrolls         1.000000
PMI                     1.000000
PPI                     1.000000
PrivateHousingStarts    1.000000
MEANPTG                 1.000000
PTGLOW                  1.000000
NUMUP1M                 1.000000
CNAME                   1.000000
gvkey                   1.000000
permno                  1.000000
NUMDOWN4W               1.000000
Unemployment            1.000000
NUMEST                  1.000000
NUMUP4W                 1.000000
ajexm     

# 7) Format data for ML
- something wrong with prices here!

In [44]:
df_all = copy(df_Merge)
# returns
df_all["mcap"] = df_all["prccm"] * df_all["cshom"]
df_all["price_adjusted"] = df_all["prccm"] /df_all["ajexm"]

#melt, then pivot to create price df
prices = df_all.melt(id_vars = ["TICKER", "month"], value_vars = "price_adjusted", var_name = "price_adjusted")
prices = prices.pivot_table(values = "value", index = "month", columns = "TICKER")

#rets over diff periods
periods = [-1, 1,3,6,9,12]
label = ["pred_target","return_1M","return_3M","return_6M","return_9M","return_12M"]
for j in range(len(periods)):
    if j> 0:
        i = periods[j]
        ret = prices/prices.shift(i) - 1
        df_melt = ret.melt(var_name = "TICKER", value_name = label[j], ignore_index = False)
        df_all = df_all.merge(df_melt, on = ["TICKER", "month"],how = "left")
    else:
        ret = (prices/prices.shift() - 1).shift(-1)
        df_melt = ret.melt(var_name = "TICKER", value_name = label[j], ignore_index = False)
        df_all = df_all.merge(df_melt, on = ["TICKER", "month"],how = "left")
        
df_all

Unnamed: 0,TICKER,month,gvkey,permno,bm,evm,pe_exi,ps,pcf,npm,...,PrivateHousingStarts,Unemployment,mcap,price_adjusted,pred_target,return_1M,return_3M,return_6M,return_9M,return_12M
0,ORCL,2000-01,12142,10104,0.037,23.669,25.486,15.247,72.985,0.155,...,1.94,4.0,1.409441e+11,24.97655,0.486394,,,,,
1,ORCL,2000-02,12142,10104,0.037,23.669,37.883,22.592,108.144,0.155,...,2.03,4.1,2.094985e+11,37.12500,0.051347,0.486394,,,,
2,ORCL,2000-03,12142,10104,0.037,23.669,39.828,23.752,113.696,0.155,...,1.89,4.0,2.202556e+11,39.03125,0.024019,0.051347,,,,
3,ORCL,2000-04,12142,10104,0.020,34.204,62.451,23.393,91.705,0.197,...,1.90,3.8,2.268953e+11,39.96875,-0.100860,0.024019,0.600251,,,
4,ORCL,2000-05,12142,10104,0.020,34.204,56.152,20.805,81.560,0.197,...,1.84,4.0,2.040106e+11,35.93750,0.169565,-0.100860,-0.031987,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172771,TSLA,2020-08,184996,93436,0.049,35.254,258.197,18.062,171.660,0.014,...,1.47,8.4,4.643396e+11,498.32000,-0.139087,0.741452,1.983952,2.729996,6.551676,10.043837
172772,TSLA,2020-09,184996,93436,0.049,35.254,222.285,15.820,150.352,0.014,...,1.54,7.9,4.004980e+11,429.01000,-0.095499,-0.139087,0.986507,3.093607,4.127650,7.905426
172773,TSLA,2020-10,184996,93436,0.049,35.254,201.057,14.308,135.979,0.014,...,1.61,6.9,3.678235e+11,388.04000,0.462736,-0.095499,0.356063,1.481455,1.982308,5.160930
172774,TSLA,2020-11,184996,93436,0.039,53.275,494.500,19.095,123.713,0.020,...,1.65,6.7,5.380286e+11,567.60000,0.243252,0.462736,0.139027,2.398802,3.248567,7.601564


In [45]:
df_nn = copy(df_all)
#Convert indices to 0-2
df_nn["PMI"] = df_nn["PMI"]/100
df_nn["PPI"] = df_nn["PPI"].astype(float)/100

#one hot encode sectors
df_nn["gsector"] = df_nn["gsector"]/5 - 2

#growth measures
df_nn.rename(columns={'cshtrm':'volume'}, inplace=True)
growth_measures = ["GDP", "MedianHomeSalesPrice", "PrivateHousingStarts","NonFarmPayrolls", "volume"]
for ratio in growth_measures:
    if ratio == "volume":  #no need for percentage. Normalise to a fraction
        df_nn[ratio] = df_nn.groupby("TICKER")[ratio].apply(lambda x: (x.diff()[1:]/x.shift(1)))
    else:
        df_nn[ratio] = df_nn.groupby("TICKER")[ratio].apply(lambda x: (100* x.diff()[1:]/x.shift(1)))

# medptg - adjusted to a return: slightly weird denominator for normalisation
df_nn["MEDPTG"] = np.exp((df_nn["MEDPTG"] - df_nn["price_adjusted"])/df_nn["MEDPTG"])

# convert analyst up/down to percentage
df_nn["PCTUP4W"] = df_nn["NUMUP4W"]/df_nn["NUMEST"]   #use 4 weeks since there are granularity issues with 1M (e.g. NUMDOWN1M > NUMEST for index 180834)
df_nn["PCTDOWN4W"] = df_nn["NUMDOWN4W"]/df_nn["NUMEST"]

#remove first 12 months & last month
df_nn = df_nn[(df_nn["month"]>= "2001-01") & (df_nn["month"]<= "2020-11")].reset_index(drop = True)

#remove index non-members
#df_nn = df_nn[df_nn["is_member"] == True]

#remove unnecessary columns
valid_cols = ["TICKER", "month", "pred_target","price_adjusted","mcap",
              "debt_assets", "de_ratio", "evm", "pe_exi", 
              "roe", "npm","ps", "ptb", "pcf", "aftret_invcapx",
              "CPI", "PMI", "PPI", "FedFundsTargetRate", "GDP", 
              "MedianHomeSalesPrice", "PrivateHousingStarts","NonFarmPayrolls", "Unemployment",
            "MEDPTG", "PCTUP4W", "PCTDOWN4W", "volume",
             "return_1M", "return_3M", "return_6M", "return_9M", "return_12M"]

df_nn = df_nn[valid_cols]
#fill nans
df_nn.iloc[:,2:]=df_nn.iloc[:,2:].replace(np.nan, df_nn.iloc[:,2:].mean(axis = 0))

#save
df_nn.to_pickle("df_NeuralNetworkFeatures.pkl")
df_nn



Unnamed: 0,TICKER,month,pred_target,price_adjusted,mcap,debt_assets,de_ratio,evm,pe_exi,roe,...,Unemployment,MEDPTG,PCTUP4W,PCTDOWN4W,volume,return_1M,return_3M,return_6M,return_9M,return_12M
0,ORCL,2001-01,-0.347639,29.125,1.630257e+11,0.466,0.874,49.313,25.774,1.350,...,4.2,1.423015,0.000000,0.043478,0.120038,0.002151,-0.117424,-0.225270,-0.271306,0.166094
1,ORCL,2001-02,-0.211579,19.000,1.063515e+11,0.466,0.874,49.313,16.814,1.350,...,4.2,1.782074,0.000000,0.000000,-0.105515,-0.347639,-0.283019,-0.582131,-0.471304,-0.488215
2,ORCL,2001-03,0.078772,14.980,8.384977e+10,0.466,0.874,49.313,13.257,1.350,...,4.3,1.869180,0.000000,0.086957,0.549890,-0.211579,-0.484559,-0.619556,-0.643599,-0.616205
3,ORCL,2001-04,-0.053218,16.160,9.071537e+10,0.461,0.855,42.100,14.624,1.210,...,4.4,1.786502,0.000000,0.000000,-0.207634,0.078772,-0.445150,-0.510303,-0.570141,-0.595684
4,ORCL,2001-05,0.241830,15.300,8.588770e+10,0.461,0.855,42.100,13.846,1.210,...,4.3,1.733253,0.000000,0.100000,-0.074699,-0.053218,-0.194737,-0.422642,-0.663505,-0.574261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164283,TSLA,2020-07,0.741452,286.152,2.666393e+11,0.755,3.732,26.151,-287.000,-0.025,...,10.2,1.863100,0.533333,0.000000,0.475571,0.325011,0.829897,1.199241,3.543249,4.921775
164284,TSLA,2020-08,-0.139087,498.320,4.643396e+11,0.736,3.310,35.254,258.197,0.054,...,8.4,1.921954,0.357143,0.000000,3.115364,0.741452,1.983952,2.729996,6.551676,10.043837
164285,TSLA,2020-09,-0.095499,429.010,4.004980e+11,0.736,3.310,35.254,222.285,0.054,...,7.9,0.665917,0.571429,0.000000,0.103446,-0.139087,0.986507,3.093607,4.127650,7.905426
164286,TSLA,2020-10,0.462736,388.040,3.678235e+11,0.736,3.310,35.254,201.057,0.054,...,6.9,0.745674,0.310345,0.068966,-0.515345,-0.095499,0.356063,1.481455,1.982308,5.160930
