In [1]:
"""
filename: estimize_signals_eadates.ipynb

Get EA dates, actualized

Inputs: raw data CSV files sent over privately in a URL on 4/12/2022
These are 
-combined_estimates_new.csv
-combined_consensus_new.csv
-signals ZIP file

This particular EA processing file uses signals time-series csv. 

Output: estimize_signals_eadates.parquet

NOTE: hcusip is 8-digit, and hcusip9 is 9-digit. 
Standardized over all data feeds moving forward. 

"""
# Imports here 
import pandas as pd 
from datetime import date, timedelta, datetime, time
from dateutil import relativedelta
import time
from optparse import OptionParser
import numpy as np
import sqlite3
import pytz
import requests

# Paths defined here
onedrive = "C://Users/clj585/OneDrive - Northwestern University/"
out_pth = "data_feeds/estimize/data/"
raw_pth = onedrive + "data_feeds/estimize/data/Estimize_CSVS/"

# Load CSVs here
sig = pd.read_csv(raw_pth + "signal_time_series.csv")
sig

Unnamed: 0,ticker,cusip,fiscal_date,reports_at,as_of,type,signal
0,SCHN,806882106,Q1 2012,2012-01-09T06:00:00-05:00,2012-01-02T07:00:00-05:00,pre,-74.246120
1,MOS,61945C103,Q4 2011,2012-01-04T16:00:00-05:00,2012-01-02T07:00:00-05:00,pre,6.414791
2,SHLM,808194104,Q1 2012,2012-01-05T16:00:00-05:00,2012-01-02T07:00:00-05:00,pre,-8.154923
3,MON,61166W101,Q1 2012,2012-01-05T06:00:00-05:00,2012-01-02T07:00:00-05:00,pre,-19.214873
4,CMC,201723103,Q1 2012,2012-01-06T06:00:00-05:00,2012-01-02T07:00:00-05:00,pre,-51.218437
...,...,...,...,...,...,...,...
1999742,ZVO,98979V102,Q4 2021,2022-03-29T16:00:00-04:00,2022-03-31T14:00:00-04:00,post,-52.937729
1999743,KMX,143130102,Q4 2022,2022-04-12T06:00:00-04:00,2022-03-31T14:00:00-04:00,pre,-2.254750
1999744,PLAY,238337109,Q4 2021,2022-03-28T16:00:00-04:00,2022-03-31T14:00:00-04:00,post,-19.726969
1999745,RH,74967X103,Q4 2021,2022-03-29T16:00:00-04:00,2022-03-31T14:00:00-04:00,post,-10.345937


In [2]:
"""
Test for the point-in-time stuff using typical HNZ and 
KHC tickers examples. 

"""
sig[sig['ticker']=="HNZ"]

Unnamed: 0,ticker,cusip,fiscal_date,reports_at,as_of,type,signal
15426,HNZ,423074AL7,Q3 2012,2012-02-17T06:00:00-05:00,2012-02-03T07:00:00-05:00,pre,-20.923058
15789,HNZ,423074AL7,Q3 2012,2012-02-17T06:00:00-05:00,2012-02-03T14:00:00-05:00,pre,-20.923058
16270,HNZ,423074AL7,Q3 2012,2012-02-17T06:00:00-05:00,2012-02-06T07:00:00-05:00,pre,-20.987815
16819,HNZ,423074AL7,Q3 2012,2012-02-17T06:00:00-05:00,2012-02-06T14:00:00-05:00,pre,-20.987815
17324,HNZ,423074AL7,Q3 2012,2012-02-17T06:00:00-05:00,2012-02-07T07:00:00-05:00,pre,-21.868299
...,...,...,...,...,...,...,...
174859,HNZ,423074AL7,Q3 2013,2013-02-21T17:10:38-05:00,2013-02-19T14:00:00-05:00,pre,-30.644268
175837,HNZ,423074AL7,Q3 2013,2013-02-21T17:10:38-05:00,2013-02-20T07:00:00-05:00,pre,-31.060862
176442,HNZ,423074AL7,Q3 2013,2013-02-21T17:10:38-05:00,2013-02-20T14:00:00-05:00,pre,-31.060862
176832,HNZ,423074AL7,Q3 2013,2013-02-21T17:10:38-05:00,2013-02-21T07:00:00-05:00,pre,-34.237829


In [3]:
sig[sig['ticker']=="KHC"]

Unnamed: 0,ticker,cusip,fiscal_date,reports_at,as_of,type,signal
99299,KHC,500754106,Q2 2012,2012-08-02T16:00:00-04:00,2012-08-03T07:00:00-04:00,post,-27.020613
99837,KHC,500754106,Q2 2012,2012-08-02T16:00:00-04:00,2012-08-03T14:00:00-04:00,post,-27.020613
100259,KHC,500754106,Q2 2012,2012-08-02T16:00:00-04:00,2012-08-06T07:00:00-04:00,post,-28.198453
100468,KHC,500754106,Q2 2012,2012-08-02T16:00:00-04:00,2012-08-06T14:00:00-04:00,post,-28.198453
101015,KHC,500754106,Q2 2012,2012-08-02T16:00:00-04:00,2012-08-07T07:00:00-04:00,post,-28.198453
...,...,...,...,...,...,...,...
1982889,KHC,500754106,Q4 2021,2022-02-16T06:00:00-05:00,2022-02-17T14:00:00-05:00,post,26.392473
1984000,KHC,500754106,Q4 2021,2022-02-16T06:00:00-05:00,2022-02-18T07:00:00-05:00,post,26.389688
1984060,KHC,500754106,Q4 2021,2022-02-16T06:00:00-05:00,2022-02-18T14:00:00-05:00,post,26.389688
1985528,KHC,500754106,Q4 2021,2022-02-16T06:00:00-05:00,2022-02-21T07:00:00-05:00,post,26.389688


In [4]:
"""
Rename appropriate columns, transform data and get date and time information 
into columns, and get other fiscal calendar info. 

"""

# Keep and rename columns 
colmap = {"ticker": "hticker", "cusip": "hcusip9", 
         "reports_at" : "ea_datetime", "as_of":"datetime"}
sig = sig.rename(columns = colmap)

# Change dtypes 
sig[['hticker','hcusip9', 'fiscal_date', 
     'type', 'ea_datetime', "datetime"]] = sig[['hticker','hcusip9', 'fiscal_date', 
                                                'type', 'ea_datetime', "datetime"]].astype("string")

# Fiscal calendar information 
sig['fyear'] = (sig['fiscal_date'].str[3:7]).astype("Int64")
sig['fqtr'] = sig['fiscal_date'].str[1:2]

# Get date column (as of) and actual earn announce date, approximate time pull out 
sig['date'] = pd.to_datetime(sig['datetime'].str[0:10])
sig['ea_date'] = pd.to_datetime(sig['ea_datetime'].str[0:10])
sig['ea_time'] = (pd.to_datetime(sig['ea_datetime'].str[11:19])).dt.time#strftime("%H:%M:%S")

sig = sig[['date', 'hticker', 'hcusip9', 'ea_datetime', 'fyear', 
           'fqtr', "ea_date", "ea_time", 'type', 'signal']]
sig

Unnamed: 0,date,hticker,hcusip9,ea_datetime,fyear,fqtr,ea_date,ea_time,type,signal
0,2012-01-02,SCHN,806882106,2012-01-09T06:00:00-05:00,2012,1,2012-01-09,06:00:00,pre,-74.246120
1,2012-01-02,MOS,61945C103,2012-01-04T16:00:00-05:00,2011,4,2012-01-04,16:00:00,pre,6.414791
2,2012-01-02,SHLM,808194104,2012-01-05T16:00:00-05:00,2012,1,2012-01-05,16:00:00,pre,-8.154923
3,2012-01-02,MON,61166W101,2012-01-05T06:00:00-05:00,2012,1,2012-01-05,06:00:00,pre,-19.214873
4,2012-01-02,CMC,201723103,2012-01-06T06:00:00-05:00,2012,1,2012-01-06,06:00:00,pre,-51.218437
...,...,...,...,...,...,...,...,...,...,...
1999742,2022-03-31,ZVO,98979V102,2022-03-29T16:00:00-04:00,2021,4,2022-03-29,16:00:00,post,-52.937729
1999743,2022-03-31,KMX,143130102,2022-04-12T06:00:00-04:00,2022,4,2022-04-12,06:00:00,pre,-2.254750
1999744,2022-03-31,PLAY,238337109,2022-03-28T16:00:00-04:00,2021,4,2022-03-28,16:00:00,post,-19.726969
1999745,2022-03-31,RH,74967X103,2022-03-29T16:00:00-04:00,2021,4,2022-03-29,16:00:00,post,-10.345937


In [5]:
"""
Ticker nonsense again... especially the problem is dealing
with chaotic "defunct" flags, in caps, not capitalized, with
and without dashes, before and after ticker... 

"""
sig[sig['hticker'].str.contains("DEFUNCT")]

# Create this defunct flag on those that have it... save the info here in this column 
sig['defunct_flag'] = np.where(sig['hticker'].str.contains("DEFUNCT", case=False), 1, 0)

# Column kept track of original haphazard marking by estimize, can cut from ticker
sig['hticker'] = np.where(sig['defunct_flag']==1, 
                          sig['hticker'].str.replace(' - DEFUNCT', ''), 
                          sig['hticker'])

# Defunct in this file either marked with "DEFUNCT" or " - DEFUNCT"... 
sig['hticker'] = np.where(sig['hticker'].str.contains("DEFUNCT - ", case = False), 
                          sig['hticker'].str.replace('DEFUNCT - ', ''), 
                          sig['hticker'])

sig['hticker'] = np.where(sig['hticker'].str.contains("-defunct", case = False), 
                          sig['hticker'].str.replace('-defunct', ''), 
                          sig['hticker'])

sig['hticker'] = np.where(sig['hticker'].str.contains("-Defunct"), 
                          sig['hticker'].str.replace('-Defunct', ''), 
                          sig['hticker'])

sig['hticker'] = np.where(sig['hticker'].str.contains(" - Defunct", case = False), 
                          sig['hticker'].str.replace(' - Defunct', ''), 
                          sig['hticker'])

sig['hticker'] = np.where(sig['hticker'].str.contains(" - defunct", case = False), 
                          sig['hticker'].str.replace(' - defunct', ''), 
                          sig['hticker'])

sig['hticker'] = np.where(sig['hticker'].str.contains("DEFUNCT", case=False), 
                          sig['hticker'].str.replace('DEFUNCT', ''), 
                          sig['hticker'])

# Now can break apart into root and suffix 
sig[["root", 'suffix']] = sig['hticker'].str.split("-", n=1, expand = True)
sig['suffix'] = sig['suffix'].fillna("")

sig[['suffix', 'root', 'hticker']] = sig[['suffix', 'root', 'hticker']].astype("string")
sig

Unnamed: 0,date,hticker,hcusip9,ea_datetime,fyear,fqtr,ea_date,ea_time,type,signal,defunct_flag,root,suffix
0,2012-01-02,SCHN,806882106,2012-01-09T06:00:00-05:00,2012,1,2012-01-09,06:00:00,pre,-74.246120,0,SCHN,
1,2012-01-02,MOS,61945C103,2012-01-04T16:00:00-05:00,2011,4,2012-01-04,16:00:00,pre,6.414791,0,MOS,
2,2012-01-02,SHLM,808194104,2012-01-05T16:00:00-05:00,2012,1,2012-01-05,16:00:00,pre,-8.154923,0,SHLM,
3,2012-01-02,MON,61166W101,2012-01-05T06:00:00-05:00,2012,1,2012-01-05,06:00:00,pre,-19.214873,0,MON,
4,2012-01-02,CMC,201723103,2012-01-06T06:00:00-05:00,2012,1,2012-01-06,06:00:00,pre,-51.218437,0,CMC,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999742,2022-03-31,ZVO,98979V102,2022-03-29T16:00:00-04:00,2021,4,2022-03-29,16:00:00,post,-52.937729,0,ZVO,
1999743,2022-03-31,KMX,143130102,2022-04-12T06:00:00-04:00,2022,4,2022-04-12,06:00:00,pre,-2.254750,0,KMX,
1999744,2022-03-31,PLAY,238337109,2022-03-28T16:00:00-04:00,2021,4,2022-03-28,16:00:00,post,-19.726969,0,PLAY,
1999745,2022-03-31,RH,74967X103,2022-03-29T16:00:00-04:00,2021,4,2022-03-29,16:00:00,post,-10.345937,0,RH,


In [6]:
"""
Create before trade hours and after trading hours flag. 

"""
# Time of day flag/multi-level indicator - no missing time information! 
sig['time_of_day'] = np.where(pd.to_datetime(sig['ea_time'].astype("string"))<"09:30", 
                              "PO", 
                              np.where(pd.to_datetime(sig['ea_time'].astype("string"))>="16:00", 
                                       "PC", "RH"))

# Further get rid of stray dashes in the hticker column (if no suffix)
sig['hticker'] = np.where((sig['hticker'].str.contains("-")) & (sig['suffix'] == ""), 
                         sig['hticker'].str.replace("-", ""), sig['hticker'])
sig

Unnamed: 0,date,hticker,hcusip9,ea_datetime,fyear,fqtr,ea_date,ea_time,type,signal,defunct_flag,root,suffix,time_of_day
0,2012-01-02,SCHN,806882106,2012-01-09T06:00:00-05:00,2012,1,2012-01-09,06:00:00,pre,-74.246120,0,SCHN,,PO
1,2012-01-02,MOS,61945C103,2012-01-04T16:00:00-05:00,2011,4,2012-01-04,16:00:00,pre,6.414791,0,MOS,,PC
2,2012-01-02,SHLM,808194104,2012-01-05T16:00:00-05:00,2012,1,2012-01-05,16:00:00,pre,-8.154923,0,SHLM,,PC
3,2012-01-02,MON,61166W101,2012-01-05T06:00:00-05:00,2012,1,2012-01-05,06:00:00,pre,-19.214873,0,MON,,PO
4,2012-01-02,CMC,201723103,2012-01-06T06:00:00-05:00,2012,1,2012-01-06,06:00:00,pre,-51.218437,0,CMC,,PO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999742,2022-03-31,ZVO,98979V102,2022-03-29T16:00:00-04:00,2021,4,2022-03-29,16:00:00,post,-52.937729,0,ZVO,,PC
1999743,2022-03-31,KMX,143130102,2022-04-12T06:00:00-04:00,2022,4,2022-04-12,06:00:00,pre,-2.254750,0,KMX,,PO
1999744,2022-03-31,PLAY,238337109,2022-03-28T16:00:00-04:00,2021,4,2022-03-28,16:00:00,post,-19.726969,0,PLAY,,PC
1999745,2022-03-31,RH,74967X103,2022-03-29T16:00:00-04:00,2021,4,2022-03-29,16:00:00,post,-10.345937,0,RH,,PC


In [7]:
"""
Export out the sorted table

"""
# Sort in case 
sig = sig.sort_values(by = ['hticker', 'ea_date', 'date'])

# Create another hcusip column, this time is the 8-digit one
sig['hcusip'] = sig['hcusip9'].str[0:8]

# Export out to parquet 
sig.to_parquet(onedrive + out_pth + "estimize_signals_eadates.parquet")
sig

Unnamed: 0,date,hticker,hcusip9,ea_datetime,fyear,fqtr,ea_date,ea_time,type,signal,defunct_flag,root,suffix,time_of_day,hcusip
13470,2012-02-01,A,00846U101,2012-02-15T16:00:00-05:00,2012,1,2012-02-15,16:00:00,pre,65.204644,0,A,,PC,00846U10
13848,2012-02-01,A,00846U101,2012-02-15T16:00:00-05:00,2012,1,2012-02-15,16:00:00,pre,65.204644,0,A,,PC,00846U10
14552,2012-02-02,A,00846U101,2012-02-15T16:00:00-05:00,2012,1,2012-02-15,16:00:00,pre,65.514943,0,A,,PC,00846U10
14829,2012-02-02,A,00846U101,2012-02-15T16:00:00-05:00,2012,1,2012-02-15,16:00:00,pre,65.514943,0,A,,PC,00846U10
15643,2012-02-03,A,00846U101,2012-02-15T16:00:00-05:00,2012,1,2012-02-15,16:00:00,pre,65.727105,0,A,,PC,00846U10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944321,2021-11-16,ZYNE,98986X109,2021-11-15T06:00:00-05:00,2021,3,2021-11-15,06:00:00,post,-4.063074,0,ZYNE,,PO,98986X10
1944532,2021-11-17,ZYNE,98986X109,2021-11-15T06:00:00-05:00,2021,3,2021-11-15,06:00:00,post,-4.063074,0,ZYNE,,PO,98986X10
1944630,2021-11-17,ZYNE,98986X109,2021-11-15T06:00:00-05:00,2021,3,2021-11-15,06:00:00,post,-4.063074,0,ZYNE,,PO,98986X10
1944917,2021-11-18,ZYNE,98986X109,2021-11-15T06:00:00-05:00,2021,3,2021-11-15,06:00:00,post,-4.063074,0,ZYNE,,PO,98986X10


In [None]:
"""
(Code ended at above cell).

OPTIONAL: 
Run extensive checks here (as necessary for checking/debug). 

"""

sig[sig['hticker'].str.contains("-")]
sig[sig['hticker'].str.contains("defunct", case=False)]
sig[sig['root'].str.contains("defunct", case=False)]
sig[sig['suffix'].str.contains("defunct", case=False)]
sig[(sig['hticker'].str.contains("-")) & (sig['suffix']=="")]
sig[sig['hticker'].str.contains("-")].drop_duplicates(subset="hticker")
sig[sig['defunct_flag']==1]

In [None]:
sig[sig['hticker'].str.contains("defunct", case=False)]

In [None]:
sig[sig['root'].str.contains("defunct", case=False)]

In [None]:
sig[sig['suffix'].str.contains("defunct", case=False)]

In [None]:
sig[(sig['hticker'].str.contains("-")) & (sig['suffix']=="")]

In [None]:
sig[sig['hticker'].str.contains("-")].drop_duplicates(subset="hticker")

In [None]:
sig[sig['defunct_flag']==1]