In [184]:
# -*- coding: utf-8 -*-
"""
1_wsh_build_ranges.ipynb

Build date ranges for identifying tuple using the datebreaks data from FTP sources and 
historically collected data in Dropbox/OneDrive, broken down by event type. 

Output: wsh_names.parquet file 

@author: clj585
"""

# Imports and setup 
import pandas as pd
import os, glob
import datatable as dt
from datetime import datetime, timedelta
import zipfile
from datatable import Frame
import pyarrow.parquet as pq
import numpy as np 
import pysftp                # This is not FTP! This is Python's equivalent to Cyberduck 
import fnmatch               # Import for file matching certain pattern/Regex

# Local path to WSH earnings data (daily parquet files?)
wsh_pth = "C:\\Users\clj585\Dropbox\WSH\DB" 

# Local list defined 
local_list = glob.glob("*.parquet")

## IGNORE ##
# Test sample ED (earnings identified) file 
#filepth = 'C://Users/clj585/Downloads/file_1-3-2022_1-4-2022_ED.parquet'
#blah = pd.read_parquet(filepth)

In [185]:
"""
SFTP (not FTP) into KLC directory with WSH automated/populated files. 
"""
HOST = "klc.northwestern.edu"
USER = "clj585"                              # ToDo: fill in own credentials! PASS not defined on purpose. 

os.chdir(wsh_pth)

with pysftp.Connection(HOST, username=USER, password=PASS) as sftp:
    with sftp.cd('/kellogg/data/wall_street_horizon/parquet'):          
        kis_list = sftp.listdir()
        
        # File differences 
        dl_list=(list(set(kis_list) - set(local_list))) 
        #print(dl_list)

        # Loop through necessary files now 
        for filename in dl_list:
            if fnmatch.fnmatch(filename, "*DB.parquet"):
                print(filename)
                sftp.get(filename)         # get a remote file
sftp.close()

In [186]:
"""
Join parquet files populated from SFTP with the original data held in Dropbox. 

"""

hist_pth = "C:\\Users\clj585\Dropbox\WSH\hist_20210902"
os.chdir(hist_pth)

db_hist_files = glob.glob("db*.csv")
db_list_dfs = []
for file in db_hist_files:
    db_list_dfs.append(pd.read_csv(os.path.join(hist_pth, file)))

os.chdir(wsh_pth)
for file in glob.glob("*.parquet"):
    tempdf = pd.read_parquet(os.path.join(wsh_pth, file))
    db_list_dfs.append(tempdf)
    
db = pd.concat(db_list_dfs)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [187]:
db

Unnamed: 0,event_id,company_id,stock_symbol,company_name,status,record_status,stock_exchange,quarter,fiscal_year,echangetype,...,option_expiration_code,option_expiration_date,isin,filing_due_date,confidence_indicator,prelim_earnings_date,change_reason,announcement_url,prior_earnings_date,return_time
0,3VAF0HL6,1228,HXL,Hexcel,ACTIVE,PREV,NYSE,Q4,2005.0,NQ,...,,,,,,,,,,
1,3VAF2659,11728,TMI,"Team, Inc.",ACTIVE,PREV,NYSE,Q2,2006.0,NQ,...,,,,,,,,,,
2,3VAF2XGP,1431682492,DPZ,,ACTIVE,PREV,NYSE,Q4,2005.0,NQ,...,,,,,,,,,,
3,3VAF2XGP,1431682492,DPZ,,ACTIVE,PREV,NYSE,Q4,2005.0,DTT,...,,,,,,,,,,
4,3VAF34B1,1431682825,MPW,Medical Properties Trust Inc.,ACTIVE,PREV,NYSE,Q4,2005.0,NQ,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,432U77BV,1431684820,CA:TRZ,Transat A.T. Inc.,,,,Q4,2021,NQ,...,,,CA89351T4019,,,,,,12/09/2021,01/09/2022 11:24:34 AM
289,42XQ83IF,1431686569,BRX,Brixmor Property Group Inc.,,,,Q3,2021,DTV,...,,,US11120U1051,,,,,,11/03/2021,01/09/2022 11:24:34 AM
290,4IQK1TUV,1431699194,CADL,"Candel Therapeutics, Inc.",,,,Q3,2021,NQ,...,,,US1374041093,,,,,,12/02/2021,01/09/2022 11:24:34 AM
291,4IQK1TUV,1431699194,CADL,"Candel Therapeutics, Inc.",,,,Q3,2021,DTT,...,,,US1374041093,,,,,,12/02/2021,01/09/2022 11:24:34 AM


In [188]:
# Keep only certain columns: identifiers and time information 
db.columns

Index(['event_id', 'company_id', 'stock_symbol', 'company_name', 'status',
       'record_status', 'stock_exchange', 'quarter', 'fiscal_year',
       'echangetype', 'earnings_date', 'time_of_day', 'earnings_date_status',
       'audit_source', 'disclaimer', 'same_store_sales', 'created', 'updated',
       'quarter_end_date', 'total_days_changed', 'confirmed_date_zscore',
       'option_expiration_code', 'option_expiration_date', 'isin',
       'filing_due_date', 'confidence_indicator', 'prelim_earnings_date',
       'change_reason', 'announcement_url', 'prior_earnings_date',
       'return_time'],
      dtype='object')

In [189]:
db = db[['company_id', 'stock_symbol', 'isin', 'company_name', 'fiscal_year', 'earnings_date', "created",
          'updated', "return_time"]]
db

Unnamed: 0,company_id,stock_symbol,isin,company_name,fiscal_year,earnings_date,created,updated,return_time
0,1228,HXL,,Hexcel,2005.0,01/12/2006,03/30/2005 09:08:29 AM,01/02/2006 12:00:00 AM,
1,11728,TMI,,"Team, Inc.",2006.0,01/05/2006,01/02/2006 12:00:00 AM,01/02/2006 12:00:00 AM,
2,1431682492,DPZ,,,2005.0,01/02/2006,01/02/2006 07:21:38 AM,01/02/2006 07:21:38 AM,
3,1431682492,DPZ,,,2005.0,02/09/2006,01/02/2006 07:21:38 AM,01/02/2006 07:21:39 AM,
4,1431682825,MPW,,Medical Properties Trust Inc.,2005.0,01/02/2006,01/02/2006 07:27:49 AM,01/02/2006 07:27:49 AM,
...,...,...,...,...,...,...,...,...,...
288,1431684820,CA:TRZ,CA89351T4019,Transat A.T. Inc.,2021,12/09/2021,06/14/2019 06:21:36 AM,09/10/2021 01:07:53 AM,01/09/2022 11:24:34 AM
289,1431686569,BRX,US11120U1051,Brixmor Property Group Inc.,2021,11/01/2021,04/30/2019 07:49:24 AM,09/09/2021 04:25:12 PM,01/09/2022 11:24:34 AM
290,1431699194,CADL,US1374041093,"Candel Therapeutics, Inc.",2021,12/02/2021,09/08/2021 08:43:34 AM,09/09/2021 01:06:25 AM,01/09/2022 11:24:34 AM
291,1431699194,CADL,US1374041093,"Candel Therapeutics, Inc.",2021,11/15/2021,09/08/2021 08:43:34 AM,09/09/2021 07:16:29 AM,01/09/2022 11:24:34 AM


In [190]:
# Change column type and rename columns right here 
db.dtypes

colmap={"company_id":"wsh_id", "stock_symbol":"hticker", "isin":"hcusip", "company_name":"name"}

db = db.rename(columns=colmap)

# Define string types  
db[["hticker", "hcusip", "name", "earnings_date", "created", "updated", "return_time"]] = db[["hticker", 
                                                "hcusip", "name", "earnings_date", 
                                                "created", "updated", "return_time"]].astype('string')

# Define integers -- harder than usual! 
db[["wsh_id"]] = db[['wsh_id']].astype(str).astype(int)
db['fiscal_year'] = db['fiscal_year'].astype(float).astype("Int64")

# Adjust format with the dates 
db['updated'] = pd.to_datetime(db['updated'])
db['created'] = pd.to_datetime(db['created'])
db['return_time'] = pd.to_datetime(db['return_time'])

db
    

Unnamed: 0,wsh_id,hticker,hcusip,name,fiscal_year,earnings_date,created,updated,return_time
0,1228,HXL,,Hexcel,2005,01/12/2006,2005-03-30 09:08:29,2006-01-02 00:00:00,NaT
1,11728,TMI,,"Team, Inc.",2006,01/05/2006,2006-01-02 00:00:00,2006-01-02 00:00:00,NaT
2,1431682492,DPZ,,,2005,01/02/2006,2006-01-02 07:21:38,2006-01-02 07:21:38,NaT
3,1431682492,DPZ,,,2005,02/09/2006,2006-01-02 07:21:38,2006-01-02 07:21:39,NaT
4,1431682825,MPW,,Medical Properties Trust Inc.,2005,01/02/2006,2006-01-02 07:27:49,2006-01-02 07:27:49,NaT
...,...,...,...,...,...,...,...,...,...
288,1431684820,CA:TRZ,CA89351T4019,Transat A.T. Inc.,2021,12/09/2021,2019-06-14 06:21:36,2021-09-10 01:07:53,2022-01-09 11:24:34
289,1431686569,BRX,US11120U1051,Brixmor Property Group Inc.,2021,11/01/2021,2019-04-30 07:49:24,2021-09-09 16:25:12,2022-01-09 11:24:34
290,1431699194,CADL,US1374041093,"Candel Therapeutics, Inc.",2021,12/02/2021,2021-09-08 08:43:34,2021-09-09 01:06:25,2022-01-09 11:24:34
291,1431699194,CADL,US1374041093,"Candel Therapeutics, Inc.",2021,11/15/2021,2021-09-08 08:43:34,2021-09-09 07:16:29,2022-01-09 11:24:34


In [191]:
db['earliest'] = np.nanmin(db[['updated', 'created', "return_time"]].values, axis=1)
db['latest'] = np.nanmax(db[['updated', 'created', "return_time"]].values, axis=1)

db['hcusip']=db['hcusip'].fillna('')

#date1=db.groupby(['wsh_id','hcusip', 'hticker'], dropna=False,as_index=False)[['earliest']].agg('min')
#date2=db.groupby(['wsh_id','hcusip', 'hticker'], dropna=False,as_index=False)[['latest']].agg('max')

#date1=db.groupby(['wsh_id','earliest', 'hticker', 'hcusip'],as_index=False).first()
#date2=db.groupby(['wsh_id','latest', 'hticker', 'hcusip'],as_index=False).last()

date1=db.groupby(['wsh_id','hticker','hcusip'], as_index=False)['earliest'].min()#.reset_index(drop=True)
date2 = db.groupby(['wsh_id','hticker','hcusip'], as_index=False)['latest'].max()#.reset_index(drop=True)

db

Unnamed: 0,wsh_id,hticker,hcusip,name,fiscal_year,earnings_date,created,updated,return_time,earliest,latest
0,1228,HXL,,Hexcel,2005,01/12/2006,2005-03-30 09:08:29,2006-01-02 00:00:00,NaT,2005-03-30 09:08:29,2006-01-02 00:00:00
1,11728,TMI,,"Team, Inc.",2006,01/05/2006,2006-01-02 00:00:00,2006-01-02 00:00:00,NaT,2006-01-02 00:00:00,2006-01-02 00:00:00
2,1431682492,DPZ,,,2005,01/02/2006,2006-01-02 07:21:38,2006-01-02 07:21:38,NaT,2006-01-02 07:21:38,2006-01-02 07:21:38
3,1431682492,DPZ,,,2005,02/09/2006,2006-01-02 07:21:38,2006-01-02 07:21:39,NaT,2006-01-02 07:21:38,2006-01-02 07:21:39
4,1431682825,MPW,,Medical Properties Trust Inc.,2005,01/02/2006,2006-01-02 07:27:49,2006-01-02 07:27:49,NaT,2006-01-02 07:27:49,2006-01-02 07:27:49
...,...,...,...,...,...,...,...,...,...,...,...
288,1431684820,CA:TRZ,CA89351T4019,Transat A.T. Inc.,2021,12/09/2021,2019-06-14 06:21:36,2021-09-10 01:07:53,2022-01-09 11:24:34,2019-06-14 06:21:36,2022-01-09 11:24:34
289,1431686569,BRX,US11120U1051,Brixmor Property Group Inc.,2021,11/01/2021,2019-04-30 07:49:24,2021-09-09 16:25:12,2022-01-09 11:24:34,2019-04-30 07:49:24,2022-01-09 11:24:34
290,1431699194,CADL,US1374041093,"Candel Therapeutics, Inc.",2021,12/02/2021,2021-09-08 08:43:34,2021-09-09 01:06:25,2022-01-09 11:24:34,2021-09-08 08:43:34,2022-01-09 11:24:34
291,1431699194,CADL,US1374041093,"Candel Therapeutics, Inc.",2021,11/15/2021,2021-09-08 08:43:34,2021-09-09 07:16:29,2022-01-09 11:24:34,2021-09-08 08:43:34,2022-01-09 11:24:34


In [192]:
date1

Unnamed: 0,wsh_id,hticker,hcusip,earliest
0,2,EW,,2006-01-30 00:00:00
1,2,EW,US28176E1082,2018-05-19 05:51:43
2,4,HBC,,2006-05-15 12:55:53
3,4,HSBC,,2013-11-13 23:10:14
4,4,HSBC,US4042804066,2018-02-26 13:55:23
...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 11:47:25
23771,1431700735,LFLY,US52178J1051,2022-03-22 12:19:44
23772,1431700737,2351.TW,TW0002351004,2022-03-22 14:38:36
23773,1431700748,GLS,US36850R2040,2022-03-22 12:11:20


In [193]:
date2

Unnamed: 0,wsh_id,hticker,hcusip,latest
0,2,EW,,2018-07-27 00:52:20
1,2,EW,US28176E1082,2022-01-27 19:00:11
2,4,HBC,,2013-11-13 23:10:14
3,4,HSBC,,2018-08-07 00:50:17
4,4,HSBC,US4042804066,2022-02-23 19:00:40
...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 20:00:14
23771,1431700735,LFLY,US52178J1051,2022-03-22 20:00:15
23772,1431700737,2351.TW,TW0002351004,2022-03-22 20:00:15
23773,1431700748,GLS,US36850R2040,2022-03-22 20:00:11


In [194]:
#date1[date1['wsh_id']==1228]

In [195]:
#date2[date2['wsh_id']==1228]

In [196]:
#date1[date1['hticker']=="AA"]

In [197]:
#date2[date2['hticker']=="AA"]

In [198]:
# Merging / combining 

import sqlite3
con = sqlite3.connect("wsh.db")
date1.to_sql("date1", con, index=False, if_exists='replace')
date2.to_sql("date2", con, index=False, if_exists='replace')

sql = """SELECT a.*,b.latest
FROM date1 as a, date2 as b
where (a.wsh_id=b.wsh_id)
AND (a.hcusip = b.hcusip and a.hticker=b.hticker)
order  by a.wsh_id, a.hticker, a.hcusip;"""

db_fin=pd.read_sql_query(sql, con)

db_fin['start_date'] = db_fin['earliest'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"))
db_fin['end_date'] = db_fin['latest'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"))
db_fin

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date
0,2,EW,,2006-01-30 00:00:00,2018-07-27 00:52:20,2006-01-30,2018-07-27
1,2,EW,US28176E1082,2018-05-19 05:51:43,2022-01-27 19:00:11,2018-05-19,2022-01-27
2,4,HBC,,2006-05-15 12:55:53,2013-11-13 23:10:14,2006-05-15,2013-11-13
3,4,HSBC,,2013-11-13 23:10:14,2018-08-07 00:50:17,2013-11-13,2018-08-07
4,4,HSBC,US4042804066,2018-02-26 13:55:23,2022-02-23 19:00:40,2018-02-26,2022-02-23
...,...,...,...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 11:47:25,2022-03-14 20:00:14,2022-03-14,2022-03-14
23771,1431700735,LFLY,US52178J1051,2022-03-22 12:19:44,2022-03-22 20:00:15,2022-03-22,2022-03-22
23772,1431700737,2351.TW,TW0002351004,2022-03-22 14:38:36,2022-03-22 20:00:15,2022-03-22,2022-03-22
23773,1431700748,GLS,US36850R2040,2022-03-22 12:11:20,2022-03-22 20:00:11,2022-03-22,2022-03-22


In [199]:
db_fin = db_fin.sort_values(by=['wsh_id', 'start_date', 'end_date'])
db_fin

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date
0,2,EW,,2006-01-30 00:00:00,2018-07-27 00:52:20,2006-01-30,2018-07-27
1,2,EW,US28176E1082,2018-05-19 05:51:43,2022-01-27 19:00:11,2018-05-19,2022-01-27
2,4,HBC,,2006-05-15 12:55:53,2013-11-13 23:10:14,2006-05-15,2013-11-13
3,4,HSBC,,2013-11-13 23:10:14,2018-08-07 00:50:17,2013-11-13,2018-08-07
4,4,HSBC,US4042804066,2018-02-26 13:55:23,2022-02-23 19:00:40,2018-02-26,2022-02-23
...,...,...,...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 11:47:25,2022-03-14 20:00:14,2022-03-14,2022-03-14
23771,1431700735,LFLY,US52178J1051,2022-03-22 12:19:44,2022-03-22 20:00:15,2022-03-22,2022-03-22
23772,1431700737,2351.TW,TW0002351004,2022-03-22 14:38:36,2022-03-22 20:00:15,2022-03-22,2022-03-22
23773,1431700748,GLS,US36850R2040,2022-03-22 12:11:20,2022-03-22 20:00:11,2022-03-22,2022-03-22


In [200]:
# Fix the column data types of this data frame db_fin

# Define string types  
db_fin[["hticker", "hcusip", "earliest", "latest", "start_date", 'end_date']] = db_fin[["hticker", 
                                        "hcusip", "earliest", "latest", "start_date", 'end_date']].astype('string')

# Last adjustments with dates columns  
db_fin['latest'] = pd.to_datetime(db_fin['latest'])
db_fin['earliest'] = pd.to_datetime(db_fin['earliest'])
db_fin['start_date'] = pd.to_datetime(db_fin['start_date'])
db_fin['end_date'] = pd.to_datetime(db_fin['end_date'])

db_fin

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date
0,2,EW,,2006-01-30 00:00:00,2018-07-27 00:52:20,2006-01-30,2018-07-27
1,2,EW,US28176E1082,2018-05-19 05:51:43,2022-01-27 19:00:11,2018-05-19,2022-01-27
2,4,HBC,,2006-05-15 12:55:53,2013-11-13 23:10:14,2006-05-15,2013-11-13
3,4,HSBC,,2013-11-13 23:10:14,2018-08-07 00:50:17,2013-11-13,2018-08-07
4,4,HSBC,US4042804066,2018-02-26 13:55:23,2022-02-23 19:00:40,2018-02-26,2022-02-23
...,...,...,...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 11:47:25,2022-03-14 20:00:14,2022-03-14,2022-03-14
23771,1431700735,LFLY,US52178J1051,2022-03-22 12:19:44,2022-03-22 20:00:15,2022-03-22,2022-03-22
23772,1431700737,2351.TW,TW0002351004,2022-03-22 14:38:36,2022-03-22 20:00:15,2022-03-22,2022-03-22
23773,1431700748,GLS,US36850R2040,2022-03-22 12:11:20,2022-03-22 20:00:11,2022-03-22,2022-03-22


In [201]:
db_fin.dtypes

wsh_id                 int64
hticker               string
hcusip                string
earliest      datetime64[ns]
latest        datetime64[ns]
start_date    datetime64[ns]
end_date      datetime64[ns]
dtype: object

In [202]:
# Fix the column with isin (called hcusip for now) to generate cusip8
db_fin['prefix_hcusip'] = db_fin['hcusip'].str[0:2]
db_fin['suffix_hcusip'] = db_fin['hcusip'].str[2:len(db_fin['hcusip'])]
db_fin['prefix_hcusip']
db_fin['cusip9'] = db_fin['hcusip'].str[2:11]
db_fin['cusip8'] = db_fin['hcusip'].str[2:10]

In [203]:
db_fin['suffix_hcusip']

0                  
1        28176E1082
2                  
3                  
4        4042804066
            ...    
23770    E982J01020
23771    52178J1051
23772    0002351004
23773    36850R2040
23774    4831042043
Name: suffix_hcusip, Length: 23775, dtype: string

In [204]:
db_fin['prefix_hcusip']

0          
1        US
2          
3          
4        US
         ..
23770    IN
23771    US
23772    TW
23773    US
23774    US
Name: prefix_hcusip, Length: 23775, dtype: string

In [205]:
db_fin['cusip8']

0                
1        28176E10
2                
3                
4        40428040
           ...   
23770    E982J010
23771    52178J10
23772    00023510
23773    36850R20
23774    48310420
Name: cusip8, Length: 23775, dtype: string

In [206]:
db_fin['cusip9']

0                 
1        28176E108
2                 
3                 
4        404280406
           ...    
23770    E982J0102
23771    52178J105
23772    000235100
23773    36850R204
23774    483104204
Name: cusip9, Length: 23775, dtype: string

In [207]:
db_fin[db_fin['hticker'].str[0:2] == "!!"]

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date,prefix_hcusip,suffix_hcusip,cusip9,cusip8
627,985,!!FINL,,2018-04-02 06:12:35,2018-06-19 06:50:10,2018-04-02,2018-06-19,,,,
21221,1431696010,!!HNNMY,US4258831050,2019-09-19 11:23:38,2019-09-19 11:26:05,2019-09-19,2019-09-19,US,4258831050,425883105,42588310
21643,1431696726,!!DCP.ZA,ZAE000227831,2020-02-24 11:00:18,2020-02-24 11:04:49,2020-02-24,2020-02-24,ZA,E000227831,E00022783,E0002278
21954,1431697113,!!MEG,US6151111019,2020-07-24 09:35:08,2020-07-24 09:40:11,2020-07-24,2020-07-24,US,6151111019,615111101,61511110


In [208]:
db_fin['hticker'] = db_fin['hticker'].replace('!!', '', regex=True)
db_fin

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date,prefix_hcusip,suffix_hcusip,cusip9,cusip8
0,2,EW,,2006-01-30 00:00:00,2018-07-27 00:52:20,2006-01-30,2018-07-27,,,,
1,2,EW,US28176E1082,2018-05-19 05:51:43,2022-01-27 19:00:11,2018-05-19,2022-01-27,US,28176E1082,28176E108,28176E10
2,4,HBC,,2006-05-15 12:55:53,2013-11-13 23:10:14,2006-05-15,2013-11-13,,,,
3,4,HSBC,,2013-11-13 23:10:14,2018-08-07 00:50:17,2013-11-13,2018-08-07,,,,
4,4,HSBC,US4042804066,2018-02-26 13:55:23,2022-02-23 19:00:40,2018-02-26,2022-02-23,US,4042804066,404280406,40428040
...,...,...,...,...,...,...,...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 11:47:25,2022-03-14 20:00:14,2022-03-14,2022-03-14,IN,E982J01020,E982J0102,E982J010
23771,1431700735,LFLY,US52178J1051,2022-03-22 12:19:44,2022-03-22 20:00:15,2022-03-22,2022-03-22,US,52178J1051,52178J105,52178J10
23772,1431700737,2351.TW,TW0002351004,2022-03-22 14:38:36,2022-03-22 20:00:15,2022-03-22,2022-03-22,TW,0002351004,000235100,00023510
23773,1431700748,GLS,US36850R2040,2022-03-22 12:11:20,2022-03-22 20:00:11,2022-03-22,2022-03-22,US,36850R2040,36850R204,36850R20


In [209]:
db_fin[db_fin['hticker'].str[0:2] == "!!"]            # Empty! 

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date,prefix_hcusip,suffix_hcusip,cusip9,cusip8


In [210]:
# Now check for single "!" in front of hticker to denote reuse case

db_fin[db_fin['hticker'].str[0:1] == "!"]            

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date,prefix_hcusip,suffix_hcusip,cusip9,cusip8
5,5,!SHLM,,2018-05-19 05:51:43,2018-08-22 07:04:09,2018-05-19,2018-08-22,,,,
17,30,!KERX,US4925151015,2018-05-19 05:51:43,2018-12-13 06:50:46,2018-05-19,2018-12-13,US,4925151015,492515101,49251510
47,91,!BEL,BMG1154H1079,2018-05-19 05:51:43,2019-04-18 14:18:17,2018-05-19,2019-04-18,BM,G1154H1079,G1154H107,G1154H10
51,103,!MDCO,US5846881051,2018-05-19 05:51:43,2020-01-07 06:33:33,2018-05-19,2020-01-07,US,5846881051,584688105,58468810
54,123,!ARQL,US04269E1073,2018-05-19 05:51:43,2020-01-21 07:07:20,2018-05-19,2020-01-21,US,04269E1073,04269E107,04269E10
...,...,...,...,...,...,...,...,...,...,...,...
22305,1431697565,!KDS.NL,NL0011323407,2020-12-24 13:58:36,2021-04-21 06:59:03,2020-12-24,2021-04-21,NL,0011323407,001132340,00113234
22332,1431697598,!8586.JP,JP3786600001,2020-11-17 11:00:43,2021-04-01 09:17:35,2020-11-17,2021-04-01,JP,3786600001,378660000,37866000
22520,1431697896,!8184.JP,JP3356800007,2021-01-14 12:17:06,2021-03-26 07:13:21,2021-01-14,2021-03-26,JP,3356800007,335680000,33568000
22559,1431697946,!BFF.IT,IT0005244402,2021-02-01 10:30:59,2021-02-01 10:42:18,2021-02-01,2021-02-01,IT,0005244402,000524440,00052444


In [211]:
# Cut these as well

db_fin['hticker'] = db_fin['hticker'].replace('!', '', regex=True)
db_fin

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date,prefix_hcusip,suffix_hcusip,cusip9,cusip8
0,2,EW,,2006-01-30 00:00:00,2018-07-27 00:52:20,2006-01-30,2018-07-27,,,,
1,2,EW,US28176E1082,2018-05-19 05:51:43,2022-01-27 19:00:11,2018-05-19,2022-01-27,US,28176E1082,28176E108,28176E10
2,4,HBC,,2006-05-15 12:55:53,2013-11-13 23:10:14,2006-05-15,2013-11-13,,,,
3,4,HSBC,,2013-11-13 23:10:14,2018-08-07 00:50:17,2013-11-13,2018-08-07,,,,
4,4,HSBC,US4042804066,2018-02-26 13:55:23,2022-02-23 19:00:40,2018-02-26,2022-02-23,US,4042804066,404280406,40428040
...,...,...,...,...,...,...,...,...,...,...,...
23770,1431700731,543396.IN,INE982J01020,2022-03-14 11:47:25,2022-03-14 20:00:14,2022-03-14,2022-03-14,IN,E982J01020,E982J0102,E982J010
23771,1431700735,LFLY,US52178J1051,2022-03-22 12:19:44,2022-03-22 20:00:15,2022-03-22,2022-03-22,US,52178J1051,52178J105,52178J10
23772,1431700737,2351.TW,TW0002351004,2022-03-22 14:38:36,2022-03-22 20:00:15,2022-03-22,2022-03-22,TW,0002351004,000235100,00023510
23773,1431700748,GLS,US36850R2040,2022-03-22 12:11:20,2022-03-22 20:00:11,2022-03-22,2022-03-22,US,36850R2040,36850R204,36850R20


In [212]:
# Double check again 
db_fin[db_fin['hticker'].str[0:1] == "!"]         # Great, empty 

Unnamed: 0,wsh_id,hticker,hcusip,earliest,latest,start_date,end_date,prefix_hcusip,suffix_hcusip,cusip9,cusip8


In [213]:
db_fin.to_parquet("C://Users/clj585/Dropbox/WSH/wsh_names.parquet")      # ToDo; change path dir! 