In [None]:
import os
import pickle
import pandas as pd
from tqdm.notebook import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# bond_issues
acc = []
base_dir = 'data/source/bond_issues/'
for file in os.listdir(base_dir):
    curr_bond_issues = pd.read_csv(base_dir + file, delimiter=';')
    # uniform column names
    curr_bond_issues.columns = [x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_bond_issues.columns]
    acc.append(curr_bond_issues)
    
# combine dfs
bond_issues = pd.concat([acc[i] for i in range(len(acc))], axis=0)

# drop empty columns
bond_issues = bond_issues.dropna(how='all', axis=1)

# drop duplicates along whole dataset
bond_issues = bond_issues.drop_duplicates()

# save
bond_issues.to_csv('data/processed/bond_issues.csv', index=False)

In [None]:
# bond_issuers
bond_issuers = pd.read_csv('data/source/bond_issuers.csv', delimiter=',', low_memory=False)

# uniform column names
bond_issuers.columns = [x.lower().replace('_', ' ').title().replace(' ', '') for x in bond_issuers.columns]

# drop duplicates along whole dataset
bond_issuers = bond_issuers.drop_duplicates()

# drop empty columns
bond_issuers = bond_issuers.dropna(how='all', axis=1)

# remove commas from string values
cols = ['CusipName', 'LegalName', 'Addr1', 'Addr2', 'City', 'Zipcode', 'Province', 'MainPhone', 'MainFax', 'Note']
for col in cols:
    bond_issuers[col] = bond_issuers[col].str.replace(',', '')

# cast float(NaNs) to int 
bond_issuers['SicCode'] = bond_issuers['SicCode'].astype('Int64')
bond_issuers['ParentId'] = bond_issuers['ParentId'].astype('Int64')
bond_issuers['NaicsCode'] = bond_issuers['NaicsCode'].astype('Int64')
bond_issuers['IndustryGroup'] = bond_issuers['IndustryGroup'].astype('Int64')

# save
bond_issuers.to_csv('data/processed/bond_issuers.csv', index=False)

In [None]:
# bond ratings
bond_ratings = pd.read_csv('data/source/bond_ratings.csv', delimiter=';', low_memory=False)

# uniform column names
bond_ratings.columns = [x.lower().replace('_', ' ').title().replace(' ', '') for x in bond_ratings.columns]

# drop duplicates
bond_ratings = bond_ratings.sort_values(by=['InvestmentGrade'])
bond_ratings = bond_ratings.drop_duplicates(subset=['CompleteCusip', 'RatingDate', 'RatingType'], keep='first')

# rating mapping
with open('rating_eq.pickle', 'rb') as handle:
    rating_eq = pickle.load(handle)
bond_ratings['RatingCategory'] = bond_ratings['Rating'].map(rating_eq).astype('Int64')

# save
bond_ratings.to_csv('data/processed/bond_ratings.csv', index=False)

In [None]:
# trace
base_dir = 'data/source/trace/'
for file in tqdm(os.listdir(base_dir), desc="Processing records"):
    curr_trace = pd.read_csv(base_dir + file, delimiter=';', low_memory=False)
    
    # drop blank columns
    curr_trace = curr_trace.drop(['asof_cd', 'spcl_trd_fl'], axis=1)
    
    # remove commas from columns
    curr_trace['bond_sym_id'] = curr_trace['bond_sym_id'].str.replace(',', '.')

    # uniform column names
    curr_trace.columns = [x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_trace.columns]
    
    # save
    curr_trace.to_csv('data/processed/trace/{}'.format(file), index=False)

In [None]:
# crspc factors
crspc_factors = pd.read_csv('data/source/crspc/factors.csv', delimiter=',', low_memory=False)

# transform date column
crspc_factors['Date'] = pd.to_datetime(crspc_factors['Date'], format='%Y%m%d')

# rename columns
crspc_factors.columns = ['Date', 'MktRf', 'Smb', 'Hml', 'Rmw', 'Cma', 'Rf', 'Rm']

# save
crspc_factors.to_csv('data/processed/crspc/factors.csv', index=False)

In [None]:
# column selection and renaming
col = [
    'LPERMNO', 'LPERMCO', 'datadate', 'tic', 'cusip', 'conm', 'div', 'curcdd', 'cshoc', 'cshtrd', 'eps', 'epsmo', 'prccd',
    'prchd', 'prcld', 'prcod', 'prcstd', 'trfd', 'exchg', 'secstat', 'loc', 'naics', 'sic', 'ipodate'
]
rcol = [
    'LPermNo', 'LPermCo', 'DataDate', 'Tic', 'Cusip', 'CoNm', 'Div', 'CurCdd', 'Cshoc', 'CshTrd', 'Eps', 'EpsMo', 'PrcCd', 
    'PrcHd', 'PrcLd', 'PrcOd', 'PrcStd', 'TrFd', 'Exchg', 'SecStat', 'Loc', 'Naics', 'Sic', 'IpoDate'
]

# crspc daily first file
crspc_daily = pd.read_csv('data/source/crspc/CRSPC_02-12.csv', chunksize=500000, delimiter=',', low_memory=False)
for i, chunk in enumerate(crspc_daily):
    chunk = chunk[col]
    chunk.columns = rcol
    
    # transform date column
    chunk['IpoDate'] =  pd.to_datetime(chunk['IpoDate'], format='%Y%m%d')
    
    # save
    chunk.to_csv('data/processed/crspc/securities_daily/securities{}.csv'.format(i), index=False)

cnt = i + 1 # file counter
    
# crspc daily second file
crspc_daily = pd.read_csv('data/source/crspc/CRSPC_13-June22.csv', chunksize=500000, delimiter=',', low_memory=False)
for i, chunk in enumerate(crspc_daily):
    chunk = chunk[col]
    chunk.columns = rcol
    
    # transform date column
    chunk['IpoDate'] =  pd.to_datetime(chunk['IpoDate'], format='%Y%m%d')
    
    # save
    chunk.to_csv('data/processed/crspc/securities_daily/securities{}.csv'.format(i+cnt), index=False)