In [None]:
import os
import pickle
import pandas as pd
from tqdm.notebook import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

### Bond Issues
---

In [None]:
acc = []
base_dir = 'data/source/bond_issues/'

for file in os.listdir(base_dir):
    
    # bond issues
    curr_bond_issues = pd.read_csv(base_dir + file, delimiter=';')
    
    # uniform column names
    curr_bond_issues.columns = [
        x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_bond_issues.columns
    ]
    
    # transform dates
    curr_bond_issues['DatedDate'] = pd.to_datetime(
        curr_bond_issues['DatedDate'], format='%Y%m%d', errors='coerce'
    )
    curr_bond_issues['FirstInterestDate'] = pd.to_datetime(
        curr_bond_issues['FirstInterestDate'], format='%Y%m%d', errors='coerce'
    )
    curr_bond_issues['LastInterestDate'] = pd.to_datetime(
        curr_bond_issues['LastInterestDate'], format='%Y%m%d', errors='coerce'
    )
    
    # local save
    acc.append(curr_bond_issues)
    
# combine dfs
bond_issues = pd.concat([acc[i] for i in range(len(acc))], axis=0)

# drop empty columns
bond_issues = bond_issues.dropna(how='all', axis=1)

# drop duplicates along the whole dataset
bond_issues = bond_issues.drop_duplicates()

# drop duplicate Cusips
bond_issues = bond_issues.sort_values(by=['FirstInterestDate'])
bond_issues = bond_issues.drop_duplicates(
        subset=['CompleteCusip'], keep='first'
    )

# save
bond_issues.to_csv('data/processed/bond_issues.csv', index=False)

### Bond Issuers
---

In [None]:
# bond issuers
bond_issuers = pd.read_csv('data/source/bond_issuers.csv', delimiter=',', low_memory=False)

# uniform column names
bond_issuers.columns = [
    x.lower().replace('_', ' ').title().replace(' ', '') for x in bond_issuers.columns
]

# drop duplicates along the whole dataset
bond_issuers = bond_issuers.drop_duplicates()

# drop empty columns
bond_issuers = bond_issuers.dropna(how='all', axis=1)

# remove commas from string values
cols = ['CusipName', 'LegalName', 'Addr1', 'Addr2', 'City', 'Zipcode', 'Province', 'MainPhone', 'MainFax', 'Note']
for col in cols:
    bond_issuers[col] = bond_issuers[col].str.replace(',', '')

# cast float(NaNs) to int 
bond_issuers['SicCode'] = bond_issuers['SicCode'].astype('Int64')
bond_issuers['ParentId'] = bond_issuers['ParentId'].astype('Int64')
bond_issuers['NaicsCode'] = bond_issuers['NaicsCode'].astype('Int64')
bond_issuers['IndustryGroup'] = bond_issuers['IndustryGroup'].astype('Int64')

# save
bond_issuers.to_csv('data/processed/bond_issuers.csv', index=False)

### Bond Ratings
---

In [None]:
acc = []
base_dir = 'data/source/bond_ratings/'

for file in os.listdir(base_dir):
    
    # bond ratings
    curr_bond_ratings = pd.read_csv(base_dir + file, delimiter=';', low_memory=False)

    # uniform column names
    curr_bond_ratings.columns = [
        x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_bond_ratings.columns
    ]

    # drop duplicates
    curr_bond_ratings = curr_bond_ratings.sort_values(by=['InvestmentGrade'])
    curr_bond_ratings = curr_bond_ratings.drop_duplicates(
        subset=['CompleteCusip', 'RatingDate', 'RatingType'], keep='first'
    )

    # rating mapping
    with open('rating_eq.pickle', 'rb') as handle:
        rating_eq = pickle.load(handle)
    curr_bond_ratings['RatingCategory'] = curr_bond_ratings['Rating'].map(rating_eq).astype('Int64')
    
    # local save
    acc.append(curr_bond_ratings)

# combine dfs
bond_ratings = pd.concat([acc[i] for i in range(len(acc))], axis=0)

# drop duplicates along the whole dataset
bond_ratings = bond_ratings.drop_duplicates()

# save
bond_ratings.to_csv('data/processed/bond_ratings.csv', index=False)

### Trace
---

In [None]:
base_dir = 'data/source/trace/'

for file in tqdm(os.listdir(base_dir), desc="Processing records"):
    
    # trace
    curr_trace = pd.read_csv(base_dir + file, delimiter=';', low_memory=False)
    
    # drop blank columns
    curr_trace = curr_trace.drop(['asof_cd', 'spcl_trd_fl'], axis=1)
    
    # remove commas from columns
    curr_trace['bond_sym_id'] = curr_trace['bond_sym_id'].str.replace(',', '.')

    # uniform column names
    curr_trace.columns = [x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_trace.columns]
    
    # save
    curr_trace.to_csv('data/processed/trace/{}'.format(file), index=False)

### CRSPC Factors
---

In [None]:
# crspc factors
crspc_factors = pd.read_csv('data/source/crspc/factors.csv', delimiter=',', low_memory=False)

# transform date column
crspc_factors['Date'] = pd.to_datetime(crspc_factors['Date'], format='%Y%m%d')

# rename columns
crspc_factors.columns = ['Date', 'MktRf', 'Smb', 'Hml', 'Rmw', 'Cma', 'Rf', 'Rm']

# save
crspc_factors.to_csv('data/processed/crspc/factors.csv', index=False)

### CRPSC Daily Securities
---

In [None]:
base_dir = 'data/source/crspc/securities_daily/'

for file in tqdm(os.listdir(base_dir), desc="Processing records"):

    crspc_daily = pd.read_csv(base_dir + file, delimiter=';', low_memory=False)

    crspc_daily.columns = [
        'GvKey', 'IId', 'DataDate', 'Tic', 'Cusip', 'CoNml', 'PrcCd', 'PrcHd', 'PrcLd', 'PrcOd', 'PrcStd', 
        'Exchg', 'Exchange', 'Sic', 'Industry', 'Naics', 'Eps', 'EpsMo', 'TrFd', 'Loc', 'Div', 'Divd', 'DivdPayDateInd',
        'Divsp', 'DivdPayDate', 'DivspPayDate', 'AnncDate', 'IpoDate'
    ]
    
    # drop duplicates
    crspc_daily = crspc_daily.drop_duplicates()
    
    crspc_daily['CoNml'] = crspc_daily['CoNml'].str.replace(',', '.')
    
    crspc_daily.to_csv('data/processed/crspc/securities_daily/securities{}.csv'.format(file.split('.')[0][-2:]), index=False)