In [None]:
import os
import csv
import pickle
import pandas as pd
from tqdm.notebook import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

### Bond Issuers

In [None]:
# bond issuers
bond_issuers = pd.read_csv(
    'data/source/bond_issuers.csv', delimiter=',', low_memory=False, encoding='utf-8'
)

# uniform column names
bond_issuers.columns = [
    x.lower().replace('_', ' ').title().replace(' ', '') for x in bond_issuers.columns
]

# drop duplicates along the whole dataset
bond_issuers = bond_issuers.drop_duplicates()

# drop empty columns
bond_issuers = bond_issuers.dropna(how='all', axis=1)

# remove commas from string values
cols = [
    'CusipName', 'LegalName', 'Addr1', 'Addr2', 'City', 
    'Zipcode', 'Province', 'MainPhone', 'MainFax', 'Note'
]
for col in cols:
    bond_issuers[col] = bond_issuers[col].str.replace(',', '')

# cast float(NaNs) to int 
bond_issuers = bond_issuers.assign(
    SicCode = bond_issuers['SicCode'].astype('Int64'),
    ParentId = bond_issuers['ParentId'].astype('Int64'),
    NaicsCode = bond_issuers['NaicsCode'].astype('Int64'),
    IndustryGroup = bond_issuers['IndustryGroup'].astype('Int64')
)

# save
bond_issuers.to_csv('data/processed/bond_issuers.csv', index=False)

### Bond Issues

In [None]:
acc = []
base_dir = 'data/source/bond_issues/'

for file in os.listdir(base_dir):
    
    # bond issues
    curr_bond_issues = pd.read_csv(
        base_dir + file, delimiter=';', encoding='utf-8'
    )
    
    # uniform column names
    curr_bond_issues.columns = [
        x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_bond_issues.columns
    ]
    
    # transform dates
    curr_bond_issues = curr_bond_issues.assign(
        DatedDate = pd.to_datetime(
            curr_bond_issues['DatedDate'], format='%Y%m%d', errors='coerce'
        ),
        FirstInterestDate = pd.to_datetime(
            curr_bond_issues['FirstInterestDate'], format='%Y%m%d', errors='coerce'
        ),
        LastInterestDate = pd.to_datetime(
            curr_bond_issues['LastInterestDate'], format='%Y%m%d', errors='coerce'
        )
    )
    
    # local save
    acc.append(curr_bond_issues)
    
# combine dfs
bond_issues = pd.concat([acc[i] for i in range(len(acc))], axis=0)

# drop empty columns
bond_issues = bond_issues.dropna(how='all', axis=1)

# drop duplicate Cusips
bond_issues = bond_issues.sort_values(by=['FirstInterestDate'])
bond_issues = bond_issues.drop_duplicates(
        subset=['CompleteCusip'], keep='first'
    )

# save
bond_issues.to_csv('data/processed/bond_issues.csv', index=False)

### Bond Ratings

In [None]:
acc = []
base_dir = 'data/source/bond_ratings/'

with open('pickle/rating_eq.pickle', 'rb') as handle:
    rating_eq = pickle.load(handle)

for file in tqdm(os.listdir(base_dir), desc='Processing records'):
    
    # bond ratings
    curr_bond_ratings = pd.read_csv(
        base_dir + file, delimiter=';', low_memory=False, encoding='utf-8'
    )
    
    # uniform column names
    curr_bond_ratings.columns = [
        x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_bond_ratings.columns
    ]

    # drop duplicates
    curr_bond_ratings = curr_bond_ratings.sort_values(by=['InvestmentGrade'])
    curr_bond_ratings = curr_bond_ratings.drop_duplicates(
        subset=['CompleteCusip', 'RatingDate', 'RatingType'], keep='first'
    )

    # rating mapping
    curr_bond_ratings['RatingCategory'] = curr_bond_ratings['Rating'].map(rating_eq).astype('Int64')
    curr_bond_ratings['RatingCategory'] = curr_bond_ratings['RatingCategory'].fillna(0)
    
    # local save
    acc.append(curr_bond_ratings)

# combine dfs
bond_ratings = pd.concat([acc[i] for i in range(len(acc))], axis=0)

# drop duplicates along the whole dataset
bond_ratings = bond_ratings.drop_duplicates()

# save
bond_ratings.to_csv('data/processed/bond_ratings.csv', index=False)

### Bond Returns

In [None]:
acc = []
base_dir = 'data/source/bond_returns/'

for file in tqdm(os.listdir(base_dir), desc='Processing records'):
    
    # bond retruns
    curr_bond_ret = pd.read_csv(
        base_dir + file, delimiter=',', low_memory=False, encoding='utf-8'
    )
    
    # uniform columns names
    columns = [
        ''.join(x.lower().capitalize() for x in col.split('_')) for col in curr_bond_ret.columns
    ]
    curr_bond_ret.columns = columns
    curr_bond_ret = curr_bond_ret.rename(
        columns = {
            'Coupamt': 'CoupAmt',
            'Coupacc': 'CoupAcc',
            'Nextcoup': 'NextCoup',
            'Remcoups': 'RemCoups',
            'Coupmonth': 'CoupMonth', 
            'Multicoups': 'MultiCoups'
        }
    )
    
    # remove punctuation from columns
    curr_bond_ret = curr_bond_ret.assign(
        BondSymId = curr_bond_ret['BondSymId'].str.replace(',', '.'),
        Yield = curr_bond_ret['Yield'].str.replace(',', '').str.replace('%', '').astype(float),
        RetEom = curr_bond_ret['RetEom'].str.replace(',', '').str.replace('%', '').astype(float),
        RetLdm = curr_bond_ret['RetLdm'].str.replace(',', '').str.replace('%', '').astype(float),
        RetL5m = curr_bond_ret['RetL5m'].str.replace(',', '').str.replace('%', '').astype(float),
        TSpread = curr_bond_ret['TSpread'].str.replace(',', '').str.replace('%', '').astype(float),
        TVolume = curr_bond_ret['TVolume'].str.replace(',', '').str.replace('$', '', regex=True).astype('Int64'),
        TDvolume = curr_bond_ret['TDvolume'].str.replace(',', '').str.replace('$', '', regex=True).astype('Int64')
    )
    
    # drop duplicates along the whole dataset
    curr_bond_ret = curr_bond_ret.drop_duplicates()

    # local save
    acc.append(curr_bond_ret)
    
# combine dfs
bond_ret = pd.concat([acc[i] for i in range(len(acc))], axis=0)

# drop duplicates along the whole dataset
bond_ret = bond_ret.drop_duplicates()

# save
bond_ret.to_csv('data/processed/bond_returns.csv', index=False)

### CRSPC Factors

In [None]:
# crspc factors
crspc_factors = pd.read_csv(
    'data/source/crspc/factors.csv', delimiter=',', low_memory=False, encoding='utf-8'
)

# transform date column
crspc_factors['Date'] = pd.to_datetime(crspc_factors['Date'], format='%Y%m%d')

# rename columns
crspc_factors.columns = ['Date', 'MktRf', 'Smb', 'Hml', 'Rmw', 'Cma', 'Rf', 'Rm']

# save
crspc_factors.to_csv('data/processed/crspc/factors.csv', index=False)

### CRPSC Daily Securities

In [None]:
base_dir = 'data/source/crspc/securities_daily/'

for file in tqdm(os.listdir(base_dir), desc='Processing records'):

    # securities daily
    crspc_daily = pd.read_csv(
        base_dir + file, delimiter=';', low_memory=False, encoding='utf-8'
    )
    
    # uniform column names
    crspc_daily.columns = [
        'GvKey', 'IId', 'DataDate', 'Tic', 'Cusip', 'CoNml', 'PrcCd', 'PrcHd', 'PrcLd', 
        'PrcOd', 'PrcStd', 'Exchg', 'Exchange', 'Sic', 'Industry', 'Naics', 'Eps', 'EpsMo',
        'TrFd', 'Loc', 'Div', 'Divd', 'DivdPayDateInd','Divsp', 'DivspPayDate', 'AnncDate', 'IpoDate'
    ]
    
    # drop duplicates
    crspc_daily = crspc_daily.drop_duplicates()
    
    # remove punctuation
    crspc_daily['CoNml'] = crspc_daily['CoNml'].str.replace(',', '.')
    
    # save
    crspc_daily.to_csv('data/processed/crspc/securities_daily/{}'.format(file), index=False)

In [None]:
base_dir = 'data/source/crspc/'

columns = [
    'GVKEY', 'LPERMNO', 'iid', 'datadate', 'tic', 'cusip', 'conm', 'prccd', 'prchd', 'prcld', 
    'prcod', 'prcstd', 'exchg', 'exchange', 'sic', 'industry', 'naics', 'eps', 'epsmo',
    'trfd', 'loc', 'div', 'divd', 'divdpaydateind','divsp', 'divsppaydate', 'anncdate', 'ipodate'
]

for file in ['CRSPC_12-2002.csv', 'CRSPC_06-2022.csv']:

    # securities daily
    crspc_daily = pd.read_csv(
        base_dir + file, delimiter=';', low_memory=False, encoding='utf-8'
    )
    
    # select columns
    crspc_daily = crspc_daily[columns]
    
    # uniform column names
    crspc_daily.columns = [
        'GvKey', 'LPermNo', 'IId', 'DataDate', 'Tic', 'Cusip', 'CoNml', 'PrcCd', 'PrcHd', 'PrcLd', 
        'PrcOd', 'PrcStd', 'Exchg', 'Exchange', 'Sic', 'Industry', 'Naics', 'Eps', 'EpsMo',
        'TrFd', 'Loc', 'Div', 'Divd', 'DivdPayDateInd','Divsp', 'DivspPayDate', 'AnncDate', 'IpoDate'
    ]
    
    # drop duplicates
    crspc_daily = crspc_daily.drop_duplicates()
    
    # transform date column
    crspc_daily['IpoDate'] = pd.to_datetime(crspc_daily['IpoDate'], format='%Y%m%d')
    
    # remove punctuation
    crspc_daily['CoNml'] = crspc_daily['CoNml'].str.replace(',', '.')
    
    # save
    crspc_daily.to_csv('data/processed/crspc/{}'.format(file), index=False)

### Trace

In [None]:
base_dir = 'data/source/trace/'

for file in tqdm(os.listdir(base_dir), desc='Processing records'):
    
    # trace
    curr_trace = pd.read_csv(
        base_dir + file, delimiter=';', low_memory=False, encoding='utf-8'
    )
    
    # drop blank columns
    curr_trace = curr_trace.drop(['asof_cd', 'spcl_trd_fl'], axis=1)
    
    # remove commas from columns
    curr_trace['bond_sym_id'] = curr_trace['bond_sym_id'].str.replace(',', '.')

    # uniform column names
    curr_trace.columns = [
        x.lower().replace('_', ' ').title().replace(' ', '') for x in curr_trace.columns
    ]
    
    # save
    curr_trace.to_csv('data/processed/trace/{}'.format(file), index=False)