## Cleans TRACE Corporate Bond Data

In [1]:
# Imports
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from dateutil.relativedelta import *
from pandas.tseries.offsets import *

In [2]:
# Loads in subset of prof Yoshio's data
with pd.read_csv("prof_yoshio_data/trace_credit_spreads.csv", chunksize=1000) as reader:
    data = reader.get_chunk()

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cusip_id                1000 non-null   object 
 1   date                    1000 non-null   int64  
 2   price                   1000 non-null   float64
 3   volume                  1000 non-null   int64  
 4   ISSUE_ID                1000 non-null   int64  
 5   ISSUER_ID               1000 non-null   int64  
 6   PROSPECTUS_ISSUER_NAME  1000 non-null   object 
 7   MATURITY                1000 non-null   int64  
 8   SECURITY_LEVEL          1000 non-null   object 
 9   OFFERING_AMT            1000 non-null   int64  
 10  OFFERING_DATE           1000 non-null   int64  
 11  OFFERING_PRICE          852 non-null    float64
 12  INTEREST_FREQUENCY      1000 non-null   int64  
 13  COUPON                  1000 non-null   float64
 14  standard                1000 non-null   i

In [174]:
# data.memory_usage(deep=True) \
# .gt(8000)

As expected all "object" (ie. string) columns use up the majority of the memory. The rest (float/int64s) are using 8 bytes, however these may also be reduced to 16 or 32.

In [183]:
df = data.copy()
df.columns = map(str.lower, df.columns)

In [184]:
def cols(col_names, data_type):
    '''
    Takes in list of column names and a datatype and generates a dictionary
    '''
    return {col_names[i]: data_type for i in range(len(col_names))}

In [191]:
def clean_data_types(df):
    '''
    Cleans datatypes for raw data
    '''
    # String/text columns
    string_cols = ['cusip_id', 'cusip', 'prospectus_issuer_name']
    strings = cols(string_cols, 'string[pyarrow]')

    bool_cols = ['callable', 'make_whole', 'conv', 'junior']
    bools = cols(bool_cols, 'bool')

    category_cols = ['security_level', 'rating_spr', 'rating_mdy', 'standard', 'putop', 'industry_code']
    categories = cols(category_cols, 'category')
    df = df.drop(["sp_rat", "moody_rat", "rating", 'rating_date_spr', 'rating_date_mdy'], axis=1)

    # Float/Integer columns
    int8_cols = ['interest_frequency']
    int8s = cols(int8_cols, 'int8')

    int16_cols = ['sic_code']
    int16s = cols(int16_cols, 'int16')

    int32_cols = ['volume', 'offering_amt', 'amount_outstanding', 'issue_id', 'issuer_id']
    int32s = cols(int32_cols, 'int32')

    float32_cols = ['price', 'offering_price', 'coupon', 'accrued_interest', 'tau',
                   'age', 'ytm', 'duration', 'tr_dirty_price', 'tr_ytm', 'cs']
    float32s = cols(float32_cols, 'float32')

    # Dates
    # formula to convert to datetime: df['date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
    date_cols = ['date', 'maturity', 'offering_date', 'month1']
    dates = cols(date_cols, 'int32')

    # Applies type changes
    types = strings | bools | categories | int8s | int16s | int32s | float32s | dates

    return df.astype(types)

In [192]:
df = clean_data_types(df)
# df.memory_usage(deep=True)

In [198]:
df.info()
# Memory usage decreased by about half

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   cusip_id                1000 non-null   string  
 1   date                    1000 non-null   int32   
 2   price                   1000 non-null   float32 
 3   volume                  1000 non-null   int32   
 4   issue_id                1000 non-null   int32   
 5   issuer_id               1000 non-null   int32   
 6   prospectus_issuer_name  1000 non-null   string  
 7   maturity                1000 non-null   int32   
 8   security_level          1000 non-null   category
 9   offering_amt            1000 non-null   int32   
 10  offering_date           1000 non-null   int32   
 11  offering_price          852 non-null    float32 
 12  interest_frequency      1000 non-null   int8    
 13  coupon                  1000 non-null   float32 
 14  standard                1

In [None]:
# Loads in whole data
data = pd.read_csv("prof_yoshio_data/trace_credit_spreads.csv")
print(data.memory_usage(deep=True).sum()/1000, 'KB')

In [None]:
# Cleans data
cleaned_data = clean_data_types(data)
print(data.memory_usage(deep=True).sum()/1000, 'KB')

In [None]:
# Saves cleaned data
cleaned_data.to_csv('cleaned_data.csv')