# bondret

> Retrieve and process data from WRDS Bond Returns File 

Since this is a proprietary dataset, in the documentation below, I can not show any of the data that is being retrieved/generated (will show only column names).

In [None]:
#| default_exp wrds.bondret

In [None]:
#|exports
from __future__ import annotations
from typing import List

import pandas as pd
import numpy as np

import pandasmore as pdm
from finsets.wrds import wrds_api

In [None]:
#| exports
PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-bond-returns/'
LIBRARY = 'wrdsapps'
TABLE = 'bondret'
LINK_TABLE = 'bondcrsp_link'
FREQ = 'M'
MIN_YEAR = 2002
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'permno'
ENTITY_ID_IN_CLEAN_DSET = 'permno'
TIME_VAR_IN_RAW_DSET = 'date'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'

In [None]:
#| export
def list_all_vars() -> pd.DataFrame:
    "Collects names of all available variables from WRDS f`{LIBRARY}.{TABLE}`."

    try:
        db = wrds_api.Connection()
        df = db.describe_table(LIBRARY,TABLE).assign(wrds_library=LIBRARY, wrds_table=TABLE)
    finally:
        db.close()

    return df[['name','type','wrds_library','wrds_table']].copy()

In [None]:
#| eval: false
all_vars = list_all_vars()

Loading library list...
Done
Approximately 3854028 rows in wrdsapps.bondret.


In [None]:
#| eval: false
all_vars.head()

Unnamed: 0,name,type,wrds_library,wrds_table
0,date,DATE,wrdsapps,bondret
1,issue_id,DOUBLE PRECISION,wrdsapps,bondret
2,cusip,VARCHAR(9),wrdsapps,bondret
3,bond_sym_id,VARCHAR(14),wrdsapps,bondret
4,bsym,DOUBLE PRECISION,wrdsapps,bondret


In [None]:
#| export
def parse_varlist(vars: List[str]=None,
                  required_vars = [],
                  ) -> str:
    """Validates that `vars` are available in `{LIBRARY}.{TABLE}` table and adds a. prefixes to variable names to feed into an SQL query"""

    # Get all available variables and add suffixes needed for the SQL query
    suffix_mapping = {TABLE: 'a.'}
    all_avail_vars = list_all_vars().drop_duplicates(subset='name',keep='first').copy()
    all_avail_vars['w_prefix'] = all_avail_vars.apply(lambda row: suffix_mapping[row['wrds_table']] + row['name'] , axis=1)

    if vars == '*' or vars is None: return ','.join(list(all_avail_vars['w_prefix']))
    
    # Add required vars to requested vars
    vars_to_get =  required_vars + [x for x in list(set(vars)) if x not in required_vars]

    # Validate variables to be downloaded (make sure that they are in the target database)
    invalid_vars = [v for v in vars_to_get if v not in list(all_avail_vars.name)]
    if invalid_vars: raise ValueError(f"These vars are not in the database: {invalid_vars}") 

    # Extract information on which variable comes from which wrds table, so we know what prefix to use
    vars_to_get = pd.DataFrame(vars_to_get, columns=['name'])
    get_these = vars_to_get.merge(all_avail_vars, how = 'left', on = 'name')
        
    return ','.join(list(get_these['w_prefix']))

In [None]:
#| eval: false
parse_varlist()

Loading library list...
Done
Approximately 3854028 rows in wrdsapps.bondret.


'a.date,a.issue_id,a.cusip,a.bond_sym_id,a.bsym,a.isin,a.company_symbol,a.bond_type,a.security_level,a.conv,a.offering_date,a.offering_amt,a.offering_price,a.principal_amt,a.maturity,a.treasury_maturity,a.coupon,a.day_count_basis,a.dated_date,a.first_interest_date,a.last_interest_date,a.ncoups,a.amount_outstanding,a.r_sp,a.r_mr,a.r_fr,a.n_sp,a.n_mr,a.n_fr,a.rating_num,a.rating_cat,a.rating_class,a.t_date,a.t_volume,a.t_dvolume,a.t_spread,a.t_yld_pt,a.yield,a.price_eom_flg,a.price_eom,a.price_ldm,a.price_l5m,a.gap,a.coupmonth,a.nextcoup,a.coupamt,a.coupacc,a.multicoups,a.ret_eom,a.ret_ldm,a.ret_l5m,a.tmt,a.remcoups,a.duration,a.defaulted,a.default_date,a.default_type,a.reinstated,a.reinstated_date'

In [None]:
#| export
def get_raw_data(
        vars: List[str]='*', # Downloads all available variables by defaul
        required_vars: List[str]=['cusip','date'], #list of variables that will get downloaded, even if not in `vars`
        nrows: int=None, #Number of rows to download. If None, full dataset will be downloaded
        start_date: str=None, # Start date in MM/DD/YYYY format
        end_date: str=None #End date in MM/DD/YYYY format
) -> pd.DataFrame:
    """Downloads `vars` from `start_date` to `end_date` from WRDS `{LIBRARY}.{TABLE}` library and adds PERMNO and PERMCO"""
 
    wrds_api.validate_dates([start_date, end_date])
    vars = parse_varlist(vars, required_vars=required_vars)

    sql_string=f"""SELECT c.permno as permno, c.permco as permco, {vars}
                    FROM {LIBRARY}.{TABLE} AS a
                    INNER JOIN {LIBRARY}.{LINK_TABLE} AS c ON a.cusip = c.cusip 
                    WHERE date BETWEEN c.link_startdt AND COALESCE(c.link_enddt, CURRENT_DATE)
                """
    if start_date is not None: sql_string += r" AND date >= %(start_date)s"
    if end_date is not None: sql_string += r" AND date <= %(end_date)s"
    if nrows is not None: sql_string += r" LIMIT %(nrows)s"
    
    return wrds_api.download(sql_string,
                             params={'start_date':start_date, 'end_date':end_date, 'nrows':nrows})

In [None]:
#| eval: false
#raw = get_raw_data(nrows=1000)

In [None]:
#| eval: false
raw.head(0)

Unnamed: 0,permno,permco,date,issue_id,cusip,bond_sym_id,bsym,isin,company_symbol,bond_type,...,ret_ldm,ret_l5m,tmt,remcoups,duration,defaulted,default_date,default_type,reinstated,reinstated_date


In [None]:
#| export
def process_raw_data(
        df: pd.DataFrame=None,  # Must contain `permno` and `datadate` columns   
        clean_kwargs: dict={},  # Params to pass to `pdm.setup_panel` other than `panel_ids`, `time_var`, and `freq`
) -> pd.DataFrame:
    """Applies `pandasmore.setup_panel` to `df`"""

    # Change some variables to categorical
    for col in ['cusip']:
        if col in df.columns:
            df[col] = df[col].astype('string').astype('category')

    # Set up panel structure
    df = pdm.setup_panel(df, 
                         drop_index_duplicates=False, # Multiple bonds (cusips) per permno per date
                         panel_ids=ENTITY_ID_IN_RAW_DSET, time_var=TIME_VAR_IN_RAW_DSET, freq=FREQ, 
                         panel_ids_toint=False, 
                         **clean_kwargs)
    return df 

In [None]:
#| eval: false
df_clean = process_raw_data(raw)

In [None]:
#| eval: false
df_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,permco,issue_id,cusip,bond_sym_id,bsym,isin,company_symbol,bond_type,...,ret_ldm,ret_l5m,tmt,remcoups,duration,defaulted,default_date,default_type,reinstated,reinstated_date
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
26403,2002-07,2002-07-31,2002-07-31,20587,383238.0,00072AAA8,DIS.GA,,US00072AAA88,DIS,CDEB,...,,,6.641667,14.0,4.797721,N,,,,
26403,2002-08,2002-08-31,2002-08-31,20587,383238.0,00072AAA8,DIS.GA,,US00072AAA88,DIS,CDEB,...,,0.015265,6.555556,13.0,4.928041,N,,,,
26403,2002-10,2002-10-31,2002-10-31,20587,383238.0,00072AAA8,DIS.GA,,US00072AAA88,DIS,CDEB,...,,,6.386111,13.0,4.721617,N,,,,
26403,2002-11,2002-11-30,2002-11-30,20587,383238.0,00072AAA8,DIS.GA,,US00072AAA88,DIS,CDEB,...,,0.062946,6.302778,13.0,4.714069,N,,,,
26403,2003-02,2003-02-28,2003-02-28,20587,383238.0,00072AAA8,DIS.GA,,US00072AAA88,DIS,CDEB,...,,,6.052778,12.0,4.673783,N,,,,


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()