In [2]:
import pandas as pd
import numpy as np
import numpy_financial as npf
import geopandas as gpd
import os
import dask
import dask.dataframe as dd
import itertools
from itertools import chain
from math import sqrt, floor, ceil, isnan
import multiprocess
import importlib
from importlib import reload
from collections import Counter
from fuzzywuzzy import process, fuzz
import time
import warnings
import datetime
from datetime import datetime
from datetime import date
warnings.filterwarnings("error")

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400


# 1. Clean Mergent Data

Notes:
- Gao, Murphy, and Qi (2019) has step by step guidance on how to calculate call-adjusted yield **spread** from Mergent data, which seems to be an ideal reference.

## 1.1 Import data

In [3]:
# Note:
# (1) "coupon_code_c" = FXD, ODF, OID, OIP (& zero-coupon bond) correspond to fixed rate bonds

# Import data
bondinfo = pd.read_csv('../RawData/Mergent/bondinfo.csv',on_bad_lines='skip',encoding="ISO-8859-1",low_memory=False)

issueinfo = pd.read_csv('../RawData/Mergent/issueinfo.csv',on_bad_lines='skip',encoding="ISO-8859-1",low_memory=False)
# In cases such as "LEXINGTON-FAYETTE", actually two cities/counties are referred to. Replace '-' to facilitate later parsing
issueinfo['issuer_long_name_c'] = issueinfo['issuer_long_name_c'].str.replace('-',' ')
issueinfo['issuer_short_name_c'] = issueinfo['issuer_short_name_c'].str.replace('-',' ')

# Clean "offering type"
issueinfo.loc[issueinfo['offering_type_c']==' ','offering_type_c'] = 'MISSING'
issueinfo.loc[issueinfo['offering_type_c']=='LTD','offering_type_c'] = 'MISSING'
issueinfo.loc[issueinfo['offering_type_c']=='*','offering_type_c'] = 'MISSING'
issueinfo.loc[issueinfo['offering_type_c']=='18980000.00000','offering_type_c'] = 'MISSING'

# Clean "issue description"
issueinfo['issue_description_Processed'] = issueinfo['issue_description_c'].str.strip()
issueinfo['issue_description_Processed'] = issueinfo['issue_description_Processed'].str.upper()
issueinfo.loc[issueinfo['issue_description_Processed'].str.contains('GENERAL OBLIGATION'),
    'issue_description_Processed'] = "GENERAL OBLIGATION"
issueinfo.loc[issueinfo['issue_description_Processed'].str.contains('REVENUE'),
    'issue_description_Processed'] = "REVENUE"
issueinfo.loc[issueinfo['issue_description_Processed'].str.contains('UNLIMITED TAX'),
    'issue_description_Processed'] = "REVENUE"
issueinfo.loc[issueinfo['issue_description_Processed'].str.contains('SCHOOL'),
    'issue_description_Processed'] = "REVENUE"
issueinfo.loc[(issueinfo['issue_description_Processed']!="REVENUE")&(issueinfo['issue_description_Processed']!="GENERAL OBLIGATION"),
    'issue_description_Processed'] = "OTHERS"

In [None]:
# Notes:
# (1) Some counties, including those that issue municipal bonds, are not part of any CBSA. See Harlan County as in
# https://en.wikipedia.org/wiki/Kentucky_statistical_areas.
# (2) To avoid issues from counties with "&" in between be identified as issues from multiple counties, and also to facilitate 
# parsing of cities, I pre-process both county/city data and issuer data by (1) Adding "_"s for any county/city names with more
# than one words, and (2) Replacing phrases of names in issuer data with versions that have different words connected with "_" 
# in between.

###################
# List of issuers #
###################

# Match each issue to a county (if possible) by name
issuers = pd.DataFrame(issueinfo.value_counts(['issuer_long_name_c','issuer_short_name_c','state_c']))
issuers = issuers.reset_index()
issuers = issuers.rename(columns={0:'n_issues'})
# Create a unique ID for each issuer
issuers['issuer_id'] = range(0,len(issuers))
issueinfo = issueinfo.merge(issuers,on=['issuer_long_name_c','issuer_short_name_c','state_c'])

%run -i SCRIPT_us_states.py

###############
# Import CBSA #
###############

# "CSA" is for metropolitan and "CBSA" includes also those micropolitan
CBSA = pd.read_excel("../RawData/MSA/CBSA.xlsx",skiprows=[0,1])
CBSA = CBSA[~pd.isnull(CBSA['County/County Equivalent'])]

# Add state abbreviations
us_state_to_abbrev = pd.DataFrame.from_dict(us_state_to_abbrev,orient='index').reset_index()
us_state_to_abbrev.columns = ['State Name','State']
CBSA = CBSA.rename(columns={'County/County Equivalent':'County'})
CBSA = CBSA.merge(us_state_to_abbrev,on='State Name',how='outer',indicator=True)
CBSA = CBSA[CBSA['_merge']=='both'].drop(columns=['_merge'])
# Merge is perfect
CBSA['County'] = CBSA['County'].str.upper()
CBSA['County'] = CBSA['County'].str.replace(' COUNTY','')
# Facilitate matching in scenarios like "Lewis and Clark County"
CBSA['County'] = CBSA['County'].str.replace(' AND ',' & ')
# Add "_" to county names with spaces
CBSA['County'] = CBSA['County'].str.replace(' ','_')
CBSA['County'] = CBSA['County'].str.replace('.','',regex=False)

###################
# Import counties #
###################

# Complete list of counties, including those not part of CSA 
all_counties = pd.read_csv("../RawData/MSA/fips-by-state.csv",sep=',',encoding="ISO-8859-1",low_memory=False)
all_counties = all_counties.rename(columns={'name':'County','state':'State'})
all_counties['County'] = all_counties['County'].str.upper()
all_counties['County'] = all_counties['County'].str.replace(' COUNTY','')
all_counties['County'] = all_counties['County'].str.replace(' AND ',' & ')
all_counties['County'] = all_counties['County'].str.replace(' ','_')
all_counties['County'] = all_counties['County'].str.replace('.','',regex=False)

#######################################
# Import cities and city equivalences #
#######################################

# (https://github.com/grammakov/USA-cities-and-states/blob/master/us_cities_states_counties.csv)
all_cities = pd.read_csv("../RawData/MSA/us_cities_states_counties.csv",sep='|',encoding="ISO-8859-1",low_memory=False)
all_cities_alias = all_cities.drop(columns=['City']).rename(columns={'City alias':'City'})
all_cities = pd.concat([all_cities.drop(columns=['City alias']),all_cities_alias])
all_cities = all_cities.drop_duplicates()
all_cities['City'] = all_cities['City'].str.upper()
# Facilitate matching in scenarios like "Lewis and Clark County"
all_cities['County'] = all_cities['County'].str.replace(' AND ',' & ')
all_cities['City'] = all_cities['City'].str.replace(' AND ',' & ')
# Add "_" to county names with spaces
all_cities['County'] = all_cities['County'].str.replace(' ','_')
all_cities['City'] = all_cities['City'].str.replace(' ','_')
all_cities['County'] = all_cities['County'].str.replace('.','',regex=False)
all_cities['City'] = all_cities['City'].str.replace('.','',regex=False)
all_cities = all_cities.rename(columns={'State short':'State'})

##########################
# Import school district #
##########################

all_schooldistrcts = pd.read_csv("../RawData/MSA/school-districts_lea_directory.csv",low_memory=False)
all_schooldistrcts = all_schooldistrcts[['lea_name','state_mailing','county_name']]
all_schooldistrcts = all_schooldistrcts.drop_duplicates()
all_schooldistrcts = all_schooldistrcts.rename(
    columns={'lea_name':'SchoolDistrict','state_mailing':'State','county_name':'County'})
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.upper()
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace(' COUNTY','')
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace(' AND ',' & ')
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace(' ','_')
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace('.','',regex=False)
all_schooldistrcts['SchoolDistrict'] = all_schooldistrcts['SchoolDistrict'].str.upper()
all_schooldistrcts['SchoolDistrict'] = all_schooldistrcts['SchoolDistrict'].str.replace(' AND ',' & ')
all_schooldistrcts['SchoolDistrict'] = all_schooldistrcts['SchoolDistrict'].str.replace('.','',regex=False)

all_schooldistrcts = all_schooldistrcts[~pd.isnull(all_schooldistrcts['SchoolDistrict'])]
all_schooldistrcts[all_schooldistrcts['County']!='-1']
all_schooldistrcts[all_schooldistrcts['County']!='-2']

# A version of school district name that has subfixes removed, which will be used to merge with issuer data
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict']
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace('-',' ')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' INC','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' CORP','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL DISTRICT','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' DISTRICT','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' LOCAL SCHOOL DIST','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL DIST','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL DIS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCH DIST','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCH DIS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ELEMENTARY','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ELEM','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ISD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' DISTRIC','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' LOCAL SD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' CSD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' AREA SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' MUNICIPAL SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' PUBLIC SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' TOWNSHIP','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' CHARTER SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' HIGH SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' PREPARATORY SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ELEMENTARY SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL','')


In [None]:
# When maching with school districts:
# (1) Clean numbers from both "all_schooldistrcts" and "issuer" names, including those that start with "#".
# (2) Clean state names, usually with three characters, from "issuer" names
all_schooldistrcts[all_schooldistrcts['SchoolDistrict_NoSubfix'].str.contains('RAYMOND')]

## 1.2 Match Mergent issuer with county

In [None]:
##########################################
# Round 1: Match by county or city names #
##########################################

def proc_list(issuers):

    issuers = issuers.copy()
    issuers['issuer_long_name_c_Processed'] = issuers['issuer_long_name_c']
    
    # Pre-process issuer data and replace phrases of names in issuer data with versions that have different words connected with
    # "_" in between. This facilitates parsing and matching. Note that in this step I add "_" to distinguish blocks in issuer names
    # only if the specific issuer is in the same state as the entity for which the name has multiple words. This addresses cases like
    # "Thief River Falls" and "River Falls" which are in different states
    for idx,row in all_counties[all_counties['County'].str.contains('_')].iterrows():
        issuers.loc[issuers['state_c']==row['State'],'issuer_long_name_c_Processed'] = \
            issuers[issuers['state_c']==row['State']]['issuer_long_name_c_Processed'].str.replace(
            row['County'].replace('_',' '),row['County'],regex=False)
    for idx,row in all_cities[all_cities['City'].str.contains('_')].iterrows():
        issuers.loc[issuers['state_c']==row['State'],'issuer_long_name_c_Processed'] = \
            issuers[issuers['state_c']==row['State']]['issuer_long_name_c_Processed'].str.replace(
            row['City'].replace('_',' '),row['City'],regex=False)

    # Initialize fields
    issuers['County'] = None
    issuers['County_2'] = None
    issuers['County_3'] = None
    issuers['County_4'] = None
    issuers['County_5'] = None
    issuers['City'] = None

    # Check if it is a county
    for idx,row in issuers.iterrows():
    
        ###########################################################################################
        # Handle those clean cases where county names and the key word "CNTY" are in issuer names #
        ###########################################################################################

        # If issue is by one single county
        if ' CNTY ' in row['issuer_long_name_c_Processed']:
            name_county = row['issuer_long_name_c_Processed'].split(' CNTY ')[0]
            all_counties_frag = all_counties[(all_counties['County']==name_county)&(all_counties['State']==row['state_c'])]\
                .reset_index()
            if len(all_counties_frag)==1:
                issuers.at[idx,'County'] = name_county
        # If issue is by multiple counties
        elif ' CNTYS ' in row['issuer_long_name_c_Processed']:
            name_county = row['issuer_long_name_c_Processed'].split(' CNTYS ')[0]
            name_county = name_county.replace(' & ',' ')
            name_counties = name_county.split(' ')
            all_counties_frag = all_counties[(all_counties['County']==name_counties[0])&(all_counties['State']==row['state_c'])]\
                .reset_index()
            if len(all_counties_frag)==1:
                issuers.at[idx,'County'] = name_counties[0]
            if len(name_counties)>=2:
                all_counties_frag = all_counties[(all_counties['County']==name_counties[1])&(all_counties['State']==row['state_c'])]\
                    .reset_index()
                if len(all_counties_frag)==1:
                    issuers.at[idx,'County_2'] = name_counties[1]
            if len(name_counties)>=3:
                all_counties_frag = all_counties[(all_counties['County']==name_counties[2])&(all_counties['State']==row['state_c'])]\
                    .reset_index()
                if len(all_counties_frag)==1:
                    issuers.at[idx,'County_3'] = name_counties[2]
            if len(name_counties)>=4:
                all_counties_frag = all_counties[(all_counties['County']==name_counties[3])&(all_counties['State']==row['state_c'])]\
                    .reset_index()
                if len(all_counties_frag)==1:
                    issuers.at[idx,'County_4'] = name_counties[3]
            if len(name_counties)>=5:
                all_counties_frag = all_counties[(all_counties['County']==name_counties[4])&(all_counties['State']==row['state_c'])]\
                    .reset_index()
                if len(all_counties_frag)==1:
                    issuers.at[idx,'County_5'] = name_counties[4]

        ##############################################################################
        # Match by county or city or city equivalence or school district or hospital #
        ##############################################################################

        # Conduct this as long as a match by county has not been identifed. Also note that I handle prefixes (NEWS) in this step

        # Match by county again. This addresses cases where key words "CNTY" are not in issuer names
        if issuers.at[idx,'County']==None:
            issuer_long_name = row['issuer_long_name_c_Processed']
            issuer_long_name_noloc = issuer_long_name[10:] if issuer_long_name[:9]=='NORTHEAST' else issuer_long_name
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHEAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='NORTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='EAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='WEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='NORTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='SOUTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[8:] if issuer_long_name_noloc[:7]=='CENTRAL' else issuer_long_name_noloc
            name_county = issuer_long_name.split(' ')[0]
            name_county_noloc = issuer_long_name_noloc.split(' ')[0]
            all_counties_frag = all_counties[
                ((all_counties['County']==name_county)|(all_counties['County']==name_county_noloc))
                &(all_counties['State']==row['state_c'])].reset_index()
            if len(all_counties_frag)==1:
                issuers.at[idx,'County'] = all_counties_frag['County'][0]

        # Match by city
        if issuers.at[idx,'County']==None:
            issuer_long_name = row['issuer_long_name_c_Processed']
            issuer_long_name_noloc = issuer_long_name[10:] if issuer_long_name[:9]=='NORTHEAST' else issuer_long_name
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHEAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='NORTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='EAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='WEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='NORTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='SOUTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[8:] if issuer_long_name_noloc[:7]=='CENTRAL' else issuer_long_name_noloc
            name_city = issuer_long_name.split(' ')[0]
            name_city_noloc = issuer_long_name_noloc.split(' ')[0]
            all_cities_frag = all_cities[
                ((all_cities['City']==name_city)|(all_cities['City']==name_city_noloc))
                &(all_cities['State']==row['state_c'])].reset_index()
            # Note that there could be duplicate entries in "all_cities"
            if len(all_cities_frag)>=1:
                issuers.at[idx,'City'] = name_city
                issuers.at[idx,'County'] = all_cities_frag['County'][0]

    return issuers

meta_columns = list(proc_list(issuers[:10]).columns)
issuers_dd = dd.from_pandas(issuers, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    issuers = issuers_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()

In [None]:
##################################################################################################################
# Round 2: Match by school district or hospital names or universities or Community Develop Destrict (in Florida) #
##################################################################################################################


In [None]:
# Add CSA information

def proc_list(issuers):

    issuers = issuers.copy()
    issuers['CSA Code'] = None
    issuers['CSA Title'] = None
    issuers['CBSA Code'] = None
    issuers['CBSA Title'] = None
    
    for idx,row in issuers.iterrows():
        CBSA_frag = CBSA[(CBSA['County']==row['County'])&(CBSA['State']==row['state_c'])]\
            .reset_index()
        if len(CBSA_frag)==1:
            issuers.at[idx,'CSA Code'] = CBSA_frag['CSA Code'][0]
            issuers.at[idx,'CSA Title'] = CBSA_frag['CSA Title'][0]
            issuers.at[idx,'CBSA Code'] = CBSA_frag['CBSA Code'][0]
            issuers.at[idx,'CBSA Title'] = CBSA_frag['CBSA Title'][0]

    return issuers

meta_columns = list(proc_list(issuers[:10]).columns)
issuers_dd = dd.from_pandas(issuers, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    issuers = issuers_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()

In [None]:
# Merge in issuer infor into issues dataset
issueinfo = issueinfo.merge(issuers[['issuer_id','County','City','CSA Code','CBSA Code']],on=['issuer_id'])

bondinfo['maturity_date_d'] = pd.to_datetime(bondinfo['maturity_date_d'],errors='coerce')
bondinfo['settlement_date_d'] = pd.to_datetime(bondinfo['settlement_date_d'],errors='coerce')
bondinfo['dif_dates'] = bondinfo['maturity_date_d']-bondinfo['settlement_date_d']
bondinfo['dif_dates'] = bondinfo['dif_dates'].dt.days
bondinfo['issue_id_l'] = pd.to_numeric(bondinfo['issue_id_l'],errors='coerce')
bondinfo = bondinfo[~pd.isnull(bondinfo['issue_id_l'])].copy()
bondinfo['issue_id_l'] = bondinfo['issue_id_l'].astype(int)
bondinfo = bondinfo[[
    'issue_id_l','maturity_id_l',
    'offering_yield_f','coupon_f','coupon_code_c','offering_price_f',
    'total_maturity_offering_amt_f',
    'maturity_date_d','dif_dates',
    'tax_code_c']]

issueinfo_nodup = issueinfo[[
    'issuer_id','issue_id_l',
    'offering_date_d',
    'County','City','CSA Code','CBSA Code',
    'offering_type_c','issue_description_c','issue_description_Processed'
    ]].drop_duplicates()
Mergent = bondinfo.merge(issueinfo_nodup,on='issue_id_l')
Mergent['sale_year'] = pd.to_datetime(Mergent['offering_date_d']).dt.year

# Export data
Mergent.to_parquet('../RawData/Mergent/Mergent_bondlevel.parque')

## 1.3 Calculate yield of synthetic risk-free bond

Notes:
- Without coupon rate, it is impossible to calculate the price (yield) of the synthetic treasury bond. For example, consider  Bond A: Pays \\$5 one year from now, and \\$5 two years from now, and Bond B: Pays $100*(1+5%)^2 two years from now. These two have the same yield. Suppose that treasury yield in one year is 0% and in two years is 20%. Then price of first synthetic bond is higher than the second, and the yield of the first synthetic bond is lower than the second. In other words, without coupon rate, I do not know when the cash flow is going to come, so I do not know what is the component of risk-free rate that I should tease out from the return of the municipal bond yield. An INACCURATE approximation can be simply using yield of municipal bond minus  that of treasury bond, but it is erraneous to do so.

Luckily except for later parts of the sample, coupon rate is usually available.

- I only handle fixed rate bonds, and (for the moment) skip zero-coupon bonds


In [None]:
%%time

Mergent = Mergent.copy()

# Treasury yield
feds200628 = pd.read_csv("../RawData/FedBOG/feds200628.csv", header=9)
feds200628 = feds200628[~pd.isnull(feds200628['SVENY01'])]
columns = ['Date']+ \
    ['SVENY0'+str(i) for i in range(1,10)]+ \
    ['SVENY'+str(i) for i in range(10,31)]
feds200628 = feds200628[columns]
new_columns = ['Date']+ \
    ['SVENY'+str(i) for i in range(1,10)]+ \
    ['SVENY'+str(i) for i in range(10,31)]
feds200628.columns = new_columns
feds200628['Date'] = pd.to_datetime(feds200628['Date'])
threshold_date = pd.to_datetime('2050-01-01')
feds200628['Date'] = feds200628['Date'].apply(lambda x: x - pd.DateOffset(years=100) if x > threshold_date else x)

max_year_7 = [pd.Timestamp(1961,6,14,0,0,0),pd.Timestamp(1971,8,15,0,0,0)]
max_year_10 = [pd.Timestamp(1971,8,16,0,0,0),pd.Timestamp(1971,11,14,0,0,0)]
max_year_15 = [pd.Timestamp(1971,11,15,0,0,0),pd.Timestamp(1981,7,1,0,0,0)]
max_year_20 = [pd.Timestamp(1981,7,2,0,0,0),pd.Timestamp(1985,11,24,0,0,0)]
max_year_30 = [pd.Timestamp(1985,11,25,0,0,0),pd.Timestamp(2023,11,3,0,0,0)]

Mergent['sync_bond_yield'] = None

def proc_list(Mergent_part):

    Mergent_part = Mergent_part.copy()

    for idx,row in Mergent_part.iterrows():
    
        # Only handle if bond is fixed rate
        if not ((row['coupon_code_c']=='OIP') or (row['coupon_code_c']=='FXD') or (row['coupon_code_c']=='OID')):
            continue
    
        # Skip if key info is missing
        if str(row['coupon_f'])=='nan' or str(row['dif_dates'])=='nan':
            continue
        
        offering_date = datetime.strptime(row['offering_date_d'],"%Y-%m-%d %H:%M:%S")
        
        # Obtain the treasury zero-coupon yield curve at the closest date
        feds200628_copy = feds200628.copy()
        feds200628_copy['dif_date'] = np.abs(feds200628_copy['Date']-offering_date)
        feds200628_copy = feds200628_copy.sort_values('dif_date').reset_index()
    
        coupon_rate = float(row['coupon_f'])
        maturity = row['dif_dates']
    
        cf = []
        discount_factor = []
        N_coupons = int(np.max([1,np.around(maturity/(365/2))]))
    
        # Determine if synthetic bond can be constructed. Cannot do so if the length of zero-coupon yields is not long enough
        rf_available = False
        if offering_date>max_year_7[0] and offering_date<=max_year_7[1] and maturity<=7*365:
            rf_available = True
        if offering_date>max_year_10[0] and offering_date<=max_year_10[1] and maturity<=10*365:
            rf_available = True
        if offering_date>max_year_15[0] and offering_date<=max_year_15[1] and maturity<=15*365:
            rf_available = True
        if offering_date>max_year_20[0] and offering_date<=max_year_20[1] and maturity<=20*365:
            rf_available = True
        if offering_date>max_year_30[0] and offering_date<=max_year_30[1] and maturity<=30*365:
            rf_available = True
    
        if rf_available:
            # Construct a series of cash flow for each bond
            for cf_idx in range(0,N_coupons):
                cf = cf+[coupon_rate/2]
            cf[N_coupons-1] = cf[N_coupons-1]+100
    
            # Construct a series of discount factor for each bond
            for cf_idx in range(0,N_coupons):
                if cf_idx==0:
                    discount_factor = discount_factor+[feds200628_copy['SVENY1'][0]]
                elif cf_idx%2==1:
                    discount_factor = discount_factor+[feds200628_copy['SVENY'+str(ceil(cf_idx/2))][0]]
                elif cf_idx%2==0:
                    discount_factor = discount_factor+\
                        [(feds200628_copy['SVENY'+str(ceil(cf_idx/2))][0]+feds200628_copy['SVENY'+str(ceil(cf_idx/2)+1)][0])/2]
            discount_factor = [(1/(1+discount_factor[disc_idx]/100))**((disc_idx+1)/2) for disc_idx in range(0,N_coupons)]
    
            # Bond price and yield of synthetic bond
            sync_bond_price = np.sum(np.dot(cf,discount_factor))
            cf = [-sync_bond_price]+cf
            sync_bond_yield = (1+npf.irr(cf))**2-1
    
            # Record data
            Mergent_part.at[idx,'sync_bond_yield'] = sync_bond_yield

    return Mergent_part

meta_columns = list(proc_list(Mergent[:10]).columns)
Mergent_dd = dd.from_pandas(Mergent,npartitions=40)
with dask.config.set(scheduler='processes',num_workers=40):
    Mergent = Mergent_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()

## 1.4 Export data

In [None]:
Mergent.to_parquet('../CleanData/Mergent/0E_Mergent_bondlevel.parque')