In [2]:
import pandas as pd
import numpy as np
import numpy_financial as npf
import statsmodels.api as sm
import geopandas as gpd
import os
import dask
import dask.dataframe as dd
import itertools
from itertools import chain
from math import sqrt, floor, ceil, isnan
import multiprocess
import multiprocessing
import importlib
from importlib import reload
from collections import Counter
from fuzzywuzzy import process, fuzz
import time
import warnings
import datetime
from datetime import datetime
from datetime import date
warnings.filterwarnings("error")

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400


Estimate a statistical model of issuance fees (advisor fee, insurance premium, ratings fee) as a ratio of amount:
$$\text{Cost}=
\gamma_1\frac{\text{County Income}}{\text{National Average Income}}+\gamma_2\frac{\text{County Population}}{\text{National Average County Population}}+
\delta_{\text{maturity bracket}}+\delta_{\text{amount bracket}}+
$$
$$
\theta_{\text{method of sales}}+\theta_{\text{tax status}}+\theta_{\text{source of repayment}}+
\beta_{\text{if prior relationship}}+\beta_{\text{if refunding issue}}
$$

# 1. California

## 1.1 Import data

In [2]:
#----------------------------#
# Import california fee data #
#----------------------------#

CaliforniaData = pd.read_csv('../RawData/California/CDA_All_Data.csv')

CaliforniaData['County'] = CaliforniaData['Issuer County'].str.upper()
CaliforniaData['State'] = 'CA'

#------------#
# Fee ratios #
#------------#

CaliforniaData['AdvisorFeeRatio'] = CaliforniaData['Financial Advisor Fee']/CaliforniaData['Principal Amount']
CaliforniaData['CRFeeRatio'] = CaliforniaData['Rating Agency Fee']/CaliforniaData['Principal Amount']
CaliforniaData['InsureFeeRatio'] = CaliforniaData['Credit Enhancement Fee']/CaliforniaData['Principal Amount']

# Winsorize data
upper_limit = np.percentile(CaliforniaData['AdvisorFeeRatio'][np.logical_not(np.isnan(CaliforniaData['AdvisorFeeRatio']))],99)
lower_limit = np.percentile(CaliforniaData['AdvisorFeeRatio'][np.logical_not(np.isnan(CaliforniaData['AdvisorFeeRatio']))],1)
CaliforniaData.loc[(CaliforniaData['AdvisorFeeRatio']>upper_limit)&(np.logical_not(np.isnan(CaliforniaData['AdvisorFeeRatio']))),
    'AdvisorFeeRatio'] = upper_limit
CaliforniaData.loc[(CaliforniaData['AdvisorFeeRatio']<lower_limit)&(np.logical_not(np.isnan(CaliforniaData['AdvisorFeeRatio']))),
    'AdvisorFeeRatio'] = lower_limit
upper_limit = np.percentile(CaliforniaData['CRFeeRatio'][np.logical_not(np.isnan(CaliforniaData['CRFeeRatio']))],99)
lower_limit = np.percentile(CaliforniaData['CRFeeRatio'][np.logical_not(np.isnan(CaliforniaData['CRFeeRatio']))],1)
CaliforniaData.loc[(CaliforniaData['CRFeeRatio']>upper_limit)&(np.logical_not(np.isnan(CaliforniaData['CRFeeRatio']))),
    'CRFeeRatio'] = upper_limit
CaliforniaData.loc[(CaliforniaData['CRFeeRatio']<lower_limit)&(np.logical_not(np.isnan(CaliforniaData['CRFeeRatio']))),
    'CRFeeRatio'] = lower_limit
upper_limit = np.percentile(CaliforniaData['InsureFeeRatio'][np.logical_not(np.isnan(CaliforniaData['InsureFeeRatio']))],99)
lower_limit = np.percentile(CaliforniaData['InsureFeeRatio'][np.logical_not(np.isnan(CaliforniaData['InsureFeeRatio']))],1)
CaliforniaData.loc[(CaliforniaData['InsureFeeRatio']>upper_limit)&(np.logical_not(np.isnan(CaliforniaData['InsureFeeRatio']))),
    'InsureFeeRatio'] = upper_limit
CaliforniaData.loc[(CaliforniaData['InsureFeeRatio']<lower_limit)&(np.logical_not(np.isnan(CaliforniaData['InsureFeeRatio']))),
    'InsureFeeRatio'] = lower_limit

CaliforniaData['Sale Date'] = pd.to_datetime(CaliforniaData['Sale Date'])
CaliforniaData['year'] = CaliforniaData['Sale Date'].dt.year
CaliforniaData['Final Maturity Date'] = pd.to_datetime(CaliforniaData['Final Maturity Date'])
CaliforniaData['year_maturity'] = CaliforniaData['Final Maturity Date'].dt.year
CaliforniaData['maturity_in_years'] = CaliforniaData['year_maturity']-CaliforniaData['year']

#---------------------------------------------------------------------------------------------------------------------------#
# Whether the issuer has worked with the CRA/financial advisor/insurer in prior years (to capture frequent issuer discount) #
#---------------------------------------------------------------------------------------------------------------------------#

CaliforniaData_gb = CaliforniaData.groupby('Issuer')

CaliforniaData['GPF_if_prior_insurer'] = None
CaliforniaData['GPF_if_prior_advisor'] = None
CaliforniaData['GPF_if_prior_rater'] = None

for idx,row in CaliforniaData.iterrows():
    
    CaliforniaData_oneissuer = CaliforniaData_gb.get_group(row['Issuer'])
    CaliforniaData_oneissuer = CaliforniaData_oneissuer[CaliforniaData_oneissuer['year']<row['year']]

    prior_insurers = list(CaliforniaData_oneissuer['Guarantor'].unique())
    prior_insurers = [item for item in prior_insurers if item!=None]
    prior_insurers = [item for item in prior_insurers if str(item)!='nan']
    prior_insurers = [item for item in prior_insurers if str(item)!='None']
    if_prior_insurer = row['Guarantor'] in prior_insurers
    CaliforniaData.at[idx,'GPF_if_prior_insurer'] = if_prior_insurer

    prior_advisors = list(CaliforniaData_oneissuer['Financial Advisor'].unique())
    prior_advisors = [item for item in prior_advisors if item!=None]
    prior_advisors = [item for item in prior_advisors if str(item)!='nan']
    prior_advisors = [item for item in prior_advisors if str(item)!='None']
    if_prior_advisor = row['Financial Advisor'] in prior_advisors
    CaliforniaData.at[idx,'GPF_if_prior_advisor'] = if_prior_advisor

    if_prior_rater = False
    prior_ratings = list(CaliforniaData_oneissuer['S and P Rating'].unique())
    prior_ratings = [item for item in prior_ratings if item!=None]
    prior_ratings = [item for item in prior_ratings if str(item)!='nan']
    prior_ratings = [item for item in prior_ratings if str(item)!='None']
    prior_ratings = [item for item in prior_ratings if item!='NOT RATED']
    if len(prior_ratings)>0 and row['S and P Rating']!='NOT RATED' and \
        row['S and P Rating']!=None and str(row['S and P Rating'])!='nan':
        if_prior_rater = True
    prior_ratings = list(CaliforniaData_oneissuer['Moody Rating'].unique())
    prior_ratings = [item for item in prior_ratings if item!=None]
    prior_ratings = [item for item in prior_ratings if str(item)!='nan']
    prior_ratings = [item for item in prior_ratings if str(item)!='None']
    prior_ratings = [item for item in prior_ratings if item!='NOT RATED']
    if len(prior_ratings)>0 and row['Moody Rating']!='NOT RATED' and \
        row['Moody Rating']!=None and str(row['Moody Rating'])!='nan':
        if_prior_rater = True
    prior_ratings = list(CaliforniaData_oneissuer['Fitch Rating'].unique())
    prior_ratings = [item for item in prior_ratings if item!=None]
    prior_ratings = [item for item in prior_ratings if str(item)!='nan']
    prior_ratings = [item for item in prior_ratings if str(item)!='None']
    prior_ratings = [item for item in prior_ratings if item!='NOT RATED']
    if len(prior_ratings)>0 and row['Fitch Rating']!='NOT RATED' and \
        row['Fitch Rating']!=None and str(row['Fitch Rating'])!='nan':
        if_prior_rater = True
    CaliforniaData.at[idx,'GPF_if_prior_rater'] = if_prior_rater


#----------------------------------------------------#
# Recreate other variables to be consistent with GPF #
#----------------------------------------------------#

CaliforniaData['GPF_security_type'] = None
CaliforniaData.loc[CaliforniaData['Debt Type']=='Tax and Revenue Anticipation Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='General Obligation Bond','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Special Assessment Bond','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Certificate of Participation/Leases','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Limited Tax Obligation Bond (Special Tax Bonds)','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Conduit Revenue Bond','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Public Enterprise Revenue Bond','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Tax Allocation Bond','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Revenue Bond','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Public Lease Revenue Bond','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Marks-Roos Loan','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Commercial Paper','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Conduit Revenue Note or Loan (Private Obligor)','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Bond Anticipation Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Capital Lease','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Pension Obligation Bonds','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Other Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Sales Tax Revenue Bond','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Tax Allocation Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Revenue Anticipation Note','GPF_security_type'] = 'RV'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Other Bond','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='State Agency Loan','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Loan from bank/other institution','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Tax Anticipation Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Grant Anticipation Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Other Debt','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Promissory Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='General Obligation Note','GPF_security_type'] = 'GO'
CaliforniaData.loc[CaliforniaData['Debt Type']=='Revenue Anticipation Warrant','GPF_security_type'] = 'RV'

CaliforniaData['GPF_taxable_code'] = None
CaliforniaData.loc[CaliforniaData['Federally Taxable']=='Federal Tax Flag: E','GPF_taxable_code'] = 'E'
CaliforniaData.loc[CaliforniaData['Federally Taxable']=='Federal Tax Flag: T','GPF_taxable_code'] = 'T'
CaliforniaData.loc[CaliforniaData['Federally Taxable']=='Subject to Alternative Minimum Tax','GPF_taxable_code'] = 'A'
CaliforniaData.loc[CaliforniaData['Federally Taxable']=='Federal Tax Flag: ET','GPF_taxable_code'] = 'E'
CaliforniaData.loc[CaliforniaData['Federally Taxable']=='Federal Tax Flag: TE','GPF_taxable_code'] = 'T'

CaliforniaData['GPF_Bid'] = None
CaliforniaData.loc[CaliforniaData['Sale Type (Comp/Neg)']=='Neg','GPF_Bid'] = 'N'
CaliforniaData.loc[CaliforniaData['Sale Type (Comp/Neg)']=='Comp','GPF_Bid'] = 'C'

# Adjust inflation
FPCPITOTLZGUSA = pd.read_csv("../RawData/StLouisFed/FPCPITOTLZGUSA.csv")
FPCPITOTLZGUSA['year'] = FPCPITOTLZGUSA['DATE'].str[:4].astype(int)
FPCPITOTLZGUSA = FPCPITOTLZGUSA.sort_values('year',ascending=False).reset_index(drop=True)
scaler = 1
FPCPITOTLZGUSA['scaler'] = None
for idx,row in FPCPITOTLZGUSA.iterrows():
    if idx==0:
        FPCPITOTLZGUSA.at[idx,'scaler'] = 1
    else:
        scaler = scaler*(FPCPITOTLZGUSA.at[idx-1,'FPCPITOTLZGUSA']/100+1)
        FPCPITOTLZGUSA.at[idx,'scaler'] = scaler
FPCPITOTLZGUSA = FPCPITOTLZGUSA[['scaler','year']]

CaliforniaData = CaliforniaData.merge(FPCPITOTLZGUSA,on=['year'])
CaliforniaData['GPF_amount_inf_adjusted'] = CaliforniaData['Principal Amount']*CaliforniaData['scaler']

CaliforniaData['GPF_amount_bracket'] = None
CaliforniaData.loc[CaliforniaData['GPF_amount_inf_adjusted']<=1*1000000,
    'GPF_amount_bracket'] = 'Less than 1M'
CaliforniaData.loc[(CaliforniaData['GPF_amount_inf_adjusted']>1*1000000)&(CaliforniaData['GPF_amount_inf_adjusted']<=5*1000000),
    'GPF_amount_bracket'] = '1M to 5M'
CaliforniaData.loc[(CaliforniaData['GPF_amount_inf_adjusted']>5*1000000)&(CaliforniaData['GPF_amount_inf_adjusted']<=10*1000000),
    'GPF_amount_bracket'] = '5M to 10M'
CaliforniaData.loc[(CaliforniaData['GPF_amount_inf_adjusted']>10*1000000)&(CaliforniaData['GPF_amount_inf_adjusted']<=50*1000000),
    'GPF_amount_bracket'] = '10M to 50M'
CaliforniaData.loc[(CaliforniaData['GPF_amount_inf_adjusted']>50*1000000)&(CaliforniaData['GPF_amount_inf_adjusted']<=100*1000000),
    'GPF_amount_bracket'] = '50M to 100M'
CaliforniaData.loc[CaliforniaData['GPF_amount_inf_adjusted']>100*1000000,
    'GPF_amount_bracket'] = 'Greater than 100M'

CaliforniaData['GPF_maturity_bracket'] = None
CaliforniaData.loc[CaliforniaData['maturity_in_years']<=2,
    'GPF_maturity_bracket'] = 'Less then 2y'
CaliforniaData.loc[(CaliforniaData['maturity_in_years']>2)&(CaliforniaData['maturity_in_years']<=5),
    'GPF_maturity_bracket'] = '2y to 5y'
CaliforniaData.loc[(CaliforniaData['maturity_in_years']>5)&(CaliforniaData['maturity_in_years']<=10),
    'GPF_maturity_bracket'] = '5y to 10y'
CaliforniaData.loc[(CaliforniaData['maturity_in_years']>10)&(CaliforniaData['maturity_in_years']<=20),
    'GPF_maturity_bracket'] = '10y to 20y'
CaliforniaData.loc[(CaliforniaData['maturity_in_years']>20)&(CaliforniaData['maturity_in_years']<=30),
    'GPF_maturity_bracket'] = '20y to 30y'
CaliforniaData.loc[(CaliforniaData['maturity_in_years']>30)&(CaliforniaData['maturity_in_years']<=40),
    'GPF_maturity_bracket'] = '30y to 40y'
CaliforniaData.loc[CaliforniaData['maturity_in_years']>40,
    'GPF_maturity_bracket'] = 'Greater than 40y'

CaliforniaData['GPF_if_refunding'] = CaliforniaData['Refunding Amount']>0

#-----------------------#
# Keep useful variables #
#-----------------------#

CaliforniaData = CaliforniaData[[
    'Issuer','County','State','year',
    'AdvisorFeeRatio', 'CRFeeRatio','InsureFeeRatio',
    'GPF_if_prior_insurer', 'GPF_if_prior_advisor', 'GPF_if_prior_rater',
    'GPF_security_type', 'GPF_taxable_code', 'GPF_Bid',
    'GPF_amount_inf_adjusted', 'GPF_amount_bracket', 'GPF_maturity_bracket',
    'GPF_if_refunding']]

# 2. Texas

In [3]:
TexasData = pd.read_excel('../RawData/Texas/Local Fees with Maturity FY03-23 (Renping) (1.25.24).xlsb')

#############################
# Pin down county of issuer #
#############################

# Note that some issuers might not belong to any county

# Codes below are adapted from codes for pre-processing Mergent data

# Notes:
# (1) Some counties, including those that issue municipal bonds, are not part of any CBSA. See Harlan County as in
# https://en.wikipedia.org/wiki/Kentucky_statistical_areas.
# (2) To avoid issues from counties with "&" in between be identified as issues from multiple counties, and also to facilitate 
# parsing of cities, I pre-process both county/city data and issuer data by (1) Adding "_"s for any county/city names with more
# than one words, and (2) Replacing phrases of names in issuer data with versions that have different words connected with "_" 
# in between.

###################
# List of issuers #
###################

# Match each issue to a county (if possible) by name
issuers = pd.DataFrame(TexasData.value_counts(['GovernmentName']))
issuers = issuers.reset_index()
issuers = issuers.rename(columns={0:'n_issues'})
# Create a unique ID for each issuer
issuers['issuer_id'] = range(0,len(issuers))
TexasData = TexasData.merge(issuers,on=['GovernmentName'])
issuers['GovernmentName'] = issuers['GovernmentName'].str.upper()
issuers['State'] = 'TX'

%run -i SCRIPT_us_states.py

###################
# Import counties #
###################

# Complete list of counties, including those not part of CSA 
all_counties = pd.read_csv("../RawData/MSA/fips-by-state.csv",sep=',',encoding="ISO-8859-1",low_memory=False)
all_counties = all_counties.rename(columns={'name':'County','state':'State'})
all_counties['County'] = all_counties['County'].str.upper()
all_counties['County'] = all_counties['County'].str.replace(' COUNTY','')
all_counties['County'] = all_counties['County'].str.replace(' AND ',' & ')
all_counties['County'] = all_counties['County'].str.replace(' ','_')
all_counties['County'] = all_counties['County'].str.replace('.','',regex=False)

# Keep only Texas
all_counties = all_counties[all_counties['State']=='TX']

#######################################
# Import cities and city equivalences #
#######################################

# (https://github.com/grammakov/USA-cities-and-states/blob/master/us_cities_states_counties.csv)
all_cities = pd.read_csv("../RawData/MSA/us_cities_states_counties.csv",sep='|',encoding="ISO-8859-1",low_memory=False)
all_cities_alias = all_cities.drop(columns=['City']).rename(columns={'City alias':'City'})
all_cities = pd.concat([all_cities.drop(columns=['City alias']),all_cities_alias])
all_cities = all_cities.drop_duplicates()
all_cities['City'] = all_cities['City'].str.upper()
# Facilitate matching in scenarios like "Lewis and Clark County"
all_cities['County'] = all_cities['County'].str.replace(' AND ',' & ')
all_cities['City'] = all_cities['City'].str.replace(' AND ',' & ')
# Add "_" to county names with spaces
all_cities['County'] = all_cities['County'].str.replace(' ','_')
all_cities['City'] = all_cities['City'].str.replace(' ','_')
all_cities['County'] = all_cities['County'].str.replace('.','',regex=False)
all_cities['City'] = all_cities['City'].str.replace('.','',regex=False)
all_cities = all_cities.rename(columns={'State short':'State'})

# Keep only Texas
all_cities = all_cities[all_cities['State']=='TX']

##########################
# Import school district #
##########################

all_schooldistrcts = pd.read_csv("../RawData/MSA/school-districts_lea_directory.csv",low_memory=False)
all_schooldistrcts = all_schooldistrcts[['lea_name','state_mailing','county_name']]
all_schooldistrcts = all_schooldistrcts.drop_duplicates()
all_schooldistrcts = all_schooldistrcts.rename(
    columns={'lea_name':'SchoolDistrict','state_mailing':'State','county_name':'County'})
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.upper()
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace(' COUNTY','')
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace(' AND ',' & ')
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace(' ','_')
all_schooldistrcts['County'] = all_schooldistrcts['County'].str.replace('.','',regex=False)
all_schooldistrcts['SchoolDistrict'] = all_schooldistrcts['SchoolDistrict'].str.upper()
all_schooldistrcts['SchoolDistrict'] = all_schooldistrcts['SchoolDistrict'].str.replace(' AND ',' & ')
all_schooldistrcts['SchoolDistrict'] = all_schooldistrcts['SchoolDistrict'].str.replace('.','',regex=False)

all_schooldistrcts = all_schooldistrcts[~pd.isnull(all_schooldistrcts['SchoolDistrict'])]
all_schooldistrcts[all_schooldistrcts['County']!='-1']
all_schooldistrcts[all_schooldistrcts['County']!='-2']

# A version of school district name that has subfixes removed, which will be used to merge with issuer data
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict']
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace('-',' ')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' INC','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' CORP','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL DISTRICT','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' DISTRICT','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' LOCAL SCHOOL DIST','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL DIST','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL DIS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCH DIST','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCH DIS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ELEMENTARY','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ELEM','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ISD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ISDA','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' DISTRIC','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' LOCAL SD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' CSD','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' AREA SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' MUNICIPAL SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' PUBLIC SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOLS','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' TOWNSHIP','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' CHARTER SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' HIGH SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' PREPARATORY SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' ELEMENTARY SCHOOL','')
all_schooldistrcts['SchoolDistrict_NoSubfix'] = all_schooldistrcts['SchoolDistrict_NoSubfix'].str.replace(' SCHOOL','')

# Keep only Texas
all_schooldistrcts = all_schooldistrcts[all_schooldistrcts['State']=='TX']

##########################################
# Round 1: Match by county or city names #
##########################################

def proc_list(issuers):

    issuers = issuers.copy()
    
    # Pre-process issuer data and replace phrases of names in issuer data with versions that have different words connected with
    # "_" in between. This facilitates parsing and matching. Note that in this step I add "_" to distinguish blocks in issuer names
    # only if the specific issuer is in the same state as the entity for which the name has multiple words. This addresses cases like
    # "Thief River Falls" and "River Falls" which are in different states
    for idx,row in all_counties[all_counties['County'].str.contains('_')].iterrows():
        issuers.loc[issuers['State']==row['State'],'GovernmentName'] = \
            issuers[issuers['State']==row['State']]['GovernmentName'].str.replace(
            row['County'].replace('_',' '),row['County'],regex=False)
    for idx,row in all_cities[all_cities['City'].str.contains('_')].iterrows():
        issuers.loc[issuers['State']==row['State'],'GovernmentName'] = \
            issuers[issuers['State']==row['State']]['GovernmentName'].str.replace(
            row['City'].replace('_',' '),row['City'],regex=False)

    # Initialize fields
    issuers['County'] = None
    issuers['City'] = None

    # Check if it is a county
    for idx,row in issuers.iterrows():
    
        ###########################################################################################
        # Handle those clean cases where county names and the key word "CNTY" are in issuer names #
        ###########################################################################################

        # If issue is by one single county
        name_county = row['GovernmentName']
        all_counties_frag = all_counties[(all_counties['County']==name_county)&(all_counties['State']==row['State'])]\
            .reset_index()
        if len(all_counties_frag)==1:
            issuers.at[idx,'County'] = name_county

        ##############################################################################
        # Match by county or city or city equivalence or school district or hospital #
        ##############################################################################

        # Conduct this as long as a match by county has not been identifed. Also note that I handle prefixes (NEWS) in this step

        # Match by county again. This addresses cases where key words "CNTY" are not in issuer names
        if issuers.at[idx,'County']==None:
            issuer_long_name = row['GovernmentName']
            issuer_long_name_noloc = issuer_long_name[10:] if issuer_long_name[:9]=='NORTHEAST' else issuer_long_name
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHEAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='NORTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='EAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='WEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='NORTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='SOUTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[8:] if issuer_long_name_noloc[:7]=='CENTRAL' else issuer_long_name_noloc
            name_county = issuer_long_name.split(' ')[0]
            name_county_noloc = issuer_long_name_noloc.split(' ')[0]
            all_counties_frag = all_counties[
                ((all_counties['County']==name_county)|(all_counties['County']==name_county_noloc))
                &(all_counties['State']==row['State'])].reset_index()
            if len(all_counties_frag)==1:
                issuers.at[idx,'County'] = all_counties_frag['County'][0]

        # Match by city
        if issuers.at[idx,'County']==None:
            issuer_long_name = row['GovernmentName']
            issuer_long_name_noloc = issuer_long_name[10:] if issuer_long_name[:9]=='NORTHEAST' else issuer_long_name
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHEAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='NORTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[10:] if issuer_long_name_noloc[:9]=='SOUTHWEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='EAST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[5:] if issuer_long_name_noloc[:4]=='WEST' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='NORTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[6:] if issuer_long_name_noloc[:5]=='SOUTH' else issuer_long_name_noloc
            issuer_long_name_noloc = issuer_long_name_noloc[8:] if issuer_long_name_noloc[:7]=='CENTRAL' else issuer_long_name_noloc
            name_city = issuer_long_name.split(' ')[0]
            name_city_noloc = issuer_long_name_noloc.split(' ')[0]
            all_cities_frag = all_cities[
                ((all_cities['City']==name_city)|(all_cities['City']==name_city_noloc))
                &(all_cities['State']==row['State'])].reset_index()
            # Note that there could be duplicate entries in "all_cities"
            if len(all_cities_frag)>=1:
                issuers.at[idx,'City'] = name_city
                issuers.at[idx,'County'] = all_cities_frag['County'][0]

    return issuers

meta_columns = list(proc_list(issuers[:10]).columns)
issuers_dd = dd.from_pandas(issuers, npartitions=10)
with dask.config.set(scheduler='processes',num_workers=10):
    issuers = issuers_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()



##############################################
# Round 2: Match using school district names #
##############################################

# Sometimes school district name can be not related to county/city name. E.g., Calallen Independent School District is a school 
# district located in northwest Corpus Christi, Texas in northern Nueces County.

for idx,row in issuers.iterrows():

    if str(row['County'])=='None':

        GovernmentName = row['GovernmentName']
        GovernmentName = GovernmentName.replace('-',' ')
        GovernmentName = GovernmentName.replace(' INC','')
        GovernmentName = GovernmentName.replace(' CORP','')
        GovernmentName = GovernmentName.replace(' SCHOOL DISTRICT','')
        GovernmentName = GovernmentName.replace(' DISTRICT','')
        GovernmentName = GovernmentName.replace(' LOCAL SCHOOL DIST','')
        GovernmentName = GovernmentName.replace(' SCHOOL DIST','')
        GovernmentName = GovernmentName.replace(' SCHOOL DIS','')
        GovernmentName = GovernmentName.replace(' SCH DIST','')
        GovernmentName = GovernmentName.replace(' SCH DIS','')
        GovernmentName = GovernmentName.replace(' ELEMENTARY','')
        GovernmentName = GovernmentName.replace(' ELEM','')
        GovernmentName = GovernmentName.replace(' ISD','')
        GovernmentName = GovernmentName.replace(' ISDA','')
        GovernmentName = GovernmentName.replace(' DISTRIC','')
        GovernmentName = GovernmentName.replace(' LOCAL SD','')
        GovernmentName = GovernmentName.replace(' SD','')
        GovernmentName = GovernmentName.replace(' CSD','')
        GovernmentName = GovernmentName.replace(' SCHS','')
        GovernmentName = GovernmentName.replace(' AREA SCHOOLS','')
        GovernmentName = GovernmentName.replace(' MUNICIPAL SCHOOLS','')
        GovernmentName = GovernmentName.replace(' PUBLIC SCHOOLS','')
        GovernmentName = GovernmentName.replace(' SCHOOLS','')
        GovernmentName = GovernmentName.replace(' TOWNSHIP','')
        GovernmentName = GovernmentName.replace(' CHARTER SCHOOL','')
        GovernmentName = GovernmentName.replace(' HIGH SCHOOL','')
        GovernmentName = GovernmentName.replace(' PREPARATORY SCHOOL','')
        GovernmentName = GovernmentName.replace(' ELEMENTARY SCHOOL','')
        GovernmentName = GovernmentName.replace(' SCHOOL','')
        
        all_schooldistrcts_frag = all_schooldistrcts[(all_schooldistrcts['SchoolDistrict_NoSubfix']==GovernmentName)
            &(all_schooldistrcts['State']==row['State'])]\
            .reset_index()

        if len(all_schooldistrcts_frag)>=1:
            issuers.at[idx,'County'] = all_schooldistrcts_frag['County'][0]

# Merge in county info
TexasData = TexasData.merge(issuers[['issuer_id','County']],on='issuer_id')

# Most governments can be matched to a county. Note that "_" is still in the "County" field currently
TexasData['County'] = TexasData['County'].str.replace('_',' ')

In [4]:
%%time

#----------------------------#
# Keep only fees of interest #
#----------------------------#

TexasData = TexasData[
    (TexasData['FeeName']=='FinancialAdvisor')|
    (TexasData['FeeName']=='Rating')|
    (TexasData['FeeName']=='SpreadExpenses')|
    (TexasData['FeeName']=='BondInsurance')|
    (TexasData['FeeName']=='NetUnderwritersSpread')
    ].copy()
TexasData.loc[TexasData['FeeName']=='SpreadExpenses','FeeName'] = 'NetUnderwritersSpread'

#---------------------------------------------------------------------------------------------------------------------------#
# Whether the issuer has worked with the CRA/financial advisor/insurer in prior years (to capture frequent issuer discount) #
#---------------------------------------------------------------------------------------------------------------------------#

TexasData_gb = TexasData.groupby('IssuerName')

TexasData['GPF_if_prior_insurer'] = None
TexasData['GPF_if_prior_advisor'] = None
TexasData['GPF_if_prior_rater'] = None

for idx,row in TexasData.iterrows():
    
    TexasData_oneissuer = TexasData_gb.get_group(row['IssuerName'])
    TexasData_oneissuer = TexasData_oneissuer[TexasData_oneissuer['FiscalYearIssuance']<row['FiscalYearIssuance']]

    # Whether having prior relationship with insurer
    if row['FeeName']=='BondInsurance':
        TexasData_oneissuer_insurers = TexasData_oneissuer[TexasData_oneissuer['FeeName']=='BondInsurance']
        prior_insurers = list(TexasData_oneissuer_insurers['FirmName'].unique())
        prior_insurers = [item for item in prior_insurers if item!=None]
        prior_insurers = [item for item in prior_insurers if str(item)!='nan']
        prior_insurers = [item for item in prior_insurers if str(item)!='None']
        if_prior_insurer = row['FirmName'] in prior_insurers
        # Edit for all issues of the issuer in the same year
        TexasData.loc[(TexasData['IssuerName']==row['IssuerName'])
            &(TexasData['FiscalYearIssuance']==row['FiscalYearIssuance']),
            'GPF_if_prior_insurer'] = if_prior_insurer

    # Whether having prior relationship with credit rater
    if row['FeeName']=='Rating':
        TexasData_oneissuer_raters = TexasData_oneissuer[TexasData_oneissuer['FeeName']=='Rating']
        prior_raters = list(TexasData_oneissuer_raters['FirmName'].unique())
        prior_raters = [item for item in prior_raters if item!=None]
        prior_raters = [item for item in prior_raters if str(item)!='nan']
        prior_raters = [item for item in prior_raters if str(item)!='None']
        if_prior_rater = row['FirmName'] in prior_raters
        # Edit for all issues of the issuer in the same year
        TexasData.loc[(TexasData['IssuerName']==row['IssuerName'])
            &(TexasData['FiscalYearIssuance']==row['FiscalYearIssuance']),
            'GPF_if_prior_rater'] = if_prior_rater

    # Whether having prior relationship with advisor
    if row['FeeName']=='FinancialAdvisor':
        TexasData_oneissuer_advisors = TexasData_oneissuer[TexasData_oneissuer['FeeName']=='FinancialAdvisor']
        prior_advisors = list(TexasData_oneissuer_advisors['FirmName'].unique())
        prior_advisors = [item for item in prior_advisors if item!=None]
        prior_advisors = [item for item in prior_advisors if str(item)!='nan']
        prior_advisors = [item for item in prior_advisors if str(item)!='None']
        if_prior_advisor = row['FirmName'] in prior_advisors
        # Edit for all issues of the issuer in the same year
        TexasData.loc[(TexasData['IssuerName']==row['IssuerName'])
            &(TexasData['FiscalYearIssuance']==row['FiscalYearIssuance']),
            'GPF_if_prior_advisor'] = if_prior_advisor

#-----------------------------#
# Convert data to issue level #
#-----------------------------#

# Aggregate within an issue
TexasData_sum = TexasData.groupby( \
    ['GovernmentName','GovernmentType','IssuerName','IssuanceName','FiscalYearIssuance','ClosingDate','FeeName']). \
    agg({'ActualFee':sum})
TexasData_sum = TexasData_sum.reset_index()
TexasData = TexasData.drop(columns=['ActualFee'])
TexasData = TexasData.drop_duplicates(subset= \
    ['GovernmentName','GovernmentType','IssuerName','IssuanceName','FiscalYearIssuance','ClosingDate','FeeName'])
TexasData = TexasData.merge(TexasData_sum, \
    on=['GovernmentName','GovernmentType','IssuerName','IssuanceName','FiscalYearIssuance','ClosingDate','FeeName'])

# Pivot table to wide
TexasData_wide = TexasData.pivot( \
    index=['GovernmentName','GovernmentType','IssuerName','IssuanceName','FiscalYearIssuance','ClosingDate'],
    columns='FeeName',values='ActualFee')
TexasData_wide = TexasData_wide.reset_index()
TexasData = TexasData.drop(columns=['FeeName','FirmName','FeeType','IssuanceFeeComments','ActualFee'])
TexasData = TexasData.drop_duplicates( \
    subset=['GovernmentName','GovernmentType','IssuerName','IssuanceName','FiscalYearIssuance','ClosingDate'])
TexasData = TexasData_wide.merge(TexasData, \
    on=['GovernmentName','GovernmentType','IssuerName','IssuanceName','FiscalYearIssuance','ClosingDate'])

# Rename to have consistent variable names with California
TexasData = TexasData.rename(columns={
    'IssuerName':'Issuer',
    'BondInsurance':'Credit Enhancement Fee',
    'FinancialAdvisor':'Financial Advisor Fee',
    'Rating':'Rating Agency Fee',
    'ActualPar':'Principal Amount',
    'FiscalYearIssuance':'year',
    'YearsToMaturity':'maturity_in_years'})
TexasData['State'] = 'TX'

#------------------------#
# Generate fee variables #
#------------------------#

TexasData['AdvisorFeeRatio'] = TexasData['Financial Advisor Fee']/TexasData['Principal Amount']
TexasData['CRFeeRatio'] = TexasData['Rating Agency Fee']/TexasData['Principal Amount']
TexasData['InsureFeeRatio'] = TexasData['Credit Enhancement Fee']/TexasData['Principal Amount']

# Winsorize data
upper_limit = np.percentile(TexasData['AdvisorFeeRatio'][np.logical_not(np.isnan(TexasData['AdvisorFeeRatio']))],99)
lower_limit = np.percentile(TexasData['AdvisorFeeRatio'][np.logical_not(np.isnan(TexasData['AdvisorFeeRatio']))],1)
TexasData.loc[(TexasData['AdvisorFeeRatio']>upper_limit)&(np.logical_not(np.isnan(TexasData['AdvisorFeeRatio']))),
    'AdvisorFeeRatio'] = \
    upper_limit
TexasData.loc[(TexasData['AdvisorFeeRatio']<lower_limit)&(np.logical_not(np.isnan(TexasData['AdvisorFeeRatio']))),
    'AdvisorFeeRatio'] = \
    lower_limit
upper_limit = np.percentile(TexasData['CRFeeRatio'][np.logical_not(np.isnan(TexasData['CRFeeRatio']))],99)
lower_limit = np.percentile(TexasData['CRFeeRatio'][np.logical_not(np.isnan(TexasData['CRFeeRatio']))],1)
TexasData.loc[(TexasData['CRFeeRatio']>upper_limit)&(np.logical_not(np.isnan(TexasData['CRFeeRatio']))),
    'CRFeeRatio'] = \
    upper_limit
TexasData.loc[(TexasData['CRFeeRatio']<lower_limit)&(np.logical_not(np.isnan(TexasData['CRFeeRatio']))),
    'CRFeeRatio'] = \
    lower_limit
upper_limit = np.percentile(TexasData['InsureFeeRatio'][np.logical_not(np.isnan(TexasData['InsureFeeRatio']))],99)
lower_limit = np.percentile(TexasData['InsureFeeRatio'][np.logical_not(np.isnan(TexasData['InsureFeeRatio']))],1)
TexasData.loc[(TexasData['InsureFeeRatio']>upper_limit)&(np.logical_not(np.isnan(TexasData['InsureFeeRatio']))),
    'InsureFeeRatio'] = \
    upper_limit
TexasData.loc[(TexasData['InsureFeeRatio']<lower_limit)&(np.logical_not(np.isnan(TexasData['InsureFeeRatio']))),
    'InsureFeeRatio'] = \
    lower_limit

#----------------------------------------------#
# Recreate variables to be consistent with GPF #
#----------------------------------------------#

# Fields for if prior relationships are missing if not using a certain fee
TexasData.loc[pd.isnull(TexasData['GPF_if_prior_insurer']),'GPF_if_prior_insurer'] = True
TexasData.loc[pd.isnull(TexasData['GPF_if_prior_advisor']),'GPF_if_prior_advisor'] = True
TexasData.loc[pd.isnull(TexasData['GPF_if_prior_rater']),'GPF_if_prior_rater'] = True

TexasData['GPF_security_type'] = None
TexasData.loc[TexasData['PledgeType']=='GO','GPF_security_type'] = 'GO'
TexasData.loc[TexasData['PledgeType']=='REV','GPF_security_type'] = 'RV'

TexasData['GPF_taxable_code'] = 'E'

TexasData['GPF_Bid'] = None
TexasData.loc[TexasData['SaleType']=='Negotiated','GPF_Bid'] = 'N'
TexasData.loc[TexasData['SaleType']=='Competitive','GPF_Bid'] = 'C'

# Adjust inflation
FPCPITOTLZGUSA = pd.read_csv("../RawData/StLouisFed/FPCPITOTLZGUSA.csv")
FPCPITOTLZGUSA['year'] = FPCPITOTLZGUSA['DATE'].str[:4].astype(int)
FPCPITOTLZGUSA = FPCPITOTLZGUSA.sort_values('year',ascending=False).reset_index(drop=True)
scaler = 1
FPCPITOTLZGUSA['scaler'] = None
for idx,row in FPCPITOTLZGUSA.iterrows():
    if idx==0:
        FPCPITOTLZGUSA.at[idx,'scaler'] = 1
    else:
        scaler = scaler*(FPCPITOTLZGUSA.at[idx-1,'FPCPITOTLZGUSA']/100+1)
        FPCPITOTLZGUSA.at[idx,'scaler'] = scaler
FPCPITOTLZGUSA = FPCPITOTLZGUSA[['scaler','year']]

TexasData = TexasData.merge(FPCPITOTLZGUSA,on=['year'])
TexasData['GPF_amount_inf_adjusted'] = TexasData['Principal Amount']*TexasData['scaler']

TexasData['GPF_amount_bracket'] = None
TexasData.loc[TexasData['GPF_amount_inf_adjusted']<=1*1000000,
    'GPF_amount_bracket'] = 'Less than 1M'
TexasData.loc[(TexasData['GPF_amount_inf_adjusted']>1*1000000)&(TexasData['GPF_amount_inf_adjusted']<=5*1000000),
    'GPF_amount_bracket'] = '1M to 5M'
TexasData.loc[(TexasData['GPF_amount_inf_adjusted']>5*1000000)&(TexasData['GPF_amount_inf_adjusted']<=10*1000000),
    'GPF_amount_bracket'] = '5M to 10M'
TexasData.loc[(TexasData['GPF_amount_inf_adjusted']>10*1000000)&(TexasData['GPF_amount_inf_adjusted']<=50*1000000),
    'GPF_amount_bracket'] = '10M to 50M'
TexasData.loc[(TexasData['GPF_amount_inf_adjusted']>50*1000000)&(TexasData['GPF_amount_inf_adjusted']<=100*1000000),
    'GPF_amount_bracket'] = '50M to 100M'
TexasData.loc[TexasData['GPF_amount_inf_adjusted']>100*1000000,
    'GPF_amount_bracket'] = 'Greater than 100M'

TexasData['GPF_maturity_bracket'] = None
TexasData.loc[TexasData['maturity_in_years']<=2,'GPF_maturity_bracket'] = 'Less then 2y'
TexasData.loc[(TexasData['maturity_in_years']>2)&(TexasData['maturity_in_years']<=5),
    'GPF_maturity_bracket'] = '2y to 5y'
TexasData.loc[(TexasData['maturity_in_years']>5)&(TexasData['maturity_in_years']<=10),
    'GPF_maturity_bracket'] = '5y to 10y'
TexasData.loc[(TexasData['maturity_in_years']>10)&(TexasData['maturity_in_years']<=20),
    'GPF_maturity_bracket'] = '10y to 20y'
TexasData.loc[(TexasData['maturity_in_years']>20)&(TexasData['maturity_in_years']<=30),
    'GPF_maturity_bracket'] = '20y to 30y'
TexasData.loc[(TexasData['maturity_in_years']>30)&(TexasData['maturity_in_years']<=40),
    'GPF_maturity_bracket'] = '30y to 40y'
TexasData.loc[TexasData['maturity_in_years']>40,
    'GPF_maturity_bracket'] = 'Greater than 40y'

TexasData['GPF_if_refunding'] = TexasData['RefundingPar']>0

#-----------------------#
# Keep useful variables #
#-----------------------#

TexasData = TexasData[[
    'Issuer','County','State','year',
    'AdvisorFeeRatio', 'CRFeeRatio','InsureFeeRatio',
    'GPF_if_prior_insurer', 'GPF_if_prior_advisor', 'GPF_if_prior_rater',
    'GPF_security_type', 'GPF_taxable_code', 'GPF_Bid',
    'GPF_amount_inf_adjusted', 'GPF_amount_bracket', 'GPF_maturity_bracket',
    'GPF_if_refunding']]

CPU times: user 7min 43s, sys: 15.8 s, total: 7min 59s
Wall time: 8min 5s


# 3. Combine California and Texas, and predict costs

In [5]:
#------------------------------#
# Combine California and Texas #
#------------------------------#

CostSample = pd.concat([TexasData,CaliforniaData])

######################################
# Merge in county level demographics #
######################################

County_Composite = pd.read_csv("../CleanData/Demographics/0C_County_Composite.csv")

# Get national average county-level income and population
County_Composite_yearlyavg = County_Composite.groupby('year').agg({'pop':'mean','inc':'mean'})
County_Composite_yearlyavg = County_Composite_yearlyavg.rename(columns={'pop':'pop_yearlyavg','inc':'inc_yearlyavg'})
County_Composite = County_Composite.merge(County_Composite_yearlyavg,on=['year'])
County_Composite['pop_to_avg'] = County_Composite['pop']/County_Composite['pop_yearlyavg']
County_Composite['inc_to_avg'] = County_Composite['inc']/County_Composite['inc_yearlyavg']

CostSample = CostSample.merge(County_Composite[['State','County','year','pop_to_avg','inc_to_avg']],on=['State','County','year'])

# Convert True or False into 0 and 1
CostSample['GPF_if_prior_insurer'] = CostSample['GPF_if_prior_insurer'].astype(int)
CostSample['GPF_if_prior_advisor'] = CostSample['GPF_if_prior_advisor'].astype(int)
CostSample['GPF_if_prior_rater'] = CostSample['GPF_if_prior_rater'].astype(int)
CostSample['GPF_if_refunding'] = CostSample['GPF_if_refunding'].astype(int)

CostSample.to_csv('../CleanData/California/0H_SumStats.csv')

In [6]:
##############
# Import GPF #
##############

GPF = pd.read_csv("../CleanData/SDC/0A_GPF.csv",low_memory=False)

GPF['is_security_type_GO'] = GPF['security_type']=='GO'
GPF['is_security_type_RV'] = GPF['security_type']=='RV'
GPF['is_Bid_C'] = GPF['Bid']=='C'
GPF['is_Bid_N'] = GPF['Bid']=='N'
GPF['is_taxable_code_A'] = GPF['taxable_code']=='A'
GPF['is_taxable_code_E'] = GPF['taxable_code']=='E'
GPF['is_taxable_code_T'] = GPF['taxable_code']=='T'

GPF = GPF.merge(FPCPITOTLZGUSA.rename(columns={'year':'sale_year'}),on=['sale_year'],how='outer',indicator=True)
GPF = GPF[GPF['_merge']!='right_only']
GPF = GPF.drop(columns=['_merge'])
GPF['amount_inf_adjusted'] = GPF['amount']*GPF['scaler']*1000000

GPF['is_amount_Less_than_1M'] = GPF['amount_inf_adjusted']<1*1000000
GPF['is_amount_1M_to_5M'] = (GPF['amount_inf_adjusted']>1*1000000)&(GPF['amount_inf_adjusted']<=5*1000000)
GPF['is_amount_5M_to_10M'] = (GPF['amount_inf_adjusted']>5*1000000)&(GPF['amount_inf_adjusted']<=10*1000000)
GPF['is_amount_10M_to_50M'] = (GPF['amount_inf_adjusted']>10*1000000)&(GPF['amount_inf_adjusted']<=50*1000000)
GPF['is_amount_50M_to_100M'] = (GPF['amount_inf_adjusted']>50*1000000)&(GPF['amount_inf_adjusted']<=100*1000000)
GPF['is_amount_Greater_than_100M'] = GPF['amount_inf_adjusted']>100*1000000

GPF['maturity_in_years'] = np.round(GPF['avg_maturity']/365)

GPF['is_maturity_Less_than_2y'] = GPF['maturity_in_years']<2
GPF['is_maturity_2y_to_5y'] = (GPF['maturity_in_years']>2)&(GPF['maturity_in_years']<=5)
GPF['is_maturity_5y_to_10y'] = (GPF['maturity_in_years']>5)&(GPF['maturity_in_years']<=10)
GPF['is_maturity_10y_to_20y'] = (GPF['maturity_in_years']>10)&(GPF['maturity_in_years']<=20)
GPF['is_maturity_20y_to_30y'] = (GPF['maturity_in_years']>20)&(GPF['maturity_in_years']<=30)
GPF['is_maturity_30y_to_40y'] = (GPF['maturity_in_years']>30)&(GPF['maturity_in_years']<=40)
GPF['is_maturity_Greater_than_40y'] = GPF['maturity_in_years']>40

# Convert True or False into 0 and 1
GPF['if_prior_insurer'] = GPF['if_prior_insurer'].astype(int)
GPF['if_prior_advisor'] = GPF['if_prior_advisor'].astype(int)
GPF['if_prior_rater'] = GPF['if_prior_rater'].astype(int)

GPF['if_refunding'] = GPF['Refunding']!='N'
GPF['if_refunding'] = GPF['if_refunding'].astype(int)

# Dummy variable for which year it is
for year in list(GPF['sale_year'].unique()):
    GPF['if_year_'+str(year)] = GPF['sale_year']==year
column_years = ['if_year_'+str(year) for year in list(GPF['sale_year'].unique())]

# Merge in county income/population data
GPF = GPF.merge(County_Composite[['State','County','year','pop_to_avg','inc_to_avg']]\
    .rename(columns={'year':'sale_year'}),on=['State','County','sale_year'],how='outer',indicator=True)
GPF = GPF[GPF['_merge']!='right_only']
GPF = GPF.drop(columns=['_merge'])


## 3.1 Model with no year FEs

In [7]:
################################################
# Advisors fee/Credit rating fee/Insurance fee #
################################################

fee_names = [
    ['Advisor','if_prior_advisor'],
    ['CR','if_prior_rater'],
    ['Insure','if_prior_insurer'],
    ]

for item in fee_names:

    fee_name = item[0]
    if_prior_relationship = item[1]
    
    # Run regression in the California sample
    # To make sure that order of first category is fixed
    CostSample = CostSample.sort_values(['GPF_maturity_bracket','GPF_amount_bracket','GPF_Bid','GPF_security_type','GPF_taxable_code'])
    maturity_bracket = pd.get_dummies(CostSample['GPF_maturity_bracket'], drop_first=True)
    amount_bracket = pd.get_dummies(CostSample['GPF_amount_bracket'], drop_first=True)
    Bid = pd.get_dummies(CostSample['GPF_Bid'], drop_first=True)
    security_type = pd.get_dummies(CostSample['GPF_security_type'], drop_first=True)
    taxable_code = pd.get_dummies(CostSample['GPF_taxable_code'], drop_first=True)
    
    CostSample_RegData = pd.concat([CostSample,maturity_bracket,amount_bracket,Bid,security_type,taxable_code],axis=1)
    CostSample_RegData = CostSample_RegData.dropna(subset=[fee_name+'FeeRatio'])
    CostSample_RegData[fee_name+'FeeRatio'] = CostSample_RegData[fee_name+'FeeRatio']*10000
    
    upper_limit = np.percentile(CostSample_RegData[fee_name+'FeeRatio'][np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))],99)
    lower_limit = np.percentile(CostSample_RegData[fee_name+'FeeRatio'][np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))],1)
    CostSample_RegData.loc[
        (CostSample_RegData[fee_name+'FeeRatio']>upper_limit)&(np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))),
        fee_name+'FeeRatio'] = upper_limit
    CostSample_RegData.loc[
        (CostSample_RegData[fee_name+'FeeRatio']<lower_limit)&(np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))),
        fee_name+'FeeRatio'] = lower_limit
    
    CostSample_RegData = CostSample_RegData[CostSample_RegData[fee_name+'FeeRatio']>0.00001]
    
    X = CostSample_RegData[['pop_to_avg','inc_to_avg','GPF_'+if_prior_relationship,'GPF_if_refunding']
        +list(maturity_bracket.columns)
        +list(amount_bracket.columns)
        +list(Bid.columns)
        +list(security_type.columns)
        +list(taxable_code.columns)]
    y = CostSample_RegData[fee_name+'FeeRatio']
    
    CostSample_RegData.to_csv("../CleanData/California/0H_CostSample_RegData_"+fee_name+"FeeRatio.csv")
    
    model = sm.OLS(y, sm.add_constant(X))
    result = model.fit()
    
    # Predict would-be costs
    GPF[fee_name+'FeeRatio_hat'] = result.params['const']+\
        result.params['pop_to_avg']*GPF['pop_to_avg']+\
        result.params['inc_to_avg']*GPF['inc_to_avg']+\
        result.params['GPF_'+if_prior_relationship]*GPF[if_prior_relationship]+\
        result.params['GPF_if_refunding']*GPF['if_refunding']+\
        result.params['20y to 30y']*GPF['is_maturity_20y_to_30y']+\
        result.params['2y to 5y']*GPF['is_maturity_2y_to_5y']+\
        result.params['30y to 40y']*GPF['is_maturity_30y_to_40y']+\
        result.params['5y to 10y']*GPF['is_maturity_5y_to_10y']+\
        result.params['Greater than 40y']*GPF['is_maturity_Greater_than_40y']+\
        result.params['Less then 2y']*GPF['is_maturity_Less_than_2y']+\
        result.params['1M to 5M']*GPF['is_amount_1M_to_5M']+\
        result.params['50M to 100M']*GPF['is_amount_50M_to_100M']+\
        result.params['5M to 10M']*GPF['is_amount_5M_to_10M']+\
        result.params['Greater than 100M']*GPF['is_amount_Greater_than_100M']+\
        result.params['Less than 1M']*GPF['is_amount_Less_than_1M']+\
        result.params['N']*GPF['is_Bid_N']+\
        result.params['RV']*GPF['is_security_type_RV']+\
        result.params['E']*GPF['is_taxable_code_E']+\
        result.params['T']*GPF['is_taxable_code_T']
    
    GPF.loc[GPF[fee_name+'FeeRatio_hat']<0,fee_name+'FeeRatio_hat'] = 0
    GPF[fee_name+'FeeRatio_hat'] = GPF[fee_name+'FeeRatio_hat'].astype(float)
    
    upper_limit = np.percentile(GPF[fee_name+'FeeRatio_hat'][np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat']))],99)
    lower_limit = np.percentile(GPF[fee_name+'FeeRatio_hat'][np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat']))],1)
    GPF.loc[(GPF[fee_name+'FeeRatio_hat']>upper_limit)&(np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat']))),fee_name+'FeeRatio_hat'] = \
        upper_limit
    GPF.loc[(GPF[fee_name+'FeeRatio_hat']<lower_limit)&(np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat']))),fee_name+'FeeRatio_hat'] = \
        lower_limit


## 3.2 Model with year FEs

Notes:
- Prediction is done using data where each fee is non-zero.

In [8]:
# Dummy for years
for year in list(CostSample['year'].unique()):
    CostSample['if_year_'+str(year)] = CostSample['year']==year
    CostSample['if_year_'+str(year)] = CostSample['if_year_'+str(year)].astype(int)
column_years_CostSample = ['if_year_'+str(year) for year in list(CostSample['year'].unique())]


In [9]:
################################################
# Advisors fee/Credit rating fee/Insurance fee #
################################################

fee_names = [
    ['Advisor','if_prior_advisor'],
    ['CR','if_prior_rater'],
    ['Insure','if_prior_insurer'],
    ]

for item in fee_names:

    fee_name = item[0]
    if_prior_relationship = item[1]

    # Run regression in the California sample
    # To make sure that order of first category is fixed
    CostSample = CostSample.sort_values(['GPF_maturity_bracket','GPF_amount_bracket','GPF_Bid','GPF_security_type','GPF_taxable_code'])
    maturity_bracket = pd.get_dummies(CostSample['GPF_maturity_bracket'], drop_first=True)
    amount_bracket = pd.get_dummies(CostSample['GPF_amount_bracket'], drop_first=True)
    Bid = pd.get_dummies(CostSample['GPF_Bid'], drop_first=True)
    security_type = pd.get_dummies(CostSample['GPF_security_type'], drop_first=True)
    taxable_code = pd.get_dummies(CostSample['GPF_taxable_code'], drop_first=True)
    
    CostSample_RegData = pd.concat([CostSample,maturity_bracket,amount_bracket,Bid,security_type,taxable_code],axis=1)
    CostSample_RegData = CostSample_RegData.dropna(subset=[fee_name+'FeeRatio'])
    CostSample_RegData[fee_name+'FeeRatio'] = CostSample_RegData[fee_name+'FeeRatio']*10000
    
    upper_limit = np.percentile(CostSample_RegData[fee_name+'FeeRatio'][np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))],99)
    lower_limit = np.percentile(CostSample_RegData[fee_name+'FeeRatio'][np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))],1)
    CostSample_RegData.loc[
        (CostSample_RegData[fee_name+'FeeRatio']>upper_limit)&(np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))),
        fee_name+'FeeRatio'] = upper_limit
    CostSample_RegData.loc[
        (CostSample_RegData[fee_name+'FeeRatio']<lower_limit)&(np.logical_not(np.isnan(CostSample_RegData[fee_name+'FeeRatio']))),
        fee_name+'FeeRatio'] = lower_limit
    
    CostSample_RegData = CostSample_RegData[CostSample_RegData[fee_name+'FeeRatio']>0.00001]
    
    # Year 2000 is chosen as the left-out group
    column_years_CostSample_in_regression = [item for item in column_years_CostSample if item!='if_year_2000']
    # Further, throw out columns for which there is no observation for this type of fee
    years_in_sample = ['if_year_'+str(year) for year in list(CostSample_RegData['year'].unique())]
    column_years_CostSample_in_regression = set(column_years_CostSample_in_regression).intersection(set(years_in_sample))
    column_years_CostSample_in_regression = list(column_years_CostSample_in_regression)
    # "column_years_CostSample_in_regression" is a list of year dummies used in the regression
    
    X = CostSample_RegData[['pop_to_avg','inc_to_avg','GPF_'+if_prior_relationship,'GPF_if_refunding']
        +list(maturity_bracket.columns)
        +list(amount_bracket.columns)
        +list(Bid.columns)
        +list(security_type.columns)
        +list(taxable_code.columns)
        +column_years_CostSample_in_regression]
    y = CostSample_RegData[fee_name+'FeeRatio']
    
    CostSample_RegData.to_csv("../CleanData/California/0H_CostSample_RegData_AdvisorFeeRatio.csv")
    
    model = sm.OLS(y, sm.add_constant(X))
    result = model.fit()
    
    # Predict would-be costs
    GPF[fee_name+'FeeRatio_hat_model_timeFE'] = result.params['const']+\
        result.params['pop_to_avg']*GPF['pop_to_avg']+\
        result.params['inc_to_avg']*GPF['inc_to_avg']+\
        result.params['GPF_'+if_prior_relationship]*GPF[if_prior_relationship]+\
        result.params['GPF_if_refunding']*GPF['if_refunding']+\
        result.params['20y to 30y']*GPF['is_maturity_20y_to_30y']+\
        result.params['2y to 5y']*GPF['is_maturity_2y_to_5y']+\
        result.params['30y to 40y']*GPF['is_maturity_30y_to_40y']+\
        result.params['5y to 10y']*GPF['is_maturity_5y_to_10y']+\
        result.params['Greater than 40y']*GPF['is_maturity_Greater_than_40y']+\
        result.params['Less then 2y']*GPF['is_maturity_Less_than_2y']+\
        result.params['1M to 5M']*GPF['is_amount_1M_to_5M']+\
        result.params['50M to 100M']*GPF['is_amount_50M_to_100M']+\
        result.params['5M to 10M']*GPF['is_amount_5M_to_10M']+\
        result.params['Greater than 100M']*GPF['is_amount_Greater_than_100M']+\
        result.params['Less than 1M']*GPF['is_amount_Less_than_1M']+\
        result.params['N']*GPF['is_Bid_N']+\
        result.params['RV']*GPF['is_security_type_RV']+\
        result.params['E']*GPF['is_taxable_code_E']+\
        result.params['T']*GPF['is_taxable_code_T']
    
    # Add effects of years
    for column in column_years_CostSample_in_regression:
        GPF[fee_name+'FeeRatio_hat_model_timeFE'] = GPF[fee_name+'FeeRatio_hat_model_timeFE']+result.params[column]*GPF[column]
    
    GPF.loc[GPF[fee_name+'FeeRatio_hat_model_timeFE']<0,fee_name+'FeeRatio_hat_model_timeFE'] = 0
    GPF[fee_name+'FeeRatio_hat_model_timeFE'] = GPF[fee_name+'FeeRatio_hat_model_timeFE'].astype(float)
    
    # Throw out predicted values for years out of training sample range
    GPF.loc[~GPF['sale_year'].isin(list(CostSample_RegData['year'].unique())),fee_name+'FeeRatio_hat_model_timeFE'] = None
    
    upper_limit = np.percentile(GPF[fee_name+'FeeRatio_hat_model_timeFE']\
        [np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat_model_timeFE']))],99)
    lower_limit = np.percentile(GPF[fee_name+'FeeRatio_hat_model_timeFE']\
        [np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat_model_timeFE']))],1)
    GPF.loc[(GPF[fee_name+'FeeRatio_hat_model_timeFE']>upper_limit)
        &(np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat_model_timeFE']))),fee_name+'FeeRatio_hat_model_timeFE'] = \
        upper_limit
    GPF.loc[(GPF[fee_name+'FeeRatio_hat_model_timeFE']<lower_limit)
        &(np.logical_not(np.isnan(GPF[fee_name+'FeeRatio_hat_model_timeFE']))),fee_name+'FeeRatio_hat_model_timeFE'] = \
        lower_limit



# 4. Generate measure of total financing cost

Obtain a modified version of TIC, which sets discounted value of all future payments to (purchase price - three costs above). Here purchase price equals reoffering price minus the gross spread.

The calculation is done using dollar values (in millions), to handle different bonds within an issue and their weights.

In [10]:
def proc_list(GPF):

    GPF = GPF.copy()
    GPF['mod_tic'] = None
    GPF['mod_tic_spread_treasury'] = None
    GPF['mod_tic_spread_MMA'] = None
    GPF['mod_tic_timeFE'] = None
    GPF['mod_tic_spread_treasury_timeFE'] = None
    GPF['mod_tic_spread_MMA_timeFE'] = None

    # Handle case by case of each variable being missing, and within each case allow for multiple maturities
    for idx,row in GPF.iterrows():

        # Do nothing if the issue contains bonds with irregular type of coupon payments
        if row['IF_irregular_coupon_type']==True:
            continue
    
        # Do nothing if dated date is missing
        if str(row['dated_date'])=='nan' or str(row['dated_date'])=='NaT':
            continue
    
        # Extract vallues that describe data availablility for a particular bond issue

        IF_has_maturity_date = row['IF_has_maturity_date']
        IF_has_coupon_rate = row['IF_has_coupon_rate']
        IF_has_price_or_yield = row['IF_has_price_or_yield']
        IF_has_amount_by_maturity = row['IF_has_amount_by_maturity']
        IF_has_gross_spread = row['IF_has_gross_spread']
        
        N_coupon_rate = row['N_coupon_rate']
        N_price_or_yield = row['N_price_or_yield']
        N_maturity_date = row['N_maturity_date']
        N_amount = row['N_amount']
    
        IF_num_bonds_all_consistent = row['IF_num_bonds_all_consistent']
        IF_num_bonds_yield_mat_amt_consistent = row['IF_num_bonds_yield_mat_amt_consistent']
        IF_num_bonds_mat_amt_consistent = row['IF_num_bonds_mat_amt_consistent']

        IF_has_AdvisorFee = \
            row['AdvisorFeeRatio_hat']!=None and \
            str(row['AdvisorFeeRatio_hat'])!='nan' and \
            str(row['AdvisorFeeRatio_hat'])!='None'
        IF_has_CRFee = \
            row['CRFeeRatio_hat']!=None and \
            str(row['CRFeeRatio_hat'])!='nan' and \
            str(row['CRFeeRatio_hat'])!='None'
        IF_has_InsureFee = \
            row['InsureFeeRatio_hat']!=None and \
            str(row['InsureFeeRatio_hat'])!='nan' and \
            str(row['InsureFeeRatio_hat'])!='None'
        IF_has_AdvisorFee_timeFE = \
            row['AdvisorFeeRatio_hat_model_timeFE']!=None and \
            str(row['AdvisorFeeRatio_hat_model_timeFE'])!='nan' and \
            str(row['AdvisorFeeRatio_hat_model_timeFE'])!='None'
        IF_has_CRFee_timeFE = \
            row['CRFeeRatio_hat_model_timeFE']!=None and \
            str(row['CRFeeRatio_hat_model_timeFE'])!='nan' and \
            str(row['CRFeeRatio_hat_model_timeFE'])!='None'
        IF_has_InsureFee_timeFE = \
            row['InsureFeeRatio_hat_model_timeFE']!=None and \
            str(row['InsureFeeRatio_hat_model_timeFE'])!='nan' and \
            str(row['InsureFeeRatio_hat_model_timeFE'])!='None'

        # Whether the bond issue is using credit ratings, insurance, or financial advisor
        has_rating = row['has_Moodys'] or row['has_Fitch']
        insured_ratio = row['insured_amount']/row['amount']
        if_advisor_coded = row['if_advisor']=="Yes"

        # Note that function "npf.npv" start with a period 0 cash flow

        # The calculation below is based on dollar amounts of the whole bond issue, rather than every $100 par value
        
        if IF_has_maturity_date and IF_has_coupon_rate and IF_has_price_or_yield and IF_has_amount_by_maturity \
            and IF_has_gross_spread \
            and IF_num_bonds_all_consistent:

            # Case A: If single maturity
            if N_maturity_date==1:

                maturity = (datetime.strptime(row['maturity_date'],"%Y-%m-%d %H:%M:%S")-
                    datetime.strptime(row['dated_date'],"%Y-%m-%d %H:%M:%S")).days
                n_coupon = round(maturity/182)
    
                # Obtain the discount of reoffering price relative to par vlaue
                # Assume that if a number is more than 80 and less than 120, it is issuing price. If less than 20, it is issuing 
                # yield. Otherwise, undetermined
                if float(row['price_or_yield'])<20:
                    reoffering_yield = float(row['price_or_yield'])/100
                    reoffering_price = npf.npv(sqrt(1+reoffering_yield)-1,
                        [0]+[float(row['amount'])*float(row['coupon_rate'])/100/2]*(n_coupon-1)+[float(row['amount'])+float(row['amount'])*float(row['coupon_rate'])/100/2])
                elif float(row['price_or_yield'])>80 and float(row['price_or_yield'])<120:
                    reoffering_price = float(row['price_or_yield'])/100*float(row['amount'])
                else:
                    continue

                # Calculate purchase price
                if IF_has_AdvisorFee and IF_has_CRFee and IF_has_InsureFee:
                    # Gross spread is represented in the unit of every $1000
                    purchase_price = reoffering_price-row['gross_spread']/1000*float(row['amount'])
                    # Other fees are represented in the unit of basis points
                    if if_advisor_coded:
                        purchase_price = purchase_price-row['AdvisorFeeRatio_hat']/10000*float(row['amount'])
                    if has_rating:
                        purchase_price = purchase_price-row['CRFeeRatio_hat']/10000*float(row['amount'])
                    if insured_ratio:
                        purchase_price = purchase_price-row['InsureFeeRatio_hat']/10000*float(row['amount'])
                
                    # Calculate TIC implied by NIC
                    mod_tic = (1+npf.irr([-purchase_price]+\
                        [float(row['coupon_rate'])/2/100*float(row['amount'])]*(n_coupon-1)+[float(row['amount'])+float(row['coupon_rate'])/2/100*float(row['amount'])]))\
                        **2-1        
                    GPF.loc[idx,'mod_tic'] = mod_tic
                    if str(row['sync_treasury_bond_avg_yield'])!='nan':
                        GPF.loc[idx,'mod_tic_spread_treasury'] = mod_tic-row['sync_treasury_bond_avg_yield']
                    if str(row['MMA_avg_yield'])!='nan':
                        GPF.loc[idx,'mod_tic_spread_MMA'] = mod_tic-row['MMA_avg_yield']

                if IF_has_AdvisorFee_timeFE and IF_has_CRFee_timeFE and IF_has_InsureFee_timeFE:
                    # Gross spread is represented in the unit of every $1000
                    purchase_price = reoffering_price-row['gross_spread']/1000*float(row['amount'])
                    # Other fees are represented in the unit of basis points
                    if if_advisor_coded:
                        purchase_price = purchase_price-row['AdvisorFeeRatio_hat_model_timeFE']/10000*float(row['amount'])
                    if has_rating:
                        purchase_price = purchase_price-row['CRFeeRatio_hat_model_timeFE']/10000*float(row['amount'])
                    if insured_ratio:
                        purchase_price = purchase_price-row['InsureFeeRatio_hat_model_timeFE']/10000*float(row['amount'])

                    # Calculate TIC implied by NIC
                    mod_tic_timeFE = (1+npf.irr([-purchase_price]+\
                        [float(row['coupon_rate'])/2/100*float(row['amount'])]*(n_coupon-1)+[float(row['amount'])+float(row['coupon_rate'])/2/100*float(row['amount'])]))\
                        **2-1        
                    GPF.loc[idx,'mod_tic_timeFE'] = mod_tic_timeFE
                    if str(row['sync_treasury_bond_avg_yield'])!='nan':
                        GPF.loc[idx,'mod_tic_spread_treasury_timeFE'] = mod_tic_timeFE-row['sync_treasury_bond_avg_yield']
                    if str(row['MMA_avg_yield'])!='nan':
                        GPF.loc[idx,'mod_tic_spread_MMA_timeFE'] = mod_tic_timeFE-row['MMA_avg_yield']
    
            # Case B: If multiple maturity
            else:
                # If number of tranches not consistent across fields, skip
                if N_price_or_yield!=N_coupon_rate:
                    continue
                elif N_price_or_yield!=N_maturity_date:
                    continue
                else:
                    reoffering_prices = []
                    cash_flows = [] # Cash flow of the whole bond, not every $100 par value
                    for tranch in range(0,row['coupon_rate'].count('\n')+1):
                        maturity = (datetime.strptime(row['maturity_date'].split('\n')[tranch],"%m/%d/%y")\
                            -datetime.strptime(row['dated_date'],"%Y-%m-%d %H:%M:%S")).days
                        coupon_rate = float(row['coupon_rate'].split('\n')[tranch])
                        price_or_yield = float(row['price_or_yield'].split('\n')[tranch])
                        n_coupon = round(maturity/182)
                        amount = float(row['amount_by_maturity'].split('\n')[tranch].replace(',',''))
                        cash_flows = cash_flows+[[0]+[coupon_rate/2/100*amount]*(n_coupon-1)+[amount+coupon_rate/2/100*amount]]
                        if price_or_yield<20:
                            reoffering_yield = price_or_yield/100
                            # Below is a value per every $100 par value, which needs to be converted to the value of the whole bond
                            reoffering_price = npf.npv(sqrt(1+reoffering_yield)-1,[0]+[coupon_rate/2]*(n_coupon-1)+[100+coupon_rate/2])
                            reoffering_price = reoffering_price/100*amount
                            reoffering_prices = reoffering_prices+[reoffering_price]
                        elif price_or_yield>80 and price_or_yield<120:
                            reoffering_price = price_or_yield
                            reoffering_price = reoffering_price/100*amount
                            reoffering_prices = reoffering_prices+[reoffering_price]
                        else:
                            reoffering_prices = reoffering_prices+[None]
    
                # Aggregate across trenches and calculate the NIC implied underwriter discount and TIC
                if None in reoffering_prices:
                    continue
                else:
                    # Calculate purchase price
                    if IF_has_AdvisorFee and IF_has_CRFee and IF_has_InsureFee:
                        # Gross spread is represented in the unit of every $1000
                        purchase_price = np.sum(reoffering_prices)-row['gross_spread']/1000*float(row['amount'])
                        # Other fees are represented in the unit of basis points
                        if if_advisor_coded:
                            purchase_price = purchase_price-row['AdvisorFeeRatio_hat']/10000*float(row['amount'])
                        if has_rating:
                            purchase_price = purchase_price-row['CRFeeRatio_hat']/10000*float(row['amount'])
                        if insured_ratio:
                            purchase_price = purchase_price-row['InsureFeeRatio_hat']/10000*float(row['amount'])
                        
                        # Calculate TIC implied by NIC
                        max_length = max(len(lst) for lst in cash_flows)
                        padded_lists = [lst+[0]*(max_length-len(lst)) for lst in cash_flows]
                        cash_flow = [sum(elements) for elements in zip(*padded_lists)]
                        cash_flow[0] = cash_flow[0]-purchase_price
                        mod_tic = (1+npf.irr(cash_flow))**2-1
                        GPF.loc[idx,'mod_tic'] = mod_tic
                        if str(row['sync_treasury_bond_avg_yield'])!='nan':
                            GPF.loc[idx,'mod_tic_spread_treasury'] = mod_tic-row['sync_treasury_bond_avg_yield']
                        if str(row['MMA_avg_yield'])!='nan':
                            GPF.loc[idx,'mod_tic_spread_MMA'] = mod_tic-row['MMA_avg_yield']

                    if IF_has_AdvisorFee_timeFE and IF_has_CRFee_timeFE and IF_has_InsureFee_timeFE:
                        # Gross spread is represented in the unit of every $1000
                        purchase_price = np.sum(reoffering_prices)-row['gross_spread']/1000*float(row['amount'])
                        # Other fees are represented in the unit of basis points
                        if if_advisor_coded:
                            purchase_price = purchase_price-row['AdvisorFeeRatio_hat_model_timeFE']/10000*float(row['amount'])
                        if has_rating:
                            purchase_price = purchase_price-row['CRFeeRatio_hat_model_timeFE']/10000*float(row['amount'])
                        if insured_ratio:
                            purchase_price = purchase_price-row['InsureFeeRatio_hat_model_timeFE']/10000*float(row['amount'])
                    
                        # Calculate TIC implied by NIC
                        max_length = max(len(lst) for lst in cash_flows)
                        padded_lists = [lst+[0]*(max_length-len(lst)) for lst in cash_flows]
                        cash_flow = [sum(elements) for elements in zip(*padded_lists)]
                        cash_flow[0] = cash_flow[0]-purchase_price
                        mod_tic_timeFE = (1+npf.irr(cash_flow))**2-1
                        GPF.loc[idx,'mod_tic_timeFE'] = mod_tic_timeFE
                        if str(row['sync_treasury_bond_avg_yield'])!='nan':
                            GPF.loc[idx,'mod_tic_spread_treasury_timeFE'] = mod_tic_timeFE-row['sync_treasury_bond_avg_yield']
                        if str(row['MMA_avg_yield'])!='nan':
                            GPF.loc[idx,'mod_tic_spread_MMA_timeFE'] = mod_tic_timeFE-row['MMA_avg_yield']

    return GPF

GPF = GPF.copy()
meta_columns = list(proc_list(GPF.sample(10)).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


In [11]:
# Winsorize data. Handle missing values cases carefully

vars_to_winsor = [
    'mod_tic','mod_tic_spread_treasury','mod_tic_spread_MMA',
    'mod_tic_timeFE','mod_tic_spread_treasury_timeFE','mod_tic_spread_MMA_timeFE'
    ]

for var in vars_to_winsor:
    GPF[var] = pd.to_numeric(GPF[var],errors='coerce')
    upper_limit = np.percentile(GPF[var][np.logical_not(np.isnan(GPF[var]))],99)
    lower_limit = np.percentile(GPF[var][np.logical_not(np.isnan(GPF[var]))],1)
    GPF.loc[(GPF[var]>upper_limit)&(np.logical_not(np.isnan(GPF[var]))),var] = upper_limit
    GPF.loc[(GPF[var]<lower_limit)&(np.logical_not(np.isnan(GPF[var]))),var] = lower_limit


# 5. Export data

In [12]:
#--------------------------------------------------#
# Export a version of GPF with imputed other costs #
#--------------------------------------------------#

for year in list(GPF['sale_year'].unique()):
    GPF['if_year_'+str(year)] = GPF['sale_year']==year
column_years = ['if_year_'+str(year) for year in list(GPF['sale_year'].unique())]

GPF = GPF.drop(columns=[
    'is_security_type_GO', 'is_security_type_RV', 'is_Bid_C', 'is_Bid_N',
    'is_taxable_code_A', 'is_taxable_code_E', 'is_taxable_code_T', 'scaler',
    'amount_inf_adjusted', 'is_amount_Less_than_1M', 'is_amount_1M_to_5M',
    'is_amount_5M_to_10M', 'is_amount_10M_to_50M', 'is_amount_50M_to_100M',
    'is_amount_Greater_than_100M', 'maturity_in_years',
    'is_maturity_Less_than_2y', 'is_maturity_2y_to_5y',
    'is_maturity_5y_to_10y', 'is_maturity_10y_to_20y',
    'is_maturity_20y_to_30y', 'is_maturity_30y_to_40y',
    'is_maturity_Greater_than_40y', 'pop_to_avg', 'inc_to_avg',
    ]+column_years)
GPF.to_csv("../CleanData/SDC/0A_GPF.csv")

# 6. Export a Version of GPF for OLS

In [3]:
GPF = pd.read_csv("../CleanData/SDC/0A_GPF.csv",low_memory=False)

In [4]:
# Adjust inflation
FPCPITOTLZGUSA = pd.read_csv("../RawData/StLouisFed/FPCPITOTLZGUSA.csv")
FPCPITOTLZGUSA['year'] = FPCPITOTLZGUSA['DATE'].str[:4].astype(int)
FPCPITOTLZGUSA = FPCPITOTLZGUSA.sort_values('year',ascending=False).reset_index(drop=True)
scaler = 1
FPCPITOTLZGUSA['scaler'] = None
for idx,row in FPCPITOTLZGUSA.iterrows():
    if idx==0:
        FPCPITOTLZGUSA.at[idx,'scaler'] = 1
    else:
        scaler = scaler*(FPCPITOTLZGUSA.at[idx-1,'FPCPITOTLZGUSA']/100+1)
        FPCPITOTLZGUSA.at[idx,'scaler'] = scaler
FPCPITOTLZGUSA = FPCPITOTLZGUSA[['scaler','year']]
FPCPITOTLZGUSA = pd.concat([FPCPITOTLZGUSA,pd.DataFrame([{'scaler':1/(1+3.2/100),'year':2023}])])

HHI_byCSA = pd.read_csv('../CleanData/SDC/1A_HHI_byCSA.csv')

In [12]:
GPF_OLS = GPF[['CSA Code','CBSA Code','sale_year',
    'gross_spread','avg_yield',
    'treasury_avg_spread','MMA_avg_spread',
    'mod_tic','mod_tic_spread_treasury','mod_tic_spread_MMA',
    'mod_tic_timeFE','mod_tic_spread_treasury_timeFE','mod_tic_spread_MMA_timeFE',
    'underpricing_15to30','underpricing_15to60',
    'Issuer','issuer_type',
    'if_advisor','if_dual_advisor','Bid','taxable_code','security_type','amount','avg_maturity',
    'has_Moodys','has_Fitch','rating_Moodys','rating_Fitch','insured_amount',
    'use_of_proceeds_general','if_callable','CB_Eligible',
    'TBB_n_bidders',
    'AdvisorFeeRatio_hat','CRFeeRatio_hat','InsureFeeRatio_hat',
    'AdvisorFeeRatio_hat_model_timeFE','CRFeeRatio_hat_model_timeFE','InsureFeeRatio_hat_model_timeFE',
    'State','County']]
GPF_OLS = GPF_OLS.rename(columns={'sale_year':'calendar_year'})
GPF_OLS = GPF_OLS.merge(FPCPITOTLZGUSA,left_on='calendar_year',right_on='year')
GPF_OLS = GPF_OLS.merge(HHI_byCSA,on=['CSA Code','calendar_year'])
GPF_OLS.to_csv('../CleanData/SDC/1A_GPF_OLS.csv')

In [16]:
# Number: Total amount of issuance in 2022
amount = np.sum(GPF[GPF['sale_year']==2022]['amount'])/1000
amount = "{:.0f}".format(amount)
with open('../Draft/nums/Amount_MunicipalBond.tex','w') as file:
    file.write(str(amount))