In [2]:
import pandas as pd
import numpy as np
import numpy_financial as npf
import statsmodels.api as sm
import geopandas as gpd
import os
import pickle
import dask
import dask.dataframe as dd
import itertools
from itertools import chain
from math import sqrt, floor, ceil, isnan
import multiprocess
import multiprocessing
import importlib
from importlib import reload
from collections import Counter
from fuzzywuzzy import process, fuzz
import time
import warnings
import datetime
from datetime import datetime
from datetime import date
import re
warnings.filterwarnings("error")

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400

try:
    del(FUN_proc_name)
except:
    pass
import FUN_proc_name
importlib.reload(FUN_proc_name)
from FUN_proc_name import FUN_proc_name

state_abbreviations = {
    "ALABAMA": "AL","ALASKA": "AK","ARIZONA": "AZ","ARKANSAS": "AR","CALIFORNIA": "CA",
    "COLORADO": "CO","CONNECTICUT": "CT","DELAWARE": "DE","FLORIDA": "FL","GEORGIA": "GA",
    "HAWAII": "HI","IDAHO": "ID","ILLINOIS": "IL","INDIANA": "IN","IOWA": "IA",
    "KANSAS": "KS","KENTUCKY": "KY","LOUISIANA": "LA","MAINE": "ME","MARYLAND": "MD",
    "MASSACHUSETTS": "MA","MICHIGAN": "MI","MINNESOTA": "MN","MISSISSIPPI": "MS","MISSOURI": "MO",
    "MONTANA": "MT","NEBRASKA": "NE","NEVADA": "NV","NEW HAMPSHIRE": "NH","NEW JERSEY": "NJ",
    "NEW MEXICO": "NM","NEW YORK": "NY","NORTH CAROLINA": "NC","NORTH DAKOTA": "ND","OHIO": "OH",
    "OKLAHOMA": "OK","OREGON": "OR","PENNSYLVANIA": "PA","RHODE ISLAND": "RI","SOUTH CAROLINA": "SC",
    "SOUTH DAKOTA": "SD","TENNESSEE": "TN","TEXAS": "TX","UTAH": "UT","VERMONT": "VT",
    "VIRGINIA": "VA","WASHINGTON": "WA","WEST VIRGINIA": "WV","WISCONSIN": "WI","WYOMING": "WY"
    }


# 1. SDC Global Public Finance

Notes:
- In 2020 and onwards, there are many bonds with yield below 1%, and also lower than NIC (when that field is available).
- SDC GPF does not have any field named "offering price" or "reoffering price", only a field named "price/yield" which could be either price or yield. However, Cestau, Green, Hollifield, Schürhoff (The Cost Burden of Negotiated Sales Restrictions) state that "We obtain information about bonds and issuers from SDC Platinum. These data include issuer characteristics such as name, state, type, **reoffering prices or yields for each bond issue**, and issue characteristics such as issue description, maturity, sale date, coupon, coupon type, call schedule, taxable status, bank qualified indicator, ratings, a refunding indicator, and sinking fund provisions." So it is obviously reoffering price/yield in the raw data.
- "The TIC is defined as the discount rate which equates the principal and semi-annual interest payments on the bonds to the purchase price paid by the underwriter to the issuer". And "the purchase price of a bond issue is the aggregate par amount of all the maturities, plus accrued interest and original issue premium (or less any original issue discount), and **less the underwriter’s discount**." Hence, based on TIC and its implied offering price, and the initial trading price, I can create my own version of underwriting spread.

## 1.1. Import data

In [None]:
###############
# Import data #
###############

# Note that Pandas is unable to read excel formulas. I copy and value-only paste for all cells in "GPF.xlsx" files

GPF_Full_19670101_19671231 = pd.read_excel("../RawData/SDC/GPF_Full_19670101_19671231.xlsx",skiprows=[0])
GPF_Full_19680101_19681231 = pd.read_excel("../RawData/SDC/GPF_Full_19680101_19681231.xlsx",skiprows=[0])
GPF_Full_19690101_19691231 = pd.read_excel("../RawData/SDC/GPF_Full_19690101_19691231.xlsx",skiprows=[0])
GPF_Full_19700101_19701231 = pd.read_excel("../RawData/SDC/GPF_Full_19700101_19701231.xlsx",skiprows=[0])
GPF_Full_19710101_19711231 = pd.read_excel("../RawData/SDC/GPF_Full_19710101_19711231.xlsx",skiprows=[0])
GPF_Full_19720101_19721231 = pd.read_excel("../RawData/SDC/GPF_Full_19720101_19721231.xlsx",skiprows=[0])
GPF_Full_19730101_19731231 = pd.read_excel("../RawData/SDC/GPF_Full_19730101_19731231.xlsx",skiprows=[0])
GPF_Full_19740101_19741231 = pd.read_excel("../RawData/SDC/GPF_Full_19740101_19741231.xlsx",skiprows=[0])
GPF_Full_19750101_19751231 = pd.read_excel("../RawData/SDC/GPF_Full_19750101_19751231.xlsx",skiprows=[0])
GPF_Full_19760101_19761231 = pd.read_excel("../RawData/SDC/GPF_Full_19760101_19761231.xlsx",skiprows=[0])
GPF_Full_19770101_19771231 = pd.read_excel("../RawData/SDC/GPF_Full_19770101_19771231.xlsx",skiprows=[0])
GPF_Full_19780101_19781231 = pd.read_excel("../RawData/SDC/GPF_Full_19780101_19781231.xlsx",skiprows=[0])
GPF_Full_19790101_19791231 = pd.read_excel("../RawData/SDC/GPF_Full_19790101_19791231.xlsx",skiprows=[0])
GPF_Full_19800101_19800331 = pd.read_excel("../RawData/SDC/GPF_Full_19800101_19800331.xlsx",skiprows=[0])
GPF_Full_19800401_19801231 = pd.read_excel("../RawData/SDC/GPF_Full_19800401_19801231.xlsx",skiprows=[0])
GPF_Full_19810101_19811231 = pd.read_excel("../RawData/SDC/GPF_Full_19810101_19811231.xlsx",skiprows=[0])
GPF_Full_19820101_19821231 = pd.read_excel("../RawData/SDC/GPF_Full_19820101_19821231.xlsx",skiprows=[0])
GPF_Full_19830101_19830630 = pd.read_excel("../RawData/SDC/GPF_Full_19830101_19830630.xlsx",skiprows=[0])
GPF_Full_19830701_19831231 = pd.read_excel("../RawData/SDC/GPF_Full_19830701_19831231.xlsx",skiprows=[0])
GPF_Full_19840101_19840630 = pd.read_excel("../RawData/SDC/GPF_Full_19840101_19840630.xlsx",skiprows=[0])
GPF_Full_19840701_19841231 = pd.read_excel("../RawData/SDC/GPF_Full_19840701_19841231.xlsx",skiprows=[0])
GPF_Full_19850101_19850630 = pd.read_excel("../RawData/SDC/GPF_Full_19850101_19850630.xlsx",skiprows=[0])
GPF_Full_19850701_19851231 = pd.read_excel("../RawData/SDC/GPF_Full_19850701_19851231.xlsx",skiprows=[0])
GPF_Full_19860101_19860630 = pd.read_excel("../RawData/SDC/GPF_Full_19860101_19860630.xlsx",skiprows=[0])
GPF_Full_19860701_19861231 = pd.read_excel("../RawData/SDC/GPF_Full_19860701_19861231.xlsx",skiprows=[0])
GPF_Full_19870101_19870630 = pd.read_excel("../RawData/SDC/GPF_Full_19870101_19870630.xlsx",skiprows=[0])
GPF_Full_19870701_19871231 = pd.read_excel("../RawData/SDC/GPF_Full_19870701_19871231.xlsx",skiprows=[0])
GPF_Full_19880101_19880630 = pd.read_excel("../RawData/SDC/GPF_Full_19880101_19880630.xlsx",skiprows=[0])
GPF_Full_19880701_19881231 = pd.read_excel("../RawData/SDC/GPF_Full_19880701_19881231.xlsx",skiprows=[0])
GPF_Full_19890101_19890630 = pd.read_excel("../RawData/SDC/GPF_Full_19890101_19890630.xlsx",skiprows=[0])
GPF_Full_19890701_19891231 = pd.read_excel("../RawData/SDC/GPF_Full_19890701_19891231.xlsx",skiprows=[0])
GPF_Full_19900101_19900331 = pd.read_excel("../RawData/SDC/GPF_Full_19900101_19900331.xlsx",skiprows=[0])
GPF_Full_19900401_19900831 = pd.read_excel("../RawData/SDC/GPF_Full_19900401_19900831.xlsx",skiprows=[0])
GPF_Full_19900901_19901231 = pd.read_excel("../RawData/SDC/GPF_Full_19900901_19901231.xlsx",skiprows=[0])
GPF_Full_19910101_19910430 = pd.read_excel("../RawData/SDC/GPF_Full_19910101_19910430.xlsx",skiprows=[0])
GPF_Full_19910501_19910831 = pd.read_excel("../RawData/SDC/GPF_Full_19910501_19910831.xlsx",skiprows=[0])
GPF_Full_19910901_19911031 = pd.read_excel("../RawData/SDC/GPF_Full_19910901_19911031.xlsx",skiprows=[0])
GPF_Full_19911101_19911231 = pd.read_excel("../RawData/SDC/GPF_Full_19911101_19911231.xlsx",skiprows=[0])
GPF_Full_19920101_19920430 = pd.read_excel("../RawData/SDC/GPF_Full_19920101_19920430.xlsx",skiprows=[0])
GPF_Full_19920501_19920831 = pd.read_excel("../RawData/SDC/GPF_Full_19920501_19920831.xlsx",skiprows=[0])
GPF_Full_19920901_19921231 = pd.read_excel("../RawData/SDC/GPF_Full_19920901_19921231.xlsx",skiprows=[0])
GPF_Full_19930101_19930430 = pd.read_excel("../RawData/SDC/GPF_Full_19930101_19930430.xlsx",skiprows=[0])
GPF_Full_19930501_19930630 = pd.read_excel("../RawData/SDC/GPF_Full_19930501_19930630.xlsx",skiprows=[0])
GPF_Full_19930701_19930930 = pd.read_excel("../RawData/SDC/GPF_Full_19930701_19930930.xlsx",skiprows=[0])
GPF_Full_19931001_19931231 = pd.read_excel("../RawData/SDC/GPF_Full_19931001_19931231.xlsx",skiprows=[0])
GPF_Full_19940101_19940331 = pd.read_excel("../RawData/SDC/GPF_Full_19940101_19940331.xlsx",skiprows=[0])
GPF_Full_19940401_19940630 = pd.read_excel("../RawData/SDC/GPF_Full_19940401_19940630.xlsx",skiprows=[0])
GPF_Full_19940701_19940930 = pd.read_excel("../RawData/SDC/GPF_Full_19940701_19940930.xlsx",skiprows=[0])
GPF_Full_19941001_19941231 = pd.read_excel("../RawData/SDC/GPF_Full_19941001_19941231.xlsx",skiprows=[0])
GPF_Full_19950101_19950331 = pd.read_excel("../RawData/SDC/GPF_Full_19950101_19950331.xlsx",skiprows=[0])
GPF_Full_19950401_19950630 = pd.read_excel("../RawData/SDC/GPF_Full_19950401_19950630.xlsx",skiprows=[0])
GPF_Full_19950701_19950930 = pd.read_excel("../RawData/SDC/GPF_Full_19950701_19950930.xlsx",skiprows=[0])
GPF_Full_19951001_19951231 = pd.read_excel("../RawData/SDC/GPF_Full_19951001_19951231.xlsx",skiprows=[0])
GPF_Full_19960101_19960430 = pd.read_excel("../RawData/SDC/GPF_Full_19960101_19960430.xlsx",skiprows=[0])
GPF_Full_19960501_19960831 = pd.read_excel("../RawData/SDC/GPF_Full_19960501_19960831.xlsx",skiprows=[0])
GPF_Full_19960901_19961231 = pd.read_excel("../RawData/SDC/GPF_Full_19960901_19961231.xlsx",skiprows=[0])
GPF_Full_19970101_19970331 = pd.read_excel("../RawData/SDC/GPF_Full_19970101_19970331.xlsx",skiprows=[0])
GPF_Full_19970401_19970630 = pd.read_excel("../RawData/SDC/GPF_Full_19970401_19970630.xlsx",skiprows=[0])
GPF_Full_19970701_19970930 = pd.read_excel("../RawData/SDC/GPF_Full_19970701_19970930.xlsx",skiprows=[0])
GPF_Full_19971001_19971231 = pd.read_excel("../RawData/SDC/GPF_Full_19971001_19971231.xlsx",skiprows=[0])
GPF_Full_19980101_19980331 = pd.read_excel("../RawData/SDC/GPF_Full_19980101_19980331.xlsx",skiprows=[0])
GPF_Full_19980401_19980630 = pd.read_excel("../RawData/SDC/GPF_Full_19980401_19980630.xlsx",skiprows=[0])
GPF_Full_19980701_19980930 = pd.read_excel("../RawData/SDC/GPF_Full_19980701_19980930.xlsx",skiprows=[0])
GPF_Full_19981001_19981231 = pd.read_excel("../RawData/SDC/GPF_Full_19981001_19981231.xlsx",skiprows=[0])
GPF_Full_19990101_19990331 = pd.read_excel("../RawData/SDC/GPF_Full_19990101_19990331.xlsx",skiprows=[0])
GPF_Full_19990401_19990630 = pd.read_excel("../RawData/SDC/GPF_Full_19990401_19990630.xlsx",skiprows=[0])
GPF_Full_19990701_19990930 = pd.read_excel("../RawData/SDC/GPF_Full_19990701_19990930.xlsx",skiprows=[0])
GPF_Full_19991001_19991231 = pd.read_excel("../RawData/SDC/GPF_Full_19991001_19991231.xlsx",skiprows=[0])
GPF_Full_20000101_20000331 = pd.read_excel("../RawData/SDC/GPF_Full_20000101_20000331.xlsx",skiprows=[0])
GPF_Full_20000401_20000630 = pd.read_excel("../RawData/SDC/GPF_Full_20000401_20000630.xlsx",skiprows=[0])
GPF_Full_20000701_20000930 = pd.read_excel("../RawData/SDC/GPF_Full_20000701_20000930.xlsx",skiprows=[0])
GPF_Full_20001001_20001231 = pd.read_excel("../RawData/SDC/GPF_Full_20001001_20001231.xlsx",skiprows=[0])
GPF_Full_20010101_20010331 = pd.read_excel("../RawData/SDC/GPF_Full_20010101_20010331.xlsx",skiprows=[0])
GPF_Full_20010401_20010630 = pd.read_excel("../RawData/SDC/GPF_Full_20010401_20010630.xlsx",skiprows=[0])
GPF_Full_20010701_20010930 = pd.read_excel("../RawData/SDC/GPF_Full_20010701_20010930.xlsx",skiprows=[0])
GPF_Full_20011001_20011231 = pd.read_excel("../RawData/SDC/GPF_Full_20011001_20011231.xlsx",skiprows=[0])
GPF_Full_20020101_20020331 = pd.read_excel("../RawData/SDC/GPF_Full_20020101_20020331.xlsx",skiprows=[0])
GPF_Full_20020401_20020630 = pd.read_excel("../RawData/SDC/GPF_Full_20020401_20020630.xlsx",skiprows=[0])
GPF_Full_20020701_20020930 = pd.read_excel("../RawData/SDC/GPF_Full_20020701_20020930.xlsx",skiprows=[0])
GPF_Full_20021001_20021231 = pd.read_excel("../RawData/SDC/GPF_Full_20021001_20021231.xlsx",skiprows=[0])
GPF_Full_20030101_20030331 = pd.read_excel("../RawData/SDC/GPF_Full_20030101_20030331.xlsx",skiprows=[0])
GPF_Full_20030401_20030630 = pd.read_excel("../RawData/SDC/GPF_Full_20030401_20030630.xlsx",skiprows=[0])
GPF_Full_20030701_20030930 = pd.read_excel("../RawData/SDC/GPF_Full_20030701_20030930.xlsx",skiprows=[0])
GPF_Full_20031001_20031231 = pd.read_excel("../RawData/SDC/GPF_Full_20031001_20031231.xlsx",skiprows=[0])
GPF_Full_20040101_20040331 = pd.read_excel("../RawData/SDC/GPF_Full_20040101_20040331.xlsx",skiprows=[0])
GPF_Full_20040401_20040630 = pd.read_excel("../RawData/SDC/GPF_Full_20040401_20040630.xlsx",skiprows=[0])
GPF_Full_20040701_20040930 = pd.read_excel("../RawData/SDC/GPF_Full_20040701_20040930.xlsx",skiprows=[0])
GPF_Full_20041001_20041231 = pd.read_excel("../RawData/SDC/GPF_Full_20041001_20041231.xlsx",skiprows=[0])
GPF_Full_20050101_20050331 = pd.read_excel("../RawData/SDC/GPF_Full_20050101_20050331.xlsx",skiprows=[0])
GPF_Full_20050401_20050630 = pd.read_excel("../RawData/SDC/GPF_Full_20050401_20050630.xlsx",skiprows=[0])
GPF_Full_20050701_20050930 = pd.read_excel("../RawData/SDC/GPF_Full_20050701_20050930.xlsx",skiprows=[0])
GPF_Full_20051001_20051231 = pd.read_excel("../RawData/SDC/GPF_Full_20051001_20051231.xlsx",skiprows=[0])
GPF_Full_20060101_20060331 = pd.read_excel("../RawData/SDC/GPF_Full_20060101_20060331.xlsx",skiprows=[0])
GPF_Full_20060401_20060630 = pd.read_excel("../RawData/SDC/GPF_Full_20060401_20060630.xlsx",skiprows=[0])
GPF_Full_20060701_20060930 = pd.read_excel("../RawData/SDC/GPF_Full_20060701_20060930.xlsx",skiprows=[0])
GPF_Full_20061001_20061231 = pd.read_excel("../RawData/SDC/GPF_Full_20061001_20061231.xlsx",skiprows=[0])
GPF_Full_20070101_20070331 = pd.read_excel("../RawData/SDC/GPF_Full_20070101_20070331.xlsx",skiprows=[0])
GPF_Full_20070401_20070630 = pd.read_excel("../RawData/SDC/GPF_Full_20070401_20070630.xlsx",skiprows=[0])
GPF_Full_20070701_20070930 = pd.read_excel("../RawData/SDC/GPF_Full_20070701_20070930.xlsx",skiprows=[0])
GPF_Full_20071001_20071231 = pd.read_excel("../RawData/SDC/GPF_Full_20071001_20071231.xlsx",skiprows=[0])
GPF_Full_20080101_20080331 = pd.read_excel("../RawData/SDC/GPF_Full_20080101_20080331.xlsx",skiprows=[0])
GPF_Full_20080401_20080630 = pd.read_excel("../RawData/SDC/GPF_Full_20080401_20080630.xlsx",skiprows=[0])
GPF_Full_20080701_20080930 = pd.read_excel("../RawData/SDC/GPF_Full_20080701_20080930.xlsx",skiprows=[0])
GPF_Full_20081001_20081231 = pd.read_excel("../RawData/SDC/GPF_Full_20081001_20081231.xlsx",skiprows=[0])
GPF_Full_20090101_20090331 = pd.read_excel("../RawData/SDC/GPF_Full_20090101_20090331.xlsx",skiprows=[0])
GPF_Full_20090401_20090630 = pd.read_excel("../RawData/SDC/GPF_Full_20090401_20090630.xlsx",skiprows=[0])
GPF_Full_20090701_20090930 = pd.read_excel("../RawData/SDC/GPF_Full_20090701_20090930.xlsx",skiprows=[0])
GPF_Full_20091001_20091231 = pd.read_excel("../RawData/SDC/GPF_Full_20091001_20091231.xlsx",skiprows=[0])
GPF_Full_20100101_20100331 = pd.read_excel("../RawData/SDC/GPF_Full_20100101_20100331.xlsx",skiprows=[0])
GPF_Full_20100401_20100630 = pd.read_excel("../RawData/SDC/GPF_Full_20100401_20100630.xlsx",skiprows=[0])
GPF_Full_20100701_20100930 = pd.read_excel("../RawData/SDC/GPF_Full_20100701_20100930.xlsx",skiprows=[0])
GPF_Full_20101001_20101231 = pd.read_excel("../RawData/SDC/GPF_Full_20101001_20101231.xlsx",skiprows=[0])
GPF_Full_20110101_20110331 = pd.read_excel("../RawData/SDC/GPF_Full_20110101_20110331.xlsx",skiprows=[0])
GPF_Full_20110401_20110630 = pd.read_excel("../RawData/SDC/GPF_Full_20110401_20110630.xlsx",skiprows=[0])
GPF_Full_20110701_20110930 = pd.read_excel("../RawData/SDC/GPF_Full_20110701_20110930.xlsx",skiprows=[0])
GPF_Full_20111001_20111231 = pd.read_excel("../RawData/SDC/GPF_Full_20111001_20111231.xlsx",skiprows=[0])
GPF_Full_20120101_20120331 = pd.read_excel("../RawData/SDC/GPF_Full_20120101_20120331.xlsx",skiprows=[0])
GPF_Full_20120401_20120630 = pd.read_excel("../RawData/SDC/GPF_Full_20120401_20120630.xlsx",skiprows=[0])
GPF_Full_20120701_20120930 = pd.read_excel("../RawData/SDC/GPF_Full_20120701_20120930.xlsx",skiprows=[0])
GPF_Full_20121001_20121231 = pd.read_excel("../RawData/SDC/GPF_Full_20121001_20121231.xlsx",skiprows=[0])
GPF_Full_20130101_20130331 = pd.read_excel("../RawData/SDC/GPF_Full_20130101_20130331.xlsx",skiprows=[0])
GPF_Full_20130401_20130630 = pd.read_excel("../RawData/SDC/GPF_Full_20130401_20130630.xlsx",skiprows=[0])
GPF_Full_20130701_20130930 = pd.read_excel("../RawData/SDC/GPF_Full_20130701_20130930.xlsx",skiprows=[0])
GPF_Full_20131001_20131231 = pd.read_excel("../RawData/SDC/GPF_Full_20131001_20131231.xlsx",skiprows=[0])
GPF_Full_20140101_20140331 = pd.read_excel("../RawData/SDC/GPF_Full_20140101_20140331.xlsx",skiprows=[0])
GPF_Full_20140401_20140630 = pd.read_excel("../RawData/SDC/GPF_Full_20140401_20140630.xlsx",skiprows=[0])
GPF_Full_20140701_20140930 = pd.read_excel("../RawData/SDC/GPF_Full_20140701_20140930.xlsx",skiprows=[0])
GPF_Full_20141001_20141231 = pd.read_excel("../RawData/SDC/GPF_Full_20141001_20141231.xlsx",skiprows=[0])
GPF_Full_20150101_20150331 = pd.read_excel("../RawData/SDC/GPF_Full_20150101_20150331.xlsx",skiprows=[0])
GPF_Full_20150401_20150630 = pd.read_excel("../RawData/SDC/GPF_Full_20150401_20150630.xlsx",skiprows=[0])
GPF_Full_20150701_20150930 = pd.read_excel("../RawData/SDC/GPF_Full_20150701_20150930.xlsx",skiprows=[0])
GPF_Full_20151001_20151231 = pd.read_excel("../RawData/SDC/GPF_Full_20151001_20151231.xlsx",skiprows=[0])
GPF_Full_20160101_20160331 = pd.read_excel("../RawData/SDC/GPF_Full_20160101_20160331.xlsx",skiprows=[0])
GPF_Full_20160401_20160630 = pd.read_excel("../RawData/SDC/GPF_Full_20160401_20160630.xlsx",skiprows=[0])
GPF_Full_20160701_20160930 = pd.read_excel("../RawData/SDC/GPF_Full_20160701_20160930.xlsx",skiprows=[0])
GPF_Full_20161001_20161231 = pd.read_excel("../RawData/SDC/GPF_Full_20161001_20161231.xlsx",skiprows=[0])
GPF_Full_20170101_20170331 = pd.read_excel("../RawData/SDC/GPF_Full_20170101_20170331.xlsx",skiprows=[0])
GPF_Full_20170401_20170630 = pd.read_excel("../RawData/SDC/GPF_Full_20170401_20170630.xlsx",skiprows=[0])
GPF_Full_20170701_20170930 = pd.read_excel("../RawData/SDC/GPF_Full_20170701_20170930.xlsx",skiprows=[0])
GPF_Full_20171001_20171231 = pd.read_excel("../RawData/SDC/GPF_Full_20171001_20171231.xlsx",skiprows=[0])
GPF_Full_20180101_20180331 = pd.read_excel("../RawData/SDC/GPF_Full_20180101_20180331.xlsx",skiprows=[0])
GPF_Full_20180401_20180630 = pd.read_excel("../RawData/SDC/GPF_Full_20180401_20180630.xlsx",skiprows=[0])
GPF_Full_20180701_20180930 = pd.read_excel("../RawData/SDC/GPF_Full_20180701_20180930.xlsx",skiprows=[0])
GPF_Full_20181001_20181231 = pd.read_excel("../RawData/SDC/GPF_Full_20181001_20181231.xlsx",skiprows=[0])
GPF_Full_20190101_20190331 = pd.read_excel("../RawData/SDC/GPF_Full_20190101_20190331.xlsx",skiprows=[0])
GPF_Full_20190401_20190630 = pd.read_excel("../RawData/SDC/GPF_Full_20190401_20190630.xlsx",skiprows=[0])
GPF_Full_20190701_20190930 = pd.read_excel("../RawData/SDC/GPF_Full_20190701_20190930.xlsx",skiprows=[0])
GPF_Full_20191001_20191231 = pd.read_excel("../RawData/SDC/GPF_Full_20191001_20191231.xlsx",skiprows=[0])
GPF_Full_20200101_20200331 = pd.read_excel("../RawData/SDC/GPF_Full_20200101_20200331.xlsx",skiprows=[0])
GPF_Full_20200401_20200630 = pd.read_excel("../RawData/SDC/GPF_Full_20200401_20200630.xlsx",skiprows=[0])
GPF_Full_20200701_20200930 = pd.read_excel("../RawData/SDC/GPF_Full_20200701_20200930.xlsx",skiprows=[0])
GPF_Full_20201001_20201231 = pd.read_excel("../RawData/SDC/GPF_Full_20201001_20201231.xlsx",skiprows=[0])
GPF_Full_20210101_20210331 = pd.read_excel("../RawData/SDC/GPF_Full_20210101_20210331.xlsx",skiprows=[0])
GPF_Full_20210401_20210630 = pd.read_excel("../RawData/SDC/GPF_Full_20210401_20210630.xlsx",skiprows=[0])
GPF_Full_20210701_20210930 = pd.read_excel("../RawData/SDC/GPF_Full_20210701_20210930.xlsx",skiprows=[0])
GPF_Full_20211001_20211231 = pd.read_excel("../RawData/SDC/GPF_Full_20211001_20211231.xlsx",skiprows=[0])
GPF_Full_20220101_20220331 = pd.read_excel("../RawData/SDC/GPF_Full_20220101_20220331.xlsx",skiprows=[0])
GPF_Full_20220401_20220630 = pd.read_excel("../RawData/SDC/GPF_Full_20220401_20220630.xlsx",skiprows=[0])
GPF_Full_20220701_20220930 = pd.read_excel("../RawData/SDC/GPF_Full_20220701_20220930.xlsx",skiprows=[0])
GPF_Full_20221001_20221231 = pd.read_excel("../RawData/SDC/GPF_Full_20221001_20221231.xlsx",skiprows=[0])
GPF_Full_20230101_20230430 = pd.read_excel("../RawData/SDC/GPF_Full_20230101_20230430.xlsx",skiprows=[0])
GPF_Full_20230501_20230831 = pd.read_excel("../RawData/SDC/GPF_Full_20230501_20230831.xlsx",skiprows=[0])
GPF_Full_20230901_20231031 = pd.read_excel("../RawData/SDC/GPF_Full_20230901_20231031.xlsx",skiprows=[0])


In [None]:
# %%script false --no-raise-error
GPF = pd.concat([
    GPF_Full_19670101_19671231,
    GPF_Full_19680101_19681231,
    GPF_Full_19690101_19691231,
    GPF_Full_19700101_19701231,
    GPF_Full_19710101_19711231,
    GPF_Full_19720101_19721231,
    GPF_Full_19730101_19731231,
    GPF_Full_19740101_19741231,
    GPF_Full_19750101_19751231,
    GPF_Full_19760101_19761231,
    GPF_Full_19770101_19771231,
    GPF_Full_19780101_19781231,
    GPF_Full_19790101_19791231,
    GPF_Full_19800101_19800331,
    GPF_Full_19800401_19801231,
    GPF_Full_19810101_19811231,
    GPF_Full_19820101_19821231,
    GPF_Full_19830101_19830630,
    GPF_Full_19830701_19831231,
    GPF_Full_19840101_19840630,
    GPF_Full_19840701_19841231,
    GPF_Full_19850101_19850630,
    GPF_Full_19850701_19851231,
    GPF_Full_19860101_19860630,
    GPF_Full_19860701_19861231,
    GPF_Full_19870101_19870630,
    GPF_Full_19870701_19871231,
    GPF_Full_19880101_19880630,
    GPF_Full_19880701_19881231,
    GPF_Full_19890101_19890630,
    GPF_Full_19890701_19891231,
    GPF_Full_19900101_19900331,
    GPF_Full_19900401_19900831,
    GPF_Full_19900901_19901231,
    GPF_Full_19910101_19910430,
    GPF_Full_19910501_19910831,
    GPF_Full_19910901_19911031,
    GPF_Full_19911101_19911231,
    GPF_Full_19920101_19920430,
    GPF_Full_19920501_19920831,
    GPF_Full_19920901_19921231,
    GPF_Full_19930101_19930430,
    GPF_Full_19930501_19930630,
    GPF_Full_19930701_19930930,
    GPF_Full_19931001_19931231,
    GPF_Full_19940101_19940331,
    GPF_Full_19940401_19940630,
    GPF_Full_19940701_19940930,
    GPF_Full_19941001_19941231,
    GPF_Full_19950101_19950331,
    GPF_Full_19950401_19950630,
    GPF_Full_19950701_19950930,
    GPF_Full_19951001_19951231,
    GPF_Full_19960101_19960430,
    GPF_Full_19960501_19960831,
    GPF_Full_19960901_19961231,
    GPF_Full_19970101_19970331,
    GPF_Full_19970401_19970630,
    GPF_Full_19970701_19970930,
    GPF_Full_19971001_19971231,
    GPF_Full_19980101_19980331,
    GPF_Full_19980401_19980630,
    GPF_Full_19980701_19980930,
    GPF_Full_19981001_19981231,
    GPF_Full_19990101_19990331,
    GPF_Full_19990401_19990630,
    GPF_Full_19990701_19990930,
    GPF_Full_19991001_19991231,
    GPF_Full_20000101_20000331,
    GPF_Full_20000401_20000630,
    GPF_Full_20000701_20000930,
    GPF_Full_20001001_20001231,
    GPF_Full_20010101_20010331,
    GPF_Full_20010401_20010630,
    GPF_Full_20010701_20010930,
    GPF_Full_20011001_20011231,
    GPF_Full_20020101_20020331,
    GPF_Full_20020401_20020630,
    GPF_Full_20020701_20020930,
    GPF_Full_20021001_20021231,
    GPF_Full_20030101_20030331,
    GPF_Full_20030401_20030630,
    GPF_Full_20030701_20030930,
    GPF_Full_20031001_20031231,
    GPF_Full_20040101_20040331,
    GPF_Full_20040401_20040630,
    GPF_Full_20040701_20040930,
    GPF_Full_20041001_20041231,
    GPF_Full_20050101_20050331,
    GPF_Full_20050401_20050630,
    GPF_Full_20050701_20050930,
    GPF_Full_20051001_20051231,
    GPF_Full_20060101_20060331,
    GPF_Full_20060401_20060630,
    GPF_Full_20060701_20060930,
    GPF_Full_20061001_20061231,
    GPF_Full_20070101_20070331,
    GPF_Full_20070401_20070630,
    GPF_Full_20070701_20070930,
    GPF_Full_20071001_20071231,
    GPF_Full_20080101_20080331,
    GPF_Full_20080401_20080630,
    GPF_Full_20080701_20080930,
    GPF_Full_20081001_20081231,
    GPF_Full_20090101_20090331,
    GPF_Full_20090401_20090630,
    GPF_Full_20090701_20090930,
    GPF_Full_20091001_20091231,
    GPF_Full_20100101_20100331,
    GPF_Full_20100401_20100630,
    GPF_Full_20100701_20100930,
    GPF_Full_20101001_20101231,
    GPF_Full_20110101_20110331,
    GPF_Full_20110401_20110630,
    GPF_Full_20110701_20110930,
    GPF_Full_20111001_20111231,
    GPF_Full_20120101_20120331,
    GPF_Full_20120401_20120630,
    GPF_Full_20120701_20120930,
    GPF_Full_20121001_20121231,
    GPF_Full_20130101_20130331,
    GPF_Full_20130401_20130630,
    GPF_Full_20130701_20130930,
    GPF_Full_20131001_20131231,
    GPF_Full_20140101_20140331,
    GPF_Full_20140401_20140630,
    GPF_Full_20140701_20140930,
    GPF_Full_20141001_20141231,
    GPF_Full_20150101_20150331,
    GPF_Full_20150401_20150630,
    GPF_Full_20150701_20150930,
    GPF_Full_20151001_20151231,
    GPF_Full_20160101_20160331,
    GPF_Full_20160401_20160630,
    GPF_Full_20160701_20160930,
    GPF_Full_20161001_20161231,
    GPF_Full_20170101_20170331,
    GPF_Full_20170401_20170630,
    GPF_Full_20170701_20170930,
    GPF_Full_20171001_20171231,
    GPF_Full_20180101_20180331,
    GPF_Full_20180401_20180630,
    GPF_Full_20180701_20180930,
    GPF_Full_20181001_20181231,
    GPF_Full_20190101_20190331,
    GPF_Full_20190401_20190630,
    GPF_Full_20190701_20190930,
    GPF_Full_20191001_20191231,
    GPF_Full_20200101_20200331,
    GPF_Full_20200401_20200630,
    GPF_Full_20200701_20200930,
    GPF_Full_20201001_20201231,
    GPF_Full_20210101_20210331,
    GPF_Full_20210401_20210630,
    GPF_Full_20210701_20210930,
    GPF_Full_20211001_20211231,
    GPF_Full_20220101_20220331,
    GPF_Full_20220401_20220630,
    GPF_Full_20220701_20220930,
    GPF_Full_20221001_20221231,
    GPF_Full_20230101_20230430,
    GPF_Full_20230501_20230831,
    GPF_Full_20230901_20231031,
    ]).copy()
GPF = GPF.reset_index(drop=True)

# Divide the "Lead Manager" field into many subfields
# Note that "Co-Managers" have other manager information, which is not utilized here
new_columns = GPF['Lead Manager'].str.split('\n', expand=True)
raw_name_GPF_colnames = ['raw_name_GPF_'+str(column) for column in new_columns.columns]
new_columns.columns = raw_name_GPF_colnames
GPF = pd.concat([GPF,new_columns],axis=1)
# Modify mistakes in sale time
threshold_date = pd.to_datetime('2050-01-01')
GPF['Sale\nDate'] = GPF['Sale\nDate'].apply(lambda x: x - pd.DateOffset(years=100) if x > threshold_date else x)
# Note that such errors do not exist for dated date
# Add a year
GPF['sale_year'] = None
GPF['sale_year'] = pd.to_datetime(GPF['Sale\nDate']).dt.year

# strip all the columns of blanks
columns = [item.strip() for item in GPF.columns]
GPF.columns = columns
# Choose columns to keep & reorder columns
GPF = GPF[[
    "Amount\n   of   \n Issue  \n($ mils)",
    "Amount\n   of   \nMaturity\n($ mils)",
    "Bid",
    "Bk \n Elig",
    "Bond\nBuyer\nUOP.1",
    "Call\nIssue",
    "County",
    "Coupon Maturity",
    "Coupon Type.1",
    "Coupon\n   of\nMaturity",
    'Credit Enhancer.1',
    'Credit\nEnhance\n ment\n Type',
    'Credit\nEnhance\n ment\n Type.1',
    'Credit Enhancer.2',
    "Cusip",
    'Dated Date',
    'Dated\nDate',
    "Financial Advisor.1",
    "Financial Advisor.2",
    "Financial\nAdvisor\nDeal(Y/N)",
    "Fitch\nInsured\nLong Term\nRating",
    "Fitch\nInsured\nShort Term\nRating",
    "General Use of Proceeds",
    "Gross\nSpread",
    "Insured\nAmount",
    "Issuer Type\nDescription",
    "Issuer",
    "Issuer\nType",
    "Lead Manager",
    "Main Use of Proceeds",
    "Maturity Amount",
    "Maturity Date",
    "Maturity",
    "Maturity\n  Year",
    "Moody's\nInsured\nLong Term\nRating",
    "Moody's\nInsured\nShort Term\nRating",
    "Net\nInterest\n  Cost",
    "Price/\n Yield\n  of\nMaturity",
    'Refunding',
    'Refunding\n  Amount\n ($ mils)',
    'Dealno of\nRefunded\n  Issue',
    'Refunded Issue',
    'Ref-\nType',
    'Refd',
    "Sale\nDate",
    "Security\n  Type",
    "State",
    "Taxable\n Code",
    'True\nInterest\n  Cost',
    "Yield Amount",
    "sale_year",
    ]+raw_name_GPF_colnames]
GPF['County_raw'] = GPF['County']
# Format county
GPF['County'] = GPF['County'].str.upper()
GPF['County'] = GPF['County'].replace(' COUNTY','')


In [None]:
GPF = GPF.rename(columns={
    "Amount\n   of   \n Issue  \n($ mils)":"amount",
    "Amount\n   of   \nMaturity\n($ mils)":"amount_by_maturity",
    "Bk \n Elig":"CB_Eligible",
    "Bond\nBuyer\nUOP.1":"use_of_proceeds_BB",
    "Call\nIssue":"if_callable",
    "Coupon Maturity":"TOM_coupon_rate",
    "Coupon Type.1":"coupon_type",
    "Coupon\n   of\nMaturity":"coupon_rate",
    'Credit Enhancer.1':'credit_enhancer_short',
    'Credit Enhancer.2':'credit_enhancer_long',
    "Cusip":"cusip",
    "Dated Date":"dated_date",
    "Financial Advisor.1":"advisor_short",
    "Financial Advisor.2":"advisor_long",
    "Financial\nAdvisor\nDeal(Y/N)":"if_advisor",
    "Fitch\nInsured\nLong Term\nRating":"Fitch_ILTR",
    "Fitch\nInsured\nShort Term\nRating":"Fitch_ISTR",
    "General Use of Proceeds":"use_of_proceeds_general",
    "Gross\nSpread":"gross_spread",
    "Insured\nAmount":"insured_amount",
    "Issuer Type\nDescription":"issuer_type_full",
    "Issuer\nType":"issuer_type",
    "Lead Manager":"lead_manager",
    "Main Use of Proceeds":"use_of_proceeds_main",
    "Maturity Amount":"TOM_amount_by_maturity",
    "Maturity Date":"TOM_maturity_date",
    "Maturity\n  Year":"maturity_date", # Somehow this rather than "Maturity" is more often non-missing
    "Moody's\nInsured\nLong Term\nRating":"Moodys_ILTR",
    "Moody's\nInsured\nShort Term\nRating":"Moodys_ISTR",
    "Net\nInterest\n  Cost":"net_interest_cost",
    "Price/\n Yield\n  of\nMaturity":"price_or_yield",
    "Sale\nDate":"sale_date",
    "Security\n  Type":"security_type",
    "Taxable\n Code":"taxable_code",
    'True\nInterest\n  Cost':'true_interest_cost',
    "Yield Amount":"TOM_price_or_yield",
    })


### 1.1.1 Import and merge with TBB data

In [None]:
#-------------#
# Import data #
#-------------#

BBIssueDataAll = []
for year in range(2008,2025):
    with open('../RawData/BondBuyer/WebPages/BBIssueData'+str(year)+'.pkl', 'rb') as file:
        BBIssueData = pickle.load(file)
        BBIssueData = pd.DataFrame(BBIssueData)
        BBIssueDataAll = BBIssueDataAll+[BBIssueData]
BBIssueDataAll = pd.concat(BBIssueDataAll).reset_index(drop=True)

# When a bond issue has more than one effective rate, NIC, or TIC, such an issue is recorded as multiple data points in 
# GPF. Hence, reformat and explode data
BBIssueDataAll['len_CaseEffRate_amounts'] = BBIssueDataAll['CaseEffRate_amounts'].apply(lambda x: len(x) if x is not None else 0)
BBIssueDataAll_multi = BBIssueDataAll[BBIssueDataAll['len_CaseEffRate_amounts']>1].copy()
BBIssueDataAll_single = BBIssueDataAll[BBIssueDataAll['len_CaseEffRate_amounts']<=1].copy()
BBIssueDataAll_multi = BBIssueDataAll_multi.rename(columns={
    'CaseEffRate_amounts':'a',
    'CaseEffRate_purchasers':'b',
    'CaseEffRate_coupon_rates':'c',
    'CaseEffRate_purchase_price_minus_pars':'d',
    'CaseEffRate_effective_rates':'e',
    })
BBIssueDataAll_multi = BBIssueDataAll_multi.explode(list('abcde'))
BBIssueDataAll_single['CaseEffRate_amounts'] = \
    BBIssueDataAll_single['CaseEffRate_amounts'].apply(lambda x: None if x is None else x[0])
BBIssueDataAll_single['CaseEffRate_purchasers'] = \
    BBIssueDataAll_single['CaseEffRate_purchasers'].apply(lambda x: None if x is None else x[0])
BBIssueDataAll_single['CaseEffRate_coupon_rates'] = \
    BBIssueDataAll_single['CaseEffRate_coupon_rates'].apply(lambda x: None if x is None else x[0])
BBIssueDataAll_single['CaseEffRate_purchase_price_minus_pars'] = \
    BBIssueDataAll_single['CaseEffRate_purchase_price_minus_pars'].apply(lambda x: None if x is None else x[0])
BBIssueDataAll_single['CaseEffRate_effective_rates'] = \
    BBIssueDataAll_single['CaseEffRate_effective_rates'].apply(lambda x: None if x is None else x[0])
BBIssueDataAll = pd.concat([BBIssueDataAll_single,BBIssueDataAll_multi])

BBIssueDataAll['sale_date'] = pd.to_datetime(BBIssueDataAll['sale_date'])
text = '</p> <table class="tblExcelHide" id="tblRCSalesResult0"><tbody></tbody></table> <p class="FootRCSalesResultcls" '+\
    'id="divFootRCSalesResult0" style="display:none"></p> <p class="RCSalesResultcls" id="divRCSalesResult1">Oct 10, 2023'
BBIssueDataAll.loc[BBIssueDataAll['dated_date']==text,'dated_date'] = 'Oct 10, 2023'
# Handle an error case
BBIssueDataAll['dated_date'] = pd.to_datetime(BBIssueDataAll['dated_date'])


In [None]:

#-----------------------------#
# Match and put data into GPF #
#-----------------------------#

# Go over each issue in BBIssueDataAll,
# (1) Find all issues in SDC with the same state, sales date, and dated date.
# Allow margin of error of seven days
# (2) Find the issue with the smallest difference in amount.
# (3) Check if amount is within margin of error (e.g., 0.05 million).
# (4) If still not unique, check similarity in terms of issuer name.

BBIssueDataAll['GPF_no_match'] = None
BBIssueDataAll['GPF_multiple_match'] = None

GPF['CaseEffRate_amounts'] = None
GPF['CaseEffRate_purchasers'] = None
GPF['CaseEffRate_coupon_rates'] = None
GPF['CaseEffRate_purchase_price_minus_pars'] = None
GPF['CaseEffRate_effective_rates'] = None
GPF['CaseEffRate_lines_other_bidders'] = None

GPF['CaseTIC_purchaser'] = None
GPF['CaseTIC_purchase_price'] = None
GPF['CaseTIC_TIC'] = None
GPF['CaseTIC_lines_other_bidders'] = None

GPF['CaseNIC_purchaser'] = None
GPF['CaseNIC_purchase_price'] = None
GPF['CaseNIC_NIC'] = None
GPF['CaseNIC_lines_other_bidders'] = None

GPF_pre2008 = GPF[GPF['sale_year']<2008]
GPF_post2008 = GPF[GPF['sale_year']>=2008]
data_for_parallel = []
for year in range(2008,2024):
    data_for_parallel = data_for_parallel+\
        [(GPF[GPF['sale_year']==year],BBIssueDataAll[BBIssueDataAll['notice_year']==year],state_abbreviations)]

try:
    del(FUN_0A_Match_TBB_GPF)
except:
    pass
import FUN_0A_Match_TBB_GPF
importlib.reload(FUN_0A_Match_TBB_GPF)
from FUN_0A_Match_TBB_GPF import FUN_0A_Match_TBB_GPF

if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        results_from_parallel = p.starmap(FUN_0A_Match_TBB_GPF, data_for_parallel)

# Collect results
GPF_parts = []
BBIssueDataAll_parts = []
for i in range(0,len(results_from_parallel)):
    GPF_parts = GPF_parts+[results_from_parallel[i][0]]
    BBIssueDataAll_parts = BBIssueDataAll_parts+[results_from_parallel[i][1]]
GPF = pd.concat(GPF_parts+[GPF_pre2008])
BBIssueDataAll = pd.concat(BBIssueDataAll_parts)

# When TIC/NIC is provided in TBB, almost all the time it is available in GPF
# GPF[['net_interest_cost','true_interest_cost','CaseTIC_TIC']][~pd.isnull(GPF['CaseTIC_TIC'])].sample(20)
# GPF[['net_interest_cost','true_interest_cost','CaseNIC_NIC']][~pd.isnull(GPF['CaseNIC_NIC'])].sample(20)

# On the contrary, TIC/NIC is usually missing when effective rate (which is TIC, but for very short term bonds) is provided
# GPF[['net_interest_cost','true_interest_cost','CaseEffRate_effective_rates']]\
# [~pd.isnull(GPF['CaseEffRate_effective_rates'])].sample(20)

# Put effective rates from TBB to TIC
GPF.loc[~pd.isnull(GPF['CaseEffRate_effective_rates']),'CaseEffRate_effective_rates'] = \
    GPF[~pd.isnull(GPF['CaseEffRate_effective_rates'])]['CaseEffRate_effective_rates'].str.replace('%','')
GPF.loc[~pd.isnull(GPF['CaseEffRate_effective_rates']),'CaseEffRate_effective_rates'] = \
    GPF[~pd.isnull(GPF['CaseEffRate_effective_rates'])]['CaseEffRate_effective_rates'].astype(float)
GPF['if_true_interest_cost_fromTBB'] = False
GPF.loc[pd.isnull(GPF['true_interest_cost'])&(~pd.isnull(GPF['CaseEffRate_effective_rates'])),'if_true_interest_cost_fromTBB'] = True
GPF.loc[pd.isnull(GPF['true_interest_cost'])&(~pd.isnull(GPF['CaseEffRate_effective_rates'])),'true_interest_cost'] = \
    GPF[pd.isnull(GPF['true_interest_cost'])&(~pd.isnull(GPF['CaseEffRate_effective_rates']))]['CaseEffRate_effective_rates']

# Count number of bidders
GPF['TBB_n_bidders'] = None
for idx,row in GPF.iterrows():
    if str(row['CaseEffRate_lines_other_bidders'])!='nan':
        GPF.at[idx,'TBB_n_bidders'] = len(eval(row['CaseEffRate_lines_other_bidders']))
    if str(row['CaseTIC_lines_other_bidders'])!='nan':
        GPF.at[idx,'TBB_n_bidders'] = len(eval(row['CaseTIC_lines_other_bidders']))
    if str(row['CaseNIC_lines_other_bidders'])!='nan':
        GPF.at[idx,'TBB_n_bidders'] = len(eval(row['CaseNIC_lines_other_bidders']))


## 1.2. Calculate yield

### 1.2.1 Pre-process data for yield calculation

Notes:
- Note that coupon type is tricky: I do not have a Thompson-researched version of coupon type, so it is tricky if I want to have a bond-level variable "coupon type". Therefore, I retain the original "coupon_type" variable, and determine if an issue is "all" fixed-rate/zero-coupon rate or "any other type" variable by checking that variable. I will not calculate yield if a bond issue has any bond with "coupon_type" being something other than fixed-rate, so, yield of floating-rate or zero-coupon bonds are not calculated.


In [None]:
########################
# Initialize variables #
########################

# Weighted average of yield across all maturities
GPF['yield_by_maturity_list'] = None
GPF['avg_yield'] = None

# Weighted average maturity
GPF['maturity_by_maturity_list'] = None
GPF['avg_maturity'] = None

# Amount by maturity
GPF['amount_by_maturity_list'] = None

# Whether it can be determined if the record is price or yield
GPF['IF_price_or_yield_determined'] = None

# Whether the number of tranches match in the coupon versus in the price/yield versus in the maturity field
GPF['IF_n_tranches_not_match'] = None

# Whether the bond has any coupon type other than "fixed-rate"
GPF['IF_irregular_coupon_type'] = False

GPF = GPF.reset_index(drop=True)

############################
# Handle exceptional cases #
############################

def proc_list(GPF):

    GPF = GPF.copy()

    # (1) Handle zero-coupon cases. If "Zero Coupon" is one of the coupon types, there are other coupon types, and the number of
    # coupon rates is exactly "total number of maturities" minus "number of zero coupon bonds", fill in the places of zero coupon
    # bonds to have a coupon rate of 0. Note that I do not do this for the Thompson-researched version of data, as the "coupon
    # type" variable is not available there
    
    for idx,row in GPF.iterrows():
    
        coupon_rate_filled = []
        zero_coupon_idxes = []
        non_zero_coupon_idxes = []
        
        coupon_type_original = str(row['coupon_type'])
        coupon_type = str(row['coupon_type']).split('\n')
        coupon_rate = row['coupon_rate']
    
        IF_has_coupon_rate = \
            row['coupon_rate']!=None and \
            str(row['coupon_rate'])!='nan' \
            and str(row['coupon_rate'])!='None'
        
        # Handle cases where just one bond
        if coupon_type_original=="Zero Coupon":
            GPF.at[idx,'coupon_rate'] = 0
            
        # When there are multiple bonds. Note that no need to handle if there is just one bond and it is not zero coupon
        elif len(coupon_type)>1 :
            # Do not handle if no zero-coupon bond
            if IF_has_coupon_rate and "Zero Coupon" in coupon_type:
                zero_coupon_idxes = [index for index,item in enumerate(coupon_type) if item=="Zero Coupon"]
                non_zero_coupon_idxes = [index for index,item in enumerate(coupon_type) if item!="Zero Coupon"]
                coupon_rate = str(coupon_rate).split('\n')
                if len(coupon_type)==len(coupon_rate)+len(zero_coupon_idxes):
                    coupon_rate_filled = [' ']*len(coupon_type)
                    physical_idx = 0
                    for sub_idx in non_zero_coupon_idxes:
                        coupon_rate_filled[sub_idx] = coupon_rate[physical_idx]
                        physical_idx = physical_idx+1
                    for sub_idx in zero_coupon_idxes:
                        coupon_rate_filled[sub_idx] = "0"
                    coupon_rate_new = coupon_rate_filled[0]
                    for item in coupon_rate_filled[1:]:
                        coupon_rate_new = coupon_rate_new+"\n"+item
                    GPF.at[idx,'coupon_rate'] = coupon_rate_new

    # (2) Assume that the coupon rate applies to all maturities if there is one coupon rate but multiple tranches
    for idx,row in GPF.iterrows():
        if row['coupon_rate']!=None and str(row['coupon_rate'])!='nan':
            if '\n' in str(row['maturity_date']):
                if '\n' not in str(row['coupon_rate']):
                    coupon_rate = ''
                    for tranch in range(0,row['maturity_date'].count('\n')):
                        coupon_rate = coupon_rate+str(row['coupon_rate'])+'\n'
                    coupon_rate = coupon_rate+str(row['coupon_rate'])+'\n'
                    GPF.loc[idx,'coupon_rate'] = coupon_rate
    
    # (3) Assume that the price/yield applies to all maturities if there is one price/yield rate but multiple tranches.
    # Make this edit only if the maturities are all identical. Otherwise, it is more likely a data error and do not impute.
    for idx,row in GPF.iterrows():
        if row['price_or_yield']!=None and str(row['price_or_yield'])!='nan':
            if '\n' in str(row['maturity_date']):
                maturity_date = row['maturity_date'].split('\n')
                if_same = all(element == maturity_date[0] for element in maturity_date)
                if if_same and ('\n' not in str(row['price_or_yield'])):
                    price_or_yield = ''
                    for tranch in range(0,row['maturity_date'].count('\n')):
                        price_or_yield = price_or_yield+str(row['price_or_yield'])+'\n'
                    price_or_yield = price_or_yield+str(row['price_or_yield'])+'\n'
                    GPF.loc[idx,'price_or_yield'] = price_or_yield
    
    # (4) Take the value of "Maturity" to populate "Maturity\n  Year" (i.e., "maturity_date") if the latter is missing
    for idx,row in GPF.iterrows():
        IF_has_maturity_date = \
            row['maturity_date']!=None and \
            str(row['maturity_date'])!='nan' and \
            str(row['maturity_date'])!='None' and \
            'None' not in str(row['maturity_date']) 
        if not IF_has_maturity_date:
            GPF.at[idx,'maturity_date'] = row['Maturity']
    
    # (5) Remove if beginning or end of field is '\n'
    for idx,row in GPF.iterrows():
        if isinstance(row['price_or_yield'],str):
            if row['price_or_yield'][:1]=='\n':
                GPF.loc[idx,'price_or_yield'] = row['price_or_yield'][1:]
            if row['price_or_yield'][-1:]=='\n':
                GPF.loc[idx,'price_or_yield'] = row['price_or_yield'][:-1]
        if isinstance(row['maturity_date'],str):
            if row['maturity_date'][:1]=='\n':
                GPF.loc[idx,'maturity_date'] = row['maturity_date'][1:]
            if row['maturity_date'][-1:]=='\n':
                GPF.loc[idx,'maturity_date'] = row['maturity_date'][:-1]
        if isinstance(row['coupon_rate'],str):
            if row['coupon_rate'][:1]=='\n':
                GPF.loc[idx,'coupon_rate'] = row['coupon_rate'][1:]
            if row['coupon_rate'][-1:]=='\n':
                GPF.loc[idx,'coupon_rate'] = row['coupon_rate'][:-1]
        if isinstance(row['amount_by_maturity'],str):
            if row['amount_by_maturity'][:1]=='\n':
                GPF.loc[idx,'amount_by_maturity'] = row['amount_by_maturity'][1:]
            if row['amount_by_maturity'][-1:]=='\n':
                GPF.loc[idx,'amount_by_maturity'] = row['amount_by_maturity'][:-1]
        if isinstance(row['TOM_price_or_yield'],str):
            if row['TOM_price_or_yield'][:1]=='\n':
                GPF.loc[idx,'TOM_price_or_yield'] = row['TOM_price_or_yield'][1:]
            if row['TOM_price_or_yield'][-1:]=='\n':
                GPF.loc[idx,'TOM_price_or_yield'] = row['TOM_price_or_yield'][:-1]
        if isinstance(row['TOM_maturity_date'],str):
            if row['TOM_maturity_date'][:1]=='\n':
                GPF.loc[idx,'TOM_maturity_date'] = row['TOM_maturity_date'][1:]
            if row['TOM_maturity_date'][-1:]=='\n':
                GPF.loc[idx,'TOM_maturity_date'] = row['TOM_maturity_date'][:-1]
        if isinstance(row['TOM_coupon_rate'],str):
            if row['TOM_coupon_rate'][:1]=='\n':
                GPF.loc[idx,'TOM_coupon_rate'] = row['TOM_coupon_rate'][1:]
            if row['TOM_coupon_rate'][-1:]=='\n':
                GPF.loc[idx,'TOM_coupon_rate'] = row['TOM_coupon_rate'][:-1]
        if isinstance(row['TOM_amount_by_maturity'],str):
            if row['TOM_amount_by_maturity'][:1]=='\n':
                GPF.loc[idx,'TOM_amount_by_maturity'] = row['TOM_amount_by_maturity'][1:]
            if row['TOM_amount_by_maturity'][-1:]=='\n':
                GPF.loc[idx,'TOM_amount_by_maturity'] = row['TOM_amount_by_maturity'][:-1]
    
    # (6) Remove if beginning or end or middle of field "maturity_date" is 'None'
    for idx,row in GPF.iterrows():
        if isinstance(row['maturity_date'],str):
            if row['maturity_date'][:5]=='None\n':
                GPF.loc[idx,'maturity_date'] = row['maturity_date'][5:]
            if row['maturity_date'][-5:]=='\nNone':
                GPF.loc[idx,'maturity_date'] = row['maturity_date'][:-5]
            GPF.at[idx,'maturity_date'] = GPF.at[idx,'maturity_date'].replace('None\n','')
    
    # (7) After the prior step, there are cases where "\n" is not in "maturity_date" and "maturity_date" is a string. To avoid
    # incompatibility, convert type
    for idx,row in GPF.iterrows():
        if isinstance(row['maturity_date'],str) and '\n' not in row['maturity_date'] and\
            row['maturity_date']!=None and str(row['maturity_date'])!='nan' and str(row['maturity_date'])!='None':
            GPF.at[idx,'maturity_date'] = datetime.strptime(GPF.at[idx,'maturity_date'],"%m/%d/%y")
    
    # (8) Drop if two or three "\n" come adjacent
    for idx,row in GPF.iterrows():
        if '\n\n' in str(row['price_or_yield']):
            GPF.at[idx,'price_or_yield'] = GPF.at[idx,'price_or_yield'].replace('\n\n','\n')
        if '\n\n\n' in str(row['price_or_yield']):
            GPF.at[idx,'price_or_yield'] = GPF.at[idx,'price_or_yield'].replace('\n\n\n','\n')
        if '\n\n' in str(row['coupon_rate']):
            GPF.at[idx,'coupon_rate'] = GPF.at[idx,'coupon_rate'].replace('\n\n','\n')
        if '\n\n\n' in str(row['coupon_rate']):
            GPF.at[idx,'coupon_rate'] = GPF.at[idx,'coupon_rate'].replace('\n\n\n','\n')

    # (9) Put value of data from the non-Thompson-researched version to the Thompson-researched version, if the latter is missing.
    # For consistency, make this change for all related fields, or do not do so at all
    for idx,row in GPF.iterrows():
        if \
            (row['TOM_maturity_date']==None or \
            str(row['TOM_maturity_date'])=='nan' or \
            str(row['TOM_maturity_date'])=='None' or \
            'None' in str(row['TOM_maturity_date'])) \
            and \
            (row['TOM_coupon_rate']==None or \
            str(row['TOM_coupon_rate'])=='nan' \
            or str(row['TOM_coupon_rate'])=='None')\
            and \
            (row['TOM_price_or_yield']==None or \
            str(row['TOM_price_or_yield'])=='nan' \
            or str(row['TOM_price_or_yield'])=='None') \
            and \
            (row['TOM_amount_by_maturity']==None or \
            str(row['TOM_amount_by_maturity'])=='nan' or \
            str(row['TOM_amount_by_maturity'])=='None' or \
            'None' in str(row['TOM_amount_by_maturity'])):
            GPF.at[idx,'TOM_maturity_date'] = row['maturity_date']
            GPF.at[idx,'TOM_coupon_rate'] = row['coupon_rate']
            GPF.at[idx,'TOM_price_or_yield'] = row['price_or_yield']
            GPF.at[idx,'TOM_amount_by_maturity'] = row['amount_by_maturity']

    # (10) Mark out if a bond issue has a bond that is not fixed rate or zero coupon. It is extremely uncommon for an issue to have both
    # fixed rate/zero coupon and irregular (variable rate) types of bonds. Therefore, I do not consider cases where one issue has both
    for idx,row in GPF.iterrows():
        coupon_type = row['coupon_type'].split('\n')
        coupon_type = [item for item in coupon_type if item!='Fixed Rate']
        coupon_type = [item for item in coupon_type if item!='Zero Coupon']
        if len(coupon_type)>0:
            GPF.at[idx,'IF_irregular_coupon_type'] = True

    # (11) In the prior steps, I put non-Thompson data to those fields if Thompson-researched data are missing. In addition, I use 
    # Thompson-researched data completely for any issues post 2003
    if row['sale_year']>=2003:
        GPF.at[idx,'maturity_date'] = row['TOM_maturity_date']
        GPF.at[idx,'coupon_rate'] = row['TOM_coupon_rate']
        GPF.at[idx,'price_or_yield'] = row['TOM_price_or_yield']
        GPF.at[idx,'amount_by_maturity'] = row['TOM_amount_by_maturity']

    return GPF

meta_columns = list(proc_list(GPF[:10]).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


In [None]:
# Create columns that describe data availablility for a particular bond issue
def proc_list(GPF):
    
    GPF = GPF.copy()
    
    GPF['IF_has_maturity_date'] = None
    GPF['IF_has_coupon_rate'] = None
    GPF['IF_has_price_or_yield'] = None
    GPF['IF_has_amount_by_maturity'] = None
    GPF['IF_has_tic'] = None
    GPF['IF_has_nic'] = None
    GPF['IF_has_gross_spread'] = None
    
    GPF['N_coupon_rate'] = None
    GPF['N_price_or_yield'] = None
    GPF['N_maturity_date'] = None
    GPF['N_amount'] = None
    
    GPF['IF_num_bonds_all_consistent'] = None
    GPF['IF_num_bonds_yield_mat_amt_consistent'] = None
    GPF['IF_num_bonds_mat_amt_consistent'] = None
    
    # Handle case by case of each variable being missing, and within each case allow for multiple maturities
    for idx,row in GPF.iterrows():
    
        # Do nothing if the issue contains bonds with irregular type of coupon payments
        if row['IF_irregular_coupon_type']==True:
            continue
    
        # Do nothing if dated date is missing
        if str(row['dated_date'])=='nan' or str(row['dated_date'])=='NaT':
            continue
    
        # Initialize variables
        IF_has_maturity_date = False
        IF_has_coupon_rate = False
        IF_has_price_or_yield = False
        IF_has_amount_by_maturity = False
        IF_has_tic = False
        IF_has_gross_spread = False
        
        # Whether certain fields exist
        IF_has_coupon_rate = \
            row['coupon_rate']!=None and \
            str(row['coupon_rate'])!='nan' \
            and str(row['coupon_rate'])!='None'
        IF_has_maturity_date = \
            row['maturity_date']!=None and \
            str(row['maturity_date'])!='nan' and \
            str(row['maturity_date'])!='None' and \
            'None' not in str(row['maturity_date']) 
        IF_has_price_or_yield = \
            row['price_or_yield']!=None and \
            str(row['price_or_yield'])!='nan' \
            and str(row['price_or_yield'])!='None'
        IF_has_amount_by_maturity = \
            row['amount_by_maturity']!=None and \
            str(row['amount_by_maturity'])!='nan' and \
            str(row['amount_by_maturity'])!='None' and \
            'None' not in str(row['amount_by_maturity']) 
        IF_has_tic = \
            row['true_interest_cost']!=None and \
            str(row['true_interest_cost'])!='nan' and \
            str(row['true_interest_cost'])!='None' and \
            'None' not in str(row['true_interest_cost']) and \
            (isinstance(row['true_interest_cost'],int)|isinstance(row['true_interest_cost'],float))
        IF_has_nic = \
            row['net_interest_cost']!=None and \
            str(row['net_interest_cost'])!='nan' and \
            str(row['net_interest_cost'])!='None' and \
            'None' not in str(row['net_interest_cost']) and \
            (isinstance(row['net_interest_cost'],int)|isinstance(row['net_interest_cost'],float))
        IF_has_gross_spread = \
            row['gross_spread']!=None and \
            str(row['gross_spread'])!='nan' and \
            str(row['gross_spread'])!='None' and \
            'None' not in str(row['gross_spread']) and \
            (isinstance(row['gross_spread'],int)|isinstance(row['gross_spread'],float))

        # Number of entries in certain fields
        N_coupon_rate = str(row['coupon_rate']).count('\n')+1
        N_price_or_yield = str(row['price_or_yield']).count('\n')+1
        N_maturity_date = str(row['maturity_date']).count('\n')+1
        N_amount = str(row['amount_by_maturity']).count('\n')+1
        IF_num_bonds_all_consistent = \
            (N_coupon_rate==N_price_or_yield) and \
            (N_coupon_rate==N_maturity_date) and \
            (N_coupon_rate==N_amount)
        # The following indicator can be applied when coupon is not available
        IF_num_bonds_yield_mat_amt_consistent = \
            (N_price_or_yield==N_maturity_date) and \
            (N_price_or_yield==N_amount)
        # The following indicator can be applied when coupon and yield/price is not available
        IF_num_bonds_mat_amt_consistent = \
            (N_maturity_date==N_amount)
        
        GPF.at[idx,'IF_has_maturity_date'] = IF_has_maturity_date
        GPF.at[idx,'IF_has_coupon_rate'] = IF_has_coupon_rate
        GPF.at[idx,'IF_has_price_or_yield'] = IF_has_price_or_yield
        GPF.at[idx,'IF_has_amount_by_maturity'] = IF_has_amount_by_maturity
        GPF.at[idx,'IF_has_tic'] = IF_has_tic
        GPF.at[idx,'IF_has_nic'] = IF_has_nic
        GPF.at[idx,'IF_has_gross_spread'] = IF_has_gross_spread
        
        GPF.at[idx,'N_coupon_rate'] = N_coupon_rate
        GPF.at[idx,'N_price_or_yield'] = N_price_or_yield
        GPF.at[idx,'N_maturity_date'] = N_maturity_date
        GPF.at[idx,'N_amount'] = N_amount
        
        GPF.at[idx,'IF_num_bonds_all_consistent'] = IF_num_bonds_all_consistent
        GPF.at[idx,'IF_num_bonds_yield_mat_amt_consistent'] = IF_num_bonds_yield_mat_amt_consistent
        GPF.at[idx,'IF_num_bonds_mat_amt_consistent'] = IF_num_bonds_mat_amt_consistent

    return GPF


meta_columns = list(proc_list(GPF[:10]).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


### 1.2.2 Calculate yield at issuance

In [None]:
%%time

def proc_list(GPF):

    GPF = GPF.copy()

    # Handle case by case of each variable being missing, and within each case allow for multiple maturities
    for idx,row in GPF.iterrows():

        # Do nothing if the issue contains bonds with irregular type of coupon payments
        if row['IF_irregular_coupon_type']==True:
            continue

        # Do nothing if dated date is missing
        if str(row['dated_date'])=='nan' or str(row['dated_date'])=='NaT':
            continue

        # Extract vallues that describe data availablility for a particular bond issue

        IF_has_maturity_date = row['IF_has_maturity_date']
        IF_has_coupon_rate = row['IF_has_coupon_rate']
        IF_has_price_or_yield = row['IF_has_price_or_yield']
        IF_has_amount_by_maturity = row['IF_has_amount_by_maturity']
        IF_has_tic = row['IF_has_tic']
        
        N_coupon_rate = row['N_coupon_rate']
        N_price_or_yield = row['N_price_or_yield']
        N_maturity_date = row['N_maturity_date']
        N_amount = row['N_amount']

        IF_num_bonds_all_consistent = row['IF_num_bonds_all_consistent']
        IF_num_bonds_yield_mat_amt_consistent = row['IF_num_bonds_yield_mat_amt_consistent']
        IF_num_bonds_mat_amt_consistent = row['IF_num_bonds_mat_amt_consistent']


        ##########
        # Case 1 #
        ##########

        # Case 1: "coupon_rate","maturity_date","price_or_yield" are all available
        if IF_has_maturity_date and IF_has_coupon_rate and IF_has_price_or_yield and IF_has_amount_by_maturity \
            and IF_num_bonds_all_consistent:
    
            # Case 1A: If single maturity
            if N_maturity_date==1 :
                maturity = (row['maturity_date']-row['dated_date']).days
                GPF.loc[idx,'avg_maturity'] = maturity
                GPF.loc[idx,'maturity_by_maturity_list'] = [[maturity]]
                GPF.loc[idx,'amount_by_maturity_list'] = [[row['amount_by_maturity']]]
                # Assume that if a number is more than 80 and less than 120, it is issuing price. If less than 20, it is issuing 
                # yield. Otherwise, undetermined
                if float(row['price_or_yield'])<20:
                    GPF.loc[idx,'avg_yield'] = row['price_or_yield']/100
                    GPF.loc[idx,'yield_by_maturity_list'] = [[GPF.loc[idx,'avg_yield']]]
                elif float(row['price_or_yield'])>80 and float(row['price_or_yield'])<120:
                    # Number of coupons to be paid
                    n_coupon = round(maturity/182)
                    GPF.loc[idx,'avg_yield'] = \
                        (1+npf.irr([-row['price_or_yield']]+[row['coupon_rate']/2]*(n_coupon-1)+[100+row['coupon_rate']/2]))\
                        **2-1
                    GPF.loc[idx,'yield_by_maturity_list'] = [[GPF.loc[idx,'avg_yield']]]
                else:
                    GPF.loc[idx,'IF_price_or_yield_determined'] = False
    
            # Case 1B: If multiple maturity
            else:
                # If number of tranches not consistent across fields, skip
                if N_price_or_yield!=N_coupon_rate:
                    GPF.loc[idx,'IF_n_tranches_not_match'] = True
                elif N_price_or_yield!=N_maturity_date:
                    GPF.loc[idx,'IF_n_tranches_not_match'] = True
                else:
                    maturities = []
                    yields = []
                    amounts = []
                    for tranch in range(0,row['coupon_rate'].count('\n')+1):
                        maturity = (datetime.strptime(row['maturity_date'].split('\n')[tranch],"%m/%d/%y")\
                            -row['dated_date']).days
                        coupon_rate = float(row['coupon_rate'].split('\n')[tranch])
                        price_or_yield = float(row['price_or_yield'].split('\n')[tranch])
                        n_coupon = round(maturity/182)
                        amount = float(row['amount_by_maturity'].split('\n')[tranch].replace(',',''))
                        maturities = maturities+[maturity]
                        amounts = amounts+[amount]
                        if price_or_yield<20:
                            yields = yields+[price_or_yield/100]
                        elif price_or_yield>80 and price_or_yield<120:
                            tranch_yield = (1+npf.irr([-price_or_yield]+[coupon_rate/2]*(n_coupon-1)+[100+coupon_rate/2]))**2-1
                            yields = yields+[tranch_yield]
                        else:
                            yields = yields+[None]
                            GPF.loc[idx,'IF_price_or_yield_determined'] = False
                    if GPF.at[idx,'IF_price_or_yield_determined']!=False:
                        amountsXmaturities = np.multiply(np.array(amounts),np.array(maturities))
                        avg_yield = np.sum(np.multiply(np.array(yields),np.array(amountsXmaturities)))/np.sum(amountsXmaturities)
                        GPF.loc[idx,'avg_yield'] = avg_yield
                        GPF.loc[idx,'avg_maturity'] = np.dot(maturities,amounts)/np.sum(amounts)
                    GPF.at[idx,'maturity_by_maturity_list'] = maturities
                    GPF.at[idx,'yield_by_maturity_list'] = yields
                    GPF.at[idx,'amount_by_maturity_list'] = amounts


        ##########
        # Case 2 #
        ##########          

        # Case 2: "coupon_rate" is not available, but "price_or_yield" and "maturity_date" is
        elif IF_has_maturity_date and (not IF_has_coupon_rate) and IF_has_price_or_yield and IF_has_amount_by_maturity \
            and IF_num_bonds_yield_mat_amt_consistent:

            # Case 2A: If single maturity
            if N_maturity_date==1:
                maturity = (row['maturity_date']-row['dated_date']).days
                GPF.loc[idx,'avg_maturity'] = maturity
                GPF.loc[idx,'maturity_by_maturity_list'] = [[maturity]]
                GPF.loc[idx,'amount_by_maturity_list'] = [[row['amount_by_maturity']]]
                # Assume that if a number is more than 80, it is issuing price. If less than 20, it is issuing yield. 
                # Otherwise, undetermined
                if float(row['price_or_yield'])<20:
                    GPF.loc[idx,'avg_yield'] = row['price_or_yield']/100
                    GPF.loc[idx,'yield_by_maturity_list'] = [[GPF.loc[idx,'avg_yield']]]
                # Cannot do anything if coupon rate is unavailable and only price is given
                else:
                    continue
    
            # Case 2B: If multiple maturity
            else:
                # If number of tranches not consistent across fields, skip
                if N_price_or_yield!=N_maturity_date:
                    GPF.loc[idx,'IF_n_tranches_not_match'] = True
                else:
                    maturities = []
                    yields = []
                    amounts = []
                    for tranch in range(0,row['maturity_date'].count('\n')+1):
                        maturity = (datetime.strptime(row['maturity_date'].split('\n')[tranch],"%m/%d/%y")-row['dated_date']).days
                        price_or_yield = float(row['price_or_yield'].split('\n')[tranch])
                        amount = float(row['amount_by_maturity'].split('\n')[tranch].replace(',',''))
                        maturities = maturities+[maturity]
                        amounts = amounts+[amount]
                        if price_or_yield<20:
                            yields = yields+[price_or_yield/100]
                        else:
                            # Note that it is impossible to calculate yield if only price is available for one tranch, or if I cannot
                            # decide whether it is price or yield
                            yields = yields+[None]
                    if len(yields)>0:
                        if None not in yields:
                            amountsXmaturities = np.multiply(np.array(amounts),np.array(maturities))
                            avg_yield = np.sum(np.multiply(np.array(yields),np.array(amountsXmaturities)))/np.sum(amountsXmaturities)
                            GPF.loc[idx,'avg_yield'] = avg_yield
                        else:
                            GPF.loc[idx,'avg_yield'] = None
                        GPF.loc[idx,'avg_maturity'] = np.dot(maturities,amounts)/np.sum(amounts)
                    GPF.at[idx,'maturity_by_maturity_list'] = maturities
                    GPF.at[idx,'yield_by_maturity_list'] = yields
                    GPF.at[idx,'amount_by_maturity_list'] = amounts
    
    return GPF

GPF = GPF.copy()
meta_columns = list(proc_list(GPF[:10]).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


### 1.2.3 Calculate underwriting spread based on yield and TIC or NIC

Notes:
- With TIC and coupon rate, I can calculate the purchase price (i.e., proceeds of the issuer). Then with price, or yield which implies price, I can obtain the reoffering price. The difference is the stipulated underwriting spread.
- Without coupon rate, I cannot reliably stipulate underwriting spread.  
    - E.g., if I have both reoffering yield and TIC, then  
=> reoffering price = value of future coupon and principal payment, discounted by reoffering yield  
=> purchase price = value of future coupon and principal payment, discounted by TIC  
So how large the difference of the two is depends on the size of the coupon.  
Still, I can do some stipulation of the coupon rate based on similar bonds, and have a guess of the underwriting spread.  
    - If I have reoffering price and TIC, then, however,  
=> purchase price = value of future coupon and principal payment, discounted by TIC  
This will be much more sensitive to what I assume about coupon rate and hence not reliable.  
    - Benefits of these stipulation is very low, given that, in most cases, when TIC is available, coupon rate is already available. Hence, not doing them.
- When TIC and NIC are both available, use TIC. If only NIC is available and coupon rate is available, I can calculate the underwriter's discount as  
NIC = (Total interest payment + Discount of reoffering price relative to par - Premium of reoffering price relative to par + underwriter's discount)/bond years of this issue,  
noting that every term above is in dollar amounts for the whole issue.
Then, the the underwriter's discount can be converted into in terms of every $100 of par value.  
I can also calculate a TIC based on that, which is to be compared with the NIC.

In [None]:
def proc_list(GPF):

    GPF = GPF.copy()
    GPF['gross_spread_tic_based'] = None
    GPF['gross_spread_nic_based'] = None
    GPF['tic_nic_based'] = None

    # Handle case by case of each variable being missing, and within each case allow for multiple maturities
    for idx,row in GPF.iterrows():

        # Do nothing if the issue contains bonds with irregular type of coupon payments
        if row['IF_irregular_coupon_type']==True:
            continue
    
        # Do nothing if dated date is missing
        if str(row['dated_date'])=='nan' or str(row['dated_date'])=='NaT':
            continue
    
        # Extract vallues that describe data availablility for a particular bond issue

        IF_has_maturity_date = row['IF_has_maturity_date']
        IF_has_coupon_rate = row['IF_has_coupon_rate']
        IF_has_price_or_yield = row['IF_has_price_or_yield']
        IF_has_amount_by_maturity = row['IF_has_amount_by_maturity']
        IF_has_tic = row['IF_has_tic']
        IF_has_nic = row['IF_has_nic']
        
        N_coupon_rate = row['N_coupon_rate']
        N_price_or_yield = row['N_price_or_yield']
        N_maturity_date = row['N_maturity_date']
        N_amount = row['N_amount']
    
        IF_num_bonds_all_consistent = row['IF_num_bonds_all_consistent']
        IF_num_bonds_yield_mat_amt_consistent = row['IF_num_bonds_yield_mat_amt_consistent']
        IF_num_bonds_mat_amt_consistent = row['IF_num_bonds_mat_amt_consistent']
    
        # Extract TIC, one number for each bond issue
        tic = None
        tic_half = None
        if isinstance(row['true_interest_cost'],float) or isinstance(row['true_interest_cost'],int):
            tic = row['true_interest_cost']/100
            tic_half = sqrt(1+tic)-1

        # Extract NIC, one number for each bond issue
        nic = None
        if isinstance(row['net_interest_cost'],float) or isinstance(row['net_interest_cost'],int):
            nic = row['net_interest_cost']/100

        # Note that function "npf.npv" start with a period 0 cash flow


        ##########
        # Case 1 #
        ##########
    
        # Case 1: "coupon_rate","maturity_date","price_or_yield", "true_interest_cost" are all available
        if IF_has_maturity_date and IF_has_coupon_rate and IF_has_price_or_yield and IF_has_amount_by_maturity and IF_has_tic \
            and IF_num_bonds_all_consistent:

            if tic>0.2:
                continue
    
            # Case 1A: If single maturity
            if N_maturity_date==1:
                maturity = (row['maturity_date']-row['dated_date']).days
                n_coupon = round(maturity/182)
                npv_by_tic = npf.npv(tic_half,[0]+[row['coupon_rate']/2]*(n_coupon-1)+[100+row['coupon_rate']/2])
                # Assume that if a number is more than 80 and less than 120, it is issuing price. If less than 20, it is issuing 
                # yield. Otherwise, undetermined
                if float(row['price_or_yield'])<20:
                    reoffering_yield = row['price_or_yield']/100
                    npv_by_reoffering_yield = npf.npv(sqrt(1+reoffering_yield)-1,[0]+[row['coupon_rate']/2]*(n_coupon-1)+\
                        [100+row['coupon_rate']/2])
                    GPF.loc[idx,'gross_spread_tic_based'] = npv_by_reoffering_yield-npv_by_tic
                elif float(row['price_or_yield'])>80 and float(row['price_or_yield'])<120:
                    npv_by_reoffering_yield = row['price_or_yield']
                    GPF.loc[idx,'gross_spread_tic_based'] = npv_by_reoffering_yield-npv_by_tic
    
            # Case 1B: If multiple maturity
            else:
                # If number of tranches not consistent across fields, skip
                if N_price_or_yield!=N_coupon_rate:
                    continue
                elif N_price_or_yield!=N_maturity_date:
                    continue
                else:
                    npv_by_tics = []
                    npv_by_reoffering_yields = []
                    amounts = []
                    for tranch in range(0,row['coupon_rate'].count('\n')+1):
                        maturity = (datetime.strptime(row['maturity_date'].split('\n')[tranch],"%m/%d/%y")\
                            -row['dated_date']).days
                        coupon_rate = float(row['coupon_rate'].split('\n')[tranch])
                        price_or_yield = float(row['price_or_yield'].split('\n')[tranch])
                        n_coupon = round(maturity/182)
                        amount = float(row['amount_by_maturity'].split('\n')[tranch].replace(',',''))
                        amounts = amounts+[amount]
                        npv_by_tic = npf.npv(tic_half,[0]+[coupon_rate/2]*(n_coupon-1)+[100+coupon_rate/2])
                        npv_by_tics = npv_by_tics+[npv_by_tic]
                        if price_or_yield<20:
                            reoffering_yield = price_or_yield/100
                            npv_by_reoffering_yield = npf.npv(sqrt(1+reoffering_yield)-1,[0]+[coupon_rate/2]*(n_coupon-1)+\
                                [100+coupon_rate/2])
                            npv_by_reoffering_yields = npv_by_reoffering_yields+[npv_by_reoffering_yield]
                        elif price_or_yield>80 and price_or_yield<120:
                            npv_by_reoffering_yield = price_or_yield
                            npv_by_reoffering_yields = npv_by_reoffering_yields+[npv_by_reoffering_yield]
                        else:
                            npv_by_reoffering_yields = npv_by_reoffering_yields+[None]
                            GPF.loc[idx,'IF_price_or_yield_determined'] = False

                # Average the gross spread across tranches
                if None in npv_by_reoffering_yields:
                    continue
                else:
                    # In the following averaging, as underwriter's discount is a one-time cost, averaging does not use maturity 
                    # as part of the weight
                    GPF.loc[idx,'gross_spread_tic_based'] = \
                        np.dot(np.array(npv_by_reoffering_yields)-np.array(npv_by_tics),np.array(amounts))/ \
                        np.sum(np.array(amounts))

    
        ##########
        # Case 2 #
        ##########
    
        # Case 2: "coupon_rate","maturity_date","price_or_yield" are all available. "true_interest_cost" is not available, while
        # "net_interest_cost" is available

        # The calculation below is based on dollar amounts of the whole bond issue, rather than every $100 par value
        
        if IF_has_maturity_date and IF_has_coupon_rate and IF_has_price_or_yield and IF_has_amount_by_maturity \
            and (not IF_has_tic) and IF_has_nic \
            and IF_num_bonds_all_consistent:

            if nic>0.2:
                continue

            # Case 2A: If single maturity
            if N_maturity_date==1:
    
                maturity = (row['maturity_date']-row['dated_date']).days
                n_coupon = round(maturity/182)
    
                # Obtain the discount of reoffering price relative to par vlaue
                # Assume that if a number is more than 80 and less than 120, it is issuing price. If less than 20, it is issuing 
                # yield. Otherwise, undetermined
                if float(row['price_or_yield'])<20:
                    reoffering_yield = row['price_or_yield']/100
                    npv_by_reoffering_yield = npf.npv(sqrt(1+reoffering_yield)-1,
                        [0]+[row['amount']*row['coupon_rate']/100/2]*(n_coupon-1)+[row['amount']+row['amount']*row['coupon_rate']/100/2])
                elif float(row['price_or_yield'])>80 and float(row['price_or_yield'])<120:
                    npv_by_reoffering_yield = row['price_or_yield']/100*row['amount']
                else:
                    continue
    
                # Calculate underwriter discount implied by NIC
                # Discount of reoffering price relative to par value
                discount_reoff_to_par = npv_by_reoffering_yield-row['amount']
                total_interest = row['coupon_rate']/100/2*row['amount']*n_coupon
                # The sum of the product of each year's maturity value and the number of years to its maturity
                bondyear = row['amount']*(n_coupon/2)
                underwriter_discount = nic*bondyear-total_interest+discount_reoff_to_par
                underwriter_discount_100par = underwriter_discount/row['amount']*100
    
                # Calculate TIC implied by NIC
                tic_nic_based = (1+npf.irr([-(row['amount']+discount_reoff_to_par-underwriter_discount)]+\
                    [row['coupon_rate']/2/100*row['amount']]*(n_coupon-1)+[row['amount']+row['coupon_rate']/2/100*row['amount']]))\
                    **2-1
    
                GPF.loc[idx,'gross_spread_nic_based'] = underwriter_discount_100par
                GPF.loc[idx,'tic_nic_based'] = tic_nic_based
    
    
            # Case 2B: If multiple maturity
            else:
                # If number of tranches not consistent across fields, skip
                if N_price_or_yield!=N_coupon_rate:
                    continue
                elif N_price_or_yield!=N_maturity_date:
                    continue
                else:
                    bondyears = []
                    total_interests = []
                    discount_reoff_to_pars = []
                    cash_flows = [] # Cash flow of the whole bond, not every $100 par value
                    amounts = []
                    for tranch in range(0,row['coupon_rate'].count('\n')+1):
                        maturity = (datetime.strptime(row['maturity_date'].split('\n')[tranch],"%m/%d/%y")\
                            -row['dated_date']).days
                        coupon_rate = float(row['coupon_rate'].split('\n')[tranch])
                        price_or_yield = float(row['price_or_yield'].split('\n')[tranch])
                        n_coupon = round(maturity/182)
                        amount = float(row['amount_by_maturity'].split('\n')[tranch].replace(',',''))
                        amounts = amounts+[amount]
                        cash_flows = cash_flows+[[0]+[coupon_rate/2/100*amount]*(n_coupon-1)+[amount+coupon_rate/2/100*amount]]
                        if price_or_yield<20:
                            reoffering_yield = price_or_yield/100
                            # Below is a value per every $100 par value, which needs to be converted to the value of the whole bond
                            npv_by_reoffering_yield = npf.npv(sqrt(1+reoffering_yield)-1,[0]+[coupon_rate/2]*(n_coupon-1)+[100+coupon_rate/2])
                            discount_reoff_to_par = npv_by_reoffering_yield/100*amount-amount
                            total_interest = coupon_rate/2/100*amount*n_coupon
                            bondyear = amount*(n_coupon/2)
                            bondyears = bondyears+[bondyear]
                            total_interests = total_interests+[total_interest]
                            discount_reoff_to_pars = discount_reoff_to_pars+[discount_reoff_to_par]
                        elif price_or_yield>80 and price_or_yield<120:
                            npv_by_reoffering_yield = price_or_yield
                            discount_reoff_to_par = npv_by_reoffering_yield/100*amount-amount
                            total_interest = coupon_rate/2/100*amount*n_coupon
                            bondyear = amount*(n_coupon/2)
                            bondyears = bondyears+[bondyear]
                            total_interests = total_interests+[total_interest]
                            discount_reoff_to_pars = discount_reoff_to_pars+[discount_reoff_to_par]
                        else:
                            bondyears = bondyears+[None]
                            total_interests = total_interests+[None]
                            discount_reoff_to_pars = discount_reoff_to_pars+[None]
                            GPF.loc[idx,'IF_price_or_yield_determined'] = False
    
                # Aggregate across trenches and calculate the NIC implied underwriter discount and TIC
                if None in bondyears:
                    continue
                else:
                    # underwriter_discount of the whole issue, not in every $100 par value
                    underwriter_discount = nic*np.sum(bondyears)-np.sum(total_interests)+np.sum(discount_reoff_to_pars)
                    underwriter_discount_100par = underwriter_discount/np.sum(amounts)*100
                    GPF.loc[idx,'gross_spread_nic_based'] = underwriter_discount_100par
    
                    max_length = max(len(lst) for lst in cash_flows)
                    padded_lists = [lst+[0]*(max_length-len(lst)) for lst in cash_flows]
                    cash_flow = [sum(elements) for elements in zip(*padded_lists)]
                    cash_flow[0] = cash_flow[0]-(np.sum(amounts)+np.sum(discount_reoff_to_pars)-underwriter_discount)
                    tic_nic_based = (1+npf.irr(cash_flow))**2-1
                    GPF.loc[idx,'tic_nic_based'] = tic_nic_based

    return GPF

GPF = GPF.copy()
meta_columns = list(proc_list(GPF.sample(10)).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


### 1.2.4 Calculate spread relative to treasury

#### 1.2.4.1 Calculate yield of synthetic risk-free treasury bond

Notes:
- Without coupon rate, it is impossible to calculate the price (yield) of the synthetic treasury bond. For example, consider Bond A: Pays \\$5 one year from now, and \\$5 two years from now, and Bond B: Pays \\$100*(1+5%)^2 two years from now. These two have the same yield. Suppose that treasury yield in one year is 0% and in two years is 20%. Then price of first synthetic bond is higher than the second, and the yield of the first synthetic bond is lower than the second. In other words, without coupon rate, I do not know when the cash flow is going to come, so I do not know what is the component of risk-free rate that I should tease out from the return of the municipal bond yield. An INACCURATE approximation can be simply using yield of municipal bond minus that of treasury bond, but it is erraneous to do so.
- Luckily except for later parts of the sample, coupon rate is usually available.


In [None]:
def proc_list(GPF):

    GPF = GPF.copy()
    
    # Treasury yield
    feds200628 = pd.read_csv("../RawData/FedBOG/feds200628.csv", header=9)
    feds200628 = feds200628[~pd.isnull(feds200628['SVENY01'])]
    columns = ['Date']+ \
        ['SVENY0'+str(i) for i in range(1,10)]+ \
        ['SVENY'+str(i) for i in range(10,31)]
    feds200628 = feds200628[columns]
    new_columns = ['Date']+ \
        ['SVENY'+str(i) for i in range(1,10)]+ \
        ['SVENY'+str(i) for i in range(10,31)]
    feds200628.columns = new_columns
    feds200628['Date'] = pd.to_datetime(feds200628['Date'])
    threshold_date = pd.to_datetime('2050-01-01')
    feds200628['Date'] = feds200628['Date'].apply(lambda x: x - pd.DateOffset(years=100) if x > threshold_date else x)
    
    max_year_7 = [pd.Timestamp(1961,6,14,0,0,0),pd.Timestamp(1971,8,15,0,0,0)]
    max_year_10 = [pd.Timestamp(1971,8,16,0,0,0),pd.Timestamp(1971,11,14,0,0,0)]
    max_year_15 = [pd.Timestamp(1971,11,15,0,0,0),pd.Timestamp(1981,7,1,0,0,0)]
    max_year_20 = [pd.Timestamp(1981,7,2,0,0,0),pd.Timestamp(1985,11,24,0,0,0)]
    max_year_30 = [pd.Timestamp(1985,11,25,0,0,0),pd.Timestamp(2023,11,3,0,0,0)]

    # Initialize yield of synthetic bond
    GPF['sync_treasury_bond_yield_by_maturity_list'] = None
    GPF['sync_treasury_bond_avg_yield'] = None
    
    for idx,row in GPF.iterrows():

        # Do nothing if dated date is missing
        if str(row['dated_date'])=='nan' or str(row['dated_date'])=='NaT':
            continue

        # For all bond issues before 30 years length of treasury yield is available, do not construct an equivalent treasury security
        if row['dated_date']<max_year_30[0]:
            continue

        # Do nothing if the issue contains bonds with irregular type of coupon payments
        if row['IF_irregular_coupon_type']==True:
            continue

        IF_has_maturity_date = row['IF_has_maturity_date']
        IF_has_coupon_rate = row['IF_has_coupon_rate']
        IF_has_price_or_yield = row['IF_has_price_or_yield']
        IF_has_amount_by_maturity = row['IF_has_amount_by_maturity']
        
        if (not IF_has_coupon_rate) or \
            (not IF_has_maturity_date) or \
            (not IF_has_price_or_yield) or \
            (not IF_has_amount_by_maturity):
            continue

        N_coupon_rate = row['N_coupon_rate']
        N_price_or_yield = row['N_price_or_yield']
        N_maturity_date = row['N_maturity_date']
        N_amount = row['N_amount']

        IF_num_bonds_all_consistent = row['IF_num_bonds_all_consistent']
        IF_num_bonds_yield_mat_amt_consistent = row['IF_num_bonds_yield_mat_amt_consistent']
        IF_num_bonds_mat_amt_consistent = row['IF_num_bonds_mat_amt_consistent']

        # Obtain the treasury zero-coupon yield curve at the closest date
        feds200628_copy = feds200628.copy()
        feds200628_copy['dif_date'] = np.abs(feds200628_copy['Date']-row['dated_date'])
        feds200628_copy = feds200628_copy.sort_values('dif_date').reset_index()
    
        sync_bond_yield_by_maturity = []
    
        # If single maturity
        if N_coupon_rate==1 and IF_num_bonds_all_consistent:

            if row['price_or_yield']>20 and row['price_or_yield']<80:
                continue
            
            coupon_rate = float(row['coupon_rate'])
            maturity = (row['maturity_date']-row['dated_date']).days

            if maturity>30*365:
                continue
            else:
            
                cf = []
                discount_factor = []
                N_coupons = int(np.max([1,np.around(maturity/(365/2))]))
        
                # Construct a series of cash flow for each bond
                for cf_idx in range(0,N_coupons):
                    cf = cf+[coupon_rate/2]
                cf[N_coupons-1] = cf[N_coupons-1]+100
        
                # Construct a series of discount factor for each bond
                for cf_idx in range(0,N_coupons):
                    if cf_idx==0:
                        discount_factor = discount_factor+[feds200628_copy['SVENY1'][0]]
                    elif cf_idx%2==1:
                        discount_factor = discount_factor+[feds200628_copy['SVENY'+str(ceil(cf_idx/2))][0]]
                    elif cf_idx%2==0:
                        discount_factor = discount_factor+\
                            [(feds200628_copy['SVENY'+str(ceil(cf_idx/2))][0]
                            +feds200628_copy['SVENY'+str(ceil(cf_idx/2)+1)][0])/2]
                discount_factor = [(1/(1+discount_factor[disc_idx]/100))**((disc_idx+1)/2) for disc_idx in range(0,N_coupons)]
    
                # Bond price and yield of synthetic bond
                sync_bond_price = np.sum(np.dot(cf,discount_factor))
                cf = [-sync_bond_price]+cf
                sync_bond_yield = (1+npf.irr(cf))**2-1
                sync_bond_yield_by_maturity = sync_bond_yield_by_maturity+[sync_bond_yield]
    
                # Record data
                GPF.at[idx,'sync_treasury_bond_yield_by_maturity_list'] = sync_bond_yield_by_maturity
                GPF.at[idx,'sync_treasury_bond_avg_yield'] = sync_bond_yield

    
        # If multiple maturity, go over bond by bond
        if N_coupon_rate>1 and IF_num_bonds_all_consistent:

            cfs = []
    
            for bond_idx in range(0,N_maturity_date):
    
                if (float(row['price_or_yield'].split('\n')[bond_idx])>20) and \
                    (float(row['price_or_yield'].split('\n')[bond_idx])<80):
                    sync_bond_yield_by_maturity = sync_bond_yield_by_maturity+[None]
                    continue
    
                coupon_rate = float(row['coupon_rate'].split('\n')[bond_idx])
                maturity = row['maturity_by_maturity_list'][bond_idx]

                if maturity>30*365:
                    sync_bond_yield_by_maturity = sync_bond_yield_by_maturity+[None]
                    continue
                else:

                    cf = []
                    discount_factor = []
                    N_coupons = int(np.max([1,np.around(maturity/(365/2))]))
            
                    # Construct a series of cash flow for each bond
                    for cf_idx in range(0,N_coupons):
                        cf = cf+[coupon_rate/2]
                    cf[N_coupons-1] = cf[N_coupons-1]+100
            
                    # Construct a series of discount factor for each bond
                    for cf_idx in range(0,N_coupons):
                        if cf_idx==0:
                            discount_factor = discount_factor+[feds200628_copy['SVENY1'][0]]
                        elif cf_idx%2==1:
                            discount_factor = discount_factor+[feds200628_copy['SVENY'+str(ceil(cf_idx/2))][0]]
                        elif cf_idx%2==0:
                            discount_factor = discount_factor+\
                                [(feds200628_copy['SVENY'+str(ceil(cf_idx/2))][0]
                                +feds200628_copy['SVENY'+str(ceil(cf_idx/2)+1)][0])/2]
                    discount_factor = [(1/(1+discount_factor[disc_idx]/100))**((disc_idx+1)/2) for disc_idx in range(0,N_coupons)]
        
                    # Bond price and yield of synthetic bond
                    sync_bond_price = np.sum(np.dot(cf,discount_factor))
                    cf = [-sync_bond_price]+cf
                    cfs = cfs+[cf]
                    sync_bond_yield = (1+npf.irr(cf))**2-1
                    sync_bond_yield_by_maturity = sync_bond_yield_by_maturity+[sync_bond_yield]
        
            GPF.at[idx,'sync_treasury_bond_yield_by_maturity_list'] = sync_bond_yield_by_maturity
            if len(cfs)>0 and None not in sync_bond_yield_by_maturity:
                max_length = max(len(lst) for lst in cfs)
                padded_lists = [lst+[0]*(max_length-len(lst)) for lst in cfs]
                cf = [sum(elements) for elements in zip(*padded_lists)]
                GPF.at[idx,'sync_treasury_bond_avg_yield'] = (1+npf.irr(cf))**2-1

    return GPF

meta_columns = list(proc_list(GPF[:10]).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=40)
with dask.config.set(scheduler='processes',num_workers=40):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


#### 1.2.4.2 Calculate spread

In [None]:
# Calculate spread. Note that I calculate spread a bit differently from Li and Zhu: Theirs is from the perspective of a taxed
# individual, while mine is from the perspective of a non-taxed individual

tax_rate = {
    1967:0.700,1968:0.700,1969:0.700,1970:0.700,1971:0.700,1972:0.700,1973:0.700,
    1974:0.700,1975:0.700,1976:0.700,1977:0.700,1978:0.700,1979:0.700,1980:0.700,
    1981:0.700,1982:0.500,1983:0.500,1984:0.500,1985:0.500,1986:0.500,1987:0.385,
    1988:0.280,1989:0.280,1990:0.280,1991:0.310,1992:0.310,1993:0.396,1994:0.396,
    1995:0.396,1996:0.396,1997:0.396,1998:0.396,1999:0.396,2000:0.396,2001:0.391,
    2002:0.386,2003:0.350,2004:0.350,2005:0.350,2006:0.350,2007:0.350,2008:0.350,
    2009:0.350,2010:0.350,2011:0.350,2012:0.350,2013:0.396,2014:0.396,2015:0.396,
    2016:0.396,2017:0.396,2018:0.370,2019:0.370,2020:0.370,2021:0.370,2022:0.370,
    2023:0.370,
    }

GPF = GPF.reset_index(drop=True)
GPF['treasury_spread_by_maturity_list'] = None
GPF['treasury_avg_spread'] = None

def proc_list(GPF):

    for idx,row in GPF.iterrows():
        
        spread_by_maturity = []
        if row['sync_treasury_bond_yield_by_maturity_list']!=None and row['yield_by_maturity_list']!=None:
            for bond_idx in range(0,len(GPF.at[idx,'sync_treasury_bond_yield_by_maturity_list'])):
                if row['sync_treasury_bond_yield_by_maturity_list'][bond_idx]==None or \
                    row['yield_by_maturity_list'][bond_idx]==None:
                    spread_by_maturity = spread_by_maturity+[None]
                else:
                    # Adjust for tax here
                    if row['taxable_code']=='E':
                        spread_by_maturity = spread_by_maturity+\
                            [row['yield_by_maturity_list'][bond_idx]
                            -row['sync_treasury_bond_yield_by_maturity_list'][bond_idx]*(1-tax_rate[row['sale_year']])]
                    elif row['taxable_code']=='A' or 'T':
                        spread_by_maturity = spread_by_maturity+\
                            [row['yield_by_maturity_list'][bond_idx]*(1-tax_rate[row['sale_year']])
                            -row['sync_treasury_bond_yield_by_maturity_list'][bond_idx]*(1-tax_rate[row['sale_year']])]
            GPF.at[idx,'treasury_spread_by_maturity_list'] = spread_by_maturity

        if GPF.at[idx,'treasury_spread_by_maturity_list']!=None:
            if None not in GPF.at[idx,'treasury_spread_by_maturity_list']:
                amountsXmaturities = np.multiply(np.array(row['amount_by_maturity_list']),np.array(row['maturity_by_maturity_list']))
                if np.sum(amountsXmaturities)==0:
                    continue
                avg_spread = np.sum(np.multiply(np.array(spread_by_maturity),np.array(amountsXmaturities)))/np.sum(amountsXmaturities)
                GPF.at[idx,'treasury_avg_spread'] = avg_spread

    return GPF

meta_columns = list(proc_list(GPF[:10]).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=40)
with dask.config.set(scheduler='processes',num_workers=40):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


### 1.2.5 Calculate spread relative to MMA AAA curve

Following Goldsmith-Pinkham et al and Liang. Note that unlike constructing synthetic treasury, I do not consider coupon rates below and rather directly match on maturity and substract the reoffering yield and the matched point on the AAA curve on the **sale date**.

In [None]:
def proc_list(GPF):
        
    MMAICurve = []
    for maturity in range(1,31):
        MMAI = pd.read_excel("../RawData/Bloomberg/MMAI.xlsx",sheet_name='Sheet'+str(maturity))
        MMAI['Maturity'] = maturity
        MMAICurve = MMAICurve+[MMAI]
    MMAICurve = pd.concat(MMAICurve)
    MMAICurve = MMAICurve.sort_values(['Date','Maturity']).reset_index(drop=True)
    
    # Initialize yield of synthetic bond
    GPF['MMA_yield_by_maturity_list'] = None
    GPF['MMA_spread_by_maturity_list'] = None
    GPF['MMA_avg_yield'] = None
    GPF['MMA_avg_spread'] = None
    
    for idx,row in GPF.iterrows():
    
        N_coupon_rate = row['N_coupon_rate']
        N_maturity_date = row['N_maturity_date']
    
        if row['yield_by_maturity_list']==None:
            continue
        if row['maturity_by_maturity_list']==None:
            continue
        if row['amount_by_maturity_list']==None:
            continue
        if row['sale_date']==None:
            continue
    
        # Obtain the MMA yield curve at the sale date
        MMAICurve_oneday = MMAICurve[MMAICurve['Date']==row['sale_date']].copy()
        if len(MMAICurve_oneday)==0:
            continue
        
        MMA_yield_by_maturity = []
        MMA_spread_by_maturity = []
    
        # If single maturity
        if N_coupon_rate==1:
    
            # Determine if maturity is above 30 years, above which there is not a corresponding yield point on the curve
            if row['maturity_by_maturity_list'][0]==None:
                continue
            if row['maturity_by_maturity_list'][0]>=365*30.5:
                continue
    
            MMAICurve_oneday['dif_mat'] = np.absolute(row['maturity_by_maturity_list'][0]-MMAICurve_oneday['Maturity']*365)
            MMAICurve_oneday = MMAICurve_oneday.sort_values('dif_mat').reset_index(drop=True)
            MMA_yield = MMAICurve_oneday['Mid Price'][0]/100
            MMA_yield_by_maturity = MMA_yield_by_maturity+[MMA_yield]
            if row['yield_by_maturity_list'][0]!=None:
                MMA_spread_by_maturity = MMA_spread_by_maturity+[row['yield_by_maturity_list'][0]-MMA_yield]
            else:
                MMA_spread_by_maturity = MMA_spread_by_maturity+[None]
    
            GPF.at[idx,'MMA_yield_by_maturity_list'] = MMA_yield_by_maturity
            GPF.at[idx,'MMA_spread_by_maturity_list'] = MMA_spread_by_maturity
            GPF.at[idx,'MMA_avg_yield'] = MMA_yield_by_maturity[0]
            GPF.at[idx,'MMA_avg_spread'] = MMA_spread_by_maturity[0]
    
    
        # If multiple maturity, go over bond by bond
        if N_coupon_rate>1:
    
            for bond_idx in range(0,N_maturity_date):
    
                # Determine if maturity is above 30 years, above which there is not a corresponding yield point on the curve
                if row['maturity_by_maturity_list'][bond_idx]==None:
                    MMA_yield_by_maturity = MMA_yield_by_maturity+[None]
                    MMA_spread_by_maturity = MMA_spread_by_maturity+[None]
                    continue
                if row['maturity_by_maturity_list'][bond_idx]>=365*30.5:
                    MMA_yield_by_maturity = MMA_yield_by_maturity+[None]
                    MMA_spread_by_maturity = MMA_spread_by_maturity+[None]
                    continue
    
                MMAICurve_oneday['dif_mat'] = np.absolute(row['maturity_by_maturity_list'][bond_idx]-MMAICurve_oneday['Maturity']*365)
                MMAICurve_oneday = MMAICurve_oneday.sort_values('dif_mat').reset_index(drop=True)
                MMA_yield = MMAICurve_oneday['Mid Price'][0]/100
                MMA_yield_by_maturity = MMA_yield_by_maturity+[MMA_yield]
                if row['yield_by_maturity_list'][bond_idx]!=None:
                    MMA_spread_by_maturity = MMA_spread_by_maturity+[row['yield_by_maturity_list'][bond_idx]-MMA_yield]
                else:
                    MMA_spread_by_maturity = MMA_spread_by_maturity+[None]
    
            GPF.at[idx,'MMA_yield_by_maturity_list'] = MMA_yield_by_maturity
            GPF.at[idx,'MMA_spread_by_maturity_list'] = MMA_spread_by_maturity
            amountsXmaturities = np.multiply(np.array(row['amount_by_maturity_list']),np.array(row['maturity_by_maturity_list']))
            if np.sum(amountsXmaturities)==0:
                continue
            if None in MMA_spread_by_maturity:
                GPF.at[idx,'MMA_avg_spread'] = None
                continue
            avg_yield = np.sum(np.multiply(np.array(MMA_yield_by_maturity),np.array(amountsXmaturities)))/np.sum(amountsXmaturities)
            avg_spread = np.sum(np.multiply(np.array(MMA_spread_by_maturity),np.array(amountsXmaturities)))/np.sum(amountsXmaturities)
            GPF.at[idx,'MMA_avg_yield'] = avg_yield
            GPF.at[idx,'MMA_avg_spread'] = avg_spread

    return GPF

meta_columns = list(proc_list(GPF.sample(1000)).columns)
GPF_dd = dd.from_pandas(GPF, npartitions=40)
with dask.config.set(scheduler='processes',num_workers=40):
    GPF = GPF_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()


## 1.3 Export a central "GPF" dataset

In [None]:
# Reorder columns
first_columns = ['dated_date','net_interest_cost','true_interest_cost',
    'maturity_date','coupon_rate','price_or_yield','amount',
    'maturity_by_maturity_list','amount_by_maturity_list',
    'yield_by_maturity_list','sync_treasury_bond_yield_by_maturity_list','treasury_spread_by_maturity_list',
    'MMA_yield_by_maturity_list','MMA_spread_by_maturity_list',
    'avg_yield','avg_maturity',
    'sync_treasury_bond_avg_yield','MMA_avg_yield',
    'treasury_avg_spread','MMA_avg_spread',
    'gross_spread','gross_spread_tic_based','gross_spread_nic_based','tic_nic_based',
    'IF_price_or_yield_determined','IF_irregular_coupon_type']
GPF = GPF[first_columns+sorted([item for item in GPF.columns if item not in first_columns])]
GPF.to_csv("../CleanData/SDC/0A_GPF.csv")

# 2. Quantity of issuance

Obtain yearly quantity of debt at county level, by aggregate and also by 
1. The method of placement
2. The use of proceeds
3. Type of borrowing entity


In [None]:
%%script false --no-raise-error

try:
    del(FUN_0A_GetQ_byPlacement)
except:
    pass
import FUN_0A_GetQ_byPlacement
importlib.reload(FUN_0A_GetQ_byPlacement)
from FUN_0A_GetQ_byPlacement import FUN_0A_GetQ_byPlacement

try:
    del(FUN_0A_GetQ_byUsageBB)
except:
    pass
import FUN_0A_GetQ_byUsageBB
importlib.reload(FUN_0A_GetQ_byUsageBB)
from FUN_0A_GetQ_byUsageBB import FUN_0A_GetQ_byUsageBB

try:
    del(FUN_0A_GetQ_byUsageGeneral)
except:
    pass
import FUN_0A_GetQ_byUsageGeneral
importlib.reload(FUN_0A_GetQ_byUsageGeneral)
from FUN_0A_GetQ_byUsageGeneral import FUN_0A_GetQ_byUsageGeneral

try:
    del(FUN_0A_GetQ_byUsageMain)
except:
    pass
import FUN_0A_GetQ_byUsageMain
importlib.reload(FUN_0A_GetQ_byUsageMain)
from FUN_0A_GetQ_byUsageMain import FUN_0A_GetQ_byUsageMain

try:
    del(FUN_0A_GetQ_byIssuerType)
except:
    pass
import FUN_0A_GetQ_byIssuerType
importlib.reload(FUN_0A_GetQ_byIssuerType)
from FUN_0A_GetQ_byIssuerType import FUN_0A_GetQ_byIssuerType

In [None]:
%%script false --no-raise-error

# %%time

# For speed reasons, proceed year by year
Years = list(range(1967,2023))

GPFAmount = GPF[['State','County','sale_year','amount',
    'issuer_type_full','Bid',
    'use_of_proceeds_BB','use_of_proceeds_general','use_of_proceeds_main']].copy()

GPFAmount = GPFAmount[GPFAmount['State']!='nan']
GPFAmount = GPFAmount[GPFAmount['State']!='AS']
GPFAmount = GPFAmount[GPFAmount['State']!='DC']
GPFAmount = GPFAmount[GPFAmount['State']!='FF']
GPFAmount = GPFAmount[GPFAmount['State']!='GU']
GPFAmount = GPFAmount[GPFAmount['State']!='MR']
GPFAmount = GPFAmount[GPFAmount['State']!='PR']
GPFAmount = GPFAmount[GPFAmount['State']!='TT']
GPFAmount = GPFAmount[GPFAmount['State']!='VI']

GPFAmount = GPFAmount.reset_index(drop=True)
GPFAmount = GPFAmount[~pd.isnull(GPFAmount['County'])]

def proc_list(GPFAmount):
    GPFAmount = GPFAmount.copy()
    GPFAmount['County'] = GPFAmount['County'].str.replace(' AND ','/')
    GPFAmount_New = []
    for idx,row in GPFAmount.iterrows():
        if '/' not in row['County']:
            GPFAmount_New = GPFAmount_New+[dict(row)]
        else:
            Countys = row['County'].split('/')
            for County in Countys:
                row_new = dict(row)
                row_new['County'] = County
                row_new['amount'] = row['amount']/len(Countys)
                GPFAmount_New = GPFAmount_New+[row_new]
    GPFAmount_New = pd.DataFrame(GPFAmount_New)
    GPFAmount_New['County'] = GPFAmount_New['County'].str.strip()
    return GPFAmount_New

meta_columns = list(proc_list(GPFAmount[:10]).columns)
GPFAmount_dd = dd.from_pandas(GPFAmount, npartitions=40)
with dask.config.set(scheduler='processes',num_workers=40):
    GPFAmount = GPFAmount_dd.map_partitions(proc_list,meta=pd.DataFrame(columns=meta_columns)).compute()

#---------------------#
# Method of placement #
#---------------------#

input_list = [(year,GPFAmount) for year in Years]
if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        StateXCountyXBid = p.starmap(FUN_0A_GetQ_byPlacement, input_list)
StateXCountyXBid = pd.concat(StateXCountyXBid)
StateXCountyXBid.to_parquet("../CleanData/SDC/0A_StateXCountyXBid.parquet")

#-----------------#
# Use of proceeds #
#-----------------#

input_list = [(year,GPFAmount) for year in Years]
if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        StateXCountyXUsageBB = p.starmap(FUN_0A_GetQ_byUsageBB, input_list)
StateXCountyXUsageBB = pd.concat(StateXCountyXUsageBB)
StateXCountyXUsageBB.to_parquet("../CleanData/SDC/0A_StateXCountyXUsageBB.parquet")

input_list = [(year,GPFAmount) for year in Years]
if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        StateXCountyXUsageGeneral = p.starmap(FUN_0A_GetQ_byUsageGeneral, input_list)
StateXCountyXUsageGeneral = pd.concat(StateXCountyXUsageGeneral)
StateXCountyXUsageGeneral.to_parquet("../CleanData/SDC/0A_StateXCountyXUsageGeneral.parquet")

input_list = [(year,GPFAmount) for year in Years]
if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        StateXCountyXUsageMain = p.starmap(FUN_0A_GetQ_byUsageMain, input_list)
StateXCountyXUsageMain = pd.concat(StateXCountyXUsageMain)
StateXCountyXUsageMain.to_parquet("../CleanData/SDC/0A_StateXCountyXUsageMain.parquet")

#----------------#
# Type of issuer #
#----------------#

input_list = [(year,GPFAmount) for year in Years]
if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        StateXCountyXIssuerType = p.starmap(FUN_0A_GetQ_byIssuerType, input_list)
StateXCountyXIssuerType = pd.concat(StateXCountyXIssuerType)
StateXCountyXIssuerType.to_parquet("../CleanData/SDC/0A_StateXCountyXIssuerType.parquet")
