### Notes:
- There are several ways costs are reported in TBB:
    - "Purchased S5,436,000.00 by Commerce Cap Mkts, at 3.5\%, plus S52,354.12, effective rate 2.5369%."  
    Coupon rate is 3.5\%, and there is a premium (purchase price minus par value) of S52,354.12, which renders NIC to be 2.5369%.  
    Used for short-term notes.
    - "Winning bid: Stifel Nicolaus, at 100.1031, TIC 4.253%."  
    Bidding is done by each bidding a TIC.
    - "Winning bid: Roosevelt & Cross, at n/a, NIC 4.6755%."  
    Bidding is done by each bidding a NIC.
- Reoffering yield is often missing in TBB data ("NRO"), which is because these have been fully subscribed and not offered to the public (https://www.bondbuyer.com/news/msrb-limit-use-of-nro). They will, however, later be reported after first day of trading. Likely, I will not use them but rather use price/yield in SDC.

In [1]:
import pandas as pd
import numpy as np
import re
import os
import sys
import time
import pickle
import warnings

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400

all_states = [
    "ALABAMA","ALASKA","ARIZONA","ARKANSAS","CALIFORNIA","COLORADO","CONNECTICUT","DELAWARE",
    "FLORIDA","GEORGIA","HAWAII","IDAHO","ILLINOIS","INDIANA","IOWA","KANSAS","KENTUCKY",
    "LOUISIANA","MAINE","MARYLAND","MASSACHUSETTS","MICHIGAN","MINNESOTA","MISSISSIPPI",
    "MISSOURI","MONTANA","NEBRASKA","NEVADA","NEW HAMPSHIRE","NEW JERSEY","NEW MEXICO",
    "NEW YORK","NORTH CAROLINA","NORTH DAKOTA","OHIO","OKLAHOMA","OREGON","PENNSYLVANIA",
    "RHODE ISLAND","SOUTH CAROLINA","SOUTH DAKOTA","TENNESSEE","TEXAS","UTAH","VERMONT",
    "VIRGINIA","WASHINGTON","WEST VIRGINIA","WISCONSIN","WYOMING"]


# 1. Download and parse data

In [6]:
%%script false --no-raise-error

# Obtain web address from the index files
index_files = os.listdir('../../RawData/BondBuyer/IndexFiles')
index_files = [item for item in index_files if item[-4:]=='html']

# Process (download, parse, and save) year by year
years = list(set([item[:4] for item in index_files]))

#-----------------#
# Initiate driver #
#-----------------#

try:
    driver.close() 
except:
    pass
driver_path = "../../RawData/BondBuyer/chromedriver.exe"
service = Service(executable_path=driver_path)
driver = webdriver.Chrome(service=service)
driver.set_page_load_timeout(60)
try:
    driver.get('https://www.bondbuyer.com/')
except:
    pass
time.sleep(60) # Manually log in here

# Process year by year
for year in years:

    #---------------------#
    # Extract web address #
    #---------------------#

    index_files_oneyear = [item for item in index_files if item[:4]==year]
    
    web_addresses = []
    for index_file in index_files_oneyear:

        with open('../../RawData/BondBuyer/IndexFiles/'+index_file, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')

        a_tags = soup.find_all('a')
        pattern = re.compile(r'href="([^"]+)" title=')

        for a_tag in a_tags:
            a_tag_str = str(a_tag)
            match = pattern.search(a_tag_str)
            if match:
                href_content = match.group(1)
                web_addresses = web_addresses+[href_content]

    #---------------#
    # Download data #
    #---------------#

    # Initiate driver

    BBData = []

    for web_address in web_addresses:
        driver.get(web_address)
        time.sleep(3)
        BBData = BBData+[{'source':web_address,'text':driver.page_source}]

    with open('../../RawData/BondBuyer/WebPages/BBData'+year+'.pkl', 'wb') as file:
        pickle.dump(BBData, file)


In [8]:
# %%script false --no-raise-error

def proc_one_issue(lines):

    #----------------------------------------------------------------------------------------------------#
    # Find the line with lowest bid, case by case. Handle case where multiple winnders (with potentially #
    # different prices)                                                                                  #
    #----------------------------------------------------------------------------------------------------#
    
    #--------#
    # Case 1 #
    #--------#
    
    # Example: "Purchased $1,782,470.00 by Janney Montgomery, at 4.5%, plus n/a, effective rate n/a."
    
    CaseEffRate_amounts = None
    CaseEffRate_purchasers = None
    CaseEffRate_coupon_rates = None
    CaseEffRate_purchase_price_minus_pars = None
    CaseEffRate_effective_rates = None
    CaseEffRate_lines_other_bidders = None
    
    lines_with_purchase_price = [item for item in lines if 'Purchased ' in item]
    
    if len(lines_with_purchase_price)>0:
        CaseEffRate_amounts = []
        CaseEffRate_purchasers = []
        CaseEffRate_coupon_rates = []
        CaseEffRate_purchase_price_minus_pars = []
        CaseEffRate_effective_rates = []
        for line in lines_with_purchase_price:
    
            match = re.search(r'Purchased (.*?) by',line)
            amount = None
            if match:
                amount = match.group(1)
                amount = amount.replace('.00','').strip()
                amount = amount.replace('$','').strip()
                amount = amount.replace(',','').strip()
            CaseEffRate_amounts = CaseEffRate_amounts+[amount]
    
            match = re.search(r'by (.*?), at',line)
            purchaser = None
            if match:
                purchaser = match.group(1)
            CaseEffRate_purchasers = CaseEffRate_purchasers+[purchaser]
    
            match = re.search(r'by (.*?) at (.*?),',line)
            coupon_rate = None
            if match:
                coupon_rate = match.group(2)
            CaseEffRate_coupon_rates = CaseEffRate_coupon_rates+[coupon_rate]
    
            match = re.search(r'plus (.*?), effective rate',line)
            purchase_price_minus_par = None
            if match:
                purchase_price_minus_par = match.group(1)
            CaseEffRate_purchase_price_minus_pars = CaseEffRate_purchase_price_minus_pars+[purchase_price_minus_par]
    
            match = re.search(r'effective rate (\d+\.\d+%)',line)
            effective_rate = None
            if match:
                effective_rate = match.group(1)
            CaseEffRate_effective_rates = CaseEffRate_effective_rates+[effective_rate]
    
        # Find information on other bidders
        for index in range(0,len(lines)):
            if 'Other bidders' in lines[index]:
                CaseEffRate_lines_other_bidders = lines[index+1:]
                CaseEffRate_lines_other_bidders = [line for line in CaseEffRate_lines_other_bidders if line!='']
                CaseEffRate_lines_other_bidders = [line for line in CaseEffRate_lines_other_bidders if len(line)<=100]
                CaseEffRate_lines_other_bidders = [line for line in CaseEffRate_lines_other_bidders if 'Effective Rate' in line]

    #--------#
    # Case 2 #
    #--------#
    
    # Example: "Winning bid: BMO Capital Markets, at 99.0000, TIC 6.4338%."
    
    CaseTIC_purchaser = None
    CaseTIC_purchase_price = None
    CaseTIC_tic = None
    CaseTIC_lines_other_bidders = None
    
    lines_with_purchase_price = [item for item in lines if ('Winning bid:' in item) and ('TIC' in item)]
    
    if len(lines_with_purchase_price)==1:
    
        line = lines_with_purchase_price[0]
    
        match = re.search(r'Winning bid: (.*?), at',line)
        if match:
            CaseTIC_purchaser = match.group(1)
    
        match = re.search(r'at (.*?), TIC',line)
        if match:
            CaseTIC_purchase_price = match.group(1)
    
        match = re.search(r'TIC (\d+\.\d+%)',line)
        if match:
            CaseTIC_tic = match.group(1)
    
        # Find information on other bidders
        CaseTIC_lines_other_bidders = None
        for index in range(0,len(lines)):
            if 'Other bidders' in lines[index]:
                CaseTIC_lines_other_bidders = lines[index+1:]
                CaseTIC_lines_other_bidders = [line for line in CaseTIC_lines_other_bidders if line!='']
                CaseTIC_lines_other_bidders = [line for line in CaseTIC_lines_other_bidders if len(line)<=100]
                CaseTIC_lines_other_bidders = [line for line in CaseTIC_lines_other_bidders if 'TIC' in line]

    #--------#
    # Case 3 #
    #--------#
    
    # Example: "Winning bid: BMO Capital Markets, at n/a, NIC 5.8176%."
    
    CaseNIC_purchaser = None
    CaseNIC_purchase_price = None
    CaseNIC_nic = None
    CaseNIC_lines_other_bidders = None
    
    lines_with_purchase_price = [item for item in lines if ('Winning bid:' in item) and ('NIC' in item)]
    
    if len(lines_with_purchase_price)==1:
    
        line = lines_with_purchase_price[0]
    
        match = re.search(r'Winning bid: (.*?), at',line)
        if match:
            CaseNIC_purchaser = match.group(1)
    
        match = re.search(r'at (.*?), NIC',line)
        if match:
            CaseNIC_purchase_price = match.group(1)
    
        match = re.search(r'NIC (\d+\.\d+%)',line)
        if match:
            CaseNIC_nic = match.group(1)
    
        # Find information on other bidders
        lines_other_bidders = None
        for index in range(0,len(lines)):
            if 'Other bidders' in lines[index]:
                CaseNIC_lines_other_bidders = lines[index+1:]
                CaseNIC_lines_other_bidders = [line for line in CaseNIC_lines_other_bidders if line!='']
                # Exclude lines that are not other bidders
                CaseNIC_lines_other_bidders = [line for line in CaseNIC_lines_other_bidders if len(line)<=100]
                CaseNIC_lines_other_bidders = [line for line in CaseNIC_lines_other_bidders if 'NIC' in line]
    
    OneIssueData = {
    'CaseEffRate_amounts':CaseEffRate_amounts,
    'CaseEffRate_purchasers':CaseEffRate_purchasers,
    'CaseEffRate_coupon_rates':CaseEffRate_coupon_rates,
    'CaseEffRate_purchase_price_minus_pars':CaseEffRate_purchase_price_minus_pars,
    'CaseEffRate_effective_rates':CaseEffRate_effective_rates,
    'CaseEffRate_lines_other_bidders':CaseEffRate_lines_other_bidders,
    'CaseTIC_purchaser':CaseTIC_purchaser,
    'CaseTIC_purchase_price':CaseTIC_purchase_price,
    'CaseTIC_TIC':CaseTIC_tic,
    'CaseTIC_lines_other_bidders':CaseTIC_lines_other_bidders,
    'CaseNIC_purchaser':CaseNIC_purchaser,
    'CaseNIC_purchase_price':CaseNIC_purchase_price,
    'CaseNIC_NIC':CaseNIC_nic,
    'CaseNIC_lines_other_bidders':CaseNIC_lines_other_bidders,
    }

    return OneIssueData


In [None]:
# %%script false --no-raise-error

#########################
# Process and save data #
#########################

# How the page is organized, i.e., how different issues are separated. There are three cases:
# (a) No special formatting (https://data.bondbuyer.com/salesresults/GetDetails/2000)
# (b) State and issuers are in "h3" (https://data.bondbuyer.com/salesresults/GetDetails/1000)
# (c) State is in bold, while issuer is in italics (https://data.bondbuyer.com/salesresults/GetDetails/10000)

# Parsing is based on the line with amount and date, hence not affected by how the page is organized.
# There are two cases:
# (1) "Dec 27, 2012 . . . . . . $3,040,324"
# (2) "1-Mar-22  $171,345,000"

for year in range(2008,2025):

    with open('../../RawData/BondBuyer/WebPages/BBData'+str(year)+'.pkl', 'rb') as file:
        BBData = pickle.load(file)
    
    BBIssueData = []
    
    for webpage in BBData:

        # Skip one date with formatting error, for which data will be manually coded
        if webpage['source']=='https://data.bondbuyer.com/salesresults/GetDetails/9589':
            continue

        webpage['text'] = webpage['text'].replace('<br><br>','<br>')

        soup = BeautifulSoup(webpage['text'], 'html.parser')
    
        headline_paragraph = soup.find('p', class_='Headlinecls').text
    
        notice_month = headline_paragraph.split('.')[0]
        notice_month = int(notice_month)
        notice_day = headline_paragraph.split('.')[1]
        notice_day = int(notice_day)
        notice_year = headline_paragraph.split('.')[2].split(':')[0]
        notice_year = int(notice_year)+2000
    
        # Extract data only for competitive sales
        if "competitive sales" not in headline_paragraph:
            continue
    
        #----------------------------------------------------------------------------------------#
        # Parse documents into segments of each bond issue, with corresponding state information #
        #----------------------------------------------------------------------------------------#
    
        BondIssues = []
        matched_substrings = []
    
        #--------#
        # Case 1 #
        #--------#
    
        # Find lines with amount and date (unique for each issue)
        pattern = r"<b>(.{1,99})\. \. \. \. \. \. (.*?)</b>"
        matches = list(re.finditer(pattern,webpage['text']))
        if len(matches)>0:
            matched_substrings = [(match.group(), match.start(), match.end()) for match in matches]
    
        #--------#
        # Case 2 #
        #--------#
    
        # Find lines with amount and date (unique for each issue)
        pattern = r'\b\d{1,2}-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}\b'
        matches = list(re.finditer(pattern,webpage['text']))
        if len(matches)>0:
            matched_substrings = [(match.group(), match.start(), match.end()) for match in matches]
    
        if len(matched_substrings)>0:
    
            # Process issues one by one
            for i in range(0,len(matched_substrings)):
                issue = matched_substrings[i]
                # Extract issuer, for either format
                if '<h3>' in webpage['text']:
                    # The immediate line before that is the issuer, which is extracted with reverse search
                    pattern = r">3h/<(.*?)>3h<"
                    matches = list(re.finditer(pattern,webpage['text'][:issue[1]][::-1]))
                    issuer = str(matches[0].group())[::-1]
                else:
                    # The immediate line before that is the issuer, which is extracted with reverse search
                    pattern = r">rb<(.*?)>rb<"
                    matches = list(re.finditer(pattern,webpage['text'][:issue[1]][::-1]))
                    issuer = str(matches[0].group())[::-1]
                # The immediate state name before that is the state
                occurrences = []
                for state in all_states:
                    for match in re.finditer(re.escape(state), webpage['text'][:issue[1]]):
                        occurrences.append((match.group(), match.start(), match.end()))
                sorted_occurrences = sorted(occurrences, key=lambda x: x[1])
                state = sorted_occurrences[-1][0]
                # Tuples of state, issuer, and issue information
                # Note that issue information can contain the next issuer or state, which is totally okay given how it is parsed
                if i<len(matched_substrings)-1:
                    issue_text = webpage['text'][matched_substrings[i][1]:matched_substrings[i+1][1]]
                else:
                    issue_text = webpage['text'][matched_substrings[i][1]:]
                BondIssues = BondIssues+[[state,issuer,issue_text]]
    
    
    
        #------------------------------------------#
        # Go over segments and extract information #
        #------------------------------------------#
    
        for OneBondIssue in BondIssues:
    
            state = OneBondIssue[0]
            issuer = BeautifulSoup(OneBondIssue[1],'html.parser').get_text()
            paragraph = OneBondIssue[2]
            paragraph = paragraph.replace('<br/>','<br>')
            lines = paragraph.split('<br>')
    
            #-----------------------------------------------#
            # Find the line with sale date and total amount #
            #-----------------------------------------------#
    
            # First, determine how the date is formated
            # Format 1: "1-Mar-22  $171,345,000"
            date_pattern = r'\b\d{1,2}-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}\b'
            line_with_sale_date = [line for line in lines if re.search(date_pattern, line)]
            if_format1 = False
            if len(line_with_sale_date)==1:
                if_format1 = True
            # Format 2: "Nov 19, 2013 . . . . . . $220,000"
            line_with_sale_date = [item for item in lines if '. . . . . .' in item]
            if_format2 = False
            if len(line_with_sale_date)==1:
                if_format2 = True
    
            if if_format1:
                line_with_sale_date = [line for line in lines if re.search(date_pattern, line)]
                if '&nbsp;&nbsp;' in line_with_sale_date[0]:
                    line_with_sale_date = line_with_sale_date[0].split('&nbsp;&nbsp;')
                    sale_date = line_with_sale_date[0].replace('<b>','').strip()
                    amount = line_with_sale_date[1].replace('</b>','').strip()
                    amount = amount.replace('.00','').strip()
                    amount = amount.replace('$','').strip()
                    amount = amount.replace(',','').strip()
                elif '&nbsp;' in line_with_sale_date[0]:
                    line_with_sale_date = line_with_sale_date[0].split('&nbsp;')
                    sale_date = line_with_sale_date[0].replace('<b>','').strip()
                    amount = line_with_sale_date[1].replace('</b>','').strip()
                    amount = amount.replace('.00','').strip()
                    amount = amount.replace('$','').strip()
                    amount = amount.replace(',','').strip()
                elif '\t' in line_with_sale_date[0]:
                    line_with_sale_date = line_with_sale_date[0].split('\t')
                    sale_date = line_with_sale_date[0].replace('<b>','').strip()
                    amount = line_with_sale_date[1].replace('</b>','').strip()
                    amount = amount.replace('.00','').strip()
                    amount = amount.replace('$','').strip()
                    amount = amount.replace(',','').strip()
                elif ' ' in line_with_sale_date[0]:
                    line_with_sale_date = line_with_sale_date[0].split(' ')
                    sale_date = line_with_sale_date[0].replace('<b>','').strip()
                    amount = line_with_sale_date[1].replace('</b>','').strip()
                    amount = amount.replace('.00','').strip()
                    amount = amount.replace('$','').strip()
                    amount = amount.replace(',','').strip()
                else:
                    sale_date = line_with_sale_date[0].replace('<b>','').strip()
                    amount = None
            if if_format2:
                line_with_sale_date = [item for item in lines if '. . . . . .' in item]
                line_with_sale_date = line_with_sale_date[0].split('. . . . . .')
                sale_date = line_with_sale_date[0].replace('<b>','').strip()
                amount = line_with_sale_date[1].replace('</b>','').strip()
                amount = amount.replace('.00','').strip()
                amount = amount.replace('$','').strip()
                amount = amount.replace(',','').strip()
    
            #-------------------------------#
            # Find the line with dated date #
            #-------------------------------#
    
            line_with_dated_date = [item for item in lines if 'Dated ' in item]
            dated_date = line_with_dated_date[0].replace('Dated ','').strip()
            dated_date = dated_date.replace('.','').strip()
    
            OneIssueData = proc_one_issue(lines)
            OneIssueData['source'] = webpage['source']
            OneIssueData['issuer'] = issuer
            OneIssueData['state'] = state
            OneIssueData['amount'] = amount
            OneIssueData['sale_date'] = sale_date
            OneIssueData['notice_month'] = notice_month
            OneIssueData['notice_day'] = notice_day
            OneIssueData['notice_year'] = notice_year
            OneIssueData['dated_date'] = dated_date
            BBIssueData = BBIssueData+[OneIssueData]

        with open('../../RawData/BondBuyer/WebPages/BBIssueData'+str(year)+'.pkl', 'wb') as file:
            pickle.dump(BBIssueData, file)
