In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import os
import dask
import dask.dataframe as dd
import itertools
from itertools import chain
from math import sqrt, floor, ceil, isnan
import multiprocess
import importlib
from importlib import reload
from collections import Counter
from fuzzywuzzy import process, fuzz
import time
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import warnings
import pyreadstat
import bisect
warnings.filterwarnings("error")

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400

# A customized winsorisation function that handles None values correctly
# The percentiles are taken and winsorisation are done on non-None values only
def winsor2(series,cutoffs):

    import numpy as np
    import scipy as sp
    
    IsNone = np.isnan(series).copy()
    IsNotNone = np.logical_not(IsNone).copy()
    series_NotNonePart = sp.stats.mstats.winsorize(series[IsNotNone],limits=(cutoffs[0],cutoffs[1]))
    series_new = series.copy()
    series_new[IsNone] = np.nan
    series_new[IsNotNone] = series_NotNonePart

    return series_new


# 1. Calculate Bond-Level Underpricing

In [2]:
initial_underpricing_allyears = pd.DataFrame()

for year in range(2005,2023):

    # Process every two adjacent years' data, to make sure 30 days post offering are within the sample

    # For unknown reasons, cannot download data as "dta" with correct dates in Feb 2024
    if year not in [2016,2022,2023]:
        part1,meta1 = pyreadstat.read_dta('../RawData/MSRB/MSRB_'+str(year)+'.dta')
    else:
        part1 = pd.read_csv('../RawData/MSRB/MSRB_'+str(year)+'.csv',low_memory=False)
        part1['trade_date'] = pd.to_datetime(part1['trade_date'], format='%d/%m/%Y',errors='coerce')
        part1['dated_date'] = pd.to_datetime(part1['dated_date'], format='%d/%m/%Y',errors='coerce')
        part1['maturity_date'] = pd.to_datetime(part1['maturity_date'], format='%d/%m/%Y',errors='coerce')
        part1['settlement_date'] = pd.to_datetime(part1['settlement_date'], format='%d/%m/%Y',errors='coerce')
        part1['rtrs_publish_date'] = pd.to_datetime(part1['rtrs_publish_date'], format='%d/%m/%Y',errors='coerce')
    if year+1 not in [2016,2022,2023]:
        part2,meta2 = pyreadstat.read_dta('../RawData/MSRB/MSRB_'+str(year+1)+'.dta')
    else:
        part2 = pd.read_csv('../RawData/MSRB/MSRB_'+str(year+1)+'.csv',low_memory=False)
        part2['trade_date'] = pd.to_datetime(part2['trade_date'], format='%d/%m/%Y',errors='coerce')
        part2['dated_date'] = pd.to_datetime(part2['dated_date'], format='%d/%m/%Y',errors='coerce')
        part2['maturity_date'] = pd.to_datetime(part2['maturity_date'], format='%d/%m/%Y',errors='coerce')
        part2['settlement_date'] = pd.to_datetime(part2['settlement_date'], format='%d/%m/%Y',errors='coerce')
        part2['rtrs_publish_date'] = pd.to_datetime(part2['rtrs_publish_date'], format='%d/%m/%Y',errors='coerce')
    TradeData = pd.concat([part1,part2])

    print('Processed year '+str(year))

    # Identify new offerings in the first year
    
    start_datetime = pd.to_datetime(str(year)+'-01-01 00:00:00')
    end_datetime = pd.to_datetime(str(year)+'-12-31 23:59:59')
    TradeData = TradeData[~pd.isnull(TradeData['dated_date'])]
    TradeData = TradeData[TradeData['dated_date']!=None]
    TradeData = TradeData[(TradeData['dated_date']>=start_datetime)&(TradeData['dated_date']<=end_datetime)]
    if TradeData['par_traded'].astype(str).str.contains('[a-zA-Z]').sum()>0:
        TradeData['par_traded'] = TradeData['par_traded'].str.replace('1MM+','1000000',regex=False)
    TradeData['par_traded'] = TradeData['par_traded'].astype('float')
    
    # Calculate initial underpricing for each new offering
    
    def proc_list(TradeData):
    
        TradeData = TradeData.reset_index()
        TradeData_gb = TradeData.groupby('cusip')
    
        new_offerings = list(TradeData['cusip'].unique())
        new_offerings = pd.DataFrame(new_offerings,columns=['cusip'])
    
        initial_underpricing = []
        
        for idx,row in new_offerings.iterrows():
        
            TradeData_onecusip = TradeData_gb.get_group(row['cusip'])
            
            # If "offer_price_takedown_indicator" is available, use the first "trade_date" within those with "Y" as initial day
            TradeData_onecusip_initial = TradeData_onecusip[TradeData_onecusip['offer_price_takedown_indicator']=='Y']
            if len(TradeData_onecusip_initial)>0:
                TradeData_onecusip_initial = TradeData_onecusip_initial.sort_values('trade_date',ascending=True)
                TradeData_onecusip_initial = TradeData_onecusip_initial.reset_index(drop=True)
                offering_date = TradeData_onecusip_initial['trade_date'][0]
            # Otherwise, use the first "trade_date"
            else:
                TradeData_onecusip = TradeData_onecusip.sort_values('trade_date',ascending=True)
                TradeData_onecusip = TradeData_onecusip.reset_index(drop=True)
                offering_date = TradeData_onecusip['trade_date'][0]
            
            # Obtain initial trading day price
            TradeData_onecusip_initial = TradeData_onecusip[TradeData_onecusip['trade_date']==offering_date]
            offering_date_price = np.dot(TradeData_onecusip_initial['par_traded'],TradeData_onecusip_initial['dollar_price'])/\
                np.sum(TradeData_onecusip_initial['par_traded'])
            
            # Obtain average trading price in the [15,60] window
            start_date = offering_date + pd.Timedelta(days=15)
            end_date = offering_date + pd.Timedelta(days=60)
            TradeData_onecusip_15to60 = TradeData_onecusip[(TradeData_onecusip['trade_date']>=start_date)&
                (TradeData_onecusip['trade_date']<=end_date)]
            if len(TradeData_onecusip_15to60)>0:
                date_15to60_price = np.dot(TradeData_onecusip_15to60['par_traded'],TradeData_onecusip_15to60['dollar_price'])/\
                    np.sum(TradeData_onecusip_15to60['par_traded'])
            else:
                date_15to60_price = None

            # Obtain average trading price in the [15,30] window
            start_date = offering_date + pd.Timedelta(days=15)
            end_date = offering_date + pd.Timedelta(days=30)
            TradeData_onecusip_15to30 = TradeData_onecusip[(TradeData_onecusip['trade_date']>=start_date)&
                (TradeData_onecusip['trade_date']<=end_date)]
            if len(TradeData_onecusip_15to30)>0:
                date_15to30_price = np.dot(TradeData_onecusip_15to30['par_traded'],TradeData_onecusip_15to30['dollar_price'])/\
                    np.sum(TradeData_onecusip_15to30['par_traded'])
            else:
                date_15to30_price = None

            initial_underpricing = initial_underpricing+[{'cusip':row['cusip'],'offering_date':offering_date,
                'offering_date_price':offering_date_price,'date_15to60_price':date_15to60_price,'date_15to30_price':date_15to30_price}]
        
        initial_underpricing = pd.DataFrame(initial_underpricing)
        return initial_underpricing
    
    TradeData = TradeData.set_index('cusip')
    TradeData_dd = dd.from_pandas(TradeData, npartitions=20)
    with dask.config.set(scheduler='processes',num_workers=20):
        initial_underpricing = TradeData_dd.map_partitions(proc_list, \
            meta=pd.DataFrame(columns=['cusip','offering_date','offering_date_price','date_15to60_price','date_15to30_price'])).compute()
    TradeData = TradeData.reset_index()

    initial_underpricing_allyears = pd.concat([initial_underpricing_allyears,initial_underpricing])


Processed year 2005
Processed year 2006
Processed year 2007
Processed year 2008
Processed year 2009
Processed year 2010
Processed year 2011
Processed year 2012
Processed year 2013
Processed year 2014
Processed year 2015
Processed year 2016
Processed year 2017
Processed year 2018
Processed year 2019
Processed year 2020
Processed year 2021
Processed year 2022


Processed year 2009


Processed year 2010


Processed year 2011


Processed year 2012


Processed year 2013


Processed year 2014


Processed year 2015


Processed year 2016


Processed year 2017


Processed year 2018


Processed year 2019


Processed year 2020


Processed year 2021


Processed year 2022


In [3]:
initial_underpricing_allyears.to_parquet('../CleanData/MSRB/0F_initial_underpricing_allyears.parquet')

# 2. Add Underpricing to GPF Data

In [4]:
initial_underpricing_allyears = pd.read_parquet('../CleanData/MSRB/0F_initial_underpricing_allyears.parquet')
GPF = pd.read_csv("../CleanData/SDC/0A_GPF.csv",low_memory=False)

In [5]:
# Convert to dictionary
initial_underpricing_allyears['underpricing_15to60'] = \
    initial_underpricing_allyears['date_15to60_price']-initial_underpricing_allyears['offering_date_price']
initial_underpricing_allyears['underpricing_15to30'] = \
    initial_underpricing_allyears['date_15to30_price']-initial_underpricing_allyears['offering_date_price']
initial_underpricing_allyears['underpricing_15to60'] = \
    winsor2(initial_underpricing_allyears['underpricing_15to60'],cutoffs=[0.01,0.01])
initial_underpricing_allyears['underpricing_15to30'] = \
    winsor2(initial_underpricing_allyears['underpricing_15to30'],cutoffs=[0.01,0.01])

dict_underpricing_15to60 = initial_underpricing_allyears.set_index('cusip').to_dict(orient='dict')['underpricing_15to60']
dict_underpricing_15to30 = initial_underpricing_allyears.set_index('cusip').to_dict(orient='dict')['underpricing_15to30']

cusips_in_MSRB = list(dict_underpricing_15to60.keys())
cusips_in_MSRB.sort()


In [6]:
%%time

def proc_list(GPF):
    
    GPF = GPF.copy()
    GPF['underpricing_15to60'] = None
    GPF['underpricing_15to30'] = None
    
    for idx,row in GPF.iterrows():
    
        cusip = str(row['cusip'])
        underpricing_15to60 = None
        underpricing_15to30 = None
    
        if cusip=='nan' or cusip=='None':
            continue
        
        if '\n' not in cusip:
        
            if cusip in cusips_in_MSRB:
                underpricing_15to60 = dict_underpricing_15to60[cusip]
                underpricing_15to30 = dict_underpricing_15to30[cusip]
                if str(underpricing_15to60)=='nan':
                    underpricing_15to60 = None
                if str(underpricing_15to30)=='nan':
                    underpricing_15to30 = None
    
        else:

            cusip = cusip.split('\n')

            underpricing_15to60_list = []
            underpricing_15to30_list = []
            for cusip_onebond in cusip:
                index = bisect.bisect_left(cusips_in_MSRB,cusip_onebond)
                is_in_list = index<len(cusips_in_MSRB) and cusips_in_MSRB[index]==cusip_onebond
                if is_in_list:
                    underpricing_15to60_list = underpricing_15to60_list+[dict_underpricing_15to60[cusip_onebond]]
                    underpricing_15to30_list = underpricing_15to30_list+[dict_underpricing_15to30[cusip_onebond]]
            underpricing_15to60_list = [item for item in underpricing_15to60_list if str(item)!='nan']
            underpricing_15to30_list = [item for item in underpricing_15to30_list if str(item)!='nan']    

            # Take simple average. Most of the time number of items is not aligned in CUSIP and in amount by maturity
            if len(underpricing_15to60_list)>0:
                underpricing_15to60 = np.mean(underpricing_15to60_list)
            else:
                underpricing_15to60 = None
            if len(underpricing_15to30_list)>0:
                underpricing_15to30 = np.mean(underpricing_15to30_list)
            else:
                underpricing_15to30 = None
    
        GPF.at[idx,'underpricing_15to60'] = underpricing_15to60
        GPF.at[idx,'underpricing_15to30'] = underpricing_15to30

    return GPF

output_columns = proc_list(GPF[:10]).columns # Process one year to get columns
GPF_dd = dd.from_pandas(GPF, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    GPF = GPF_dd.map_partitions(proc_list, \
        meta=pd.DataFrame(columns=output_columns)).compute()


CPU times: user 24.9 s, sys: 5.81 s, total: 30.7 s
Wall time: 3h 18min 3s


In [None]:
GPF.to_csv("../CleanData/SDC/0A_GPF.csv")

# 3. Construct trading yield at the bond (CUSIP) X calendar month level

Put CSA and "sale_year" (year of underwriting) into the sample, which will be used for constructing an event study sample. In the event study sample, treated and control is defined with the year of underwriting. Post indicates whether the year of underwriting is post consolidation, rather than based on the month of trading. Market-level factor is controlled with time fixed effects of the month of trading.

In [7]:
# TBD!!

initial_underpricing_allyears = pd.DataFrame()

for year in range(2005,2023):

    # Process every two adjacent years' data, to make sure 30 days post offering are within the sample

    # For unknown reasons, cannot download data as "dta" with correct dates in Feb 2024
    if year not in [2016,2022,2023]:
        part1,meta1 = pyreadstat.read_dta('../RawData/MSRB/MSRB_'+str(year)+'.dta')
    else:
        part1 = pd.read_csv('../RawData/MSRB/MSRB_'+str(year)+'.csv',low_memory=False)
        part1['trade_date'] = pd.to_datetime(part1['trade_date'], format='%d/%m/%Y',errors='coerce')
        part1['dated_date'] = pd.to_datetime(part1['dated_date'], format='%d/%m/%Y',errors='coerce')
        part1['maturity_date'] = pd.to_datetime(part1['maturity_date'], format='%d/%m/%Y',errors='coerce')
        part1['settlement_date'] = pd.to_datetime(part1['settlement_date'], format='%d/%m/%Y',errors='coerce')
        part1['rtrs_publish_date'] = pd.to_datetime(part1['rtrs_publish_date'], format='%d/%m/%Y',errors='coerce')
    if year+1 not in [2016,2022,2023]:
        part2,meta2 = pyreadstat.read_dta('../RawData/MSRB/MSRB_'+str(year+1)+'.dta')
    else:
        part2 = pd.read_csv('../RawData/MSRB/MSRB_'+str(year+1)+'.csv',low_memory=False)
        part2['trade_date'] = pd.to_datetime(part2['trade_date'], format='%d/%m/%Y',errors='coerce')
        part2['dated_date'] = pd.to_datetime(part2['dated_date'], format='%d/%m/%Y',errors='coerce')
        part2['maturity_date'] = pd.to_datetime(part2['maturity_date'], format='%d/%m/%Y',errors='coerce')
        part2['settlement_date'] = pd.to_datetime(part2['settlement_date'], format='%d/%m/%Y',errors='coerce')
        part2['rtrs_publish_date'] = pd.to_datetime(part2['rtrs_publish_date'], format='%d/%m/%Y',errors='coerce')
    TradeData = pd.concat([part1,part2])

    print('Processed year '+str(year))

    # Identify new offerings in the first year
    
    start_datetime = pd.to_datetime(str(year)+'-01-01 00:00:00')
    end_datetime = pd.to_datetime(str(year)+'-12-31 23:59:59')
    TradeData = TradeData[~pd.isnull(TradeData['dated_date'])]
    TradeData = TradeData[TradeData['dated_date']!=None]
    if TradeData['par_traded'].astype(str).str.contains('[a-zA-Z]').sum()>0:
        TradeData['par_traded'] = TradeData['par_traded'].str.replace('1MM+','1000000',regex=False)
    TradeData['par_traded'] = TradeData['par_traded'].astype('float')
    
    # Calculate initial underpricing for each new offering
    
    def proc_list(TradeData):
    
        TradeData = TradeData.reset_index()
        TradeData_gb = TradeData.groupby('cusip')
    
        new_offerings = list(TradeData['cusip'].unique())
        new_offerings = pd.DataFrame(new_offerings,columns=['cusip'])
    
        initial_underpricing = []
        
        for idx,row in new_offerings.iterrows():
        
            TradeData_onecusip = TradeData_gb.get_group(row['cusip'])
            
            # If "offer_price_takedown_indicator" is available, use the first "trade_date" within those with "Y" as initial day
            TradeData_onecusip_initial = TradeData_onecusip[TradeData_onecusip['offer_price_takedown_indicator']=='Y']
            if len(TradeData_onecusip_initial)>0:
                TradeData_onecusip_initial = TradeData_onecusip_initial.sort_values('trade_date',ascending=True)
                TradeData_onecusip_initial = TradeData_onecusip_initial.reset_index(drop=True)
                offering_date = TradeData_onecusip_initial['trade_date'][0]
            # Otherwise, use the first "trade_date"
            else:
                TradeData_onecusip = TradeData_onecusip.sort_values('trade_date',ascending=True)
                TradeData_onecusip = TradeData_onecusip.reset_index(drop=True)
                offering_date = TradeData_onecusip['trade_date'][0]
            
            # Obtain initial trading day price
            TradeData_onecusip_initial = TradeData_onecusip[TradeData_onecusip['trade_date']==offering_date]
            offering_date_price = np.dot(TradeData_onecusip_initial['par_traded'],TradeData_onecusip_initial['dollar_price'])/\
                np.sum(TradeData_onecusip_initial['par_traded'])
            
            # Obtain average trading price in the [15,60] window
            start_date = offering_date + pd.Timedelta(days=15)
            end_date = offering_date + pd.Timedelta(days=60)
            TradeData_onecusip_15to60 = TradeData_onecusip[(TradeData_onecusip['trade_date']>=start_date)&
                (TradeData_onecusip['trade_date']<=end_date)]
            if len(TradeData_onecusip_15to60)>0:
                date_15to60_price = np.dot(TradeData_onecusip_15to60['par_traded'],TradeData_onecusip_15to60['dollar_price'])/\
                    np.sum(TradeData_onecusip_15to60['par_traded'])
            else:
                date_15to60_price = None

            # Obtain average trading price in the [15,30] window
            start_date = offering_date + pd.Timedelta(days=15)
            end_date = offering_date + pd.Timedelta(days=30)
            TradeData_onecusip_15to30 = TradeData_onecusip[(TradeData_onecusip['trade_date']>=start_date)&
                (TradeData_onecusip['trade_date']<=end_date)]
            if len(TradeData_onecusip_15to30)>0:
                date_15to30_price = np.dot(TradeData_onecusip_15to30['par_traded'],TradeData_onecusip_15to30['dollar_price'])/\
                    np.sum(TradeData_onecusip_15to30['par_traded'])
            else:
                date_15to30_price = None

            initial_underpricing = initial_underpricing+[{'cusip':row['cusip'],'offering_date':offering_date,
                'offering_date_price':offering_date_price,'date_15to60_price':date_15to60_price,'date_15to30_price':date_15to30_price}]
        
        initial_underpricing = pd.DataFrame(initial_underpricing)
        return initial_underpricing
    
    TradeData = TradeData.set_index('cusip')
    TradeData_dd = dd.from_pandas(TradeData, npartitions=20)
    with dask.config.set(scheduler='processes',num_workers=20):
        initial_underpricing = TradeData_dd.map_partitions(proc_list, \
            meta=pd.DataFrame(columns=['cusip','offering_date','offering_date_price','date_15to60_price','date_15to30_price'])).compute()
    TradeData = TradeData.reset_index()

    initial_underpricing_allyears = pd.concat([initial_underpricing_allyears,initial_underpricing])
