In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import os
import dask
import dask.dataframe as dd
import itertools
from itertools import chain
from math import sqrt, floor, ceil, isnan
import multiprocess
import importlib
from importlib import reload
from collections import Counter
from fuzzywuzzy import process, fuzz
import time
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import warnings
warnings.filterwarnings("error")

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400

# A customized winsorisation function that handles None values correctly
# The percentiles are taken and winsorisation are done on non-None values only
def winsor2(series,cutoffs):

    import numpy as np
    import scipy as sp
    
    IsNone = np.isnan(series).copy()
    IsNotNone = np.logical_not(IsNone).copy()
    series_NotNonePart = sp.stats.mstats.winsorize(series[IsNotNone],limits=(cutoffs[0],cutoffs[1]))
    series_new = series.copy()
    series_new[IsNone] = np.nan
    series_new[IsNotNone] = series_NotNonePart

    return series_new


# 1. Import Data

In [2]:
# GPF
GPF = pd.read_csv("../CleanData/SDC/0A_GPF.csv",low_memory=False)
raw_name_GPF_colnames = [column for column in GPF.columns if 'raw_name_GPF_' in column]
name_GPF_colnames = [column for column in GPF.columns if column[:9]=='name_GPF_']
parent_name_GPF_colnames = [column for column in GPF.columns if 'parent_name_' in column]

# CBs in SOD
SOD = pd.read_csv('../CleanData/FDIC/0I_SOD.csv')
SOD['DEPSUMBR'] = SOD['DEPSUMBR'].str.replace(',','')
SOD['DEPSUMBR'] = SOD['DEPSUMBR'].astype(int)

# M&As among CBs in SOD
SNL_in_SOD = pd.read_csv('../CleanData/FDIC/0I_SNL_in_SOD.csv')
SNL_in_SOD['year'] = SNL_in_SOD['Completion Date'].str[:4].astype(int)
SNL_in_SOD = SNL_in_SOD[SNL_in_SOD['Target']!=SNL_in_SOD['Buyer']]
SNL_in_SOD = SNL_in_SOD[['Target','Buyer','year']]

# Areas affected by underwriter M&A
CSA_affected = pd.read_parquet('../CleanData/MAEvent/1B_CSA_affected.parquet')
CBSA_affected = pd.read_parquet('../CleanData/MAEvent/1B_CBSA_affected.parquet')

# 2. Summary Stats

In [3]:
%%time

SNL_in_SOD_withchars = SNL_in_SOD.copy()

SNL_in_SOD_withchars['both_active'] = False
SNL_in_SOD_withchars['both_active_overlap_CSA'] = False

# M&As where both underwrite municipal bonds right before merger
def proc_list(SNL_in_SOD_withchars):
    for idx,row in SNL_in_SOD_withchars.iterrows():
        SOD_oneyear = SOD[SOD['year']==row['year']-1]
        names = list(chain.from_iterable(list(np.array(SOD_oneyear[['name']]))))
        names = list(set(names))
        if (row['Target'] in names) and \
            (row['Buyer'] in names):
            SNL_in_SOD_withchars.at[idx,'both_active'] = True
    return SNL_in_SOD_withchars

output_columns = proc_list(SNL_in_SOD_withchars[:3]).columns # Process one year to get columns
SNL_in_SOD_withchars_dd = dd.from_pandas(SNL_in_SOD_withchars, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    SNL_in_SOD_withchars = SNL_in_SOD_withchars_dd.map_partitions(proc_list, \
        meta=pd.DataFrame(columns=output_columns)).compute()


CPU times: user 10.3 s, sys: 2.89 s, total: 13.2 s
Wall time: 55.6 s


In [4]:
# M&As where both underwirte municipal bonds before merger and have market overlap in terms of CSA
def proc_list(SNL_in_SOD_withchars):
    for idx,row in SNL_in_SOD_withchars.iterrows():
        SOD_oneyear = SOD[SOD['year']==row['year']-1]
        CSAs = list(SOD_oneyear['CSA Code'].unique())
        CSAs = [item for item in CSAs if str(item)!='nan']
        for CSA in CSAs:
            SOD_oneyearCSA = SOD_oneyear[SOD_oneyear['CSA Code']==CSA]
            names = list(chain.from_iterable(list(np.array(SOD_oneyearCSA[['name']]))))
            names = list(set(names))
            # If for any CSA there is overlap, then there is overlap
            if (row['Target'] in names) and \
                (row['Buyer'] in names):
                SNL_in_SOD_withchars.at[idx,'both_active_overlap_CSA'] = True
    return SNL_in_SOD_withchars

output_columns = proc_list(SNL_in_SOD_withchars[:3]).columns # Process one year to get columns
SNL_in_SOD_withchars_dd = dd.from_pandas(SNL_in_SOD_withchars, npartitions=20)
with dask.config.set(scheduler='processes',num_workers=20):
    SNL_in_SOD_withchars = SNL_in_SOD_withchars_dd.map_partitions(proc_list, \
        meta=pd.DataFrame(columns=output_columns)).compute()


In [5]:
n_CBMA_both_active = np.sum(SNL_in_SOD_withchars['both_active']==True)
n_CBMA_both_active_overlap_CSA = np.sum(SNL_in_SOD_withchars['both_active_overlap_CSA']==True)

# Number: Number of M&As where both sides are active #
n_CBMA_both_active = '{:,}'.format(n_CBMA_both_active)
with open('../Draft/nums/n_CBMA_both_active.tex','w') as file:
    file.write(str(n_CBMA_both_active))

# Number: Number of M&As where both sides are active and have geographic overlap #
n_CBMA_both_active_overlap_CSA = '{:,}'.format(n_CBMA_both_active_overlap_CSA)
with open('../Draft/nums/n_CBMA_both_active_overlap_CSA.tex','w') as file:
    file.write(str(n_CBMA_both_active_overlap_CSA))

In [6]:
n_CBMA_both_active_overlap_CSA

'1,424'

# 3. Identify CB Merger Episode

## 3.1 Using CSA

In [7]:
#--------------------------------#
# Version 1: CSA, Delta HHI > 20 #
#--------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CSAs = SOD['CSA Code'].unique()
CSAs = [item for item in CSAs if str(item)!='nan']
for CSA in CSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.002:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CSA Code':CSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CSA_episodes_DeltaHHI20 = pd.DataFrame(CB_episodes)

#--------------------------------#
# Version 2: CSA, Delta HHI > 30 #
#--------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CSAs = SOD['CSA Code'].unique()
CSAs = [item for item in CSAs if str(item)!='nan']
for CSA in CSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.003:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CSA Code':CSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CSA_episodes_DeltaHHI30 = pd.DataFrame(CB_episodes)

#--------------------------------#
# Version 3: CSA, Delta HHI > 50 #
#--------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CSAs = SOD['CSA Code'].unique()
CSAs = [item for item in CSAs if str(item)!='nan']
for CSA in CSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.005:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CSA Code':CSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CSA_episodes_DeltaHHI50 = pd.DataFrame(CB_episodes)

#---------------------------------#
# Version 4: CSA, Delta HHI > 100 #
#---------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CSAs = SOD['CSA Code'].unique()
CSAs = [item for item in CSAs if str(item)!='nan']
for CSA in CSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CSA Code']==CSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.01:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CSA Code':CSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CSA_episodes_DeltaHHI100 = pd.DataFrame(CB_episodes)

In [8]:
# Number: Number of within-market CB M&As #
n_CB_CSA_episodes_DeltaHHI100 = '{:,}'.format(len(CB_CSA_episodes_DeltaHHI100))
with open('../Draft/nums/n_CB_CSA_episodes_DeltaHHI100.tex','w') as file:
    file.write(str(n_CB_CSA_episodes_DeltaHHI100))

## 3.2 Using CBSA

In [9]:
#---------------------------------#
# Version 1: CBSA, Delta HHI > 20 #
#---------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CBSAs = SOD['CBSA Code'].unique()
CBSAs = [item for item in CBSAs if str(item)!='nan']
for CBSA in CBSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.002:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CBSA Code':CBSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CBSA_episodes_DeltaHHI20 = pd.DataFrame(CB_episodes)

#---------------------------------#
# Version 2: CBSA, Delta HHI > 30 #
#---------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CBSAs = SOD['CBSA Code'].unique()
CBSAs = [item for item in CBSAs if str(item)!='nan']
for CBSA in CBSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.003:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CBSA Code':CBSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CBSA_episodes_DeltaHHI30 = pd.DataFrame(CB_episodes)

#---------------------------------#
# Version 3: CBSA, Delta HHI > 50 #
#---------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CBSAs = SOD['CBSA Code'].unique()
CBSAs = [item for item in CBSAs if str(item)!='nan']
for CBSA in CBSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.005:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CBSA Code':CBSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CBSA_episodes_DeltaHHI50 = pd.DataFrame(CB_episodes)

#----------------------------------#
# Version 4: CBSA, Delta HHI > 100 #
#----------------------------------#

# Note that deposits are not noisy. Use HHI implied by just one year.

CB_episodes = []
CBSAs = SOD['CBSA Code'].unique()
CBSAs = [item for item in CBSAs if str(item)!='nan']
for CBSA in CBSAs:

    episode_start_year = 1900

    for year in range(1995,2023):

        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue
        
        # HHI in the year prior to M&A
        SOD_prior = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
        
        SOD_prior = SOD_prior.reset_index()
        hhi_prior = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
        
        # Get SNL deals of interest
        Banks_in_SOD = SOD[(SOD['CBSA Code']==CBSA)&(SOD['year']==year-1)].copy()
        Banks_in_SOD = list(Banks_in_SOD['name'].unique())
        SNL_in_SOD_relevant = SNL_in_SOD[
            (SNL_in_SOD['Target'].isin(Banks_in_SOD))
            &(SNL_in_SOD['Buyer'].isin(Banks_in_SOD))
            &(SNL_in_SOD['year']>=year)
            &(SNL_in_SOD['year']<=year+3)]

        if len(SNL_in_SOD_relevant)>0:
            for idx,row in SNL_in_SOD_relevant.iterrows():
                SOD_prior.loc[SOD_prior['name']==row['Target'],'name'] = row['Buyer']
            SOD_prior = SOD_prior.groupby('name').agg({'DEPSUMBR':sum})
            hhi_post = np.sum((SOD_prior['DEPSUMBR']/np.sum(SOD_prior['DEPSUMBR']))**2)
            if hhi_post-hhi_prior>0.01:
                episode_start_year = year
                CB_episodes = CB_episodes+[{'CBSA Code':CBSA,'episode_start_year':year,'hhi_dif':hhi_post-hhi_prior}]

CB_CBSA_episodes_DeltaHHI100 = pd.DataFrame(CB_episodes)

## 3.3 Apply the restriction criteria: No significant IB mergers

In [10]:
CB_CSA_episodes_DeltaHHI20['if_contaminated'] = False
for idx,row in CB_CSA_episodes_DeltaHHI20.iterrows():
    # Years for which potential control is treated itself
    CSA_affected_frag = CSA_affected[CSA_affected['CSA Code']==row['CSA Code']]
    CSA_affected_frag = CSA_affected_frag[(CSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CSA_affected_frag['target_market_share_N_avg']+CSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CSA_affected_frag_affected_years = list(CSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CSA_episodes_DeltaHHI20.at[idx,'if_contaminated'] = True

CB_CSA_episodes_DeltaHHI20 = CB_CSA_episodes_DeltaHHI20[~CB_CSA_episodes_DeltaHHI20['if_contaminated']]



CB_CSA_episodes_DeltaHHI30['if_contaminated'] = False
for idx,row in CB_CSA_episodes_DeltaHHI30.iterrows():
    # Years for which potential control is treated itself
    CSA_affected_frag = CSA_affected[CSA_affected['CSA Code']==row['CSA Code']]
    CSA_affected_frag = CSA_affected_frag[(CSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CSA_affected_frag['target_market_share_N_avg']+CSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CSA_affected_frag_affected_years = list(CSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CSA_episodes_DeltaHHI30.at[idx,'if_contaminated'] = True

CB_CSA_episodes_DeltaHHI30 = CB_CSA_episodes_DeltaHHI30[~CB_CSA_episodes_DeltaHHI30['if_contaminated']]



CB_CSA_episodes_DeltaHHI50['if_contaminated'] = False
for idx,row in CB_CSA_episodes_DeltaHHI50.iterrows():
    # Years for which potential control is treated itself
    CSA_affected_frag = CSA_affected[CSA_affected['CSA Code']==row['CSA Code']]
    CSA_affected_frag = CSA_affected_frag[(CSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CSA_affected_frag['target_market_share_N_avg']+CSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CSA_affected_frag_affected_years = list(CSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CSA_episodes_DeltaHHI50.at[idx,'if_contaminated'] = True

CB_CSA_episodes_DeltaHHI50 = CB_CSA_episodes_DeltaHHI50[~CB_CSA_episodes_DeltaHHI50['if_contaminated']]



CB_CSA_episodes_DeltaHHI100['if_contaminated'] = False
for idx,row in CB_CSA_episodes_DeltaHHI100.iterrows():
    # Years for which potential control is treated itself
    CSA_affected_frag = CSA_affected[CSA_affected['CSA Code']==row['CSA Code']]
    CSA_affected_frag = CSA_affected_frag[(CSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CSA_affected_frag['target_market_share_N_avg']+CSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CSA_affected_frag_affected_years = list(CSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CSA_episodes_DeltaHHI100.at[idx,'if_contaminated'] = True

CB_CSA_episodes_DeltaHHI100 = CB_CSA_episodes_DeltaHHI100[~CB_CSA_episodes_DeltaHHI100['if_contaminated']]

In [11]:
CB_CBSA_episodes_DeltaHHI20['if_contaminated'] = False
for idx,row in CB_CBSA_episodes_DeltaHHI20.iterrows():
    # Years for which potential control is treated itself
    CBSA_affected_frag = CBSA_affected[CBSA_affected['CBSA Code']==row['CBSA Code']]
    CBSA_affected_frag = CBSA_affected_frag[(CBSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CBSA_affected_frag['target_market_share_N_avg']+CBSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CBSA_affected_frag_affected_years = list(CBSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CBSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CBSA_episodes_DeltaHHI20.at[idx,'if_contaminated'] = True

CB_CBSA_episodes_DeltaHHI20 = CB_CBSA_episodes_DeltaHHI20[~CB_CBSA_episodes_DeltaHHI20['if_contaminated']]



CB_CBSA_episodes_DeltaHHI30['if_contaminated'] = False
for idx,row in CB_CBSA_episodes_DeltaHHI30.iterrows():
    # Years for which potential control is treated itself
    CBSA_affected_frag = CBSA_affected[CBSA_affected['CBSA Code']==row['CBSA Code']]
    CBSA_affected_frag = CBSA_affected_frag[(CBSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CBSA_affected_frag['target_market_share_N_avg']+CBSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CBSA_affected_frag_affected_years = list(CBSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CBSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CBSA_episodes_DeltaHHI30.at[idx,'if_contaminated'] = True

CB_CBSA_episodes_DeltaHHI30 = CB_CBSA_episodes_DeltaHHI30[~CB_CBSA_episodes_DeltaHHI30['if_contaminated']]



CB_CBSA_episodes_DeltaHHI50['if_contaminated'] = False
for idx,row in CB_CBSA_episodes_DeltaHHI50.iterrows():
    # Years for which potential control is treated itself
    CBSA_affected_frag = CBSA_affected[CBSA_affected['CBSA Code']==row['CBSA Code']]
    CBSA_affected_frag = CBSA_affected_frag[(CBSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CBSA_affected_frag['target_market_share_N_avg']+CBSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CBSA_affected_frag_affected_years = list(CBSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CBSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CBSA_episodes_DeltaHHI50.at[idx,'if_contaminated'] = True

CB_CBSA_episodes_DeltaHHI50 = CB_CBSA_episodes_DeltaHHI50[~CB_CBSA_episodes_DeltaHHI50['if_contaminated']]



CB_CBSA_episodes_DeltaHHI100['if_contaminated'] = False
for idx,row in CB_CBSA_episodes_DeltaHHI100.iterrows():
    # Years for which potential control is treated itself
    CBSA_affected_frag = CBSA_affected[CBSA_affected['CBSA Code']==row['CBSA Code']]
    CBSA_affected_frag = CBSA_affected_frag[(CBSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
        (CBSA_affected_frag['target_market_share_N_avg']+CBSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
    CBSA_affected_frag_affected_years = list(CBSA_affected_frag['sale_year'].unique())
    # Note that market share is on a rolling basis of every three years, so I only need to start from year -1
    if len(set(list(range(row['episode_start_year']-1,row['episode_start_year']+5))).\
        intersection(set(CBSA_affected_frag_affected_years)))>0:
        # This potential control is treated
        CB_CBSA_episodes_DeltaHHI100.at[idx,'if_contaminated'] = True

CB_CBSA_episodes_DeltaHHI100 = CB_CBSA_episodes_DeltaHHI100[~CB_CBSA_episodes_DeltaHHI100['if_contaminated']]

# 4. Assemble a Treatment-Control Matched Sample

In [12]:
# There are multiple versions of episodes definiton (by market share or HHI, cutoff on implied HHI increases, etc.). I go over each
# version here

episodes_files = [
    [CB_CSA_episodes_DeltaHHI20,1,
        '../CleanData/MAEvent/CB_CSA_episodes_DeltaHHI20.csv',
    ],
    [CB_CSA_episodes_DeltaHHI30,1,
        '../CleanData/MAEvent/CB_CSA_episodes_DeltaHHI30.csv',
    ],
    [CB_CSA_episodes_DeltaHHI50,1,
        '../CleanData/MAEvent/CB_CSA_episodes_DeltaHHI50.csv',
    ],
    [CB_CSA_episodes_DeltaHHI100,1,
        '../CleanData/MAEvent/CB_CSA_episodes_DeltaHHI100.csv',
    ],
    ]

for episodes_file in episodes_files:

    episodes = episodes_file[0].copy()
    N_matches = episodes_file[1]
    file_path = episodes_file[2]

    ########################################
    # Find control for each merger episode #
    ########################################
    
    # State demographics to be used in merger
    CSA_POP = pd.read_csv("../CleanData/Demographics/0C_CSA_Pop.csv")
    CSA_INC = pd.read_csv("../CleanData/Demographics/0C_CSA_Inc.csv")
    CSA_Data = CSA_POP.merge(CSA_INC,on=['CSA Code','year'])
    CSA_Data = CSA_Data[['CSA Code','year','inc','pop']]
    Same_State_CSA_pairs = pd.read_csv("../CleanData/Demographics/0C_Same_State_CSA_pairs.csv")
    
    def calculate_distance(row,weightingmat):
        return sp.spatial.distance.mahalanobis((row['inc'],row['pop']),\
            (row['treated_inc'],row['treated_pop']),weightingmat)
    
    episodes['control'] = None
    for idx,row in episodes.iterrows():
    
        # Find population of this CSA
        CSA_Data_oneyear = CSA_Data[CSA_Data['year']==row['episode_start_year']].copy()
    
        # Demographic data of the treated CSA
        CSA_Data_oneyear_frag = CSA_Data_oneyear[CSA_Data_oneyear['CSA Code']==row['CSA Code']].copy()
        if len(CSA_Data_oneyear_frag)==0:
            continue
        episode_pop = CSA_Data_oneyear_frag.reset_index()['pop'][0]
        episode_inc = CSA_Data_oneyear_frag.reset_index()['inc'][0]
        
        # Find a match
        CSA_Data_oneyear['treated_pop'] = episode_pop
        CSA_Data_oneyear['treated_inc'] = episode_inc
        # Get weighting matrix
        CSA_Data_oneyear['inc'] = winsor2(CSA_Data_oneyear['inc'],cutoffs=[0.05,0.05])
        CSA_Data_oneyear['pop'] = winsor2(CSA_Data_oneyear['pop'],cutoffs=[0.05,0.05])
        cov = CSA_Data_oneyear[['inc','pop']].cov()
        invcov = np.linalg.inv(cov)
        CSA_Data_oneyear['dist'] = CSA_Data_oneyear.apply(calculate_distance, axis=1,weightingmat=invcov)
        CSA_Data_oneyear = CSA_Data_oneyear.sort_values('dist').reset_index(drop=True)
        # Remove oneself from potential matches
        CSA_Data_oneyear = CSA_Data_oneyear[CSA_Data_oneyear['CSA Code']!=row['CSA Code']]
        # Remove other CSAs in the same state from potential matches
        Same_State_CSAs = list(Same_State_CSA_pairs[Same_State_CSA_pairs['CSA_1']==row['CSA Code']]['CSA_2'])
        CSA_Data_oneyear = CSA_Data_oneyear[~CSA_Data_oneyear['CSA Code'].isin(Same_State_CSAs)]
    
        match_counter = 0
        control = []
        for subidx,subrow in CSA_Data_oneyear.iterrows():
            # Years for which potential control is treated itself
            CSA_affected_frag = CSA_affected[CSA_affected['CSA Code']==subrow['CSA Code']]
            CSA_affected_frag = CSA_affected_frag[(CSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
                (CSA_affected_frag['target_market_share_N_avg']+CSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
            CSA_affected_frag_affected_years = list(CSA_affected_frag['sale_year'].unique())
            # 
            if len(set(list(range(row['episode_start_year']-4,row['episode_start_year']+5))).\
                intersection(set(CSA_affected_frag_affected_years)))>0:
                # This potential control is treated
                continue
            else:
                # This potential control is not treated => Good control
                control = control+[subrow['CSA Code']]
                match_counter = match_counter+1
                if match_counter==N_matches:
                    break
    
        episodes.at[idx,'control'] = control
    
    # Exclude cases where a match cannot be found
    print('A control cannot be found for '+str(np.sum(pd.isnull(episodes['control'])))+' episodes.')
    episodes = episodes[~pd.isnull(episodes['control'])]


    #############################################
    # Expand to include an event time dimension #
    #############################################
    
    episodes_Exploded = episodes
    episodes_Exploded['year_to_merger'] = [list(range(-4,5))]*len(episodes_Exploded)
    episodes_Exploded = episodes_Exploded.explode('year_to_merger')
    episodes_Exploded['calendar_year'] = episodes_Exploded['episode_start_year']+episodes_Exploded['year_to_merger']    

    
    ################################
    # Assemble a regression sample #
    ################################

    #------------------------#
    # Issue level, using GPF #
    #------------------------#

    reg_sample = []
    for idx,row in episodes_Exploded.iterrows():

        # Event characteristics - strength
        if 'acquiror_market_share_Dollar_avg' in episodes_Exploded.columns:
            acquiror_market_share_avg = row['acquiror_market_share_Dollar_avg']
        elif 'acquiror_market_share_N_avg' in episodes_Exploded.columns:
            acquiror_market_share_avg = row['acquiror_market_share_N_avg']
        else:
            acquiror_market_share_avg = None
        if 'target_market_share_Dollar_avg' in episodes_Exploded.columns:
            target_market_share_avg = row['target_market_share_Dollar_avg']
        elif 'target_market_share_N_avg' in episodes_Exploded.columns:
            target_market_share_avg = row['target_market_share_N_avg']
        else:
            target_market_share_avg = None
        if 'other_targets_market_share_Dollar_avg' in episodes_Exploded.columns:
            other_targets_market_share_avg = row['other_targets_market_share_Dollar_avg']
        elif 'other_targets_market_share_N_avg' in episodes_Exploded.columns:
            other_targets_market_share_avg = row['other_targets_market_share_N_avg']
        else:
            other_targets_market_share_avg = None
        if 'hhi_dif' in episodes_Exploded.columns:
            hhi_dif = row['hhi_dif']
        else:
            hhi_dif = None
        if 'max_sum_share' in episodes_Exploded.columns:
            max_sum_share = row['max_sum_share']
        else:
            max_sum_share = None
        if 'max_min_share' in episodes_Exploded.columns:
            max_min_share = row['max_min_share']
        else:
            max_min_share = None
        if 'mean_sum_share' in episodes_Exploded.columns:
            mean_sum_share = row['mean_sum_share']
        else:
            mean_sum_share = None
    
        # Treated observations
        GPF_Seg = GPF[(GPF['sale_year']==row['calendar_year'])&(GPF['CSA Code']==row['CSA Code'])].copy()
        GPF_Seg = GPF_Seg[[
            'CSA Code','sale_year','State','County',
            'issuer_type','Issuer',
            'avg_maturity','amount',
            'avg_yield','treasury_avg_spread','MMA_avg_spread',
            'gross_spread','gross_spread_tic_based','gross_spread_nic_based',
            'mod_tic','mod_tic_spread_treasury','mod_tic_spread_MMA',
            'mod_tic_timeFE','mod_tic_spread_treasury_timeFE','mod_tic_spread_MMA_timeFE',
            'underpricing_15to60','underpricing_15to30',
            'Bid','taxable_code','security_type','if_advisor','if_dual_advisor','if_refunding',
            'amount_bracket','mat_bracket','use_short','has_ratings',
            'use_of_proceeds_BB','use_of_proceeds_main','use_of_proceeds_general',
            'has_Moodys','has_Fitch','rating_Moodys','rating_Fitch','insured_amount',
            'AdvisorFeeRatio_hat','CRFeeRatio_hat','InsureFeeRatio_hat',
            'AdvisorFeeRatio_hat_model_timeFE','CRFeeRatio_hat_model_timeFE','InsureFeeRatio_hat_model_timeFE',
            'if_callable','CB_Eligible',
            'num_relationship',
            ]+name_GPF_colnames+parent_name_GPF_colnames]

        #------------------------------------#
        # Some cross-sectional heterogeneity #
        #------------------------------------#

        # Note that I am check if bank is involved in any mergers in [-4,+4], instead of if bank is involved in mergers (the above
        # code block)
        mergers = CSA_affected[
            (CSA_affected['CSA Code']==row['CSA Code'])&
            (CSA_affected['sale_year']>=row['episode_start_year']-4)&
            (CSA_affected['sale_year']<=row['episode_start_year']+4)
            ][['acquiror','target','acquiror_parent','target_parent',
            'acquiror_market_share_N_avg','target_market_share_N_avg','other_targets_market_share_N_avg']]
        mergers = mergers[(mergers['acquiror_market_share_N_avg']>0)&(mergers['target_market_share_N_avg']+mergers['other_targets_market_share_N_avg']>0)]
        # Whether the underwriter is the target bank in M&A
        GPF_Seg['bank_is_target'] = False
        for column in name_GPF_colnames:
            GPF_Seg['bank_is_target'] = \
            (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
            |(GPF_Seg['bank_is_target'])
        for column in parent_name_GPF_colnames:
            GPF_Seg['bank_is_target'] = \
            (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
            |(GPF_Seg['bank_is_target'])
        # Whether the underwriter is the acquiror bank in M&A
        GPF_Seg['bank_is_acquiror'] = False
        for column in name_GPF_colnames:
            GPF_Seg['bank_is_acquiror'] = \
            (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
            |(GPF_Seg['bank_is_acquiror'])
        for column in parent_name_GPF_colnames:
            GPF_Seg['bank_is_acquiror'] = \
            (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
            |(GPF_Seg['bank_is_acquiror'])

        GPF_Seg['treated'] = 1
        GPF_Seg['episode_start_year'] = row['episode_start_year']
        GPF_Seg['year_to_merger'] = row['year_to_merger']
        GPF_Seg['calendar_year'] = row['calendar_year']
        GPF_Seg['treated_csa'] = row['CSA Code'] # Used for constructing cohort X issuer FEs
        GPF_Seg['acquiror_market_share_avg'] = acquiror_market_share_avg
        GPF_Seg['target_market_share_avg'] = target_market_share_avg
        GPF_Seg['other_targets_market_share_avg'] = other_targets_market_share_avg
        GPF_Seg['hhi_dif'] = hhi_dif
        GPF_Seg['max_sum_share'] = max_sum_share
        GPF_Seg['max_min_share'] = max_min_share
        GPF_Seg['mean_sum_share'] = mean_sum_share
        GPF_Seg_Treated = GPF_Seg

        # Control observations
        if row['control']==None:
            continue
        GPF_Seg_Control = pd.DataFrame()
        for item in row['control']:
            GPF_Seg = GPF[(GPF['sale_year']==row['calendar_year'])&(GPF['CSA Code']==item)]
            GPF_Seg = GPF_Seg[[
                'CSA Code','sale_year','State','County',
                'issuer_type','Issuer',
                'avg_maturity','amount',
                'avg_yield','treasury_avg_spread','MMA_avg_spread',
                'gross_spread','gross_spread_tic_based','gross_spread_nic_based',
                'mod_tic','mod_tic_spread_treasury','mod_tic_spread_MMA',
                'mod_tic_timeFE','mod_tic_spread_treasury_timeFE','mod_tic_spread_MMA_timeFE',
                'underpricing_15to60','underpricing_15to30',
                'Bid','taxable_code','security_type','if_advisor','if_dual_advisor','if_refunding',
                'amount_bracket','mat_bracket','use_short','has_ratings',
                'use_of_proceeds_BB','use_of_proceeds_main','use_of_proceeds_general',
                'has_Moodys','has_Fitch','rating_Moodys','rating_Fitch','insured_amount',
                'AdvisorFeeRatio_hat','CRFeeRatio_hat','InsureFeeRatio_hat',
                'AdvisorFeeRatio_hat_model_timeFE','CRFeeRatio_hat_model_timeFE','InsureFeeRatio_hat_model_timeFE',
                'if_callable','CB_Eligible',
                'num_relationship',
                ]+name_GPF_colnames+parent_name_GPF_colnames]

            # Note that for control banks, "bank_is_target" and "bank_is_acquiror" use M&A in the specific areas
            mergers = CSA_affected[
                (CSA_affected['CSA Code']==item)&
                (CSA_affected['sale_year']>=row['episode_start_year']-4)&
                (CSA_affected['sale_year']<=row['episode_start_year']+4)
                ][['acquiror','target','acquiror_parent','target_parent',
                'acquiror_market_share_N_avg','target_market_share_N_avg','other_targets_market_share_N_avg']]
            mergers = mergers[(mergers['acquiror_market_share_N_avg']>0)&(mergers['target_market_share_N_avg']+mergers['other_targets_market_share_N_avg']>0)]
            # Whether the underwriter is the target bank in M&A
            GPF_Seg['bank_is_target'] = False
            for column in name_GPF_colnames:
                GPF_Seg['bank_is_target'] = \
                (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
                |(GPF_Seg['bank_is_target'])
            for column in parent_name_GPF_colnames:
                GPF_Seg['bank_is_target'] = \
                (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
                |(GPF_Seg['bank_is_target'])
            # Whether the underwriter is the acquiror bank in M&A
            GPF_Seg['bank_is_acquiror'] = False
            for column in name_GPF_colnames:
                GPF_Seg['bank_is_acquiror'] = \
                (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
                |(GPF_Seg['bank_is_acquiror'])
            for column in parent_name_GPF_colnames:
                GPF_Seg['bank_is_acquiror'] = \
                (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
                |(GPF_Seg['bank_is_acquiror'])
            
            GPF_Seg['treated'] = 0
            GPF_Seg['episode_start_year'] = row['episode_start_year']
            GPF_Seg['year_to_merger'] = row['year_to_merger']
            GPF_Seg['calendar_year'] = row['calendar_year']
            GPF_Seg['treated_csa'] = row['CSA Code'] # Used for constructing cohort X issuer FEs
            GPF_Seg['hhi_dif'] = hhi_dif
            GPF_Seg_Control = pd.concat([GPF_Seg_Control,GPF_Seg])
    
        if len(GPF_Seg_Treated)>0 and len(GPF_Seg_Control)>0:
            reg_sample = reg_sample+[GPF_Seg_Treated,GPF_Seg_Control]

    reg_sample = pd.concat(reg_sample)
    County_Composite = pd.read_csv("../CleanData/Demographics/0C_County_Composite.csv")
    County_Composite = County_Composite[['year','State','County','black_ratio','pop']].rename(columns={'year':'calendar_year'})
    reg_sample = reg_sample.merge(County_Composite,on=['State','County','calendar_year'],how='outer',indicator=True)
    reg_sample = reg_sample[reg_sample['_merge']!='right_only'].drop(columns=['_merge'])
    reg_sample.to_csv(file_path)


A control cannot be found for 0 episodes.
A control cannot be found for 0 episodes.
A control cannot be found for 0 episodes.
A control cannot be found for 0 episodes.


In [13]:
# There are multiple versions of episodes definiton (by market share or HHI, cutoff on implied HHI increases, etc.). I go over each
# version here

episodes_files = [
    [CB_CBSA_episodes_DeltaHHI20,1,
        '../CleanData/MAEvent/CB_CBSA_episodes_DeltaHHI20.csv',
    ],
    [CB_CBSA_episodes_DeltaHHI30,1,
        '../CleanData/MAEvent/CB_CBSA_episodes_DeltaHHI30.csv',
    ],
    [CB_CBSA_episodes_DeltaHHI50,1,
        '../CleanData/MAEvent/CB_CBSA_episodes_DeltaHHI50.csv',
    ],
    [CB_CBSA_episodes_DeltaHHI100,1,
        '../CleanData/MAEvent/CB_CBSA_episodes_DeltaHHI100.csv',
    ],
    ]

for episodes_file in episodes_files:

    episodes = episodes_file[0].copy()
    N_matches = episodes_file[1]
    file_path = episodes_file[2]

    ########################################
    # Find control for each merger episode #
    ########################################
    
    # State demographics to be used in merger
    CBSA_POP = pd.read_csv("../CleanData/Demographics/0C_CBSA_Pop.csv")
    CBSA_INC = pd.read_csv("../CleanData/Demographics/0C_CBSA_Inc.csv")
    CBSA_Data = CBSA_POP.merge(CBSA_INC,on=['CBSA Code','year'])
    CBSA_Data = CBSA_Data[['CBSA Code','year','inc','pop']]
    Same_State_CBSA_pairs = pd.read_csv("../CleanData/Demographics/0C_Same_State_CBSA_pairs.csv")
    
    def calculate_distance(row,weightingmat):
        return sp.spatial.distance.mahalanobis((row['inc'],row['pop']),\
            (row['treated_inc'],row['treated_pop']),weightingmat)
    
    episodes['control'] = None
    for idx,row in episodes.iterrows():
    
        # Find population of this CBSA
        CBSA_Data_oneyear = CBSA_Data[CBSA_Data['year']==row['episode_start_year']].copy()
    
        # Demographic data of the treated CBSA
        CBSA_Data_oneyear_frag = CBSA_Data_oneyear[CBSA_Data_oneyear['CBSA Code']==row['CBSA Code']].copy()
        if len(CBSA_Data_oneyear_frag)==0:
            continue
        episode_pop = CBSA_Data_oneyear_frag.reset_index()['pop'][0]
        episode_inc = CBSA_Data_oneyear_frag.reset_index()['inc'][0]
        
        # Find a match
        CBSA_Data_oneyear['treated_pop'] = episode_pop
        CBSA_Data_oneyear['treated_inc'] = episode_inc
        # Get weighting matrix
        CBSA_Data_oneyear['inc'] = winsor2(CBSA_Data_oneyear['inc'],cutoffs=[0.05,0.05])
        CBSA_Data_oneyear['pop'] = winsor2(CBSA_Data_oneyear['pop'],cutoffs=[0.05,0.05])
        cov = CBSA_Data_oneyear[['inc','pop']].cov()
        invcov = np.linalg.inv(cov)
        CBSA_Data_oneyear['dist'] = CBSA_Data_oneyear.apply(calculate_distance, axis=1,weightingmat=invcov)
        CBSA_Data_oneyear = CBSA_Data_oneyear.sort_values('dist').reset_index(drop=True)
        # Remove oneself from potential matches
        CBSA_Data_oneyear = CBSA_Data_oneyear[CBSA_Data_oneyear['CBSA Code']!=row['CBSA Code']]
        # Remove other CBSAs in the same state from potential matches
        Same_State_CBSAs = list(Same_State_CBSA_pairs[Same_State_CBSA_pairs['CBSA_1']==row['CBSA Code']]['CBSA_2'])
        CBSA_Data_oneyear = CBSA_Data_oneyear[~CBSA_Data_oneyear['CBSA Code'].isin(Same_State_CBSAs)]
    
        match_counter = 0
        control = []
        for subidx,subrow in CBSA_Data_oneyear.iterrows():
            # Years for which potential control is treated itself
            CBSA_affected_frag = CBSA_affected[CBSA_affected['CBSA Code']==subrow['CBSA Code']]
            CBSA_affected_frag = CBSA_affected_frag[(CBSA_affected_frag['acquiror_market_share_N_avg']>0.01)&
                (CBSA_affected_frag['target_market_share_N_avg']+CBSA_affected_frag['other_targets_market_share_N_avg']>0.01)]
            CBSA_affected_frag_affected_years = list(CBSA_affected_frag['sale_year'].unique())
            # 
            if len(set(list(range(row['episode_start_year']-4,row['episode_start_year']+5))).\
                intersection(set(CBSA_affected_frag_affected_years)))>0:
                # This potential control is treated
                continue
            else:
                # This potential control is not treated => Good control
                control = control+[subrow['CBSA Code']]
                match_counter = match_counter+1
                if match_counter==N_matches:
                    break
    
        episodes.at[idx,'control'] = control
    
    # Exclude cases where a match cannot be found
    print('A control cannot be found for '+str(np.sum(pd.isnull(episodes['control'])))+' episodes.')
    episodes = episodes[~pd.isnull(episodes['control'])]


    #############################################
    # Expand to include an event time dimension #
    #############################################
    
    episodes_Exploded = episodes
    episodes_Exploded['year_to_merger'] = [list(range(-4,5))]*len(episodes_Exploded)
    episodes_Exploded = episodes_Exploded.explode('year_to_merger')
    episodes_Exploded['calendar_year'] = episodes_Exploded['episode_start_year']+episodes_Exploded['year_to_merger']    

    
    ################################
    # Assemble a regression sample #
    ################################

    #------------------------#
    # Issue level, using GPF #
    #------------------------#

    reg_sample = []
    for idx,row in episodes_Exploded.iterrows():

        # Event characteristics - strength
        if 'acquiror_market_share_Dollar_avg' in episodes_Exploded.columns:
            acquiror_market_share_avg = row['acquiror_market_share_Dollar_avg']
        elif 'acquiror_market_share_N_avg' in episodes_Exploded.columns:
            acquiror_market_share_avg = row['acquiror_market_share_N_avg']
        else:
            acquiror_market_share_avg = None
        if 'target_market_share_Dollar_avg' in episodes_Exploded.columns:
            target_market_share_avg = row['target_market_share_Dollar_avg']
        elif 'target_market_share_N_avg' in episodes_Exploded.columns:
            target_market_share_avg = row['target_market_share_N_avg']
        else:
            target_market_share_avg = None
        if 'other_targets_market_share_Dollar_avg' in episodes_Exploded.columns:
            other_targets_market_share_avg = row['other_targets_market_share_Dollar_avg']
        elif 'other_targets_market_share_N_avg' in episodes_Exploded.columns:
            other_targets_market_share_avg = row['other_targets_market_share_N_avg']
        else:
            other_targets_market_share_avg = None
        if 'hhi_dif' in episodes_Exploded.columns:
            hhi_dif = row['hhi_dif']
        else:
            hhi_dif = None
        if 'max_sum_share' in episodes_Exploded.columns:
            max_sum_share = row['max_sum_share']
        else:
            max_sum_share = None
        if 'max_min_share' in episodes_Exploded.columns:
            max_min_share = row['max_min_share']
        else:
            max_min_share = None
        if 'mean_sum_share' in episodes_Exploded.columns:
            mean_sum_share = row['mean_sum_share']
        else:
            mean_sum_share = None
    
        # Treated observations
        GPF_Seg = GPF[(GPF['sale_year']==row['calendar_year'])&(GPF['CBSA Code']==row['CBSA Code'])].copy()
        GPF_Seg = GPF_Seg[[
            'CBSA Code','sale_year','State','County',
            'issuer_type','Issuer',
            'avg_maturity','amount',
            'avg_yield','treasury_avg_spread','MMA_avg_spread',
            'gross_spread','gross_spread_tic_based','gross_spread_nic_based',
            'mod_tic','mod_tic_spread_treasury','mod_tic_spread_MMA',
            'mod_tic_timeFE','mod_tic_spread_treasury_timeFE','mod_tic_spread_MMA_timeFE',
            'underpricing_15to60','underpricing_15to30',
            'Bid','taxable_code','security_type','if_advisor','if_dual_advisor','if_refunding',
            'amount_bracket','mat_bracket','use_short','has_ratings',
            'use_of_proceeds_BB','use_of_proceeds_main','use_of_proceeds_general',
            'has_Moodys','has_Fitch','rating_Moodys','rating_Fitch','insured_amount',
            'AdvisorFeeRatio_hat','CRFeeRatio_hat','InsureFeeRatio_hat',
            'AdvisorFeeRatio_hat_model_timeFE','CRFeeRatio_hat_model_timeFE','InsureFeeRatio_hat_model_timeFE',
            'if_callable','CB_Eligible',
            'num_relationship',
            ]+name_GPF_colnames+parent_name_GPF_colnames]

        #------------------------------------#
        # Some cross-sectional heterogeneity #
        #------------------------------------#

        # Note that I am check if bank is involved in any mergers in [-4,+4], instead of if bank is involved in mergers (the above
        # code block)
        mergers = CBSA_affected[
            (CBSA_affected['CBSA Code']==row['CBSA Code'])&
            (CBSA_affected['sale_year']>=row['episode_start_year']-4)&
            (CBSA_affected['sale_year']<=row['episode_start_year']+4)
            ][['acquiror','target','acquiror_parent','target_parent',
            'acquiror_market_share_N_avg','target_market_share_N_avg','other_targets_market_share_N_avg']]
        mergers = mergers[(mergers['acquiror_market_share_N_avg']>0)&(mergers['target_market_share_N_avg']+mergers['other_targets_market_share_N_avg']>0)]
        # Whether the underwriter is the target bank in M&A
        GPF_Seg['bank_is_target'] = False
        for column in name_GPF_colnames:
            GPF_Seg['bank_is_target'] = \
            (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
            |(GPF_Seg['bank_is_target'])
        for column in parent_name_GPF_colnames:
            GPF_Seg['bank_is_target'] = \
            (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
            |(GPF_Seg['bank_is_target'])
        # Whether the underwriter is the acquiror bank in M&A
        GPF_Seg['bank_is_acquiror'] = False
        for column in name_GPF_colnames:
            GPF_Seg['bank_is_acquiror'] = \
            (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
            |(GPF_Seg['bank_is_acquiror'])
        for column in parent_name_GPF_colnames:
            GPF_Seg['bank_is_acquiror'] = \
            (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
            |(GPF_Seg['bank_is_acquiror'])

        GPF_Seg['treated'] = 1
        GPF_Seg['episode_start_year'] = row['episode_start_year']
        GPF_Seg['year_to_merger'] = row['year_to_merger']
        GPF_Seg['calendar_year'] = row['calendar_year']
        GPF_Seg['treated_cbsa'] = row['CBSA Code'] # Used for constructing cohort X issuer FEs
        GPF_Seg['acquiror_market_share_avg'] = acquiror_market_share_avg
        GPF_Seg['target_market_share_avg'] = target_market_share_avg
        GPF_Seg['other_targets_market_share_avg'] = other_targets_market_share_avg
        GPF_Seg['hhi_dif'] = hhi_dif
        GPF_Seg['max_sum_share'] = max_sum_share
        GPF_Seg['max_min_share'] = max_min_share
        GPF_Seg['mean_sum_share'] = mean_sum_share
        GPF_Seg_Treated = GPF_Seg

        # Control observations
        if row['control']==None:
            continue
        GPF_Seg_Control = pd.DataFrame()
        for item in row['control']:
            GPF_Seg = GPF[(GPF['sale_year']==row['calendar_year'])&(GPF['CBSA Code']==item)]
            GPF_Seg = GPF_Seg[[
                'CBSA Code','sale_year','State','County',
                'issuer_type','Issuer',
                'avg_maturity','amount',
                'avg_yield','treasury_avg_spread','MMA_avg_spread',
                'gross_spread','gross_spread_tic_based','gross_spread_nic_based',
                'mod_tic','mod_tic_spread_treasury','mod_tic_spread_MMA',
                'mod_tic_timeFE','mod_tic_spread_treasury_timeFE','mod_tic_spread_MMA_timeFE',
                'underpricing_15to60','underpricing_15to30',
                'Bid','taxable_code','security_type','if_advisor','if_dual_advisor','if_refunding',
                'amount_bracket','mat_bracket','use_short','has_ratings',
                'use_of_proceeds_BB','use_of_proceeds_main','use_of_proceeds_general',
                'has_Moodys','has_Fitch','rating_Moodys','rating_Fitch','insured_amount',
                'AdvisorFeeRatio_hat','CRFeeRatio_hat','InsureFeeRatio_hat',
                'AdvisorFeeRatio_hat_model_timeFE','CRFeeRatio_hat_model_timeFE','InsureFeeRatio_hat_model_timeFE',
                'if_callable','CB_Eligible',
                'num_relationship',
                ]+name_GPF_colnames+parent_name_GPF_colnames]

            # Note that for control banks, "bank_is_target" and "bank_is_acquiror" use M&A in the specific areas
            mergers = CBSA_affected[
                (CBSA_affected['CBSA Code']==item)&
                (CBSA_affected['sale_year']>=row['episode_start_year']-4)&
                (CBSA_affected['sale_year']<=row['episode_start_year']+4)
                ][['acquiror','target','acquiror_parent','target_parent',
                'acquiror_market_share_N_avg','target_market_share_N_avg','other_targets_market_share_N_avg']]
            mergers = mergers[(mergers['acquiror_market_share_N_avg']>0)&(mergers['target_market_share_N_avg']+mergers['other_targets_market_share_N_avg']>0)]
            # Whether the underwriter is the target bank in M&A
            GPF_Seg['bank_is_target'] = False
            for column in name_GPF_colnames:
                GPF_Seg['bank_is_target'] = \
                (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
                |(GPF_Seg['bank_is_target'])
            for column in parent_name_GPF_colnames:
                GPF_Seg['bank_is_target'] = \
                (GPF_Seg[column].isin(list(mergers['target'])+list(mergers['target_parent']))) \
                |(GPF_Seg['bank_is_target'])
            # Whether the underwriter is the acquiror bank in M&A
            GPF_Seg['bank_is_acquiror'] = False
            for column in name_GPF_colnames:
                GPF_Seg['bank_is_acquiror'] = \
                (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
                |(GPF_Seg['bank_is_acquiror'])
            for column in parent_name_GPF_colnames:
                GPF_Seg['bank_is_acquiror'] = \
                (GPF_Seg[column].isin(list(mergers['acquiror'])+list(mergers['acquiror_parent'])))\
                |(GPF_Seg['bank_is_acquiror'])
            
            GPF_Seg['treated'] = 0
            GPF_Seg['episode_start_year'] = row['episode_start_year']
            GPF_Seg['year_to_merger'] = row['year_to_merger']
            GPF_Seg['calendar_year'] = row['calendar_year']
            GPF_Seg['treated_cbsa'] = row['CBSA Code'] # Used for constructing cohort X issuer FEs
            GPF_Seg['hhi_dif'] = hhi_dif
            GPF_Seg_Control = pd.concat([GPF_Seg_Control,GPF_Seg])
    
        if len(GPF_Seg_Treated)>0 and len(GPF_Seg_Control)>0:
            reg_sample = reg_sample+[GPF_Seg_Treated,GPF_Seg_Control]
    
    reg_sample = pd.concat(reg_sample)
    County_Composite = pd.read_csv("../CleanData/Demographics/0C_County_Composite.csv")
    County_Composite = County_Composite[['year','State','County','black_ratio','pop']].rename(columns={'year':'calendar_year'})
    reg_sample = reg_sample.merge(County_Composite,on=['State','County','calendar_year'],how='outer',indicator=True)
    reg_sample = reg_sample[reg_sample['_merge']!='right_only'].drop(columns=['_merge'])
    reg_sample.to_csv(file_path)


A control cannot be found for 4 episodes.
A control cannot be found for 3 episodes.
A control cannot be found for 3 episodes.
A control cannot be found for 2 episodes.
