In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import os
import dask
import dask.dataframe as dd
import itertools
from itertools import chain
from math import sqrt, floor, ceil, isnan
import multiprocess
import importlib
from importlib import reload
from collections import Counter
from fuzzywuzzy import process, fuzz
import time
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import multiprocessing
import warnings
warnings.filterwarnings("error")

pd.options.display.max_columns = 500
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 400

# A customized winsorisation function that handles None values correctly
# The percentiles are taken and winsorisation are done on non-None values only
def winsor2(series,cutoffs):

    import numpy as np
    import scipy as sp
    
    IsNone = np.isnan(series).copy()
    IsNotNone = np.logical_not(IsNone).copy()
    series_NotNonePart = sp.stats.mstats.winsorize(series[IsNotNone],limits=(cutoffs[0],cutoffs[1]))
    series_new = series.copy()
    series_new[IsNone] = np.nan
    series_new[IsNotNone] = series_NotNonePart

    return series_new

try:
    del(FUN_1E_CBSA_to_State_hhidif)
except:
    pass
import FUN_1E_CBSA_to_State_hhidif
importlib.reload(FUN_1E_CBSA_to_State_hhidif)
from FUN_1E_CBSA_to_State_hhidif import FUN_1E_CBSA_to_State_hhidif


# 1. Import data

In [2]:
# GPF
GPF = pd.read_csv("../CleanData/SDC/0A_GPF.csv",low_memory=False)
raw_name_GPF_colnames = [column for column in GPF.columns if 'raw_name_GPF_' in column]
name_GPF_colnames = [column for column in GPF.columns if column[:9]=='name_GPF_']
parent_name_GPF_colnames = [column for column in GPF.columns if 'parent_name_' in column]

# Parent relationship
GPF_names = pd.read_parquet('../CleanData/SDC/0H_GPF_Parent.parquet')

# All M&As
MA = pd.read_parquet('../CleanData/SDC/0B_M&A.parquet')
MA = MA.merge(GPF_names.rename(columns={'name_GPF':'acquiror','parent_name':'acquiror_parent'}),on=['acquiror','sale_year'])
MA = MA.reset_index(drop=True)

#-------------#
# Import CBSA #
#-------------#

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

# "CSA" is for metropolitan and "CBSA" includes also those micropolitan
CBSAData = pd.read_excel("../RawData/MSA/CBSA.xlsx",skiprows=[0,1])
CBSAData = CBSAData[~pd.isnull(CBSAData['County/County Equivalent'])]

# Add state abbreviations
us_state_to_abbrev = pd.DataFrame.from_dict(us_state_to_abbrev,orient='index').reset_index()
us_state_to_abbrev.columns = ['State Name','State']
CBSAData = CBSAData.rename(columns={'County/County Equivalent':'County'})
CBSAData = CBSAData.merge(us_state_to_abbrev,on='State Name',how='outer',indicator=True)
CBSAData = CBSAData[CBSAData['_merge']=='both'].drop(columns=['_merge'])
# Merge is perfect
CBSAData['County'] = CBSAData['County'].str.upper()
CBSAData['County'] = CBSAData['County'].str.replace(' COUNTY','')
CBSAData['County'] = CBSAData['County'].str.replace(' AND ',' & ')
CBSAData['County'] = CBSAData['County'].str.replace('.','',regex=False)
CBSAData['CSA Code'] = CBSAData['CSA Code'].astype(float)
CBSAData['CBSA Code'] = CBSAData['CBSA Code'].astype(float)


In [3]:
#-----------------------------------------------------#
# State Gov Fin data based on local government survey #
#-----------------------------------------------------#

# Construct a state X year amount of transfers to school districts/special districts/other governments
GovFinData = pd.read_csv('../CleanData/GovFinSurvey/0G_GovFinData.csv', low_memory=False)

GovFinData_part = GovFinData[GovFinData['Type Code']==4]
SpecialDistFund = GovFinData_part.groupby(['State','Year4']).agg({'Total State IG Revenue':sum,'Total LTD Issued':sum,'Total Expenditure':sum})
SpecialDistFund = SpecialDistFund.reset_index()
SpecialDistFund['Year4'] = SpecialDistFund['Year4'].astype(int)
SpecialDistFund = SpecialDistFund.rename(
    columns={'Year4':'year','Total State IG Revenue':'Transfer to Special Dist',
    'Total LTD Issued':'Total LTD Issued by Special Dist','Total Expenditure':'Total Expenditure by Special Dist'})

GovFinData_part = GovFinData[GovFinData['Type Code']==5]
SchoolDistFund = GovFinData_part.groupby(['State','Year4']).agg({'Total State IG Revenue':sum})
SchoolDistFund = SchoolDistFund.reset_index()
SchoolDistFund['Year4'] = SchoolDistFund['Year4'].astype(int)
SchoolDistFund = SchoolDistFund.rename(columns={'Year4':'year','Total State IG Revenue':'Transfer to School Dist'})

GovFinData_part = GovFinData[(GovFinData['Type Code']==1)|(GovFinData['Type Code']==2)|(GovFinData['Type Code']==3)]
MTCFund = GovFinData_part.groupby(['State','Year4']).agg({'Total State IG Revenue':sum})
MTCFund = MTCFund.reset_index()
MTCFund['Year4'] = MTCFund['Year4'].astype(int)
MTCFund = MTCFund.rename(columns={'Year4':'year','Total State IG Revenue':'Transfer to MTC'})

States = list(GPF['State'].unique())
pop_by_CBSA = pd.read_csv("../CleanData/Demographics/0C_CBSA_Pop.csv")
pop_by_CBSA = pop_by_CBSA.rename(columns={'year':'sale_year'})
pop_by_CSA = pd.read_csv("../CleanData/Demographics/0C_CSA_Pop.csv")
pop_by_CSA = pop_by_CSA.rename(columns={'year':'sale_year'})

States = [State for State in States if State!='HI']
States = [State for State in States if State!=None]
States = [State for State in States if str(State)!='nan']
States = [State for State in States if State!='N']
States = [State for State in States if State!='F']
States = [State for State in States if State!='N\nN']

#-----------------------------------------------------#
# State Gov Fin data based on state government survey #
#-----------------------------------------------------#

StateGovFinData = pd.read_csv('../CleanData/GovFinSurvey/0G_StateGovFinData.csv',low_memory=False)
StateGovFinData['Year4'] = StateGovFinData['Year4'].astype(int)
StateGovFinData = StateGovFinData.rename(columns={'Year4':'calendar_year'})
StateGovFinData = StateGovFinData.merge(SpecialDistFund.rename(columns={'year':'calendar_year'}),on=['State','calendar_year'])
StateGovFinData = StateGovFinData.merge(SchoolDistFund.rename(columns={'year':'calendar_year'}),on=['State','calendar_year'])
StateGovFinData = StateGovFinData.merge(MTCFund.rename(columns={'year':'calendar_year'}),on=['State','calendar_year'])

#--------------------------------------------------------------#
# Amount of debt issued by state authorities, according to GPF #
#--------------------------------------------------------------#

GPF_Raw = pd.read_csv("../CleanData/SDC/0A_GPF.csv",low_memory=False)
GPF_State = GPF_Raw[(GPF_Raw['County']=='STATE AUTHORITY')|(GPF_Raw['County']=='STATE')].groupby(['State','sale_year']).agg({'amount':sum}).reset_index()
GPF_State = GPF_State.rename(columns={'sale_year':'calendar_year'})
GPF_State = GPF_State.rename(columns={'amount':'amount_GPF'})
StateGovFinData = StateGovFinData.merge(GPF_State,on=['State','calendar_year'],how='outer',indicator=True)
StateGovFinData = StateGovFinData[StateGovFinData['_merge']!='right_only']

# 2. Construct the state-level implied increase in HHI, Using CBSA

Notes:
- For each state $\times$ year, calculate the weighted average implied Delta HHI using the prior three years' data. A state might have multiple CBSAs, each with certain implied Delta HHI. I calculate the average using the population as the weight. Then, I look for state $\times$ year where this Delta HHI is above a certain threshold. I say such states are treated in those years.
- I use CBSA instead of CBSA as CBSA provides a more complete coverage of all places in a state.

In [4]:
%%time

# Calculate increase in HHI for every state X year

GPF = GPF[~pd.isnull(GPF['sale_year'])]
input_list = [(State,GPF,MA,raw_name_GPF_colnames,pop_by_CBSA) for State in States]

if __name__ == '__main__':
    with multiprocessing.Pool(processes = 10) as p:
        CBSA_all_state_hhi_dif = p.starmap(FUN_1E_CBSA_to_State_hhidif, input_list)
CBSA_all_state_hhi_dif = pd.concat(CBSA_all_state_hhi_dif)

CBSA_all_state_hhi_dif = CBSA_all_state_hhi_dif[~pd.isnull(CBSA_all_state_hhi_dif['state_hhi_dif'])]

CPU times: user 40.5 s, sys: 6.33 s, total: 46.9 s
Wall time: 2min 37s


# 3. State-level episode based on CBSA to state level HHI

In [5]:
# Find state level M&A episodes

CBSA_all_state_hhi_dif = pd.DataFrame(CBSA_all_state_hhi_dif)

State_episodes_impliedHHI_N = []

for State in States:
    
    episode_start_year = 1970
    for year in range(1970,2023):
    
        # If this year is still within the last merger episode
        if year<=episode_start_year+4:
            continue

        # Check if market share in the episode is high enough
        state_hhi_dif = CBSA_all_state_hhi_dif[(CBSA_all_state_hhi_dif['State']==State)&(CBSA_all_state_hhi_dif['year']==year)]
        state_hhi_dif = state_hhi_dif.reset_index(drop=True)
        if len(state_hhi_dif)==1:
            state_hhi_dif = state_hhi_dif['state_hhi_dif'][0]
        else:
            state_hhi_dif = 0
        if state_hhi_dif>0.01:
            # An episode is identified
            State_episodes_impliedHHI_N = State_episodes_impliedHHI_N+[{
                'episode_start_year':year,
                'State':State,
                'state_hhi_dif':state_hhi_dif,
                }]
            episode_start_year = year

State_episodes_impliedHHI_N = pd.DataFrame(State_episodes_impliedHHI_N)


In [6]:
# Generate state level income and population

pop_by_CBSA = pd.read_csv("../CleanData/Demographics/0C_CBSA_Pop.csv")
inc_by_CBSA = pd.read_csv("../CleanData/Demographics/0C_CBSA_Inc.csv")
popincinfo = []
for State in States:
    CBSAsinState = CBSAData[CBSAData['State']==State][['CBSA Code']].drop_duplicates()
    for year in range(1970,2023):
        one_stateyear = CBSAsinState.merge(inc_by_CBSA[inc_by_CBSA['year']==year],on='CBSA Code')
        one_stateyear = one_stateyear.merge(pop_by_CBSA[pop_by_CBSA['year']==year],on='CBSA Code')
        if len(one_stateyear)==0:
            continue
        pop = np.sum(one_stateyear['pop'])
        inc = np.dot(one_stateyear['pop'],one_stateyear['inc'])/pop
        popincinfo = popincinfo+[{'State':State,'year':year,'pop':pop,'inc':inc}]
popincinfo = pd.DataFrame(popincinfo)

In [7]:
# Number of matches to use
n_matches = 1

def calculate_distance(row,weightingmat):
    return sp.spatial.distance.mahalanobis((row['inc'],row['pop']),\
        (row['treated_inc'],row['treated_pop']),weightingmat)

State_episodes_impliedHHI_N['control'] = None
for idx,row in State_episodes_impliedHHI_N.iterrows():

    # Pop/inc information for a certain year
    one_year = popincinfo[popincinfo['year']==row['episode_start_year']].copy()

    # Demographic data of the treated state
    stateyear = one_year[one_year['State']==row['State']].copy()
    if len(stateyear)==0:
        continue
    episode_pop = stateyear.reset_index()['pop'][0]
    episode_inc = stateyear.reset_index()['inc'][0]
    
    # Find a match
    one_year['treated_pop'] = episode_pop
    one_year['treated_inc'] = episode_inc
    # Get weighting matrix
    one_year['inc'] = winsor2(one_year['inc'],cutoffs=[0.05,0.05])
    one_year['pop'] = winsor2(one_year['pop'],cutoffs=[0.05,0.05])
    cov = one_year[['inc','pop']].cov()
    invcov = np.linalg.inv(cov)
    one_year['dist'] = one_year.apply(calculate_distance, axis=1,weightingmat=invcov)
    one_year = one_year.sort_values('dist').reset_index(drop=True)
    # Remove oneself from potential matches
    one_year = one_year[one_year['State']!=row['State']]

    match_counter = 0
    control = []
    for subidx,subrow in one_year.iterrows():
        # Years for which potential control is treated itself
        one_state_hhi_dif = CBSA_all_state_hhi_dif[CBSA_all_state_hhi_dif['State']==subrow['State']]
        one_state_hhi_dif = one_state_hhi_dif[(one_state_hhi_dif['state_hhi_dif']>0.001)]
        one_state_affectedyears = list(one_state_hhi_dif['year'].unique())
        if len(set(list(range(row['episode_start_year']-4,row['episode_start_year']+5))).\
            intersection(set(one_state_affectedyears)))>0:
            # This potential control is treated
            continue
        else:
            # This potential control is not treated => Good control
            control = control+[subrow['State']]
            match_counter = match_counter+1
            if match_counter==n_matches:
                break

    if len(control)>0:
        State_episodes_impliedHHI_N.at[idx,'control'] = control
    else:
        State_episodes_impliedHHI_N.at[idx,'control'] = None

print("There are a total of "+str(len(State_episodes_impliedHHI_N))+" episodes.")
print("Match cannot be found for "+str(np.sum(pd.isnull(State_episodes_impliedHHI_N['control'])))+" episodes.")
State_episodes_impliedHHI_N = State_episodes_impliedHHI_N[~pd.isnull(State_episodes_impliedHHI_N['control'])]


#############################################
# Expand to include an event time dimension #
#############################################

State_episodes_impliedHHI_N['cohort_idx'] = np.array(range(0,len(State_episodes_impliedHHI_N)))
episodes_Exploded = State_episodes_impliedHHI_N
episodes_Exploded['year_to_merger'] = [list(range(-4,5))]*len(episodes_Exploded)
episodes_Exploded = episodes_Exploded.explode('year_to_merger')
episodes_Exploded['calendar_year'] = episodes_Exploded['episode_start_year']+episodes_Exploded['year_to_merger']    


################################
# Assemble a regression sample #
################################

episodes_Exploded_Treated = episodes_Exploded[['episode_start_year','State','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
episodes_Exploded_Treated['Treated'] = 1
episodes_Exploded_Treated = episodes_Exploded_Treated.reset_index(drop=True)
if n_matches==1:
    episodes_Exploded_Control = episodes_Exploded[['episode_start_year','control','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
    episodes_Exploded_Control['Treated'] = 0
    episodes_Exploded_Control = episodes_Exploded_Control.reset_index(drop=True)
    for idx,row in episodes_Exploded_Control.iterrows():
        episodes_Exploded_Control.at[idx,'control'] = episodes_Exploded_Control.at[idx,'control'][0]
    episodes_Exploded_Control = episodes_Exploded_Control.rename(columns={'control':'State'})
    episodes_Exploded = pd.concat([episodes_Exploded_Treated,episodes_Exploded_Control])
elif n_matches==2:
    episodes_Exploded_Control = episodes_Exploded[['episode_start_year','control','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
    episodes_Exploded_Control['Treated'] = 0
    episodes_Exploded_Control = episodes_Exploded_Control.reset_index(drop=True)
    for idx,row in episodes_Exploded_Control.iterrows():
        episodes_Exploded_Control.at[idx,'control'] = episodes_Exploded_Control.at[idx,'control'][0]
    episodes_Exploded_Control = episodes_Exploded_Control.rename(columns={'control':'State'})
    episodes_Exploded_Control_1 = episodes_Exploded_Control

    episodes_Exploded_Control = episodes_Exploded[['episode_start_year','control','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
    episodes_Exploded_Control['Treated'] = 0
    episodes_Exploded_Control = episodes_Exploded_Control.reset_index(drop=True)
    for idx,row in episodes_Exploded_Control.iterrows():
        if len(episodes_Exploded_Control.at[idx,'control'])==2:
            episodes_Exploded_Control.at[idx,'control'] = episodes_Exploded_Control.at[idx,'control'][1]
        else:
            episodes_Exploded_Control.at[idx,'control'] = None
    episodes_Exploded_Control = episodes_Exploded_Control.rename(columns={'control':'State'})
    episodes_Exploded_Control = episodes_Exploded_Control[~pd.isnull(episodes_Exploded_Control['State'])]
    episodes_Exploded_Control_2 = episodes_Exploded_Control
elif n_matches==3:
    episodes_Exploded_Control = episodes_Exploded[['episode_start_year','control','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
    episodes_Exploded_Control['Treated'] = 0
    episodes_Exploded_Control = episodes_Exploded_Control.reset_index(drop=True)
    for idx,row in episodes_Exploded_Control.iterrows():
        episodes_Exploded_Control.at[idx,'control'] = episodes_Exploded_Control.at[idx,'control'][0]
    episodes_Exploded_Control = episodes_Exploded_Control.rename(columns={'control':'State'})
    episodes_Exploded_Control_1 = episodes_Exploded_Control

    episodes_Exploded_Control = episodes_Exploded[['episode_start_year','control','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
    episodes_Exploded_Control['Treated'] = 0
    episodes_Exploded_Control = episodes_Exploded_Control.reset_index(drop=True)
    for idx,row in episodes_Exploded_Control.iterrows():
        if len(episodes_Exploded_Control.at[idx,'control'])>=2:
            episodes_Exploded_Control.at[idx,'control'] = episodes_Exploded_Control.at[idx,'control'][1]
        else:
            episodes_Exploded_Control.at[idx,'control'] = None
    episodes_Exploded_Control = episodes_Exploded_Control.rename(columns={'control':'State'})
    episodes_Exploded_Control = episodes_Exploded_Control[~pd.isnull(episodes_Exploded_Control['State'])]
    episodes_Exploded_Control_2 = episodes_Exploded_Control

    episodes_Exploded_Control = episodes_Exploded[['episode_start_year','control','state_hhi_dif','year_to_merger','calendar_year','cohort_idx']]
    episodes_Exploded_Control['Treated'] = 0
    episodes_Exploded_Control = episodes_Exploded_Control.reset_index(drop=True)
    for idx,row in episodes_Exploded_Control.iterrows():
        if len(episodes_Exploded_Control.at[idx,'control'])==3:
            episodes_Exploded_Control.at[idx,'control'] = episodes_Exploded_Control.at[idx,'control'][2]
        else:
            episodes_Exploded_Control.at[idx,'control'] = None
    episodes_Exploded_Control = episodes_Exploded_Control.rename(columns={'control':'State'})
    episodes_Exploded_Control = episodes_Exploded_Control[~pd.isnull(episodes_Exploded_Control['State'])]
    episodes_Exploded_Control_3 = episodes_Exploded_Control

    episodes_Exploded = pd.concat([episodes_Exploded_Treated,episodes_Exploded_Control_1,episodes_Exploded_Control_2,episodes_Exploded_Control_3])

# Export data
episodes_Exploded = episodes_Exploded.merge(StateGovFinData,on=['State','calendar_year'])
episodes_Exploded.to_csv('../CleanData/MAEvent/GovFin_State_from_CBSA.csv')


There are a total of 76 episodes.
Match cannot be found for 0 episodes.
