In [1]:
import os
import numpy as np
import pandas as pd
import addfips
from src.utils.paths import get_parent_dir
from linearmodels.panel import PooledOLS, PanelOLS
import statsmodels.api as sm

In [2]:
pdir = get_parent_dir(2)

### 1) Prepare data

In [41]:
def read_csse(path):
    df = pd.read_csv(path)
    df = df.set_index("Unnamed: 0")
    df.index = pd.to_datetime(df.index)
    return df

def read_sahie(path, granularity='county'):
    df = (pd.read_csv(path, header=68, sep=',')
          .drop(columns=['Unnamed: 25', 'year', 'version',
                          'statefips', 'countyfips', 'geocat'])
          .infer_objects()  
          .apply(lambda s : s.str.strip() if s.dtype is np.object else s)
          )
    # deal with whitespace
    
    # split data: county/state
    if granularity == 'county':
        df = df.query("county_name != ''")
    elif granularity == 'state':
        df = df.query("county_name == ''")
    else: 
        return df
    return df

Read health data

In [44]:
data_dir = os.path.join(pdir, 'data')
path_sahie_raw = os.path.join(data_dir, 'raw', 'health', 'SAHIE_2017.csv')
sahie = read_sahie(path=path_sahie_raw, granularity='all')
sahie.to_csv(os.path.join(data_dir, 'processed', 'health',
                          'SAHIE_2017_cleaned.csv'))


  if (await self.run_code(code, result,  async_=asy)):


In [43]:
numeric_cols = ['agecat', 'racecat', 'sexcat', 'iprcat', 'NIPR', 'nipr_moe', 'NUI',
'nui_moe', 'NIC', 'nic_moe', 'PCTUI', 'pctui_moe', 'PCTIC', 'pctic_moe',
'PCTELIG', 'pctelig_moe', 'PCTLIIC', 'pctliic_moe']

sahie[numeric_cols] = sahie[numeric_cols].apply(pd.to_numeric)

ValueError: Unable to parse string "       ." at position 57000

In [None]:
cols = sahie.columns.to_list()
cols = cols[-2:] + cols[:-2]
sahie = sahie[cols]

In [7]:
# query for county data only
sahie_county_data_only = sahie.query("county_name != ''")
sahie_county_data_only.reset_index(drop=True, inplace=True)

In [8]:
sahie_cleaned = pd.get_dummies(
    sahie_county_data_only,
    columns=['sexcat', 'iprcat', 'agecat', 'racecat'],
    drop_first=True) # only store n-1 dummies to avoid "dummy variable trap"

In [None]:
numeric_cols = ['NIPR', 'nipr_moe', 'NUI', 'nui_moe',
       'NIC', 'nic_moe', 'PCTUI', 'pctui_moe', 'PCTIC', 'pctic_moe', 'PCTELIG',
       'pctelig_moe', 'PCTLIIC', 'pctliic_moe']
for col in numeric_cols:
    sahie_cleaned[col] = sahie_cleaned[col].replace('.', np.nan).astype(float)
sahie_cleaned.info()

In [None]:
sahie_cleaned = sahie_cleaned.groupby(['county_name'], as_index=False).first()
sahie_cleaned.rename(columns={'county_name': 'county',
                              'state_name': 'state'},
                     inplace=True)


In [None]:
# add fips
af = addfips.AddFIPS()

sahie_county_fips_codes = []
for i, row in sahie_cleaned.iterrows():
    county_fips_code = af.get_county_fips(county=row.county, state=row.state)
    sahie_county_fips_codes.append(county_fips_code)
sahie_county_fips_codes

sahie_cleaned['FIPS'] = sahie_county_fips_codes

In [None]:
#sahie_cleaned[sahie_cleaned['county'] == "Anchorage Borough"]['FIPS'] = '02020'
#print(sahie_cleaned['FIPS'].head(40))

In [None]:
# drop Anchorage Borough with missing FIPS, deal with that later

In [None]:
sahie_cleaned.info()

In [None]:
csse_dir = os.path.join(pdir, 'data', 'processed', 'csse', 'US')

fname_confirmed = "time_series_covid19_confirmed_US_timeseries.csv" 
fname_deaths = "time_series_covid19_deaths_US_timeseries.csv"

path_confirmed = os.path.join(csse_dir, fname_confirmed)
path_deaths = os.path.join(csse_dir, fname_deaths)

In [None]:
ts_confirmed = read_csse(path_confirmed)
ts_deaths = read_csse(path_deaths)

ts_confirmed.index.name = 'time'
ts_deaths.index.name = 'time'

In [None]:
demographic_dir = os.path.join(pdir, 'data', 'raw', 'demography')
popdata = pd.read_csv(os.path.join(demographic_dir, 
                                   "POPEST_2019.csv"),
                      encoding = "ISO-8859-1")

# POPESTIMATE2019: 7/1/2019 resident total population estimate
df_pop = popdata[['STNAME', 'CTYNAME', 'POPESTIMATE2019']]
df_pop_counties = df_pop.query("STNAME != CTYNAME")
df_pop_counties = df_pop_counties.rename(columns={'STNAME': 'state',
                                                  'CTYNAME': 'county',
                                                  'POPESTIMATE2019': 'pop2019_county'})

df_pop_states = df_pop.query("STNAME == CTYNAME")
df_pop_states = df_pop_states.reset_index(drop=True)
df_pop_states = df_pop_states.rename(columns={'STNAME': 'state',
                                              'CTYNAME': 'county',
                                              'POPESTIMATE2019': 'pop2019_state'})
df_pop_states.drop(columns='county', inplace=True)

af = addfips.AddFIPS()

county_fips_codes = []
for i, row in df_pop_counties.iterrows():
    county_fips_code = af.get_county_fips(county=row.county, state=row.state)
    county_fips_codes.append(county_fips_code)
    
state_fips_codes = []
for i, row in df_pop_states.iterrows():
    state_fips_code = af.get_state_fips(state=row.state)
    state_fips_codes.append(state_fips_code)
    
df_pop_counties['FIPS'] = county_fips_codes
df_pop_states['FIPS_state'] = state_fips_codes
print(df_pop_states.head())
print(df_pop_counties.head())

In [None]:
df_pop_counties

In [None]:
ts_confirmedT = ts_confirmed.transpose()
ts_confirmedT.index.name = "FIPS"
tsconfm = ts_confirmedT.stack()

In [None]:
tsconfm.name = "confirmed_cases" 
tsconfm = tsconfm.reset_index()

### 2) Merge independent and dependent variables 

In [None]:
df_merged = pd.merge(left=tsconfm.reset_index(),
                     right=df_pop_counties,
                     on='FIPS')
df_merged = pd.merge(left=df_merged,
                     right=df_pop_states,
                     on='state')

In [None]:
# merge sahie
df_merged = pd.merge(left=df_merged,
                     right=sahie_cleaned,
                     on='FIPS')

In [None]:
df_merged.drop(columns='index', inplace=True)


In [None]:
df_merged

### 3) Construct panel using pandas Multi-index 

In [None]:
# fips => entity FE, time => time FE
panel = df_merged.set_index(['FIPS', 'time'])

In [None]:
# county pop share with respect to state pop
# TODO: meaningful? 
panel['county_pop_share_2019'] = \
    panel['pop2019_county'].divide(panel['pop2019_state'])

In [None]:
panel

In [None]:
# select vars
panel_subset = panel[
    ['confirmed_cases', 'pop2019_county', 'county_pop_share_2019', 'NIPR', 
     'nipr_moe', 'NUI', 'nui_moe', 'NIC', 'nic_moe', 'PCTUI', 'pctui_moe',
     'PCTIC', 'pctic_moe', 'PCTELIG', 'pctelig_moe', 'PCTLIIC',
     'pctliic_moe', 'sexcat_0', 'sexcat_1', 'sexcat_2', 'iprcat_0',
     'iprcat_1', 'iprcat_2', 'iprcat_3', 'iprcat_4', 'iprcat_5', 'agecat_0',
     'agecat_1', 'agecat_2', 'agecat_3', 'agecat_4', 'agecat_5', 'racecat_0']]

### 4) Run pooled and panel regression 
entity fixed effects don't work. this makes sense as the population
shares per county add up to 100% per county.
time fixed effects on the other hand yield the same results as 
the simple pooled regression above, which makes sense because there
is no variation over time in our current data. looking good!

A) Merged with SAHIE data (significantly less data!)

In [None]:
exog_vars = \
    ['pop2019_county', # County population
     'county_pop_share_2019', # Share of county population relative to state population
     'NIPR', # Number in demographic group for <income category>
     'NUI', # Number uninsured
     'PCTELIG'] # Percent uninsured in demographic group for all income levels
exog = sm.add_constant(panel_subset[exog_vars])

# pooled regression
mod_pooled = PooledOLS(dependent=panel_subset.confirmed_cases, 
                       exog=exog)
pooled_res = mod_pooled.fit()
print(pooled_res)

# panel regression
mod_panel_entity = PanelOLS(dependent=panel_subset.confirmed_cases, 
                            exog=exog,
                            time_effects=True)
panel_entity_res = mod_panel_entity.fit()
print(panel_entity_res)

A) Population data only

In [None]:
exog_vars = \
    ['pop2019_county', # County population
     'county_pop_share_2019'] # Percent uninsured in demographic group for all income levels
exog = sm.add_constant(panel_subset[exog_vars])

# pooled regression
mod_pooled = PooledOLS(dependent=panel_subset.confirmed_cases, 
                       exog=exog)
pooled_res = mod_pooled.fit()
print(pooled_res)

# panel regression
mod_panel_entity = PanelOLS(dependent=panel_subset.confirmed_cases, 
                            exog=exog,
                            time_effects=True)
panel_entity_res = mod_panel_entity.fit()
print(panel_entity_res)