In [48]:
import os
import pandas as pd
import addfips
from src.utils.paths import get_parent_dir

from linearmodels.panel import PooledOLS, PanelOLS
import statsmodels.api as sm

### 1) Prepare data

In [49]:
def read_csse(path):
    df = pd.read_csv(path)
    df = df.set_index("Unnamed: 0")
    df.index = pd.to_datetime(df.index)
    return df

In [50]:
pdir = get_parent_dir(2)
csse_dir = os.path.join(pdir, 'data', 'processed', 'csse', 'US')

fname_confirmed = "time_series_covid19_confirmed_US_timeseries.csv" 
fname_deaths = "time_series_covid19_deaths_US_timeseries.csv"

path_confirmed = os.path.join(csse_dir, fname_confirmed)
path_deaths = os.path.join(csse_dir, fname_deaths)

In [51]:
ts_confirmed = read_csse(path_confirmed)
ts_deaths = read_csse(path_deaths)

ts_confirmed.index.name = 'time'
ts_deaths.index.name = 'time'

In [52]:
demographic_dir = os.path.join(pdir, 'data', 'raw', 'demography')
popdata = pd.read_csv(os.path.join(demographic_dir, 
                                   "POPEST_2019.csv"),
                      encoding = "ISO-8859-1")

# POPESTIMATE2019: 7/1/2019 resident total population estimate
df_pop = popdata[['STNAME', 'CTYNAME', 'POPESTIMATE2019']]
df_pop_counties = df_pop.query("STNAME != CTYNAME")
df_pop_counties = df_pop_counties.rename(columns={'STNAME': 'state',
                                                  'CTYNAME': 'county',
                                                  'POPESTIMATE2019': 'pop2019_county'})

df_pop_states = df_pop.query("STNAME == CTYNAME")
df_pop_states = df_pop_states.reset_index(drop=True)
df_pop_states = df_pop_states.rename(columns={'STNAME': 'state',
                                              'CTYNAME': 'county',
                                              'POPESTIMATE2019': 'pop2019_state'})
df_pop_states.drop(columns='county', inplace=True)

af = addfips.AddFIPS()

county_fips_codes = []
for i, row in df_pop_counties.iterrows():
    county_fips_code = af.get_county_fips(county=row.county, state=row.state)
    county_fips_codes.append(county_fips_code)
    
state_fips_codes = []
for i, row in df_pop_states.iterrows():
    state_fips_code = af.get_state_fips(state=row.state)
    state_fips_codes.append(state_fips_code)
    
df_pop_counties['FIPS'] = county_fips_codes
df_pop_states['FIPS_state'] = state_fips_codes
print(df_pop_states.head())
print(df_pop_counties.head())


        state  pop2019_state FIPS_state
0     Alabama        4903185         01
1      Alaska         731545         02
2     Arizona        7278717         04
3    Arkansas        3017804         05
4  California       39512223         06
     state          county  pop2019_county   FIPS
1  Alabama  Autauga County           55869  01001
2  Alabama  Baldwin County          223234  01003
3  Alabama  Barbour County           24686  01005
4  Alabama     Bibb County           22394  01007
5  Alabama   Blount County           57826  01009


In [53]:
ts_confirmedT = ts_confirmed.transpose()
ts_confirmedT.index.name = "FIPS"
tsconfm = ts_confirmedT.stack()

In [54]:
tsconfm.name = "confirmed_cases" 
tsconfm = tsconfm.reset_index()

### 2) Merge independent and dependent variables 

In [55]:
df_merged = pd.merge(left=tsconfm.reset_index(),
                     right=df_pop_counties,
                     on='FIPS')
df_merged = pd.merge(left=df_merged,
                     right=df_pop_states,
                     on='state')


In [56]:
df_merged.drop(columns='index', inplace=True)


### 3) Construct panel using pandas Multi-index 

In [57]:
# fips => entity FE, time => time FE
panel = df_merged.set_index(['FIPS', 'time'])

In [58]:
# county pop share with respect to state pop
# TODO: meaningful? 
panel['county_pop_share_2019'] = \
    panel['pop2019_county'].divide(panel['pop2019_state'])
panel

Unnamed: 0_level_0,Unnamed: 1_level_0,confirmed_cases,state,county,pop2019_county,pop2019_state,FIPS_state,county_pop_share_2019
FIPS,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10001,2020-01-22,0,Delaware,Kent County,180786,973764,10,0.185657
10001,2020-01-23,0,Delaware,Kent County,180786,973764,10,0.185657
10001,2020-01-24,0,Delaware,Kent County,180786,973764,10,0.185657
10001,2020-01-25,0,Delaware,Kent County,180786,973764,10,0.185657
10001,2020-01-26,0,Delaware,Kent County,180786,973764,10,0.185657
...,...,...,...,...,...,...,...,...
56045,2020-03-30,0,Wyoming,Weston County,6927,578759,56,0.011969
56045,2020-03-31,0,Wyoming,Weston County,6927,578759,56,0.011969
56045,2020-04-01,0,Wyoming,Weston County,6927,578759,56,0.011969
56045,2020-04-02,0,Wyoming,Weston County,6927,578759,56,0.011969


In [59]:
panel_subset = panel[['confirmed_cases', 'county_pop_share_2019']]

### 4) Run pooled and panel regression 
entity fixed effects don't work. this makes sense as the population
shares per county add up to 100% per county.
time fixed effects on the other hand yield the same results as 
the simple pooled regression above, which makes sense because there
is no variation over time in our current data. looking good!

In [60]:
exog_vars = ['county_pop_share_2019']
exog = sm.add_constant(panel_subset[exog_vars])

# pooled regression
mod_pooled = PooledOLS(dependent=panel_subset.confirmed_cases, 
                       exog=exog)
pooled_res = mod_pooled.fit()
print(pooled_res)

# panel regression
mod_panel_entity = PanelOLS(dependent=panel_subset.confirmed_cases, 
                            exog=exog,
                            time_effects=True)
panel_entity_res = mod_panel_entity.fit()
print(panel_entity_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:        confirmed_cases   R-squared:                        0.0028
Estimator:                  PooledOLS   R-squared (Between):              0.0177
No. Observations:              206152   R-squared (Within):               0.0000
Date:                Sun, Apr 05 2020   R-squared (Overall):              0.0028
Time:                        11:51:49   Log-likelihood                -1.467e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      571.29
Entities:                        2824   P-value                           0.0000
Avg Obs:                       73.000   Distribution:                F(1,206150)
Min Obs:                       73.000                                           
Max Obs:                       73.000   F-statistic (robust):             571.29
                            