In [1]:
import numpy as np
import pandas as pd

In [2]:
acs_dt = pd.read_csv('data/raw/ACS Series 2.csv',encoding='cp1252')
covid19_dt = pd.read_csv('data/raw/Covid US County Tracker.csv',encoding='cp1252')

In [3]:
covid19_dt[covid19_dt['confirmed_cases'] > 1].groupby('state', as_index = False).\
agg({'date':'min'}).\
sort_values('date', ascending = True).shape

(51, 2)

In [4]:


covid19_subset_dt = covid19_dt

### population

In [5]:
print("Date : " + str(acs_dt.standardized_date.unique().tolist()[0]))

Date : 2016-01-01


In [6]:
# state population

pop_dt = acs_dt.groupby('STUSAB', as_index = False).\
agg({'population':'sum'}).\
rename(columns = {'STUSAB':'state'})

pop_dt['state'] = pop_dt['state'].apply(lambda x: x.upper())

pop_dt.to_csv('data/cln/all_state_population_2016_01_01.csv', index = False)

### first days of infection curve

- Starting point defined as the first day when there are 10 cases. 

- CA took a long time to start increase from 1 to 10

#### We can set the second wave start date instead

In [7]:
# agg to state and date level
covid19_state_dt = covid19_subset_dt.groupby(['state', 'standardized_date'], as_index = False).\
agg({
    'confirmed_cases':'sum',
    'deaths':'sum',
})

# get day and date crosswalk
date_day_crosswalk = pd.DataFrame({
    'standardized_date': covid19_subset_dt.sort_values('standardized_date')['standardized_date'].unique(),
    'day': range(covid19_subset_dt.standardized_date.nunique())
})

# add day
covid19_state_dt = covid19_state_dt.merge(date_day_crosswalk, on = 'standardized_date')

covid19_state_dt.tail(3)

Unnamed: 0,state,standardized_date,confirmed_cases,deaths,day
4230,WI,2020-04-13,3428.0,144.0,82
4231,WV,2020-04-13,633.0,9.0,82
4232,WY,2020-04-13,275.0,1.0,82


In [8]:
# find the first date of at least 10 positive case for each region
# becuase CA took a long time to start increase from 1 to 10

first_day_inf = covid19_state_dt[covid19_state_dt['confirmed_cases'] >= 10]\
.groupby('state', as_index = False).agg({'day':'min'}).\
rename(columns = {'day':'first_day'})

last_day_inf = covid19_state_dt[covid19_state_dt['confirmed_cases'] >= 10]\
.groupby('state', as_index = False).agg({'day':'max'}).\
rename(columns = {'day':'last_day'})

first_last_day_inf = pd.merge(first_day_inf, last_day_inf, on = "state")
first_last_day_inf['total_number_of_days'] = first_last_day_inf['last_day'] - first_last_day_inf['first_day']
first_last_day_inf

Unnamed: 0,state,first_day,last_day,total_number_of_days
0,AK,57,82,25
1,AL,52,82,30
2,AR,52,82,30
3,AZ,49,82,33
4,CA,30,82,52
5,CO,47,82,35
6,CT,52,82,30
7,DC,49,82,33
8,DE,55,82,27
9,FL,47,82,35


In [9]:
# only look at the first month of infection

First_N_Day = pd.merge(covid19_state_dt, first_last_day_inf, on = 'state')
First_N_Day = First_N_Day[(First_N_Day['day'] >= First_N_Day['first_day'])]

First_N_Day.drop(['day', 'first_day', 'last_day'], axis = 1, inplace=True)

In [10]:
# merge with population

First_N_Day = First_N_Day.merge(pop_dt, on = "state").\
rename(columns = {'confirmed_cases':'# Infected', 'deaths': '# Deaths'})

First_N_Day['# Susceptible'] =  First_N_Day['population'] - First_N_Day['# Infected']

In [11]:
# check the maximum of Infected 

First_N_Day.groupby('state', as_index = False).agg({'# Infected':'max'})

Unnamed: 0,state,# Infected
0,AK,278.0
1,AL,3803.0
2,AR,1475.0
3,AZ,3702.0
4,CA,24326.0
5,CO,7128.0
6,CT,13381.0
7,DC,1875.0
8,DE,1761.0
9,FL,21019.0


In [12]:
First_N_Day.head(5)

Unnamed: 0,state,standardized_date,# Infected,# Deaths,total_number_of_days,population,# Susceptible
0,AK,2020-03-19,10.0,1.0,25,710231,710221.0
1,AK,2020-03-20,16.0,4.0,25,710231,710215.0
2,AK,2020-03-21,20.0,6.0,25,710231,710211.0
3,AK,2020-03-22,23.0,0.0,25,710231,710208.0
4,AK,2020-03-23,37.0,0.0,25,710231,710194.0


In [13]:
First_N_Day.tail(5)

Unnamed: 0,state,standardized_date,# Infected,# Deaths,total_number_of_days,population,# Susceptible
1665,WY,2020-04-09,239.0,0.0,28,563626,563387.0
1666,WY,2020-04-10,252.0,0.0,28,563626,563374.0
1667,WY,2020-04-11,261.0,0.0,28,563626,563365.0
1668,WY,2020-04-12,270.0,0.0,28,563626,563356.0
1669,WY,2020-04-13,275.0,1.0,28,563626,563351.0


In [14]:
## write out
First_N_Day.to_csv('data/cln/all_states_real_sir.csv', index = False)

In [15]:
First_N_Day.state.unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)