#### Import modules

In [1]:
import os
import pandas as pd
import pickle

Read state codes mapping file:

In [2]:
name_to_code = []

with open(os.getcwd() + '/Data/name_to_code.pickle', 'rb') as pckl:
    name_to_code = pickle.load(pckl)
    
pckl.close()

name_to_code['Massachusetts']

'MA'

### States

#### Read data file

In [3]:
states_cases = pd.read_csv(os.getcwd() + '/Data/us-states.csv')
states_cases.shape

(33494, 5)

In [4]:
states_cases.tail()

Unnamed: 0,date,state,fips,cases,deaths
33489,2021-10-31,Virginia,51,924771,13907
33490,2021-10-31,Washington,53,727820,8686
33491,2021-10-31,West Virginia,54,272532,4426
33492,2021-10-31,Wisconsin,55,882618,9416
33493,2021-10-31,Wyoming,56,102926,1174


In [5]:
# pd.Series(states_cases['state'].unique()).sort_values()
states_cases['state'].unique().shape

(56,)

#### Preprocess data

Remove AK, HI, and territories, and label DC:

In [6]:
states_cases['state_code'] = states_cases['state'].apply(lambda x: name_to_code[x] if x in name_to_code else 'EXCLUDE')
states_cases.loc[states_cases['state'] == 'District of Columbia', 'state_code'] = 'DC'
states_cases.loc[(states_cases['state_code'] == 'HI') | (states_cases['state_code'] == 'AK'), 'state_code'] = 'EXCLUDE'

states_cases.tail()

Unnamed: 0,date,state,fips,cases,deaths,state_code
33489,2021-10-31,Virginia,51,924771,13907,VA
33490,2021-10-31,Washington,53,727820,8686,WA
33491,2021-10-31,West Virginia,54,272532,4426,WV
33492,2021-10-31,Wisconsin,55,882618,9416,WI
33493,2021-10-31,Wyoming,56,102926,1174,WY


In [7]:
states_cases[states_cases['state_code'] == 'EXCLUDE']['state'].unique()

array(['Hawaii', 'Alaska', 'Puerto Rico', 'Virgin Islands', 'Guam',
       'Northern Mariana Islands', 'American Samoa'], dtype=object)

In [8]:
states_cases = states_cases[states_cases['state_code'] != 'EXCLUDE']
states_cases['state'].unique().shape

(49,)

Convert date from string to date type:

In [9]:
states_cases['date'] = pd.to_datetime(states_cases['date'])

Sample pipeline to get daily cases:

(need to figure out if this works for all data)

In [24]:
small_df = states_cases[states_cases['date'] > '2021-10-20'].copy()
small_df

Unnamed: 0,date,state,fips,cases,deaths,state_code
32878,2021-10-21,Alabama,1,817054,15320,AL
32881,2021-10-21,Arizona,4,1142122,20776,AZ
32882,2021-10-21,Arkansas,5,508937,8237,AR
32883,2021-10-21,California,6,4860797,71369,CA
32884,2021-10-21,Colorado,8,721073,8174,CO
...,...,...,...,...,...,...
33489,2021-10-31,Virginia,51,924771,13907,VA
33490,2021-10-31,Washington,53,727820,8686,WA
33491,2021-10-31,West Virginia,54,272532,4426,WV
33492,2021-10-31,Wisconsin,55,882618,9416,WI


In [25]:
small_df.sort_values(['state_code', 'date'], inplace = True)

small_df['new_cases'] = small_df.groupby(['state_code'])['cases'].transform(lambda x: x.diff())

small_df

Unnamed: 0,date,state,fips,cases,deaths,state_code,new_cases
32878,2021-10-21,Alabama,1,817054,15320,AL,
32934,2021-10-22,Alabama,1,818652,15378,AL,1598.0
32990,2021-10-23,Alabama,1,819597,15406,AL,945.0
33046,2021-10-24,Alabama,1,820011,15407,AL,414.0
33102,2021-10-25,Alabama,1,820312,15407,AL,301.0
...,...,...,...,...,...,...,...
33269,2021-10-27,Wyoming,56,101912,1174,WY,488.0
33325,2021-10-28,Wyoming,56,102403,1174,WY,491.0
33381,2021-10-29,Wyoming,56,102926,1174,WY,523.0
33437,2021-10-30,Wyoming,56,102926,1174,WY,0.0


In [26]:
small_df[small_df['state_code'] == 'MA']

Unnamed: 0,date,state,fips,cases,deaths,state_code,new_cases
32901,2021-10-21,Massachusetts,25,841198,18896,MA,
32957,2021-10-22,Massachusetts,25,842652,18911,MA,1454.0
33013,2021-10-23,Massachusetts,25,842652,18911,MA,0.0
33069,2021-10-24,Massachusetts,25,842652,18911,MA,0.0
33125,2021-10-25,Massachusetts,25,846044,18931,MA,3392.0
33181,2021-10-26,Massachusetts,25,847512,18956,MA,1468.0
33237,2021-10-27,Massachusetts,25,848947,18970,MA,1435.0
33293,2021-10-28,Massachusetts,25,850531,18985,MA,1584.0
33349,2021-10-29,Massachusetts,25,851922,18996,MA,1391.0
33405,2021-10-30,Massachusetts,25,851922,18996,MA,0.0
