In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# COVID-19
Ultimately what we would like to know and predict is the true infection rate in the population. This rate is revealed by testing a sufficient amount of the population. How much is sufficient? We will investigate how states are doing in terms of testing.

# Key takeaways from this notebook
* Only 17 states currently have tested enough people to get a good estimate (within 1%) of the actual infection rate in the population. 
* Washington has done a great job testing early, far outpacing all other states
* Michigan currently has the highest infection rate of states with a large sample size of tests, followed closely by New York.

Please upvote if you find this useful!!

In [None]:
train_csv = pd.read_csv('/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_train.csv')
#df = pd.read_csv("/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv")
df = pd.read_csv("/kaggle/input/covid19-in-usa/us_states_covid19_daily.csv")
pop = pd.read_csv("/kaggle/input/us-census-demographic-data/acs2017_census_tract_data.csv")
#
df.head()

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [None]:
pop['state'] = pop['State'].map(us_state_abbrev)
df = df[~pd.isnull(df['state'])]
df_popbystate = pd.DataFrame(pop.groupby('state').agg('TotalPop').sum())

In [None]:
df = df.merge(df_popbystate,on='state')
df_bystate = pd.DataFrame(df.groupby('state').agg('positive').sum())
df_testsbystate = pd.DataFrame(df.groupby('state').agg('total').sum())

In [None]:
df_bystate = df_bystate.merge(df_popbystate,on='state')
df_bystate = df_bystate.merge(df_testsbystate,on='state')

In [None]:
df_bystate['cases_per_100'] = df_bystate['positive']/(df_bystate['TotalPop']/100)
df_bystate['tests_per_100'] = df_bystate['total']/(df_bystate['TotalPop']/100)

In [None]:
plt.plot(np.log(df_bystate['tests_per_100']),np.log(df_bystate['cases_per_100']),'.')
plt.xlabel('log(tests_per_100)')
plt.ylabel('log(cases_per_100)')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr = LinearRegression()

X = np.log(df_bystate['tests_per_100'].values)
Y = np.log(df_bystate['cases_per_100'].values)
lr.fit(X.reshape(-1,1),Y.reshape(-1,1))
print(r2_score(Y.reshape(-1,1),lr.predict(X.reshape(-1,1))))

X = np.log(df_bystate['total'].values)
Y = np.log(df_bystate['positive'].values)
lr.fit(X.reshape(-1,1),Y.reshape(-1,1))
print(r2_score(Y.reshape(-1,1),lr.predict(X.reshape(-1,1))))

A linear fit between number of tests per 100 and number of cases per 100 does not fit well. There is a better fit between number of tests and number of cases, which makes sense.

In [None]:
plt.figure(figsize=(10,15))
y_pos = np.arange(df_bystate.shape[0])
plt.barh(y_pos, df_bystate['tests_per_100'],label='tests per 100')
plt.barh(y_pos, df_bystate['cases_per_100'],label='cases per 100')
 
# Create names on the y-axis
plt.yticks(y_pos, df_bystate.index)
plt.legend()
plt.show()

Some states are testing at a much higher rate than others (WA is highest by far). 

In [None]:
df['cases_per_100'] = df['positive']/df['TotalPop']
df['tests_per_100'] = df['total']/df['TotalPop']

In [None]:
df_ca = df[df.state=='CA']
df_wa = df[df.state=='WA']
df_ny = df[df.state=='NY']

In [None]:
plt.figure(figsize=(8,6))
plt.plot(df_ca['date'],df_ca['cases_per_100'],color='b',label='cases per 100')
plt.plot(df_ca['date'],df_ca['tests_per_100'],color='r',label='tests per 100')
plt.legend()
plt.twinx()
plt.plot(df_ca['date'],100*(df_ca['positive']/df_ca['total']),color='green',label='infection rate (%)')
plt.legend()

At this point, around 10% of people in CA who got tested test positive. Interestingly, the day they implemented more testing, the rate of true:total went down a lot.

In [None]:
plt.plot(df_ca['date'],df_ca['tests_per_100'],'b')
plt.plot(df_wa['date'],df_wa['tests_per_100'],'r')
plt.plot(df_ny['date'],df_ny['tests_per_100'],'g')

Washington started aggressively testing early, whereas NY and California lagged behind. 

In [None]:
high_test_rate = []
plt.figure(figsize=(8,6))
print('states with high rate of testing:\n')
for s in np.unique(df.state):
    x = df[df.state==s]
    if x['tests_per_100'].iloc[0]>0.001:
        color = 'red'
        high_test_rate.append(s)
        print('{}'.format(s))
        plt.plot(x['date'],np.log(x['tests_per_100']),label='{}'.format(s),color=color)
    else:
        color = 'black'
        plt.plot(x['date'],np.log(x['tests_per_100']),label='{}'.format(np.nan),color=color)

Some states started testing earlier and continue to test at a higher rate than others. Does this affect infection rate?

In [None]:
plt.plot(df[df.state=='MO']['date'],df[df.state=='MO']['positive']/df[df.state=='MO']['total'],'b')
plt.plot(df[df.state=='AK']['date'],df[df.state=='AK']['positive']/df[df.state=='AK']['total'],'r')
plt.plot(df[df.state=='ME']['date'],df[df.state=='ME']['positive']/df[df.state=='ME']['total'],'g')

In [None]:
print('latest estimated infection rates\n')
print('Washingtion: {0:0.2f}%'.format(100*(df_wa['positive'].iloc[0]/df_wa['total'].iloc[0])))
print('California: {0:0.2f}%'.format(100*(df_ca['positive'].iloc[0]/df_ca['total'].iloc[0])))
print('New York: {0:0.2f}%'.format(100*(df_ny['positive'].iloc[0]/df_ny['total'].iloc[0])))

In the past week or so, we can begin to trust that a sufficient number of the population is being tested to estimate the infection rate of the population. Looks like Washington's infection rate is stabilizing around 6.6%.

How big of a sample do we need? 

In [None]:
ca_pop=np.unique(df_ca['TotalPop'])[0]

In [None]:
def sample_size_needed(Z,sd,e,N):
    X = (((Z**2)*sd*(1-sd))/(e**2))/\
    (1+(((Z**2)*sd*(1-sd))/((e**2)*N)))
    return X

sample_size_needed(1.96,0.5,0.01,ca_pop)

To be within +/- 1% on our estimate, we need a sample size of at least 9,602. 

In [None]:
ss_states = pd.DataFrame(df_bystate[df_bystate.total>9602].index)

In [None]:
ss_states.head()

So we have a large enough sample size in 17 states to say that the number of positives/ number of tests is a good estimate of the rate of infection.

In [None]:
df_ss = ss_states.merge(df,on='state')

In [None]:
%matplotlib inline
plt.figure(figsize=(8,6))
plt.subplot(2,1,1)
for s in np.unique(df_ss.state):
    x = df_ss[df_ss['state']==s]
    #x = x[x['total']>9602]
    plt.plot(x['date'],x['positive']/x['total'],label='{}'.format(s))
plt.xlim(np.max(x['date'])-5,np.max(x['date']))
plt.ylim(0,0.4)

plt.subplot(2,1,2)
for s in np.unique(df_ss.state):
    x = df_ss[df_ss['state']==s]
    #x = x[x['total']>9602]
    plt.plot(x['date'],x['total'],label='{}'.format(s))
plt.xlim(np.max(x['date'])-5,np.max(x['date']))
plt.legend()
plt.show()

Of the states which have a high enough sample size to trust, MI has the highest rate of infection.

In [None]:
df_MI = df_ss[df_ss.state=='MI']['positive']/df_ss[df_ss.state=='MI']['total']
df_NY = df_ss[df_ss.state=='NY']['positive']/df_ss[df_ss.state=='NY']['total']
df_IL = df_ss[df_ss.state=='IL']['positive']/df_ss[df_ss.state=='IL']['total']

print('MI infection rate: {0:2.2f}%'.format(100*df_MI.iloc[0]))
print('NY infection rate: {0:2.2f}%'.format(100*df_NY.iloc[0]))
print('IL infection rate: {0:2.2f}%'.format(100*df_IL.iloc[0]))

It should be noted that this assumes non of these cases has recovered or died, of course this will change the percentage of infected people.