In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and prepare data

Load the data

In [None]:
df=pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')

Reverse the order of the data, some plottings will go better if the oldest dates are on top of the dataframe.

In [None]:
df=df.iloc[::-1]

Also, stripping out the year and turning the date into a string will simplify plotting.

In [None]:
df['Date']=df['date'].astype(str).str[5:]
df['DateTime']=pd.to_datetime(df['date'], format='%Y%m%d')

# Question 1:  How many states?

How many different "states" are there?

In [None]:
print(np.unique(df['state']))
print('There are %d unique state entries, must be including territories.'%len(np.unique(df['state'])))

# Question 2: Which 5 states have highest positive testing rate?

First we will make a new dataframe that has just the states, positives, negatives, and the positive fraction.

In [None]:
posneg=df.groupby(['state']).aggregate({'positive':np.max, 'negative':np.max}).reset_index()
posneg['PositiveRatio']=posneg['positive']/(posneg['positive']+posneg['negative'])
posneg.head()

Now find the 5 highest ratios.

In [None]:
posneg.sort_values(by='PositiveRatio', ascending=False)[0:5][['state', 'PositiveRatio']]

# Question 3:  Plot daily death toll for whole country

Let's plot daily death toll for the whole country.

In [None]:
df.groupby('DateTime').aggregate({'deathIncrease':np.sum}).plot(figsize=[15,8])
plt.title('US Daily Death Toll')

# Question 4:  Plot daily death toll for just Kansas and Missouri

Now let's focus on Missouri and Kansas, first we will graph their combined totals.

In [None]:
plt.figure(figsize=[15,8])
df[df['state'].isin(['MO', 'KS'])].groupby('DateTime').aggregate({'deathIncrease':np.sum}).plot(figsize=[15,8])
plt.title('Missouri + Kansas Daily Death Toll')

Now let's separate them out.

In [None]:
plt.figure(figsize=[15,8])
sns.lineplot(data=df[df['state'].isin(['MO', 'KS'])], x='DateTime', y='deathIncrease', hue='state')
plt.title('Missouri and Kansas Daily Death Toll')

# Question 5:  Plot the daily testing rate for Kansas and Missouri

What is the daily testing rate looking like for Kansas and Missouri?

In [None]:
plt.figure(figsize=[15,8])
sns.lineplot(data=df[df['state'].isin(['MO', 'KS'])], x='DateTime', y='totalTestResultsIncrease', hue='state')
plt.title('Missouri + Kansas Daily Testings')

# Question 6:  Make a table of peak testing day for each state

In [None]:
df.groupby(['state']).apply(lambda x: x.iloc[np.argmax(x['totalTestResultsIncrease'])]).sort_values(by='DateTime')[['totalTestResultsIncrease','DateTime']]



# Question 8:  What is the peak death day for the whole country?

In [None]:
agged_df=df.groupby(['DateTime']).aggregate({'deathIncrease':np.sum}).reset_index()
agged_df.loc[np.argmax(agged_df['deathIncrease'])]

So far it is April 21, on which 2674 died.

# Challenge:  Plot the per capita testing fraction in descending order

First let's make a new dataframe that simply aggregates the total tests per state.

In [None]:
tests_agged=df.groupby(['state']).agg({'totalTestResultsIncrease':np.sum}).reset_index()

Now let's read in the population data, and keep only the parts we want.

In [None]:
pop=pd.read_csv('http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv', usecols=['NAME', 'CENSUS2010POP'])

Now the problem is that our tests_agged has state abbreviations and our pop dataframe has full state names.  We need to convert one to the other before we can merge.

First let's load a file that has both.

In [None]:
abb=pd.read_csv('http://www.fonz.net/blog/wp-content/uploads/2008/04/states.csv')

Then we will make a dictionary that takes abbreviations and turns them into the full state names.

In [None]:
mapper={a:b for a,b in zip(abb['Abbreviation'], abb['State'])}

Now that we have a dictionary, we can create a new column in tests_agged and use .map to generate it.

In [None]:
tests_agged['StateNames']=tests_agged['state'].map(mapper)

Now we can merge the tests_agged and pop dataframes because they both have a column with the full state name in them.

In [None]:
tests_agged=tests_agged.merge(pop, left_on='StateNames', right_on='NAME')

Calculate the per 1000 tests, add to a column

In [None]:
tests_agged['PerThousand']=tests_agged['totalTestResultsIncrease']/tests_agged['CENSUS2010POP']*1000

Sort, show the top 5.

In [None]:
tests_agged.sort_values(by='PerThousand', ascending=False)[0:5].loc[:,['NAME', 'PerThousand']]

In [None]:
plt.figure(figsize=[15,15])
sns.barplot(data=tests_agged.sort_values(by='PerThousand', ascending=False), x='PerThousand', y='NAME')