In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.figure as fg # figure control
import matplotlib.pyplot as plt # plotting
import matplotlib.ticker as plticker  #ticker control
import numpy as np
import os
import pandas as pd

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Introduction
"There is variation in the rate of testing and positive results among states, but most need to administer more tests to get to the level the researchers suggest — a minimum of about 152 tests per 100,000 people each day."

In most states, experts agree [not enough testing is being done at the level needed to safely reopen](https://www.nytimes.com/interactive/2020/04/17/us/coronavirus-testing-states.html).

This notebook examines trends in coronavirus testing among states, including overall numbers of tests, common trends or issues in reporting the number of tests, and test-positivity, using [the most recent COVID19 Worldwide Testing Data](https://www.kaggle.com/lin0li/covid19testing) from The Atlantic's [COVID Tracking Project](https://covidtracking.com/api).

Finally, we will extend graphs showing number of tests per capita conducted in each state from [the article cited above](https://www.nytimes.com/interactive/2020/04/17/us/coronavirus-testing-states.html) using the latest data and the [US Census National Population Estimates](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv).

In [None]:
df1 = pd.read_csv('/kaggle/input/covid19testing/tested_worldwide.csv', delimiter=',') 
df1.dataframeName = 'tested_worldwide.csv'
us = df1[df1.Country_Region=='United States']
all_states = np.sort(us.Province_State.unique())
raw_states = np.delete(all_states,[2,3,13,38,43,51])
print("States included:")
states_list = ''
for i in range(0,raw_states.size):
    if i == raw_states.size-1:
        states_list += raw_states[i]
    else:
        states_list += raw_states[i]+', '
print(states_list)

We'll start by examining the raw data -- number of tests being conducted in each state and the number of tests that are positive.

The first thing you'll notice are huge dips and spikes (and even negative numbers) that are mostly driven by changes in testing capacity or reporting methodology (or corrections). Later we'll use a 7-day rolling average to smooth things out, but bear these raw numbers in mind. The COVID Tracking Project's [data page](https://covidtracking.com/data) explains many of the specific events or data issues, by state. Some spikes indicate real changes in testing capacity, such as the spike in New Jersey's numbers on May 15th, which reflects [their limited intiative to test all residents](https://www.mycentraljersey.com/story/news/health/2020/05/15/coronavirus-nj-middlesex-county-begin-testing-asymptomatic/5197296002/) including those who show no symptoms of COVID.

Notice that some states have a very small gap between the number of total tests conducted and the number of positive tests. This means they're really only testing people who almost certainly have COVID, mostly likely because they don't have the capacity or ability to test anyone else -- more on this below.

Here's a snippet of the raw data, followed by the graphs:

In [None]:
a = df1[df1.Province_State=='Alabama']
a.tail()

In [None]:
fig, axs = plt.subplots(9,6,gridspec_kw={'hspace': 0.4, 'wspace': 0.3},figsize=(20, 20))

for i, ax in enumerate(fig.axes):
    if i>50:
        ax.set_visible(False)
    else:
        state = df1[df1.Province_State==raw_states[i]]
        short_dates = []
        for j in range(0,state.Date.values.size):
            short_dates.append(state.Date.values[j].replace('2020-',''))
        ax.plot(short_dates, state.daily_tested,'-')
        ax.plot(short_dates, state.daily_positive,'-')
        ax.set_title(raw_states[i])
        if i!=0:
            ax.get_xaxis().set_visible(False)
        else:
            #for tick in ax.get_xticklabels():
            #    tick.set_rotation(90)
            loc = plticker.MultipleLocator(base=20.0)
            ax.xaxis.set_major_locator(loc)

fig.suptitle('Daily Total Tests and Positive Tests By State (Raw)')

plt.show()

Let's dive deeper into the what we can or can't tell from the data. Are we doing enough testing to give a reasonably accurate picture of how COVID is spreading? One way to answer this is to examine the "test-positivity rate".

The daily test-positivity rate is defined as: 
\begin{equation}
daily\:test-positivity\:rate = \frac{number\:of\:daily\:new\:infections}{number\:of\:daily\:new\:tests}
\end{equation}

From [this article in the Atlantic on the utility of test-positivity rates](https://www.theatlantic.com/technology/archive/2020/04/us-coronavirus-outbreak-out-control-test-positivity-rate/610132/), "\[the\] high test-positivity rate almost certainly means that the U.S. is not testing everyone who has been infected with the pathogen, because it implies that doctors are testing only people with a very high probability of having the infection. People with milder symptoms, to say nothing of those with none at all, are going undercounted. Countries that test broadly should encounter far more people who are not infected than people who are, so their test-positivity rate should be lower." For example, South Korea is often cited as the example of effective pandemic control through mass testing. Their positive test rate was between 2-3% of all tests; Canada, Germany and Denmark have positivity rates of 6-8%; Australia and New Zealand have 2% positivity rates. As of May 17th, the US was down to a 5% test-positive rate overall.

The article also notes that comparing test-positivity rates between different regions may be fraught since not all populations may show equal [incidence](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section2.html) of COVID-19, and the test-positivity numbers themselves come with a lot of caveats (we don't know exactly who is getting tested and how reliabile the tests are). However, "\[the test-positivity rate\] can still give a rough sense of how bad a particular outbreak is by distinguishing between places undergoing very different sizes of epidemics, Andrews said. A country with a 25 percent positivity rate and one with a 2 percent positivity rate are facing “vastly different epidemics,” he said, and the 2 percent country is better off.

Let's look at the test-positive rates in each state. We'll also apply a 7-day moving average to the number of daily tests. Generally, we should expect to see two lines with opposite slopes that cross somewhere in the middle -- as testing capacity increases, test-positivity rates should decrease if the virus is spreading only steadily or declining. Most states show this pattern, which is good news. States that might still be experiencing exponential growth might have an increase in testing with a corresponding increase in test-positivity rate -- for example, Ohio, Wyoming, Minnesota.

In [None]:
# Calculate test-positive rates per state
us=us.assign(daily_rate=(us.daily_positive/us.daily_tested)*100)
us.daily_rate.replace(us[us.daily_rate<0].daily_rate, 0, inplace=True)
us['daily_rate'].clip(lower=0)
us.daily_rate.fillna(0, inplace=True)

In [None]:
fig, axs = plt.subplots(9,6,sharey=True,gridspec_kw={'hspace': 0.4, 'wspace': 0.4},figsize=(20, 25))

for i, ax in enumerate(fig.axes):
    if i>50:
        ax.set_visible(False)
    else:
        state = us[us.Province_State==raw_states[i]]
        short_dates = []
        for j in range(0,state.Date.values.size):
            short_dates.append(state.Date.values[j].replace('2020-',''))
        daily_tested_rolling = state.daily_tested.rolling(window=7).mean()
        daily_rate_rolling = state.daily_rate.rolling(window=7).mean()
        ax.plot(short_dates, daily_rate_rolling,'-', color='tab:orange')
        
        ax2 = ax.twinx()
        ax2.plot(short_dates, daily_tested_rolling,'-')
        ax2.yaxis.set_major_locator(plticker.MaxNLocator(nbins=5))
        ax.set_title(raw_states[i])
        #ax.get_xaxis().set_visible(False)
        if i!=0:
            ax.get_xaxis().set_visible(False)
        else:
            #for tick in ax.get_xticklabels():
            #    tick.set_rotation(90)
            loc = plticker.MultipleLocator(base=20.0)
            ax.xaxis.set_major_locator(loc)

fig.suptitle('Daily Test-Positivity Rate (orange) and Total Tests (blue) By State (Rolling)')

plt.show()

Let's look at tests per capita, using population estimates for each state from [the US Census Bureau National Population Totals](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-national-total.html).

The good news is, as noted on [this Johns Hopkins page using COVID Tracking Project data](https://coronavirus.jhu.edu/testing/testing-positivity), most states are below the WHO recommended 12% test-positivity threshold to reopen.

In [None]:
# Add population data for per capita testing
popdata = pd.read_csv('/kaggle/input/nstest2019alldata/nst-est2019-alldata.csv', delimiter=',')

In [None]:
# Calculate rolling test-positive rates per state
us=us.assign(daily_rate=(us.daily_positive/us.daily_tested)*100)
us.daily_rate.replace(us[us.daily_rate<0].daily_rate, 0, inplace=True)
us['daily_rate'].clip(lower=0)
us.daily_rate.fillna(0, inplace=True)

In [None]:
fig, axs = plt.subplots(9,6,gridspec_kw={'hspace': 0.4, 'wspace': 0.3},figsize=(20, 20),sharey=True)
#fig, axs = plt.subplots(7,8,gridspec_kw={'hspace': 0.4, 'wspace': 0.3},figsize=(30, 20),sharey=True)
#fig, axs = plt.subplots(1,6,gridspec_kw={'hspace': 0.4, 'wspace': 0.3},figsize=(20, 5),sharey=True)
all_tests_per_capita = pd.DataFrame()

for i, ax in enumerate(fig.axes):
    # Hide blank plots since 51 doesn't divide evenly into full rows
    if i>50:
        ax.set_visible(False)
    else:
        state_name = raw_states[i]
        state = us[us.Province_State==state_name]
        #short_dates = []
        #for j in range(0,state.Date.values.size):
        #    short_dates.append(state.Date.values[j].replace('2020-',''))
        short_dates = state.Date.str.replace('2020-','')
        daily_tested_rolling = state.daily_tested.rolling(window=7).mean()
        pop = popdata[popdata.NAME==state_name].POPESTIMATE2019.values[0]
        tests_per_capita = pd.Series(daily_tested_rolling.divide(pop)*100000,name=state_name)
        tests_per_capita.index=state.Date # Set index to dates; does not work in initialization
        all_tests_per_capita = all_tests_per_capita.append(tests_per_capita)
        
        ax.plot(short_dates, tests_per_capita,'-')
        ax.axhline(y=152,ls='--')
        ax.set_title(state_name)
        if i==0:
            loc = plticker.MultipleLocator(base=20.0)
            ax.xaxis.set_major_locator(loc)
        else:
            ax.get_xaxis().set_visible(False)

        latest = tests_per_capita[-1:].values[0]
        ax.annotate(int(latest), xy=(state.Date.max(),latest), xytext=(-20,10), textcoords='offset points')

fig.suptitle('Daily Number of Tests Per Capita (100,000) By State')

plt.show()

Finally, let's combine what we know about daily tests, test-positivity rate, and per capita testing into one graph, focusing on data from the last week.

States with larger bubbles are conducting a greater total numbers of tests. This helps us see that although states like California and Florida are testing relatively low numbers per capita, they are still running a lot of tests. They just have a lot of people, so securing adequate testing supplies or ensuring capacity may be a challenge.

States in the upper left are testing many of their residents and finding low rates of infection in the last week. This is good news for them.

States in the lower right are not testing many residents, so most tests are coming back positive. This means we don't have a complete picture of what's happening in that state.

States shown in brighter colors have had more deaths in the state in the last week. States with large absolute numbers of deaths and smaller numbers of tests per capita, such as Pennsylvania and California, may also be struggling with test capacity.

In [None]:
#Calculate mean daily rates and total tested for all states, for the last seven days
# Get only last seven days
last_week=us['Date'].unique()[-7:]
d = us[['Province_State','daily_rate','daily_tested','death','Date']]
last_week_data = d[d['Date'].isin(last_week)]

# Remove Maine because of low reporting
trimmed_states = np.delete(raw_states,np.where(raw_states=='Maine'))

# Final mean arrays to plot
mean_daily_rate = last_week_data[['Province_State','daily_rate']].groupby('Province_State').mean().loc[trimmed_states]
mean_daily_tested = last_week_data[['Province_State','daily_tested']].groupby('Province_State').mean().loc[trimmed_states]

# Calculate total number of deaths from the last week
d = last_week_data.sort_values(by=['Province_State', 'death', 'Date'])
d['last_week_deaths'] = d.groupby('Province_State')['death'].diff(periods=6)
total_deaths = d[d['Date'].isin([last_week[-1]])][['Province_State','last_week_deaths']].set_index('Province_State').loc[trimmed_states]

In [None]:
# Mean tests per capita last seven days
mean_daily_tpc = all_tests_per_capita[all_tests_per_capita.columns.sort_values()[-7:]].mean(axis=1)
mean_daily_tpc = mean_daily_tpc[mean_daily_tpc.index!='Maine']

In [None]:
fig, ax = plt.subplots(figsize=(30, 12))
#ax.scatter(mean_daily_rate, mean_daily_tested, s=last_week_tests_per_capita)#, c=close, s=volume, alpha=0.5)
ax.scatter(mean_daily_rate, mean_daily_tpc, s=mean_daily_tested/10, c=total_deaths.values, alpha=0.5)
#ax.scatter(mean_daily_rate, mean_daily_tpc, s=mean_daily_tested/10, alpha=0.5)

for state in mean_daily_rate.index:
    ax.annotate(state,xy=(mean_daily_rate.loc[state],mean_daily_tpc.loc[state]))

ax.set_xlabel('Average Test-Positivity Rate')
ax.set_ylabel('Average Tests Per Capita (100,000)')
ax.set_title('Average Daily Test-Positivity Rate vs. Tests Per Capita By State, Colored by Total Deaths, Over The Past Week')

ax.grid(True)
plot = ax.pcolor(total_deaths.values,visible=False)
fig.colorbar(plot)
plt.show()