# Script to consolidate data from the [Covid Tracking Project](https://covidtracking.com/) and calculate simple ratios for display in Tableau

# Packages and data import

In [1]:
# packages
import requests, json
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# should we plot anything in this notebook, this will allow visual to display
%matplotlib inline

# show all columns
pd.set_option('display.max_columns', None)

In [2]:
# table with state names and abbreviations
states_abbr = pd.read_csv("../inputs/state_table.csv")
states_abbr.head()

Unnamed: 0,state,state_abbr
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


### Load [US daily 4pm EST](https://covidtracking.com/data/us-daily) data

In [3]:
national_url = "https://covidtracking.com/api/us/daily"

In [4]:
# request from url
national_raw = requests.get(national_url)

In [5]:
# convert request to json
national_json = national_raw.json()

In [6]:
# show first record in json file
national_json[:1]

[{'date': 20200416,
  'states': 56,
  'positive': 663260,
  'negative': 2737804,
  'pending': 16927,
  'hospitalizedCurrently': 57494,
  'hospitalizedCumulative': 74705,
  'inIcuCurrently': 15150,
  'inIcuCumulative': 1834,
  'onVentilatorCurrently': 5940,
  'onVentilatorCumulative': 137,
  'recovered': 42032,
  'hash': 'e42dcecd56cc690f4b4c5c242e02b8eb8e3491b6',
  'dateChecked': '2020-04-16T20:00:00Z',
  'death': 30296,
  'hospitalized': 74705,
  'total': 3417991,
  'totalTestResults': 3401064,
  'posNeg': 3401064,
  'deathIncrease': 2136,
  'hospitalizedIncrease': 5102,
  'negativeIncrease': 127705,
  'positiveIncrease': 30604,
  'totalTestResultsIncrease': 158309}]

In [7]:
# convert json to dataframe
national = pd.DataFrame(national_json)

# show first five rows
national.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,20200416,56,663260,2737804.0,16927.0,57494.0,74705.0,15150.0,1834.0,5940.0,137.0,42032.0,e42dcecd56cc690f4b4c5c242e02b8eb8e3491b6,2020-04-16T20:00:00Z,30296,74705.0,3417991,3401064,3401064,2136.0,5102.0,127705.0,30604.0,158309.0
1,20200415,56,632656,2610099.0,16901.0,58361.0,69603.0,14667.0,1783.0,6032.0,223.0,39405.0,ca6d549a02f56add12897c5a47d051c50f2e8c75,2020-04-15T20:00:00Z,28160,69603.0,3259656,3242755,3242755,2492.0,2056.0,130952.0,30183.0,161135.0
2,20200414,56,602473,2479147.0,16615.0,54215.0,67547.0,14039.0,1715.0,5975.0,221.0,37645.0,7ea156ba5cb34498d2798ce71d9470bdbb27201b,2020-04-14T20:00:00Z,25668,67547.0,3098235,3081620,3081620,2299.0,4874.0,120915.0,25699.0,146614.0
3,20200413,56,576774,2358232.0,17159.0,50968.0,62673.0,13632.0,1628.0,6168.0,210.0,35442.0,171a7aa78e00daf2ddb2e32baedcdf1127162a17,2020-04-13T20:00:00Z,23369,62673.0,2952165,2935006,2935006,1450.0,1472.0,104166.0,24948.0,129114.0
4,20200412,56,551826,2254066.0,16419.0,51413.0,61201.0,13917.0,1455.0,5986.0,160.0,34151.0,b66df37c6be1e91d8fb155d5612a9fb3202e8e52,2020-04-12T20:00:00Z,21919,61201.0,2822311,2805892,2805892,1564.0,2652.0,111243.0,28983.0,140226.0


### Load [states daily 4pm EST](https://docs.google.com/spreadsheets/u/2/d/e/2PACX-1vRwAqp96T9sYYq2-i7Tj0pvTf6XVHjDSMIKBdZHXiCGGdNC0ypEU9NbngS8mxea55JuCFuua1MUeOj5/pubhtml) data

In [8]:
states_url = "https://covidtracking.com/api/states/daily"

In [9]:
# request from url
states_raw = requests.get(states_url)

In [10]:
# convert request to json
states_json = states_raw.json()

In [11]:
# show first record in json file
states_json[:1]

[{'date': 20200416,
  'state': 'AK',
  'positive': 300,
  'negative': 8435,
  'pending': None,
  'hospitalizedCurrently': None,
  'hospitalizedCumulative': 35,
  'inIcuCurrently': None,
  'inIcuCumulative': None,
  'onVentilatorCurrently': None,
  'onVentilatorCumulative': None,
  'recovered': 110,
  'hash': '42353f78693987154276c7c482342a46ec11c180',
  'dateChecked': '2020-04-16T20:00:00Z',
  'death': 9,
  'hospitalized': 35,
  'total': 8735,
  'totalTestResults': 8735,
  'posNeg': 8735,
  'fips': '02',
  'deathIncrease': 0,
  'hospitalizedIncrease': 1,
  'negativeIncrease': 64,
  'positiveIncrease': 7,
  'totalTestResultsIncrease': 71}]

In [12]:
# convert json to dataframe
states = pd.DataFrame(states_json)

# show first five rows
states.head()

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,20200416,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16T20:00:00Z,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0
1,20200416,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16T20:00:00Z,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0
2,20200416,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16T20:00:00Z,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0
3,20200416,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16T20:00:00Z,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0
4,20200416,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16T20:00:00Z,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0


In [13]:
# data types by column
states.dtypes

date                          int64
state                        object
positive                    float64
negative                    float64
pending                     float64
hospitalizedCurrently       float64
hospitalizedCumulative      float64
inIcuCurrently              float64
inIcuCumulative             float64
onVentilatorCurrently       float64
onVentilatorCumulative      float64
recovered                   float64
hash                         object
dateChecked                  object
death                       float64
hospitalized                float64
total                       float64
totalTestResults            float64
posNeg                      float64
fips                         object
deathIncrease               float64
hospitalizedIncrease        float64
negativeIncrease            float64
positiveIncrease            float64
totalTestResultsIncrease    float64
dtype: object

In [14]:
# set 'date' to date and 'dateChecked' to timestamp
states['date'] = pd.to_datetime(states['date'], format = "%Y%m%d").dt.strftime('%Y-%m-%d')
states['dateChecked'] = pd.to_datetime(states['dateChecked'])

# rename 'state' to 'state_abbr'
states.rename(columns = {'state': 'state_abbr'}, inplace = True)

# show first five rows
states.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00+00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00+00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00+00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00+00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00+00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0


### Load state populations from Census American Community Survey (ACS5)

In [15]:
# map url to var
states_acs5_url = "https://api.census.gov/data/2017/acs/acs5?get=NAME,B01001_001E&for=state:*"

In [16]:
# import data from url
states_acs5_raw = requests.get(states_acs5_url)

In [17]:
# data to json
states_acs5_json = states_acs5_raw.json()

In [18]:
# json to dataframe
states_pop = pd.DataFrame(states_acs5_json[1:], columns = states_acs5_json[0]).rename(columns = 
                                                        {"B01001_001E": "population",
                                                         "NAME": "state",
                                                         "state": "state_fips"})
# show first five rows
states_pop.head()

Unnamed: 0,state,population,state_fips
0,Puerto Rico,3468963,72
1,Alabama,4850771,1
2,Alaska,738565,2
3,Arizona,6809946,4
4,Arkansas,2977944,5


Making an additional join so we have state abbreviation - will be needed for join below

In [19]:
# left join state abbr
states_pop = pd.merge(states_pop, states_abbr, on = "state", how = "left")

# rename 'state' to 'state_name'

# show first five rows
states_pop.head()

Unnamed: 0,state,population,state_fips,state_abbr
0,Puerto Rico,3468963,72,PR
1,Alabama,4850771,1,AL
2,Alaska,738565,2,AK
3,Arizona,6809946,4,AZ
4,Arkansas,2977944,5,AR


In [20]:
states.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00+00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00+00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00+00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00+00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00+00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0


### Load national from Census American Community Survey (ACS5)

In [21]:
# map url to var
us_acs5_url = "https://api.census.gov/data/2017/acs/acs5?get=NAME,B01001_001E&for=us:*"

# import data from url
us_acs5_raw = requests.get(us_acs5_url)

# data to json
us_acs5_json = us_acs5_raw.json()

# json to dataframe
us_pop = pd.DataFrame(us_acs5_json[1:], columns = us_acs5_json[0]).rename(columns = 
                                                        {"B01001_001E": "population",
                                                         "NAME": "country"}).drop(['us'], axis = 1)
# show table
us_pop

Unnamed: 0,country,population
0,United States,321004407


# Join state population to state case table

In [22]:
# join the population to the state test/case table
combined_1 = pd.merge(states, states_pop[['state', 'state_abbr', 'population']]
                      , on = 'state_abbr', how = 'left')#.drop('state_abbr', 1)

# show first five rows
combined_1.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00+00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0,Alaska,738565.0
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00+00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0,Alabama,4850771.0
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00+00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0,Arkansas,2977944.0
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00+00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0,,
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00+00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0,Arizona,6809946.0


# Create ratios

In [23]:
# by state, change in total from one day to the next
combined_1['tests_since_prev_day'] = combined_1.groupby('state')['total'].diff(-1)

# by state, change in positive from one day to the next
combined_1['positives_since_prev_day'] = combined_1.groupby('state')['positive'].diff(-1)

# by state, change in positive from one day to the next
combined_1['negatives_since_prev_day'] = combined_1.groupby('state')['negative'].diff(-1)

# show table
combined_1.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00+00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0,Alaska,738565.0,71.0,7.0,64.0
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00+00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0,Alabama,4850771.0,2314.0,232.0,2082.0
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00+00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0,Arkansas,2977944.0,841.0,51.0,790.0
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00+00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0,,,,,
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00+00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0,Arizona,6809946.0,2088.0,272.0,1816.0


In [24]:
# look at NY state
combined_1[combined_1['state'] == "New York"].head(10)

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day
37,2020-04-16,NY,222284.0,328295.0,,17735.0,50450.0,5091.0,,,,23887.0,bd386393a80d22e8b4638b53c9a00a316b7c209a,2020-04-16 20:00:00+00:00,12192.0,50450.0,550579.0,550579.0,550579.0,36,606.0,4249.0,16062.0,8505.0,24567.0,New York,19798228,24567.0,8505.0,16062.0
93,2020-04-15,NY,213779.0,312233.0,,18697.0,46201.0,5225.0,,,,23887.0,1345a6429c3718d790172d3c91f25184b1df3832,2020-04-15 20:00:00+00:00,11586.0,46201.0,526012.0,526012.0,526012.0,36,752.0,0.0,15298.0,11571.0,26869.0,New York,19798228,26869.0,11571.0,15298.0
149,2020-04-14,NY,202208.0,296935.0,,18697.0,46201.0,5225.0,,,,23887.0,af7e566432a9ff3be22212c6e53deadb87354a33,2020-04-14 20:00:00+00:00,10834.0,46201.0,499143.0,499143.0,499143.0,36,778.0,3489.0,13609.0,7177.0,20786.0,New York,19798228,20786.0,7177.0,13609.0
205,2020-04-13,NY,195031.0,283326.0,,18825.0,42712.0,5156.0,,,,23887.0,286aa877043319eac31dd16ed268926cee8f6d4a,2020-04-13 20:00:00+00:00,10056.0,42712.0,478357.0,478357.0,478357.0,36,671.0,118.0,10419.0,6337.0,16756.0,New York,19798228,16756.0,6337.0,10419.0
261,2020-04-12,NY,188694.0,272907.0,,18707.0,42594.0,5198.0,,,,23887.0,c34e542369716ac96cf4d5c56a0b44894e6a356f,2020-04-12 20:00:00+00:00,9385.0,42594.0,461601.0,461601.0,461601.0,36,758.0,1915.0,12385.0,8236.0,20621.0,New York,19798228,20621.0,8236.0,12385.0
317,2020-04-11,NY,180458.0,260522.0,,18654.0,40679.0,5009.0,,,,22025.0,361c16745f9be4bc650b0fa0ebaabc36edad3d47,2020-04-11 20:00:00+00:00,8627.0,40679.0,440980.0,440980.0,440980.0,36,783.0,1861.0,13149.0,9946.0,23095.0,New York,19798228,23095.0,9946.0,13149.0
373,2020-04-10,NY,170512.0,247373.0,,18569.0,38818.0,4908.0,,,,20249.0,b9f6747819f23a6d4d2caddb7dab2b4568f829f4,2020-04-10 20:00:00+00:00,7844.0,38818.0,417885.0,417885.0,417885.0,36,777.0,2242.0,15761.0,10575.0,26336.0,New York,19798228,26336.0,10575.0,15761.0
429,2020-04-09,NY,159937.0,231612.0,,18279.0,36576.0,4925.0,,,,18297.0,adcfe0b7d9f5faf78429b00c69af00833443c8ec,2020-04-09 20:00:00+00:00,7067.0,36576.0,391549.0,391549.0,391549.0,36,799.0,2144.0,15775.0,10621.0,26396.0,New York,19798228,26396.0,10621.0,15775.0
485,2020-04-08,NY,149316.0,215837.0,,18079.0,34432.0,4593.0,,,,16353.0,36db5397b3f156232e908feab224b44a19364028,2020-04-08 20:00:00+00:00,6268.0,34432.0,365153.0,365153.0,365153.0,36,779.0,2349.0,14642.0,10453.0,25095.0,New York,19798228,25095.0,10453.0,14642.0
541,2020-04-07,NY,138863.0,201195.0,,17493.0,32083.0,4593.0,,,,14590.0,15947b1c9ed54db71e224d5882c2e146b415b520,2020-04-07 20:00:00+00:00,5489.0,32083.0,340058.0,340058.0,340058.0,36,731.0,1880.0,11073.0,8174.0,19247.0,New York,19798228,19247.0,8174.0,11073.0


In [25]:
# show data types by column
combined_1.dtypes

date                                     object
state_abbr                               object
positive                                float64
negative                                float64
pending                                 float64
hospitalizedCurrently                   float64
hospitalizedCumulative                  float64
inIcuCurrently                          float64
inIcuCumulative                         float64
onVentilatorCurrently                   float64
onVentilatorCumulative                  float64
recovered                               float64
hash                                     object
dateChecked                 datetime64[ns, UTC]
death                                   float64
hospitalized                            float64
total                                   float64
totalTestResults                        float64
posNeg                                  float64
fips                                     object
deathIncrease                           

In [26]:
# change population from object to numeric
combined_1['population'] = pd.to_numeric(combined_1['population'])

In [27]:
# tests per capita
combined_1['total_tests_per_person'] = combined_1['total'] / combined_1['population']

In [28]:
# show first five rows
combined_1.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day,total_tests_per_person
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00+00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0,Alaska,738565.0,71.0,7.0,64.0,0.011827
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00+00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0,Alabama,4850771.0,2314.0,232.0,2082.0,0.007502
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00+00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0,Arkansas,2977944.0,841.0,51.0,790.0,0.007614
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00+00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0,,,,,,
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00+00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0,Arizona,6809946.0,2088.0,272.0,1816.0,0.00696


In [29]:
# get today's date
today = datetime.today().strftime('%Y-%m-%d')
today

'2020-04-16'

In [30]:
# filter date to today and sort states by highest test per capita
combined_1[combined_1['date'].isin([today])].sort_values(by = "total_tests_per_person", ascending = False).head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day,total_tests_per_person
37,2020-04-16,NY,222284.0,328295.0,,17735.0,50450.0,5091.0,,,,23887.0,bd386393a80d22e8b4638b53c9a00a316b7c209a,2020-04-16 20:00:00+00:00,12192.0,50450.0,550579.0,550579.0,550579.0,36,606.0,4249.0,16062.0,8505.0,24567.0,New York,19798228.0,24567.0,8505.0,16062.0,0.02781
20,2020-04-16,LA,22532.0,104054.0,,1914.0,,,,396.0,,,6ae3a5f7f0b1e1a5c00710d1a5571702dfbff14c,2020-04-16 20:00:00+00:00,1156.0,,126586.0,126586.0,126586.0,22,53.0,0.0,4077.0,581.0,4658.0,Louisiana,4663461.0,4658.0,581.0,4077.0,0.027144
43,2020-04-16,RI,3838.0,24226.0,,245.0,331.0,61.0,,43.0,,182.0,bb544990f7c3e3da791f26108024f6657594c9c3,2020-04-16 20:00:00+00:00,105.0,331.0,28064.0,28064.0,28064.0,44,18.0,0.0,1858.0,309.0,2167.0,Rhode Island,1056138.0,2167.0,309.0,1858.0,0.026572
21,2020-04-16,MA,32181.0,108592.0,,3454.0,2340.0,973.0,,,,,ec48a236ea400ddb66cf39987ce618642a9150bc,2020-04-16 20:00:00+00:00,1245.0,2340.0,140773.0,140773.0,140773.0,25,137.0,0.0,6487.0,2263.0,8750.0,Massachusetts,6789319.0,8750.0,2263.0,6487.0,0.020734
51,2020-04-16,VT,768.0,10739.0,,58.0,,,,,,15.0,1ea60000d2e949646925602b03e1fd49dd2bfeef,2020-04-16 20:00:00+00:00,35.0,,11507.0,11507.0,11507.0,50,5.0,0.0,417.0,9.0,426.0,Vermont,624636.0,426.0,9.0,417.0,0.018422


In [31]:
# determine positive rate per tests
combined_1['positive_per_test'] = combined_1['positives_since_prev_day'] / combined_1['tests_since_prev_day']
combined_1.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day,total_tests_per_person,positive_per_test
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00+00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0,Alaska,738565.0,71.0,7.0,64.0,0.011827,0.098592
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00+00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0,Alabama,4850771.0,2314.0,232.0,2082.0,0.007502,0.100259
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00+00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0,Arkansas,2977944.0,841.0,51.0,790.0,0.007614,0.060642
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00+00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0,,,,,,,
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00+00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0,Arizona,6809946.0,2088.0,272.0,1816.0,0.00696,0.130268


# Display a few select states

In [32]:
# look at NY over time
combined_1[combined_1['state'] == 'New York'].head(10)

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day,total_tests_per_person,positive_per_test
37,2020-04-16,NY,222284.0,328295.0,,17735.0,50450.0,5091.0,,,,23887.0,bd386393a80d22e8b4638b53c9a00a316b7c209a,2020-04-16 20:00:00+00:00,12192.0,50450.0,550579.0,550579.0,550579.0,36,606.0,4249.0,16062.0,8505.0,24567.0,New York,19798228.0,24567.0,8505.0,16062.0,0.02781,0.346196
93,2020-04-15,NY,213779.0,312233.0,,18697.0,46201.0,5225.0,,,,23887.0,1345a6429c3718d790172d3c91f25184b1df3832,2020-04-15 20:00:00+00:00,11586.0,46201.0,526012.0,526012.0,526012.0,36,752.0,0.0,15298.0,11571.0,26869.0,New York,19798228.0,26869.0,11571.0,15298.0,0.026569,0.430645
149,2020-04-14,NY,202208.0,296935.0,,18697.0,46201.0,5225.0,,,,23887.0,af7e566432a9ff3be22212c6e53deadb87354a33,2020-04-14 20:00:00+00:00,10834.0,46201.0,499143.0,499143.0,499143.0,36,778.0,3489.0,13609.0,7177.0,20786.0,New York,19798228.0,20786.0,7177.0,13609.0,0.025211,0.34528
205,2020-04-13,NY,195031.0,283326.0,,18825.0,42712.0,5156.0,,,,23887.0,286aa877043319eac31dd16ed268926cee8f6d4a,2020-04-13 20:00:00+00:00,10056.0,42712.0,478357.0,478357.0,478357.0,36,671.0,118.0,10419.0,6337.0,16756.0,New York,19798228.0,16756.0,6337.0,10419.0,0.024162,0.378193
261,2020-04-12,NY,188694.0,272907.0,,18707.0,42594.0,5198.0,,,,23887.0,c34e542369716ac96cf4d5c56a0b44894e6a356f,2020-04-12 20:00:00+00:00,9385.0,42594.0,461601.0,461601.0,461601.0,36,758.0,1915.0,12385.0,8236.0,20621.0,New York,19798228.0,20621.0,8236.0,12385.0,0.023315,0.399399
317,2020-04-11,NY,180458.0,260522.0,,18654.0,40679.0,5009.0,,,,22025.0,361c16745f9be4bc650b0fa0ebaabc36edad3d47,2020-04-11 20:00:00+00:00,8627.0,40679.0,440980.0,440980.0,440980.0,36,783.0,1861.0,13149.0,9946.0,23095.0,New York,19798228.0,23095.0,9946.0,13149.0,0.022274,0.430656
373,2020-04-10,NY,170512.0,247373.0,,18569.0,38818.0,4908.0,,,,20249.0,b9f6747819f23a6d4d2caddb7dab2b4568f829f4,2020-04-10 20:00:00+00:00,7844.0,38818.0,417885.0,417885.0,417885.0,36,777.0,2242.0,15761.0,10575.0,26336.0,New York,19798228.0,26336.0,10575.0,15761.0,0.021107,0.401542
429,2020-04-09,NY,159937.0,231612.0,,18279.0,36576.0,4925.0,,,,18297.0,adcfe0b7d9f5faf78429b00c69af00833443c8ec,2020-04-09 20:00:00+00:00,7067.0,36576.0,391549.0,391549.0,391549.0,36,799.0,2144.0,15775.0,10621.0,26396.0,New York,19798228.0,26396.0,10621.0,15775.0,0.019777,0.402372
485,2020-04-08,NY,149316.0,215837.0,,18079.0,34432.0,4593.0,,,,16353.0,36db5397b3f156232e908feab224b44a19364028,2020-04-08 20:00:00+00:00,6268.0,34432.0,365153.0,365153.0,365153.0,36,779.0,2349.0,14642.0,10453.0,25095.0,New York,19798228.0,25095.0,10453.0,14642.0,0.018444,0.416537
541,2020-04-07,NY,138863.0,201195.0,,17493.0,32083.0,4593.0,,,,14590.0,15947b1c9ed54db71e224d5882c2e146b415b520,2020-04-07 20:00:00+00:00,5489.0,32083.0,340058.0,340058.0,340058.0,36,731.0,1880.0,11073.0,8174.0,19247.0,New York,19798228.0,19247.0,8174.0,11073.0,0.017176,0.42469


In [33]:
# look at NY over time
#combined_1[combined_1['state'] == 'New Jersey']

In [34]:
# look at WA over time
#combined_1[combined_1['state'] == 'Washington']

In [35]:
# look at VA over time
#combined_1[combined_1['state'] == 'Virginia']

In [36]:
# look at MD over time
#combined_1[combined_1['state'] == 'Maryland']

In [37]:
# look at DC over time
#combined_1[combined_1['state'] == 'District of Columbia']

In [38]:
# look at LA over time
#combined_1[combined_1['state'] == 'Louisiana']

# Finalize US National Data

In [39]:
# dislay us pop
us_pop

Unnamed: 0,country,population
0,United States,321004407


In [40]:
# display first five rows of national dataset
national.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,20200416,56,663260,2737804.0,16927.0,57494.0,74705.0,15150.0,1834.0,5940.0,137.0,42032.0,e42dcecd56cc690f4b4c5c242e02b8eb8e3491b6,2020-04-16T20:00:00Z,30296,74705.0,3417991,3401064,3401064,2136.0,5102.0,127705.0,30604.0,158309.0
1,20200415,56,632656,2610099.0,16901.0,58361.0,69603.0,14667.0,1783.0,6032.0,223.0,39405.0,ca6d549a02f56add12897c5a47d051c50f2e8c75,2020-04-15T20:00:00Z,28160,69603.0,3259656,3242755,3242755,2492.0,2056.0,130952.0,30183.0,161135.0
2,20200414,56,602473,2479147.0,16615.0,54215.0,67547.0,14039.0,1715.0,5975.0,221.0,37645.0,7ea156ba5cb34498d2798ce71d9470bdbb27201b,2020-04-14T20:00:00Z,25668,67547.0,3098235,3081620,3081620,2299.0,4874.0,120915.0,25699.0,146614.0
3,20200413,56,576774,2358232.0,17159.0,50968.0,62673.0,13632.0,1628.0,6168.0,210.0,35442.0,171a7aa78e00daf2ddb2e32baedcdf1127162a17,2020-04-13T20:00:00Z,23369,62673.0,2952165,2935006,2935006,1450.0,1472.0,104166.0,24948.0,129114.0
4,20200412,56,551826,2254066.0,16419.0,51413.0,61201.0,13917.0,1455.0,5986.0,160.0,34151.0,b66df37c6be1e91d8fb155d5612a9fb3202e8e52,2020-04-12T20:00:00Z,21919,61201.0,2822311,2805892,2805892,1564.0,2652.0,111243.0,28983.0,140226.0


In [41]:
# concat national dataset with the population
# code below duplicates us_pop by number of rows in national dataset to avoid NAs
national_final = pd.concat([national, 
                            pd.concat([us_pop.drop(['country'],
                                                   axis = 1)]*len(national), ignore_index = True)],
                           axis = 1)

# show first five rows
national_final.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,population
0,20200416,56,663260,2737804.0,16927.0,57494.0,74705.0,15150.0,1834.0,5940.0,137.0,42032.0,e42dcecd56cc690f4b4c5c242e02b8eb8e3491b6,2020-04-16T20:00:00Z,30296,74705.0,3417991,3401064,3401064,2136.0,5102.0,127705.0,30604.0,158309.0,321004407
1,20200415,56,632656,2610099.0,16901.0,58361.0,69603.0,14667.0,1783.0,6032.0,223.0,39405.0,ca6d549a02f56add12897c5a47d051c50f2e8c75,2020-04-15T20:00:00Z,28160,69603.0,3259656,3242755,3242755,2492.0,2056.0,130952.0,30183.0,161135.0,321004407
2,20200414,56,602473,2479147.0,16615.0,54215.0,67547.0,14039.0,1715.0,5975.0,221.0,37645.0,7ea156ba5cb34498d2798ce71d9470bdbb27201b,2020-04-14T20:00:00Z,25668,67547.0,3098235,3081620,3081620,2299.0,4874.0,120915.0,25699.0,146614.0,321004407
3,20200413,56,576774,2358232.0,17159.0,50968.0,62673.0,13632.0,1628.0,6168.0,210.0,35442.0,171a7aa78e00daf2ddb2e32baedcdf1127162a17,2020-04-13T20:00:00Z,23369,62673.0,2952165,2935006,2935006,1450.0,1472.0,104166.0,24948.0,129114.0,321004407
4,20200412,56,551826,2254066.0,16419.0,51413.0,61201.0,13917.0,1455.0,5986.0,160.0,34151.0,b66df37c6be1e91d8fb155d5612a9fb3202e8e52,2020-04-12T20:00:00Z,21919,61201.0,2822311,2805892,2805892,1564.0,2652.0,111243.0,28983.0,140226.0,321004407


In [42]:
national_final[:1].stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,0,date,20200416
1,0,states,56
2,0,positive,663260
3,0,negative,2.7378e+06
4,0,pending,16927
5,0,hospitalizedCurrently,57494
6,0,hospitalizedCumulative,74705
7,0,inIcuCurrently,15150
8,0,inIcuCumulative,1834
9,0,onVentilatorCurrently,5940


# Write to Excel

In [43]:
# running code below since excel does not support datetimes with timezones
combined_1['dateChecked'] = combined_1['dateChecked'].dt.strftime('%Y-%m-%d %H:%M:%S')

# show first five rows
combined_1.head()

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day,total_tests_per_person,positive_per_test
0,2020-04-16,AK,300.0,8435.0,,,35.0,,,,,110.0,42353f78693987154276c7c482342a46ec11c180,2020-04-16 20:00:00,9.0,35.0,8735.0,8735.0,8735.0,2,0.0,1.0,64.0,7.0,71.0,Alaska,738565.0,71.0,7.0,64.0,0.011827,0.098592
1,2020-04-16,AL,4345.0,32046.0,,,553.0,,227.0,,137.0,,bc8d87a6ade709e9ed0c8824e73dff454277a85f,2020-04-16 20:00:00,133.0,553.0,36391.0,36391.0,36391.0,1,12.0,28.0,2082.0,232.0,2314.0,Alabama,4850771.0,2314.0,232.0,2082.0,0.007502,0.100259
2,2020-04-16,AR,1620.0,21055.0,,85.0,,,,21.0,,548.0,95e15a426548163accca607e39e7c97a7e229f6e,2020-04-16 20:00:00,37.0,,22675.0,22675.0,22675.0,5,4.0,0.0,790.0,51.0,841.0,Arkansas,2977944.0,841.0,51.0,790.0,0.007614,0.060642
3,2020-04-16,AS,0.0,3.0,17.0,,,,,,,,614a57a963d84c55c40b863aaf08221f68f0387c,2020-04-16 20:00:00,,,20.0,3.0,3.0,60,0.0,0.0,0.0,0.0,0.0,,,,,,,
4,2020-04-16,AZ,4234.0,43164.0,,578.0,,278.0,,188.0,,460.0,1a168d4509dfc562564670fbcf91399c32b0981a,2020-04-16 20:00:00,150.0,,47398.0,47398.0,47398.0,4,8.0,0.0,1816.0,272.0,2088.0,Arizona,6809946.0,2088.0,272.0,1816.0,0.00696,0.130268


In [44]:
# separate the most recent day into its own tab
#today = '2020-04-03'
most_recent_day = combined_1[combined_1['date'].isin([today])]
most_recent_day[most_recent_day.state_abbr.isin(['NY', 'NJ', 'DC'])]

Unnamed: 0,date,state_abbr,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,hash,dateChecked,death,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,state,population,tests_since_prev_day,positives_since_prev_day,negatives_since_prev_day,total_tests_per_person,positive_per_test
8,2020-04-16,DC,2350.0,9800.0,,313.0,,105.0,,79.0,,552.0,100abd89bf88d9f88b0684c7fd04cc5c47fdbb60,2020-04-16 20:00:00,81.0,,12150.0,12150.0,12150.0,11,9.0,0.0,472.0,153.0,625.0,District of Columbia,672391.0,625.0,153.0,472.0,0.01807,0.2448
34,2020-04-16,NJ,75317.0,76513.0,,8224.0,,2014.0,,1645.0,,,8f04a2a6fb4ff0d2e41da80b7bca3da6ae2279ff,2020-04-16 20:00:00,3518.0,,151830.0,151830.0,151830.0,34,362.0,0.0,3522.0,4287.0,7809.0,New Jersey,8960161.0,7809.0,4287.0,3522.0,0.016945,0.548982
37,2020-04-16,NY,222284.0,328295.0,,17735.0,50450.0,5091.0,,,,23887.0,bd386393a80d22e8b4638b53c9a00a316b7c209a,2020-04-16 20:00:00,12192.0,50450.0,550579.0,550579.0,550579.0,36,606.0,4249.0,16062.0,8505.0,24567.0,New York,19798228.0,24567.0,8505.0,16062.0,0.02781,0.346196


In [45]:
# to excel
with pd.ExcelWriter('../outputs/States_Daily_4PM.xlsx') as writer:
    combined_1.to_excel(writer, sheet_name = 'States Daily 4PM', index = False)
    most_recent_day.to_excel(writer, sheet_name = 'Most Recent Day', index = False)