# Importing from the COVID Tracking Project

This script pulls data from the API provided by the [COVID Tracking Project](https://covidtracking.com/). They're collecting data from 50 US states, the District of Columbia, and five U.S. territories to provide the most comprehensive testing data. They attempt to include positive and negative results, pending tests and total people tested for each state or district currently reporting that data.

In [2]:
import pandas as pd
import requests
import json
import datetime
import pycountry

In [3]:
# papermill parameters
output_folder = '../output/'

In [4]:
raw_response = requests.get("https://covidtracking.com/api/states/daily").text
raw_data = pd.DataFrame.from_dict(json.loads(raw_response))

### Data Quality
1. Replace empty values with zero
2. Convert "date" int column to "Date" datetime column
4. Rename columns in order to match with other source
5. Drop unnecessary columns
6. Add "Country/Region" column, since the source contains data from US states, it can be hardcoded

In [8]:
data = raw_data.fillna(0)
data['Date'] = pd.to_datetime(data['date'].astype(str), format='%Y%m%d')
data = data.rename(
    columns={
        "state": "ISO3166-2",
        "positive": "Positive",
        "negative": "Negative",
        "pending": "Pending",
        "death": "Death",
        "total": "Total",
        "hospitalized": "Hospitalized"
    })
data = data.drop(labels=['dateChecked', "date"], axis='columns')
data['Country/Region'] = "United States"
data['ISO3166-1'] = "US"

In [9]:
data

Unnamed: 0,ISO3166-2,Positive,Negative,Pending,Hospitalized,Death,Total,hash,totalTestResults,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,Date,Country/Region,ISO3166-1
0,AK,114.0,3540.0,0.0,7.0,3.0,3654,01a1c96fd2ed214d8747ab778c2fec7203c8cd2f,3654,02,1.0,1.0,308.0,12.0,320.0,2020-03-30,United States,US
1,AL,859.0,5694.0,0.0,0.0,6.0,6553,1ced1dbd9879f8bbc4b1f7b7876b82611895d58e,6553,01,2.0,0.0,1510.0,53.0,1563.0,2020-03-30,United States,US
2,AR,473.0,5262.0,0.0,62.0,7.0,5735,7199b3f9984cc54342a3d0f5926bff36ef440b6c,5735,05,1.0,14.0,2235.0,47.0,2282.0,2020-03-30,United States,US
3,AS,0.0,0.0,0.0,0.0,0.0,0,955da7e53291581ad33f46d87bad7e4724848fea,0,60,0.0,0.0,0.0,0.0,0.0,2020-03-30,United States,US
4,AZ,1157.0,15602.0,0.0,78.0,20.0,16759,2f64421fc130d03c93a0fa1b89e44c0324ac15a3,16759,04,3.0,0.0,2649.0,238.0,2887.0,2020-03-30,United States,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,OR,3.0,29.0,18.0,0.0,0.0,50,5b37d7f2e6aa9fb39e09270626674b16ce9b5182,32,41,0.0,0.0,0.0,0.0,0.0,2020-03-04,United States,US
1369,SC,0.0,5.0,0.0,0.0,0.0,5,992c7ca6f8259ac7a323c0b92c27123c671f3773,5,45,0.0,0.0,0.0,0.0,0.0,2020-03-04,United States,US
1370,TX,1.0,0.0,0.0,0.0,0.0,1,7be175c20e3c9485c14f9764bd90385842df19c3,1,48,0.0,0.0,0.0,0.0,0.0,2020-03-04,United States,US
1371,WA,39.0,0.0,0.0,0.0,0.0,39,36ea36bcf282dcc480c83b6f2561810b009bc951,39,53,0.0,0.0,0.0,0.0,0.0,2020-03-04,United States,US


In [6]:
states = {k.code.replace("US-", ""): k.name for k in pycountry.subdivisions.get(country_code="US")}

In [7]:
data["Province/State"] = data["ISO3166-2"].apply(lambda x: states[x])

## Sorting data by Province/State before calculating the daily differences

In [None]:
sorted_data = data.sort_values(by=['Province/State'] + ['Date'], ascending=True)

In [None]:
sorted_data['Positive_Since_Previous_Day'] = sorted_data['Positive'] - sorted_data.groupby(['Province/State'])["Positive"].shift(1, fill_value=0)
sorted_data['Total_Since_Previous_Day'] = sorted_data['Total'] - sorted_data.groupby(['Province/State'])["Total"].shift(1, fill_value=0)
sorted_data['Negative_Since_Previous_Day'] = sorted_data['Negative'] - sorted_data.groupby(['Province/State'])["Negative"].shift(1, fill_value=0)
sorted_data['Pending_Since_Previous_Day'] = sorted_data['Pending'] - sorted_data.groupby(['Province/State'])["Pending"].shift(1, fill_value=0)
sorted_data['Death_Since_Previous_Day'] = sorted_data['Death'] - sorted_data.groupby(['Province/State'])["Death"].shift(1, fill_value=0)
sorted_data['Hospitalized_Since_Previous_Day'] = sorted_data['Hospitalized'] - sorted_data.groupby(['Province/State'])["Hospitalized"].shift(1, fill_value=0)

## Rearrange columns

In [None]:
rearranged_data = sorted_data.filter(items=['Country/Region', 'Province/State', 'Date',
                               'Positive', 'Positive_Since_Previous_Day',
                               'Negative', 'Negative_Since_Previous_Day',
                               'Pending', 'Pending_Since_Previous_Day',
                               'Death', 'Death_Since_Previous_Day',
                               'Hospitalized', 'Hospitalized_Since_Previous_Day',
                               'Total', 'Total_Since_Previous_Day',
                               'ISO3166-1', 'ISO3166-2'])

## Add `Last_Update_Date`

In [None]:
rearranged_data.loc[:, "Last_Update_Date"] = datetime.datetime.utcnow()

## Export to CSV

In [None]:
rearranged_data.to_csv(output_folder + "CT_US_COVID_TESTS.csv", index=False)