# NYCHealth Coronavirus (COVID-19) data
### Original datasource: https://github.com/nychealth/coronavirus-data

In [None]:
import pandas as pd
import numpy as np
import os
import requests
import json
import re
import csv
from datetime import datetime
import pycountry

In [None]:
REPO_NAME = 'coronavirus-data'
REPO_OWNER = 'nychealth'
API_ENDPOINT = 'https://api.github.com/repos'
RAW_DATA_ENDPOINT = 'https://raw.githubusercontent.com/'

In [None]:
# papermill parameters
output_folder = "../output/"

In [None]:
commits = []
page = 1
while True:
    response = requests.get(f'{API_ENDPOINT}/{REPO_OWNER}/{REPO_NAME}/commits?path=totals/data-by-modzcta.csv&page={page}')
    assert response.status_code == 200
    commit_shas = list(map(lambda commit: (commit['commit']['author']['date'], commit['sha']), json.loads(response.text)))
    if not len(commit_shas):
        break
    commits += commit_shas
    page += 1

In [None]:
df = []
for (date, commit) in commits:
    
    response = requests.get(f'{RAW_DATA_ENDPOINT}/{REPO_OWNER}/{REPO_NAME}/{commit}/totals/data-by-modzcta.csv')
    if response.status_code == 200:
        
        csv_dict = csv.DictReader(response.text.split('\n'))
        for row in csv_dict:
            if "modzcta_cum_perc_pos" in list(row):
                row["zcta_cum.perc_pos"] = row.pop("modzcta_cum_perc_pos")
            if "modzcta" in list(row):
                row["MODZCTA"] = row.pop("modzcta")
            row.update({'Date': date})  # add commit_date as field: Date
            df.append(row)


In [None]:
df = pd.DataFrame(df)

In [None]:
df = df.replace(".", regex=False, value="NA")
df["TOTAL_COVID_TESTS"] = df["TOTAL_COVID_TESTS"].str.strip()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%dT%H:%M:%SZ")  # parse date
df['MODIFIED_ZCTA'] = df['MODIFIED_ZCTA'].replace(['NA'], '99999')  # parse NA
df['MODIFIED_ZCTA'] = df["MODIFIED_ZCTA"].replace("\.", value="", regex=True)

In [None]:
zcta_to_fips = pd.read_csv('https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt').set_index('ZCTA5')
zcta_to_fips = zcta_to_fips[~zcta_to_fips.index.duplicated(keep='first')]
df['FIPS'] = ''

df['FIPS'].loc[df['MODIFIED_ZCTA'].fillna("99999") != '99999'] = zcta_to_fips.loc[list(map(lambda x: int(x), df['MODIFIED_ZCTA'].loc[df['MODIFIED_ZCTA'].fillna("99999") != '99999'].tolist())), 'GEOID'].tolist()
df['FIPS'] = df["FIPS"].replace("\.", value="", regex=True)


In [None]:
df['Country_Region'] = "United States"
df['ISO3166_1'] = "US"
fips_to_state = pd.read_csv('https://raw.githubusercontent.com/kjhealy/fips-codes/master/county_fips_master.csv', encoding ="ISO-8859-1").set_index('fips')
df['ISO3166_2'] = ''
df['ISO3166_2'].loc[df['FIPS'] != ''] = fips_to_state.loc[df['FIPS'].loc[df['FIPS'] != ''].tolist()]['state_abbr'].tolist()

In [None]:
df['zcta_cum.perc_pos'] = df['PERCENT_POSITIVE'].replace('NA', np.nan)
df['MODIFIED_ZCTA'] = df['MODIFIED_ZCTA'].replace('99999', '')

In [None]:
df = df.astype({
    'COVID_CASE_COUNT': 'float32',
    'TOTAL_COVID_TESTS': 'float32',
    'PERCENT_POSITIVE': 'float32',
    'FIPS': 'object'
})

In [None]:
df.dtypes

In [None]:
df["Last_Updated_Date"] = datetime.utcnow()
df['Last_Reported_Date'] = df['Date'] == df['Date'].max()

In [None]:
df.to_csv(output_folder + "NYC_HEALTH_TESTS.csv", index=False, columns=[
    "MODIFIED_ZCTA",
    "COVID_CASE_COUNT",
    "TOTAL_COVID_TESTS",
    "PERCENT_POSITIVE",
    "Date",
    "FIPS",
    "Country_Region",
    "ISO3166_1",
    "ISO3166_2",
    "Last_Updated_Date",
    "Last_Reported_Date"
])