In [2]:
import pandas as pd
import numpy as np
from datetime import date, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

pd.options.mode.chained_assignment = None
%run common.py

# Get list of days in expected format
sdate = date(2020, 1, 22)
today = date.today()
edate = date(today.year, today.month, today.day)
days = [(sdate + timedelta(days=i)).strftime('%m-%d-%Y') for i in range((edate - sdate).days + 1)]

# Merge all daily reports
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/%s.csv'
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(lambda day: (day, pd.read_csv(url % day)), day) for day in days]

merged = pd.DataFrame(columns = ['Country', 'State', 'County', 'Date', 'Confirmed', 'Deaths', 'Recovered', 'Confirmed_New', 'Deaths_New', 'Recovered_New'])
for future in as_completed(futures):
    try:
        day, df = future.result()
        # Fix changes in column names if they exits
        df = df.rename({'Admin2': 'County', 'Province/State':'State', 'Country/Region':'Country', 'Province_State':'State', 'Country_Region':'Country'}, axis=1)
        df.drop([x for x in df.columns.values if x not in merged.columns.values], axis=1, inplace=True)
        df.insert(2, 'Date', [day for i in range(df.shape[0])])
        merged = pd.concat([merged, df])
    except IOError as e:
        print(str(e))

# Clean up the data - defined in common.py
cleanup(merged)

# Fix bad data
merged.loc[(merged['State'] == 'French Polynesia') & (merged['Date'] == '03-23-2020'), 'State'] = 'n/a'
merged.loc[(merged['Country'] == 'France') & (merged['State'] == 'France'), 'State'] = 'n/a'

# Do this because there are duplicate rows in some datasets
merged = merged.groupby(['Country', 'State', 'County', 'Date'], as_index=False).sum()

# Calculate deltas for each date
for country in merged['Country'].unique():
    for state in merged[merged['Country'] == country]['State'].unique():
        for county in merged[(merged['Country'] == country) & (merged['State'] == state)]['County'].unique():
            confirmed = merged[(merged['Country'] == country) & (merged['State'] == state) & (merged['County'] == county)]['Confirmed'].values.tolist()
            confirmed_deltas = [np.nan] + [confirmed[i] - confirmed[i-1] for i in range(1, len(confirmed))]
            merged.loc[(merged['Country'] == country) & (merged['State'] == state) & (merged['County'] == county), 'Confirmed_New'] = confirmed_deltas
            deaths = merged[(merged['Country'] == country) & (merged['State'] == state) & (merged['County'] == county)]['Deaths'].values.tolist()
            deaths_deltas = [np.nan] + [deaths[i] - deaths[i-1] for i in range(1, len(deaths))]
            merged.loc[(merged['Country'] == country) & (merged['State'] == state) & (merged['County'] == county), 'Deaths_New'] = deaths_deltas
            recovered = merged[(merged['Country'] == country) & (merged['State'] == state) & (merged['County'] == county)]['Recovered'].values.tolist()
            recovered_deltas = [np.nan] + [recovered[i] - recovered[i-1] for i in range(1, len(recovered))]
            merged.loc[(merged['Country'] == country) & (merged['State'] == state) & (merged['County'] == county), 'Recovered_New'] = recovered_deltas

# Write merged to CSV and verify
merged.to_csv('jhu-daily-reports.csv', index=False)
verify(merged)

HTTP Error 404: Not Found


KeyboardInterrupt: 