In [1]:
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

pd.options.mode.chained_assignment = None
%run common.py

# Read the raw data
url = 'https://raw.githubusercontent.com/ratreya/jupyter-notebooks/master/jhu-daily-reports.csv'
merged = pd.read_csv(url, na_values='', keep_default_na=False)
last_day = sorted(merged['Date'].unique())[-1]

# Get list of incremental days in expected format
sdate = datetime.strptime(last_day, '%m-%d-%Y').date()
today = date.today()
edate = date(today.year, today.month, today.day)
days = [(sdate + timedelta(days=i)).strftime('%m-%d-%Y') for i in range(1, (edate - sdate).days + 1)]

# Merge all incremental daily reports
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/%s.csv'
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(lambda day: (day, pd.read_csv(url % day)), day) for day in days]

for future in as_completed(futures):
    try:
        day, df = future.result()
        # Fix changes in column names if they exits
        df = df.rename({'Admin2': 'County', 'Province/State':'State', 'Country/Region':'Country', 'Province_State':'State', 'Country_Region':'Country'}, axis=1)
        df.drop([x for x in df.columns.values if x not in merged.columns.values], axis=1, inplace=True)
        df.insert(2, 'Date', [day for i in range(df.shape[0])])
        cleanup(df)
        # Do this because there are duplicate rows in some datasets
        df = df.groupby(['Country', 'State', 'County', 'Date'], as_index=False).sum()
        merged = pd.concat([merged, df])
    except IOError as e:
        print(str(e))

variables = ['Confirmed', 'Deaths', 'Recovered']
for i in range(1, len(days)):
    zipped = merged[merged['Date'] == days[i]].merge(merged[merged['Date'] == days[i-1]], how='left', on=['Country', 'State', 'County'], suffixes=['_t', '_y'])
    for variable in variables:
        merged.loc[merged['Date'] == days[i], variable + '_New'] = zipped[variable + '_t'].combine(zipped[variable + '_y'], func=lambda t, y: t-y, fill_value=0)

# Write merged CSV file
merged = merged.groupby(['Country', 'State', 'County', 'Date'], as_index=False).sum()
merged.to_csv('jhu-daily-reports.csv', index=False)
verify(merged)

Some deltas are hugely negative!
      Country      State      County        Date  Confirmed  Deaths  \
1744   Canada  Recovered         n/a  04-01-2020        0.0     0.0   
36211      US         NJ  Unassigned  04-01-2020     4512.0     0.0   

       Recovered  Confirmed_New  Deaths_New  Recovered_New  
1744      1324.0            0.0         0.0         -268.0  
36211        0.0          826.0      -247.0            0.0  
