In [23]:
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

pd.options.mode.chained_assignment = None

def cleanup(merged):
    # Standerdize Country Names
    merged['Country'].replace('United Kingdom', 'UK', inplace=True)
    merged['Country'].replace('Mainland China', 'China', inplace=True)
    merged['Country'].replace(['Korea, South', 'Republic of Korea'], 'South Korea', inplace=True)
    merged['Country'].replace('Iran (Islamic Republic of)', 'Iran',inplace=True)

    # Standerdize US State Names
    merged['State'] = merged['State'].str.strip()
    merged['State'].replace(regex={'^.*Virgin Islands.*$': 'Virgin Islands'}, inplace=True)
    merged['State'].replace(regex={'^(.+) \(From Diamond Princess\)$': r'\1'}, inplace=True)
    merged['State'].replace(regex={'^.*Princess.*$': 'Cruise Ship'}, inplace=True)
    merged['State'].replace(regex={'^.+, (.+)$': r'\1'}, inplace=True)
    merged['State'].replace(['District of Columbia', 'D.C.'], 'DC', inplace=True)
    merged['State'].replace('Chicago', 'IL', inplace=True)
    us_state_abbrev = {
        'Alabama': 'AL',
        'Alaska': 'AK',
        'American Samoa': 'AS',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'District of Columbia': 'DC',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Guam': 'GU',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Northern Mariana Islands':'MP',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Puerto Rico': 'PR',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virgin Islands': 'VI',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'
    }
    merged['State'].replace(us_state_abbrev, inplace=True)
    
def fillin(merged):
    # Fill NaNs otherwise some operations such as gorupby will not work
    merged['Confirmed'].fillna(0, inplace=True)
    merged['Deaths'].fillna(0, inplace=True)
    merged['Recovered'].fillna(0, inplace=True)
    merged['State'].fillna('n/a', inplace=True)
    merged['County'].fillna('n/a', inplace=True)
    return merged

def verify(merged):
    # Run verifications - ignore small deviations
    df_neg = merged[(merged['County'] != 'Unassigned') & (merged['Confirmed_New'] < -100) | (merged['Deaths_New'] < -50) | (merged['Recovered_New'] < -50)]
    if df_neg.shape[0] > 0:
        print('Some deltas are hugely negative!')
        print(df_neg.sort_values('Confirmed_New'))

    mismatch = merged[(merged['State'] != 'US') & (merged['State'] != 'Recovered') & (merged['County'] != 'Unassigned') & (merged['Confirmed'] - (merged['Deaths'] + merged['Recovered']) < -10)]
    if mismatch.shape[0] > 0:
        print('Confirmed is much smaller than Deaths + Recovered!')
        print(mismatch)

def jhu():
    # Get list of days in expected format
    sdate = date(2020, 1, 22)
    today = date.today()
    edate = date(today.year, today.month, today.day)
    days = [(sdate + timedelta(days=i)).strftime('%m-%d-%Y') for i in range((edate - sdate).days + 1)]

    # Merge all daily reports
    url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/%s.csv'
    with ThreadPoolExecutor(max_workers=100) as executor:
        futures = [executor.submit(lambda day: (day, pd.read_csv(url % day)), day) for day in days]

    merged = pd.DataFrame(columns = ['Country', 'State', 'County', 'Date', 'Confirmed', 'Deaths', 'Recovered', 'Confirmed_New', 'Deaths_New', 'Recovered_New'])
    for future in as_completed(futures):
        try:
            day, df = future.result()
            # Fix changes in column names if they exits
            df = df.rename({'Admin2': 'County', 'Province/State':'State', 'Country/Region':'Country', 'Province_State':'State', 'Country_Region':'Country'}, axis=1)
            df.drop([x for x in df.columns.values if x not in merged.columns.values], axis=1, inplace=True)
            df['Date'] = day
            merged = pd.concat([merged, df])
        except IOError as e:
            print(str(e))

    # Clean up the data
    cleanup(merged)
    fillin(merged)

    # Fix bad data
    merged.loc[(merged['State'] == 'French Polynesia') & (merged['Date'] == '03-23-2020'), 'State'] = 'n/a'
    merged.loc[(merged['Country'] == 'France') & (merged['State'] == 'France'), 'State'] = 'n/a'

    # Do this because there are duplicate rows in some datasets
    return merged.groupby(['Country', 'State', 'County', 'Date'], as_index=False).sum()

def india(merged):
    df = pd.read_csv('covid_19_india.csv')
    df.Date = [datetime.strptime(x, '%d/%m/%y').strftime('%m-%d-%Y') for x in df.Date]
    df = df.rename({'State/UnionTerritory': 'State', 'Cured': 'Recovered'}, axis=1)
    df.drop([x for x in df.columns.values if x not in merged.columns.values], axis=1, inplace=True)
    df['Country'] = 'India'
    df['County'] = 'n/a'
    return fillin(pd.concat([merged, df]))

def deltas(merged):
    def deltas(df):
        for state in df['State'].unique():
            for county in df[df['State'] == state]['County'].unique():
                confirmed = df[(df['State'] == state) & (df['County'] == county)]['Confirmed'].values.tolist()
                confirmed_deltas = [np.nan] + [confirmed[i] - confirmed[i-1] for i in range(1, len(confirmed))]
                df.loc[(df['State'] == state) & (df['County'] == county), 'Confirmed_New'] = confirmed_deltas
                deaths = df[(df['State'] == state) & (df['County'] == county)]['Deaths'].values.tolist()
                deaths_deltas = [np.nan] + [deaths[i] - deaths[i-1] for i in range(1, len(deaths))]
                df.loc[(df['State'] == state) & (df['County'] == county), 'Deaths_New'] = deaths_deltas
                recovered = df[(df['State'] == state) & (df['County'] == county)]['Recovered'].values.tolist()
                recovered_deltas = [np.nan] + [recovered[i] - recovered[i-1] for i in range(1, len(recovered))]
                df.loc[(df['State'] == state) & (df['County'] == county), 'Recovered_New'] = recovered_deltas
        return df

    # Calculate deltas for each date
    with ThreadPoolExecutor(max_workers=100) as executor:
        futures = [executor.submit(deltas, merged[(merged['Country'] == 'US') & (merged['State'] == state)].copy()) for state in merged[merged['Country'] == 'US'].State.unique()]
        futures += [executor.submit(deltas, merged[merged['Country'] == country].copy()) for country in merged.Country.unique() if country != 'US']

    final = pd.DataFrame(columns=merged.columns)
    for future in as_completed(futures):
        df = future.result()
        final = pd.concat([final, df])

    return final

In [24]:
merged = jhu()
merged = india(merged[merged['Country'] != 'India'])
merged = deltas(merged)
merged = merged.groupby(['Country', 'State', 'County', 'Date'], as_index=False).sum()

# Write merged to CSV and verify
merged.to_csv('jhu-daily-reports.csv', index=False)
verify(merged)
print('Done!')

Some deltas are hugely negative!
      Country      State        County        Date  Confirmed   Deaths  \
6531   France        n/a           n/a  04-14-2020   130253.0  15729.0   
67876      US         TN      Davidson  04-13-2020     1207.0     16.0   
16951      US         AZ        Navajo  04-09-2020      286.0      1.0   
55288      US         NH  Hillsborough  04-14-2020      271.0      0.0   
2244   Canada  Recovered           n/a  04-01-2020        0.0      0.0   
58402      US         NY    Unassigned  04-06-2020        0.0    309.0   
58403      US         NY    Unassigned  04-07-2020        0.0    116.0   
58405      US         NY    Unassigned  04-09-2020        0.0     73.0   
58406      US         NY    Unassigned  04-10-2020        0.0      0.0   
40030      US         MA    Unassigned  04-15-2020      572.0     14.0   
40026      US         MA    Unassigned  04-11-2020      225.0     13.0   
55917      US         NJ    Unassigned  04-01-2020     4512.0      0.0   
12847