In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

from scipy.optimize import curve_fit
from scipy.optimize import fsolve

import matplotlib.pyplot as plt

import re

In [2]:
pd.set_option('display.width', 1000)

In [3]:
url_cases = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'

df_load_cases = pd\
    .read_csv(url_cases, sep=',', engine='python')\
    .groupby("Country/Region")\
    .sum()\
    .reset_index(level=[0])\
    .rename({"Country/Region": 'countriesAndTerritories'}, axis='columns', inplace=False)\
    .drop(['Lat', 'Long'], axis='columns')\
    .melt('countriesAndTerritories', var_name='dateRep', value_name='cum_cases')

url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'

df_load_deaths = pd\
    .read_csv(url_deaths, sep=',', engine='python')\
    .groupby("Country/Region")\
    .sum()\
    .reset_index(level=[0])\
    .rename({"Country/Region": 'countriesAndTerritories'}, axis='columns', inplace=False)\
    .drop(['Lat', 'Long'], axis='columns')\
    .melt('countriesAndTerritories', var_name='dateRep', value_name='cum_deaths')

df_load_jh = pd\
    .merge(df_load_cases, df_load_deaths,  how='inner', on=['countriesAndTerritories', 'dateRep'])

df_load_jh['dateRep'] = df_load_jh['dateRep'].map(lambda x : (datetime.strptime(x, '%m/%d/%y').strftime('%d/%m/%Y')))

df_1_jh = df_load_jh

df_1_jh['days'] = df_1_jh['dateRep']\
    .map(lambda x : (datetime.strptime(x, '%d/%m/%Y') - datetime.strptime("01/01/2020", '%d/%m/%Y')).days)

df_1_jh = df_1_jh.sort_values(['countriesAndTerritories', 'days'], ascending=[True, True])
df_1_jh = df_1_jh[df_1_jh['days'] >= 0]

df_2_jh = df_1_jh\
    .loc[:,['countriesAndTerritories', 'days', 'dateRep', 'cum_cases', 'cum_deaths']]

df_2_jh['source'] = 'jh'

Remarks: ECDC data does not match RKI data in all time points!

Unfortunately, at least for Germany (Robert Koch Institute (RKI) data), I have found some potential data issues. For some days the reported number of cases/deaths is different from RKI.

Some observations (ECDC v RKI):
* 1,042 incremental/8,198 cumulative ( 2020-03-19) vs 1,042/8,198 (2020-03-19)   (not okay, a shift in the day)
* 2020-03-20: 5,940/14,138 vs 2,958/13,957 (not okay)
* 2020-03-24: 4,438/29,212 vs 4,764/27,436 (not okay)
* 2020-03-25: 2,342/31,554 vs 4,118/31,554 (not okay, wrong increments)
* 2020-03-28: 6,294/48,582 vs 6,294/48,582 (okay)

I have used the following ECDC data:
[Link](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)

RKI data:
[Situationsbericht 2020-03-18](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/2020-03-18-de.pdf?__blob=publicationFile),
[Situationsbericht 2020-03-24](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/2020-03-24-de.pdf?__blob=publicationFile),
[Situationsbericht 2020-03-25](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/2020-03-25-de.pdf?__blob=publicationFile) and 
[Situationsbericht 2020-03-28](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/2020-03-28-de.pdf?__blob=publicationFile)


In [4]:
url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/"

df_load_ecdc = pd\
    .read_csv(url, sep=',', engine='python')

df_load_ecdc.columns= [re.sub("[^A-Za-z]", "", col) for col in df_load_ecdc.columns]

df_1_ecdc = df_load_ecdc\
    .loc[:,['dateRep','cases', 'deaths', 'countriesAndTerritories']]

df_1_ecdc['days'] = df_1_ecdc['dateRep']\
    .map(lambda x : (datetime.strptime(x, '%d/%m/%Y') - datetime.strptime("01/01/2020", '%d/%m/%Y')).days)

df_1_ecdc = df_1_ecdc.sort_values(['countriesAndTerritories', 'days'], ascending=[True, True])
df_1_ecdc = df_1_ecdc[df_1_ecdc['days'] >= 0]

df_2_ecdc =  df_1_ecdc\
    .groupby(['countriesAndTerritories', 'days','dateRep'])\
    .agg(cum_cases=('cases', 'sum'), cum_deaths=('deaths', 'sum'))\
    .groupby(level=[0])\
    .cumsum()\
    .reset_index(level=[0, 1, 2])

df_2_ecdc = df_2_ecdc.sort_values(['countriesAndTerritories', 'days'], ascending=[True, True])

df_2_ecdc['source'] = 'ecdc'

In [5]:
df_2 = pd.concat([df_2_jh, df_2_ecdc])

In [6]:
df_same_1 = df_2

df_same_2 = df_same_1[df_same_1['cum_cases'] >= 1000]\
    .groupby(['countriesAndTerritories', 'source'])\
    .agg(min_days=('days', 'min'))

df_3 = pd\
    .merge(df_2, df_same_2,  how='inner', on=['countriesAndTerritories', 'source'])

df_3['same_day'] = df_3['days'] - df_3['min_days']

df_3 = df_3.drop(['min_days'], axis='columns')

In [7]:
df_3 = df_3.sort_values(['countriesAndTerritories', 'days','dateRep', 'source'], ascending=[True, True, True, True])

df_3_inc = df_3\
    .groupby(['source', 'countriesAndTerritories', 'days','dateRep'])\
    .agg(cases=('cum_cases', 'sum'), deaths=('cum_deaths', 'sum'))\
    .groupby(level=[0])\
    .diff(periods=1, axis=0)\
    .reset_index(level=[0, 1, 2, 3])

df_4 = pd.merge(df_3, df_3_inc,  how='inner', on=['countriesAndTerritories', 'days','dateRep', 'source'])

df_4['cases'] = df_4['cases'].fillna(df_4['cum_cases'])
df_4['deaths'] = df_4['deaths'].fillna(df_4['cum_deaths'])

In [310]:
df_final = df_4

In [311]:
#df_final[(df_final['countriesAndTerritories'] == "Germany") & (df_final['days'] > 70) & (df_final['source'] == 'jh')]

In [27]:
df_countries = df_final[df_final['source'] == 'jh']\
    .groupby(['countriesAndTerritories'])\
    .agg(cum_cases=('cum_cases', 'max'))\
    .sort_values(['cum_cases'], ascending=[False])\
    .reset_index(level=[0])

list_countries = df_countries.iloc[:9]['countriesAndTerritories'].tolist()

In [28]:
def logistic_model(x, a, b, c):
    return c/(1 + np.exp(-(x - b)/a))

def exponential_2p_model(x, a, b):
    return a*np.exp(b*x)

def fitting(model, x, y):
    if model == 'logistic':
        fit = curve_fit(logistic_model, x, y, p0=[2, 100, 20000], maxfev = 50000)
        return fit
    if model == 'exponential':
        fit = curve_fit(exponential_2p_model, x, y, p0=[10000, 0.1], maxfev = 50000)
        return fit

def model_selection(model):
    if model == 'logistic':
        return logistic_model
    if model == 'exponential':
        return exponential_2p_model
    
def additional_info(model, fit):
    if model == 'logistic':
        print('days to reach limit of ', "{:,}".format(int(fit[0][2])) , ': ', "{:,}".format(int(int(fsolve(lambda x : logistic_model(x, *fit[0]) - int(fit[0][2]), fit[0][1])))))
    if model == 'exponential':
        print('time to double:', np.round(np.log(2)/fit[0][1], 1))

In [429]:
def extrapolate(df_in, what, model, days_fitting, days_extropolate, shift):
    df_out = df_in
    df_in = df_in[df_in['cum_' + what].notnull()]

    if shift != 0:
        shift_str = "_prior"
    else:
        shift_str = ""
    
    x_date = df_in['date'].iloc[-days_fitting - shift:-shift or None]
    x_days = df_in['days'].iloc[-days_fitting - shift:-shift or None]

    y_fit = df_in['cum_' + what].iloc[-days_fitting - shift:-shift or None]

    selected_model = model_selection(model)
    fit = fitting(model, x_days, y_fit)

    x_date_pred = pd\
        .date_range(start=list(x_date.tail(1))[0] + timedelta(days=1) , periods=days_extropolate + shift)\
        .to_series()\
        .rename("date")\
        .reset_index()\
        .drop(['index'], axis='columns')

    x_days_pred = pd\
        .Series(np.arange(int(x_days.tail(1)) + 1, int(x_days.tail(1)) + 1 + days_extropolate + shift))\
        .rename("days")

    y_pred = pd\
        .Series((int(selected_model(i, *fit[0])) for i in x_days_pred))\
        .rename("cum_" + what + "_" + model + shift_str)
    
    y_days = pd\
        .Series((int(selected_model(i, *fit[0])) for i in x_days))\
        .rename("cum_" + what + "_" + model + shift_str)

    x_days = x_days.reset_index()
    
    df_fit_actuals = pd\
        .concat([x_date.reset_index(), x_days.reset_index(), y_days], axis=1)\
        .drop(['level_0', 'index'], axis='columns')

    df_fit_pred = pd.concat([x_date_pred, x_days_pred, y_pred], axis=1)

    df_fit_cum = pd\
        .concat([df_fit_actuals, df_fit_pred])\
        .reset_index()\
        .drop(['index'], axis='columns')

    df_fit_inc = df_fit_cum\
        .groupby(['date', 'days'])\
        .agg(quantitiy=("cum_" + what + "_" + model+ shift_str, 'sum'))\
        .groupby(level=[0])\
        .diff(periods=1, axis=0)\
        .reset_index(level=[0, 1])

    df_fit_inc[what + "_" + model + shift_str] = df_fit_inc["quantitiy"]
    df_fit_inc = df_fit_inc.drop(['quantitiy'], axis='columns')

    df_fit = pd.merge(df_fit_cum, df_fit_inc,  how='inner', on=['date', 'days'])

    df_predictions = pd.merge(df_out, df_fit, how='outer', on=['date', 'days'])
    
    return df_predictions

In [430]:
import pandas as pd
from bokeh.palettes import Spectral11
from bokeh.plotting import figure
from bokeh.io import output_notebook, push_notebook, show , output_file
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models import NumeralTickFormatter, DatetimeTickFormatter

output_notebook(hide_banner=True)

In [439]:
def single_country_new(country, source, what, df_input, days_fitting=14, days_extropolate=3):
    
    df_input = df_input[df_input['countriesAndTerritories'] == country]
    df_input= df_input[df_input['cum_cases'] > 100]
    df_input = df_input[df_input['source'] == source]
    
    df_input['date'] = df_input['dateRep']\
        .map(lambda x : datetime.strptime(x, '%d/%m/%Y'))
    
    df_input = df_input\
        .drop(['countriesAndTerritories', 'dateRep', 'source', 'same_day'], axis='columns')
    
    out = extrapolate(df_input, what, 'logistic', days_fitting, days_extropolate, shift=0)
    out = extrapolate(out, what, 'logistic', days_fitting, days_extropolate, shift=7)
    out = extrapolate(out, what, 'exponential', days_fitting, days_extropolate, shift=0)
    out = extrapolate(out, what, 'exponential', days_fitting, days_extropolate, shift=7)
    
    df_input = out
    
    p = figure(
        title=country + " - " + model.capitalize(),
        width=800,
        height=600,
        x_axis_type="datetime",
        toolbar_location="below"
    )
    
    p.xaxis.axis_label = 'date'
    p.yaxis.axis_label = what
    
    p.yaxis.formatter=NumeralTickFormatter(format="0,0")
    p.xaxis.formatter=DatetimeTickFormatter(
        days=['%Y-%m-%d'],
        months=['%Y-%m-%d'],
        years=['%Y-%m-%d']
    )
    
    def plot_line(p, x, y, color, text, line_dash):
        source_actuals = ColumnDataSource(
            data={
                'x': df_input[x],
                y: df_input[y],
                'cum_' + y: df_input['cum_' + y],
            }
        )
        
        p.line(
            'x',
            y,
            source=source_actuals, 
            line_width=2,
            legend=text + " inc. " + y,
            color=color,
            alpha=0.5,
            hover_line_alpha=0.5,
            line_dash=line_dash
        )

        p.line(
            'x',
            'cum_' + y,
            source=source_actuals, 
            line_width=2,
            legend=text + " cum. " + y,
            color=color,
            alpha=0.5,
            hover_line_alpha=0.5,
            line_dash=line_dash
        )
        
        return p
    
    p = plot_line(p, x='date', y=what, color="steelblue", text='reported', line_dash='solid')
    p = plot_line(p, x='date', y=what + "_logistic", color="green", text='estimation', line_dash='dashed')
    p = plot_line(p, x='date', y=what + "_logistic_prior", color="green", text='estimation', line_dash='dotted')
    p = plot_line(p, x='date', y=what + "_exponential", color="orange", text='estimation', line_dash='dashed')
    p = plot_line(p, x='date', y=what + "_exponential_prior", color="orange", text='estimation', line_dash='dotted')
    
    def plot_scatter(p, x, y, color, text, estimation, prior, shift):
        
        if prior:
            days_fitting_shift = days_fitting + shift
        else:
            days_fitting_shift = days_fitting
        
        if estimation:
            days_fitting_shift = days_fitting_shift + days_extropolate - days_fitting
        else:
            days_fitting_shift = days_fitting_shift
        
        source_scatter = ColumnDataSource(
            data={
                'x': df_input[x].tail(days_fitting_shift),
                y: df_input[y].tail(days_fitting_shift),
                'cum_' + y: df_input['cum_' + y].tail(days_fitting_shift),
            }
        )
        
        p.scatter(
            'x',
            y,
            source=source_scatter, 
            line_width=2,
            legend=text + " inc. " + y,
            color=color,
            alpha=0.5,
            hover_line_alpha=0.5
        )

        p.scatter(
            'x',
            'cum_' + y,
            source=source_scatter, 
            line_width=2,
            legend=text + " cum. " + y,
            color=color,
            alpha=0.5,
            hover_line_alpha=0.5
        )
        
        return p
    
    p = plot_scatter(p, x='date', y=what, color="red", text='reported', estimation=False, prior=False, shift=0)
    p = plot_scatter(p, x='date', y=what, color="navy", text='reported', estimation=True, prior=True, shift=7)
    p = plot_scatter(p, x='date', y=what + "_logistic", color="green", text='estimation', estimation=True, prior=False, shift=0)
    p = plot_scatter(p, x='date', y=what + "_logistic_prior", color="green", text='estimation', estimation=True, prior=True, shift=7)
    p = plot_scatter(p, x='date', y=what + "_exponential", color="orange", text='estimation', estimation=True, prior=False, shift=0)
    p = plot_scatter(p, x='date', y=what + "_exponential_prior", color="orange", text='estimation', estimation=True, prior=True, shift=7)
    
    p.add_tools(
        HoverTool(
            show_arrow=False,
            line_policy='nearest',
            tooltips=[
                ("date", "$x{%F}"),
                ('inc. ' + what, "@" + what + "{0,0}"),
                ('cum. ' + what, "@cum_" + what + "{0,0}"),
                ('inc. ' + what + "_logistic", "@" + what + "_logistic" + "{0,0}"),
                ('cum. ' + what + "_logistic", "@cum_" + what + "_logistic" + "{0,0}"),
                ('inc. ' + what + "_exponential", "@" + what + "_exponential" + "{0,0}"),
                ('cum. ' + what + "_exponential", "@cum_" + what + "_exponential" + "{0,0}"),
                ('inc. ' + what + "_logistic_prior", "@" + what + "_logistic_prior" + "{0,0}"),
                ('cum. ' + what + "_logistic_prior", "@cum_" + what + "_logistic_prior" + "{0,0}"),
                ('inc. ' + what + "_exponential_prior", "@" + what + "_exponential_prior" + "{0,0}"),
                ('cum. ' + what + "_exponential_prior", "@cum_" + what + "_exponential_prior" + "{0,0}"),
            ],
            formatters={
                '$x': 'datetime',
            },
            mode='mouse'
        )
    )
    
    p.legend.location = "top_left"
    
    show(p)

In [440]:
country = 'Germany'
days_fitting = 14
days_extropolate = 3
source = 'jh'
# source = 'ecdc'
what = 'cases'
# what = 'deaths'

single_country_new(
    country=country,
    source=source,
    what=what,
    df_input=df_final,
    days_fitting=days_fitting,
    days_extropolate=days_extropolate
)