In [3]:
import io
import requests
import pandas as pd
import numpy as np
from scipy.stats import lognorm
from tqdm import tqdm_notebook
from statsmodels.stats.proportion import proportion_confint
from datetime import date, datetime

## Functions

In [4]:
def muTransform(zMedian):
    return np.log(zMedian)

In [5]:
def sigmaTransform(zMean, mu):
    return np.sqrt(2*(np.log(zMean)-mu))

In [6]:
def plnorm(x, mu, sigma):
    shape  = sigma
    loc    = 0
    scale  = np.exp(mu)
    return lognorm.cdf(x, shape, loc, scale)

In [7]:
def hospitalisation_to_death_truncated(x,mu,sigma):
    return plnorm(x + 1, mu, sigma) - plnorm(x, mu, sigma)

def hospitalisation_to_death_truncated_low(x):
    return hospitalisation_to_death_truncated(x,muLow, sigmaLow)

def hospitalisation_to_death_truncated_mid(x):
    return hospitalisation_to_death_truncated(x,muMid, sigmaMid)

def hospitalisation_to_death_truncated_high(x):
    return hospitalisation_to_death_truncated(x,muHigh, sigmaHigh)

## Parameters

In [8]:
# setting the baseline CFR
cCFRBaseline = 1.4
cCFREstimateRange = (1.2, 1.7)
# lower end of the range
zmeanLow = 8.7
zmedianLow = 6.7
# middle of the range
zmeanMid = 13
zmedianMid = 9.1
# upper end of the range
zmeanHigh = 20.9
zmedianHigh = 13.7

In [9]:
muLow=muTransform(zmedianLow)
sigmaLow = sigmaTransform(zmeanLow, muLow)

In [10]:
muMid = muTransform(zmedianMid)
sigmaMid = sigmaTransform(zmeanMid, muMid)

In [11]:
muHigh = muTransform(zmedianHigh)
sigmaHigh = sigmaTransform(zmeanHigh, muHigh)

## Clean Dataset

In [12]:
url="https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"
s=requests.get(url).content
dataset=pd.read_csv(io.StringIO(s.decode('utf-8')))
dataset.tail()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
13824,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0,Africa
13825,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0,Africa
13826,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0,Africa
13827,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0,Africa
13828,21/03/2020,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0,Africa


In [13]:
dataset.rename(columns = {
    "dateRep": "date",
    "cases": "new_cases",
    "deaths": "new_deaths",
    "countriesAndTerritories": "country"
},inplace=True)
allTogetherClean = dataset[['date', 'country', 'new_cases', 'new_deaths']]

In [14]:
## Exclude some countries
exclude_coutries = ['Canada','Cases_on_an_international_conveyance_Japan']
allTogetherClean = allTogetherClean[~allTogetherClean.country.isin(exclude_coutries)]
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
13824,25/03/2020,Zimbabwe,0,0
13825,24/03/2020,Zimbabwe,0,1
13826,23/03/2020,Zimbabwe,0,0
13827,22/03/2020,Zimbabwe,1,0
13828,21/03/2020,Zimbabwe,1,0


In [15]:
## Remove lower data points
threshold = 10
list_filtered_countried = allTogetherClean.groupby('country').filter(lambda x: x['new_deaths'].sum()>threshold)['country'].unique()

In [16]:
allTogetherClean = allTogetherClean[allTogetherClean.country.isin(list_filtered_countried)].reset_index(drop=True)
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
8702,19/03/2020,Uruguay,29,0
8703,18/03/2020,Uruguay,21,0
8704,17/03/2020,Uruguay,21,0
8705,16/03/2020,Uruguay,2,0
8706,15/03/2020,Uruguay,6,0


## Calculate UnderReporting

$$u_{t}=\frac{\sum_{j=0}^{t}c_{t-j}f_{j}}{c_{t}}$$

where:  
$u_{t}$ = underestimation of the proportion of cases with known outcomes  
$c_{t}$ = daily case incidence at time t  
$f_{t}$ = proportion of cases with delay of t between confirmation and death

In [23]:
def calculate_underestimate(country, delay_func):
    df = allTogetherClean[allTogetherClean.country==country].iloc[::-1].reset_index(drop=True)
    cumulative_known_t = 0
    for ii in range(0,len(df)):
        known_i = 0
        for jj in range(0,ii+1):
            known_jj = df['new_cases'].loc[ii-jj]*delay_func(jj)
            known_i = known_i + known_jj
        cumulative_known_t = cumulative_known_t + known_i
    cum_known_t = round(cumulative_known_t)
    nCFR = df['new_deaths'].sum()/df['new_cases'].sum()
    cCFR = df['new_deaths'].sum()/cum_known_t
    total_deaths = df['new_deaths'].sum()
    total_cases = df['new_cases'].sum()
    nCFR_UQ, nCFR_LQ =  proportion_confint(total_deaths, total_cases)
    cCFR_UQ, cCFR_LQ =  proportion_confint(total_deaths, cum_known_t)
    quantile25, quantile75 = proportion_confint(total_deaths, cum_known_t, alpha = 0.5)
    row = {
        'country': country,
        'nCFR': nCFR, 
        'cCFR':cCFR, 
        'total_deaths':total_deaths, 
        'cum_known_t':cum_known_t, 
        'total_cases':total_cases,
        'nCFR_UQ':round(nCFR_UQ,8),
        'nCFR_LQ':round(nCFR_LQ,8),
        'cCFR_UQ':round(cCFR_UQ,8),
        'cCFR_LQ':round(cCFR_LQ,8),
        'underreporting_estimate':cCFRBaseline / (100*cCFR),
        'lower':cCFREstimateRange[0]/ (100 * cCFR_UQ),
        'upper':cCFREstimateRange[1]/ (100 * cCFR_LQ),
        'quantile25':quantile25,
        'quantile75':quantile75
    }
    return row

In [24]:
calculate_underestimate("Afghanistan", hospitalisation_to_death_truncated_low)

{'country': 'Afghanistan',
 'nCFR': 0.03284072249589491,
 'cCFR': 0.05410279531109107,
 'total_deaths': 60,
 'cum_known_t': 1109.0,
 'total_cases': 1827,
 'nCFR_UQ': 0.02466861,
 'nCFR_LQ': 0.04101283,
 'cCFR_UQ': 0.04078862,
 'cCFR_LQ': 0.06741697,
 'underreporting_estimate': 0.25876666666666664,
 'lower': 0.2941996769490299,
 'upper': 0.25216204241058154,
 'quantile25': 0.049520939643496686,
 'quantile75': 0.05868465097868546}

In [25]:
calculate_underestimate("Argentina", hospitalisation_to_death_truncated_low)

{'country': 'Argentina',
 'nCFR': 0.05031599416626155,
 'cCFR': 0.06742671009771987,
 'total_deaths': 207,
 'cum_known_t': 3070.0,
 'total_cases': 4114,
 'nCFR_UQ': 0.04363627,
 'nCFR_LQ': 0.05699572,
 'cCFR_UQ': 0.05855645,
 'cCFR_LQ': 0.07629697,
 'underreporting_estimate': 0.20763285024154587,
 'lower': 0.20493045742182062,
 'upper': 0.22281356475452455,
 'quantile25': 0.06437415411225512,
 'quantile75': 0.07047926608318462}

In [26]:
calculate_underestimate("Brazil", hospitalisation_to_death_truncated_low)

{'country': 'Brazil',
 'nCFR': 0.06979105806415714,
 'cCFR': 0.11548731642189586,
 'total_deaths': 5017,
 'cum_known_t': 43442.0,
 'total_cases': 71886,
 'nCFR_UQ': 0.06792847,
 'nCFR_LQ': 0.07165364,
 'cCFR_UQ': 0.11248185,
 'cCFR_LQ': 0.11849279,
 'underreporting_estimate': 0.12122543352601156,
 'lower': 0.10668388149920366,
 'upper': 0.1434686494229578,
 'quantile25': 0.11445303285127988,
 'quantile75': 0.11652159999251184}

In [27]:
def return_complete_df(delay_func):
    all_countries = allTogetherClean['country'].unique()
    rows_countries = [calculate_underestimate(c,delay_func) for c in tqdm_notebook(all_countries)]
    new_df = pd.DataFrame(
        data =rows_countries,
        columns = [
        'country',
        'nCFR', 
        'cCFR', 
        'total_deaths', 
        'cum_known_t', 
        'total_cases',
        'nCFR_UQ',
        'nCFR_LQ',
        'cCFR_UQ',
        'cCFR_LQ',
        'underreporting_estimate',
        'lower',
        'upper',
        'quantile25',
        'quantile75'
    ])
    return new_df

In [28]:
allTogetherLow = return_complete_df(hospitalisation_to_death_truncated_low)
allTogetherMid = return_complete_df(hospitalisation_to_death_truncated_mid)
allTogetherHigh = return_complete_df(hospitalisation_to_death_truncated_high)

HBox(children=(IntProgress(value=0, max=103), HTML(value='')))




HBox(children=(IntProgress(value=0, max=103), HTML(value='')))




HBox(children=(IntProgress(value=0, max=103), HTML(value='')))




In [29]:
allTogetherLow.tail()

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases,nCFR_UQ,nCFR_LQ,cCFR_UQ,cCFR_LQ,underreporting_estimate,lower,upper,quantile25,quantile75
98,Ukraine,0.025399,0.03721,239,6423.0,9410,0.02222,0.028577,0.032581,0.041839,0.376243,0.368311,0.40632,0.035617,0.038803
99,United_Arab_Emirates,0.007821,0.011769,89,7562.0,11380,0.006202,0.009439,0.009339,0.0142,1.189528,1.284982,1.197175,0.010933,0.012606
100,United_Kingdom,0.134525,0.175839,21678,123283.0,161145,0.132859,0.136191,0.173714,0.177964,0.079618,0.069079,0.095525,0.175108,0.176571
101,United_States_of_America,0.05763,0.074387,58355,784474.0,1012583,0.057176,0.058084,0.073807,0.074968,0.188204,0.162587,0.226763,0.074188,0.074587
102,Uruguay,0.024,0.02809,15,534.0,625,0.012001,0.035999,0.014076,0.042104,0.4984,0.852528,0.403762,0.023267,0.032913


In [32]:
allTogetherMid[allTogetherMid.country == 'Brazil']

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases,nCFR_UQ,nCFR_LQ,cCFR_UQ,cCFR_LQ,underreporting_estimate,lower,upper,quantile25,quantile75
15,Brazil,0.069791,0.137709,5017,36432.0,71886,0.067928,0.071654,0.13417,0.141247,0.101664,0.089439,0.120356,0.136491,0.138926


In [33]:
allTogetherMid[allTogetherMid.country == 'United_States_of_America']

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases,nCFR_UQ,nCFR_LQ,cCFR_UQ,cCFR_LQ,underreporting_estimate,lower,upper,quantile25,quantile75
101,United_States_of_America,0.05763,0.085227,58355,684704.0,1012583,0.057176,0.058084,0.084565,0.085888,0.164268,0.141902,0.197932,0.084999,0.085454


In [31]:
allTogetherHigh.tail()

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases,nCFR_UQ,nCFR_LQ,cCFR_UQ,cCFR_LQ,underreporting_estimate,lower,upper,quantile25,quantile75
98,Ukraine,0.025399,0.060065,239,3979.0,9410,0.02222,0.028577,0.052683,0.067448,0.233079,0.227779,0.252045,0.057525,0.062606
99,United_Arab_Emirates,0.007821,0.018396,89,4838.0,11380,0.006202,0.009439,0.014609,0.022183,0.761034,0.821385,0.766367,0.017093,0.019699
100,United_Kingdom,0.134525,0.25619,21678,84617.0,161145,0.132859,0.136191,0.253248,0.259131,0.054647,0.047384,0.065604,0.255177,0.257202
101,United_States_of_America,0.05763,0.106586,58355,547493.0,1012583,0.057176,0.058084,0.105768,0.107403,0.13135,0.113455,0.158282,0.106305,0.106867
102,Uruguay,0.024,0.0358,15,419.0,625,0.012001,0.035999,0.01801,0.053589,0.391067,0.666295,0.317229,0.029678,0.041921


## Saving Output

In [None]:
!ls -lah ../output/

In [None]:
date_tag = datetime.now().strftime("%Y-%m-%d-%Hh")

In [None]:
allTogetherLow.to_csv("../output/low_{}.csv".format(date_tag))
allTogetherMid.to_csv("../output/mid_{}.csv".format(date_tag))
allTogetherHigh.to_csv("../output/high_{}.csv".format(date_tag))

In [None]:
!ls -lah ../output/

## Function for Individual Country

In [None]:
def underreporting_estimate_country(country):
    nCFR_low, cCFR_low, _, _, _, _, _,_, _ = calculate_underestimate(country,hospitalisation_to_death_truncated_low)
    nCFR_mid, cCFR_mid, _, _, _, _, _,_, _ = calculate_underestimate(country,hospitalisation_to_death_truncated_mid)
    nCFR_high, cCFR_high, total_deaths, cum_known_t, total_cases, _, _,_, _ = calculate_underestimate(country,hospitalisation_to_death_truncated_high)
    
    underreporting_estimate_low = cCFRBaseline / (100*cCFR_low)
    underreporting_estimate_mid = cCFRBaseline / (100*cCFR_mid)
    underreporting_estimate_high =  cCFRBaseline / (100*cCFR_high)
    print(nCFR_low, nCFR_mid, nCFR_high)
    return { 'naive_CFR_': "{:.2f}%".format(nCFR_mid*100),
             'underreporting_low':   "{:.2f}%".format(underreporting_estimate_low*100),
             'underreporting_mid':   "{:.2f}%".format(underreporting_estimate_mid*100),
             'underreporting_high':  "{:.2f}%".format(underreporting_estimate_high*100)
           }

In [None]:
underreporting_estimate_country("Brazil")

In [None]:
underreporting_estimate_country("United_States_of_America")

In [None]:
underreporting_estimate_country("Lebanon")

In [None]:
underreporting_estimate_country("Germany")

In [None]:
underreporting_estimate_country("Switzerland")

# Sources
1. https://cmmid.github.io/topics/covid19/severity/global_cfr_estimates.html