In [1]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
from tqdm import tqdm_notebook
from statsmodels.stats.proportion import proportion_confint

In [2]:
!ls ../data

case_distribution.csv   case_distribution_2.csv case_distribution_3.csv


## Functions

In [3]:
def muTransform(zMedian):
    return np.log(zMedian)

In [4]:
def sigmaTransform(zMean, mu):
    return np.sqrt(2*(np.log(zMean)-mu))

In [5]:
def plnorm(x, mu, sigma):
    shape  = sigma
    loc    = 0
    scale  = np.exp(mu)
    return lognorm.cdf(x, shape, loc, scale)

In [6]:
def hospitalisation_to_death_truncated(x,mu,sigma):
    return plnorm(x + 1, mu, sigma) - plnorm(x, mu, sigma)

def hospitalisation_to_death_truncated_low(x):
    return hospitalisation_to_death_truncated(x,muLow, sigmaLow)

def hospitalisation_to_death_truncated_mid(x):
    return hospitalisation_to_death_truncated(x,muMid, sigmaMid)

def hospitalisation_to_death_truncated_high(x):
    return hospitalisation_to_death_truncated(x,muHigh, sigmaHigh)

## Parameters

In [7]:
# setting the baseline CFR
cCFRBaseline = 1.4
cCFREstimateRange = (1.2, 1.7)
# lower end of the range
zmeanLow = 8.7
zmedianLow = 6.7
# middle of the range
zmeanMid = 13
zmedianMid = 9.1
# upper end of the range
zmeanHigh = 20.9
zmedianHigh = 13.7

1.2

In [8]:
muLow=muTransform(zmedianLow)
sigmaLow = sigmaTransform(zmeanLow, muLow)

In [9]:
muMid = muTransform(zmedianMid)
sigmaMid = sigmaTransform(zmeanMid, muMid)

In [10]:
muHigh = muTransform(zmedianHigh)
sigmaHigh = sigmaTransform(zmeanHigh, muHigh)

In [11]:
muLow, muMid, muHigh

(1.9021075263969205, 2.2082744135228043, 2.617395832834079)

In [12]:
sigmaLow, sigmaMid, sigmaHigh

(0.7227938838474179, 0.8446004309005916, 0.9190792415637358)

## Clean Dataset

In [13]:
dataset = pd.read_csv('../data/case_distribution_3.csv')
dataset.tail()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
10942,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10943,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0
10944,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10945,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0
10946,21/03/2020,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0


In [14]:
dataset.rename(columns = {
    "dateRep": "date",
    "cases": "new_cases",
    "deaths": "new_deaths",
    "countriesAndTerritories": "country"
},inplace=True)
allTogetherClean = dataset[['date', 'country', 'new_cases', 'new_deaths']]

In [15]:
## Exclude some countries
exclude_coutries = ['Canada','Cases_on_an_international_conveyance_Japan']
allTogetherClean = allTogetherClean[~allTogetherClean.country.isin(exclude_coutries)]
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
10942,25/03/2020,Zimbabwe,0,0
10943,24/03/2020,Zimbabwe,0,1
10944,23/03/2020,Zimbabwe,0,0
10945,22/03/2020,Zimbabwe,1,0
10946,21/03/2020,Zimbabwe,1,0


In [16]:
## Remove lower data points
threshold = 10
list_filtered_countried = allTogetherClean.groupby('country').filter(lambda x: x['new_deaths'].sum()>threshold)['country'].unique()

In [17]:
allTogetherClean = allTogetherClean[allTogetherClean.country.isin(list_filtered_countried)].reset_index(drop=True)
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
6447,04/01/2020,United_States_of_America,0,0
6448,03/01/2020,United_States_of_America,0,0
6449,02/01/2020,United_States_of_America,0,0
6450,01/01/2020,United_States_of_America,0,0
6451,31/12/2019,United_States_of_America,0,0


## Calculate UnderReporting

$$u_{t}=\frac{\sum_{j=0}^{t}c_{t-j}f_{j}}{c_{t}}$$

where:  
$u_{t}$ = underestimation of the proportion of cases with known outcomes  
$c_{t}$ = daily case incidence at time t  
$f_{t}$ = proportion of cases with delay of t between confirmation and death

In [23]:
nCFR_UQ, nCFR_LQ =  proportion_confint(23, 714)
nCFR_UQ, nCFR_LQ

(0.019261872877561325, 0.045163897430561914)

In [25]:
round(nCFR_UQ,8)

0.01926187

In [30]:
def calculate_underestimate(country, delay_func):
    df = allTogetherClean[allTogetherClean.country==country].iloc[::-1].reset_index(drop=True)
    cumulative_known_t = 0
    for ii in range(0,len(df)):
        #print("ii",ii)
        known_i = 0
        for jj in range(0,ii+1):
            #print("jj",jj)
            known_jj = df['new_cases'].loc[ii-jj]*delay_func(jj)
            known_i = known_i + known_jj
        cumulative_known_t = cumulative_known_t + known_i
        #print("-"*30)
    cum_known_t = round(cumulative_known_t)
    # naive CFR value
    nCFR = df['new_deaths'].sum()/df['new_cases'].sum()
    # corrected CFR estimator
    cCFR = df['new_deaths'].sum()/cum_known_t
    total_deaths = df['new_deaths'].sum()
    total_cases = df['new_cases'].sum()
    nCFR_UQ, nCFR_LQ =  proportion_confint(total_deaths, total_cases)
    cCFR_UQ, cCFR_LQ =  proportion_confint(total_deaths, cum_known_t)
    return nCFR, cCFR, total_deaths, cum_known_t, total_cases, round(nCFR_UQ,8), round(nCFR_LQ,8), round(cCFR_UQ,8), round(cCFR_LQ,8)

In [31]:
calculate_underestimate("Albania", hospitalisation_to_death_truncated_low)

(0.05052631578947368,
 0.06722689075630252,
 24,
 357.0,
 475,
 0.03082925,
 0.07022338,
 0.04125082,
 0.09320296)

In [None]:
calculate_underestimate("Afghanistan", hospitalisation_to_death_truncated_low)

In [None]:
calculate_underestimate("Argentina", hospitalisation_to_death_truncated_low)

In [None]:
calculate_underestimate("Brazil", hospitalisation_to_death_truncated_low)

In [55]:
def return_complete_df(dataframe, delay_func):
    all_countries = dataframe['country'].unique()
    new_df = pd.DataFrame(columns = [
        'country',
        'nCFR', 
        'cCFR', 
        'total_deaths', 
        'cum_known_t', 
        'total_cases',
        'nCFR_UQ',
        'nCFR_LQ',
        'cCFR_UQ',
        'cCFR_LQ',
        'underreporting_estimate',
        'lower',
        'upper',
        'quantile25',
        'quantile75'
    ])
    
    for c in tqdm_notebook(all_countries):
        nCFR, cCFR, total_deaths, cum_known_t, total_cases, nCFR_UQ, nCFR_LQ,cCFR_UQ, cCFR_LQ = calculate_underestimate(c,delay_func)
        quantile25, quantile75 = proportion_confint(total_deaths, cum_known_t, alpha = 0.5)
        new_df = new_df.append({'country':c,
                       'nCFR':nCFR,
                       'cCFR': cCFR,
                       'total_deaths': total_deaths,
                       'cum_known_t': int(cum_known_t),
                       'total_cases': total_cases,
                       'nCFR_UQ': nCFR_UQ,
                       'nCFR_LQ': nCFR_LQ,
                       'cCFR_UQ': cCFR_UQ,
                       'cCFR_LQ': cCFR_LQ,
                       'underreporting_estimate': cCFRBaseline / (100*cCFR),
                       'lower': cCFREstimateRange[0]/ (100 * cCFR_UQ),
                       'upper': cCFREstimateRange[1]/ (100 * cCFR_LQ),
                       'quantile25': quantile25,
                       'quantile75': quantile75            
                      }, ignore_index=True)
    return new_df

In [56]:
allTogetherLow = return_complete_df(allTogetherClean, hospitalisation_to_death_truncated_low)
allTogetherMid = return_complete_df(allTogetherClean, hospitalisation_to_death_truncated_mid)
allTogetherHigh = return_complete_df(allTogetherClean, hospitalisation_to_death_truncated_high)

HBox(children=(IntProgress(value=0, max=84), HTML(value='')))




In [59]:
allTogetherLow.tail()

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases,nCFR_UQ,nCFR_LQ,cCFR_UQ,cCFR_LQ,underreporting_estimate,lower,upper,quantile25,quantile75
79,Turkey,0.021548,0.040209,1403,34893,65111,0.020433,0.022663,0.038147,0.04227,0.348184,0.314569,0.402178,0.039499,0.040918
80,Ukraine,0.029063,0.060382,98,1623,3372,0.023393,0.034733,0.048794,0.07197,0.231857,0.245933,0.236209,0.056394,0.06437
81,United_Arab_Emirates,0.005676,0.011537,28,2427,4933,0.00358,0.007772,0.007288,0.015785,1.2135,1.646461,1.076945,0.010075,0.012999
82,United_Kingdom,0.128972,0.219314,12107,55204,93873,0.126828,0.131116,0.215862,0.222766,0.063835,0.055591,0.076313,0.218126,0.220502
83,United_States_of_America,0.04275,0.067546,26057,385768,609516,0.042242,0.043258,0.066754,0.068338,0.207267,0.179765,0.248764,0.067273,0.067818


In [89]:
def underreporting_estimate_country(country):
    nCFR_low, cCFR_low, _, _, _, _, _,_, _ = calculate_underestimate(country,hospitalisation_to_death_truncated_low)
    nCFR_mid, cCFR_mid, _, _, _, _, _,_, _ = calculate_underestimate(country,hospitalisation_to_death_truncated_mid)
    nCFR_high, cCFR_high, total_deaths, cum_known_t, total_cases, _, _,_, _ = calculate_underestimate(country,hospitalisation_to_death_truncated_high)
    
    underreporting_estimate_low = cCFRBaseline / (100*cCFR_low)
    underreporting_estimate_mid = cCFRBaseline / (100*cCFR_mid)
    underreporting_estimate_high =  cCFRBaseline / (100*cCFR_high)
    print(nCFR_low, nCFR_mid, nCFR_high)
    return { 'naive': "{:.2f}%".format(nCFR_mid*100),
             'low':   "{:.2f}%".format(underreporting_estimate_low*100),
             'mid':   "{:.2f}%".format(underreporting_estimate_mid*100),
             'high':  "{:.2f}%".format(underreporting_estimate_high*100)
           }

In [90]:
underreporting_estimate_country("Brazil")

0.06064444620378434 0.06064444620378434 0.06064444620378434


{'naive': '6.06%', 'low': '13.06%', 'mid': '10.49%', 'high': '7.51%'}

In [91]:
underreporting_estimate_country("United_States_of_America")

0.04275031336338997 0.04275031336338997 0.04275031336338997


{'naive': '4.28%', 'low': '20.73%', 'mid': '16.90%', 'high': '12.32%'}

In [92]:
underreporting_estimate_country("Argentina")

0.04317434210526316 0.04317434210526316 0.04317434210526316


{'naive': '4.32%', 'low': '21.71%', 'mid': '18.13%', 'high': '13.65%'}

# Sources
1. https://cmmid.github.io/topics/covid19/severity/global_cfr_estimates.html