In [1]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
from tqdm import tqdm_notebook

In [2]:
!ls ../data

case_distribution.csv   case_distribution_2.csv case_distribution_3.csv


## Functions

In [3]:
def muTransform(zMedian):
    return np.log(zMedian)

In [4]:
def sigmaTransform(zMean, mu):
    return np.sqrt(2*(np.log(zMean)-mu))

In [5]:
def plnorm(x, mu, sigma):
    shape  = sigma
    loc    = 0
    scale  = np.exp(mu)
    return lognorm.cdf(x, shape, loc, scale)

In [6]:
def hospitalisation_to_death_truncated(x,mu,sigma):
    return plnorm(x + 1, mu, sigma) - plnorm(x, mu, sigma)

def hospitalisation_to_death_truncated_low(x):
    return hospitalisation_to_death_truncated(x,muLow, sigmaLow)

def hospitalisation_to_death_truncated_mid(x):
    return hospitalisation_to_death_truncated(x,muMid, sigmaMid)

def hospitalisation_to_death_truncated_high(x):
    return hospitalisation_to_death_truncated(x,muHigh, sigmaHigh)

## Parameters

In [7]:
# setting the baseline CFR
cCFRBaseline = 1.4
cCFREstimateRange = (1.2, 1.7)
# lower end of the range
zmeanLow = 8.7
zmedianLow = 6.7
# middle of the range
zmeanMid = 13
zmedianMid = 9.1
# upper end of the range
zmeanHigh = 20.9
zmedianHigh = 13.7

In [8]:
muLow=muTransform(zmedianLow)
sigmaLow = sigmaTransform(zmeanLow, muLow)

In [9]:
muMid = muTransform(zmedianMid)
sigmaMid = sigmaTransform(zmeanMid, muMid)

In [10]:
muHigh = muTransform(zmedianHigh)
sigmaHigh = sigmaTransform(zmeanHigh, muHigh)

In [11]:
muLow, muMid, muHigh

(1.9021075263969205, 2.2082744135228043, 2.617395832834079)

In [12]:
sigmaLow, sigmaMid, sigmaHigh

(0.7227938838474179, 0.8446004309005916, 0.9190792415637358)

## Clean Dataset

In [13]:
dataset = pd.read_csv('../data/case_distribution_3.csv')
dataset.tail()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
10942,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10943,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0
10944,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10945,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0
10946,21/03/2020,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0


In [14]:
dataset.rename(columns = {
    "dateRep": "date",
    "cases": "new_cases",
    "deaths": "new_deaths",
    "countriesAndTerritories": "country"
},inplace=True)
allTogetherClean = dataset[['date', 'country', 'new_cases', 'new_deaths']]

In [15]:
## Exclude some countries
exclude_coutries = ['Canada','Cases_on_an_international_conveyance_Japan']
allTogetherClean = allTogetherClean[~allTogetherClean.country.isin(exclude_coutries)]
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
10942,25/03/2020,Zimbabwe,0,0
10943,24/03/2020,Zimbabwe,0,1
10944,23/03/2020,Zimbabwe,0,0
10945,22/03/2020,Zimbabwe,1,0
10946,21/03/2020,Zimbabwe,1,0


In [16]:
## Remove lower data points
threshold = 10
list_filtered_countried = allTogetherClean.groupby('country').filter(lambda x: x['new_deaths'].sum()>threshold)['country'].unique()

In [17]:
allTogetherClean = allTogetherClean[allTogetherClean.country.isin(list_filtered_countried)].reset_index(drop=True)
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
9488,25/03/2020,Zimbabwe,0,0
9489,24/03/2020,Zimbabwe,0,1
9490,23/03/2020,Zimbabwe,0,0
9491,22/03/2020,Zimbabwe,1,0
9492,21/03/2020,Zimbabwe,1,0


## Calculate UnderReporting

In [18]:
allTogetherClean.head(3)

Unnamed: 0,date,country,new_cases,new_deaths
0,15/04/2020,Afghanistan,49,2
1,14/04/2020,Afghanistan,58,3
2,13/04/2020,Afghanistan,52,0


$$u_{t}=\frac{\sum_{j=0}^{t}c_{t-j}f_{j}}{c_{t}}$$

where:  
$u_{t}$ = underestimation of the proportion of cases with known outcomes  
$c_{t}$ = daily case incidence at time t  
$f_{t}$ = proportion of cases with delay of t between confirmation and death

In [125]:
def calculate_underestimate(country, delay_func):
    df = allTogetherClean[allTogetherClean.country==country].iloc[::-1].reset_index(drop=True)
    cumulative_known_t = 0
    for ii in range(0,len(df)):
        #print("ii",ii)
        known_i = 0
        for jj in range(0,ii+1):
            #print("jj",jj)
            known_jj = df['new_cases'].loc[ii-jj]*delay_func(jj)
            known_i = known_i + known_jj
        cumulative_known_t = cumulative_known_t + known_i
        #print("-"*30)
    cum_known_t = round(cumulative_known_t)
    # naive CFR value
    nCFR = df['new_deaths'].sum()/df['new_cases'].sum()
    # corrected CFR estimator
    cCFR = df['new_deaths'].sum()/cum_known_t
    total_deaths = df['new_deaths'].sum()
    total_cases = df['new_cases'].sum()
    return nCFR, cCFR, total_deaths, cum_known_t, total_cases

In [126]:
calculate_underestimate("Albania", hospitalisation_to_death_truncated_low)

(0.05052631578947368, 0.06722689075630252, 24, 357.0, 475)

In [127]:
calculate_underestimate("Afghanistan", hospitalisation_to_death_truncated_low)

(0.03221288515406162, 0.059431524547803614, 23, 387.0, 714)

In [128]:
calculate_underestimate("Argentina", hospitalisation_to_death_truncated_low)

(0.04317434210526316, 0.0644963144963145, 105, 1628.0, 2432)

In [129]:
calculate_underestimate("Brazil", hospitalisation_to_death_truncated_low)

(0.06064444620378434, 0.1072078376487054, 1532, 14290.0, 25262)

In [130]:
def return_complete_df(dataframe, delay_func):
    all_countries = allTogetherClean['country'].unique()
    new_df = pd.DataFrame(columns = [
        'country','nCFR', 'cCFR', 'total_deaths', 'cum_known_t', 'total_cases'])
    for c in tqdm_notebook(all_countries):
        nCFR, cCFR, total_deaths, cum_known_t, total_cases = calculate_underestimate(c,delay_func)
        new_row = {'country':c}
        new_df = new_df.append({'country':c,
                       'nCFR':nCFR,
                       'cCFR': cCFR,
                       'total_deaths': total_deaths,
                       'cum_known_t': int(cum_known_t),
                       'total_cases': total_cases
                      }, ignore_index=True)
    return new_df

In [131]:
allTogetherLow = return_complete_df(allTogetherClean, hospitalisation_to_death_truncated_low)
allTogetherMid = return_complete_df(allTogetherClean, hospitalisation_to_death_truncated_mid)
allTogetherHigh = return_complete_df(allTogetherClean, hospitalisation_to_death_truncated_high)

HBox(children=(IntProgress(value=0, max=161), HTML(value='')))




HBox(children=(IntProgress(value=0, max=161), HTML(value='')))




HBox(children=(IntProgress(value=0, max=161), HTML(value='')))




In [132]:
allTogetherLow

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases
0,Afghanistan,0.032213,0.059432,23,387,714
1,Albania,0.050526,0.067227,24,357,475
2,Algeria,0.157488,0.236403,326,1379,2070
3,Andorra,0.047041,0.060904,31,509,659
4,Angola,0.105263,0.133333,2,15,19
5,Antigua_and_Barbuda,0.086957,0.125000,2,16,23
6,Argentina,0.043174,0.064496,105,1628,2432
7,Armenia,0.015302,0.021277,17,799,1111
8,Australia,0.009507,0.011029,61,5531,6416
9,Austria,0.026978,0.032220,384,11918,14234


In [133]:
allTogetherMid

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases
0,Afghanistan,0.032213,0.073718,23,312,714
1,Albania,0.050526,0.078947,24,304,475
2,Algeria,0.157488,0.289007,326,1128,2070
3,Andorra,0.047041,0.071429,31,434,659
4,Angola,0.105263,0.166667,2,12,19
5,Antigua_and_Barbuda,0.086957,0.153846,2,13,23
6,Argentina,0.043174,0.077206,105,1360,2432
7,Armenia,0.015302,0.025074,17,678,1111
8,Australia,0.009507,0.012663,61,4817,6416
9,Austria,0.026978,0.036998,384,10379,14234


In [134]:
allTogetherHigh

Unnamed: 0,country,nCFR,cCFR,total_deaths,cum_known_t,total_cases
0,Afghanistan,0.032213,0.103139,23,223,714
1,Albania,0.050526,0.101695,24,236,475
2,Algeria,0.157488,0.394196,326,827,2070
3,Andorra,0.047041,0.092537,31,335,659
4,Angola,0.105263,0.222222,2,9,19
5,Antigua_and_Barbuda,0.086957,0.222222,2,9,23
6,Argentina,0.043174,0.102539,105,1024,2432
7,Armenia,0.015302,0.032567,17,522,1111
8,Australia,0.009507,0.015960,61,3822,6416
9,Austria,0.026978,0.046602,384,8240,14234


# Sources
1. https://cmmid.github.io/topics/covid19/severity/global_cfr_estimates.html