In [1]:
import pandas as pd
import numpy as np

## Functions

In [2]:
def muTransform(zMedian):
    return np.log(zMedian)

In [3]:
def sigmaTransform(zMean, mu):
    return np.sqrt(2*(np.log(zMean)-mu))

## Parameters

In [4]:
# setting the baseline CFR
cCFRBaseline = 1.4
cCFREstimateRange = (1.2, 1.7)
# lower end of the range
zmeanLow = 8.7
zmedianLow = 6.7
# middle of the range
zmeanMid = 13
zmedianMid = 9.1
# upper end of the range
zmeanHigh = 20.9
zmedianHigh = 13.7

In [5]:
muLow=muTransform(zmedianLow)
sigmaLow = sigmaTransform(zmeanLow, muLow)

In [6]:
muMid = muTransform(zmedianMid)
sigmaMid = sigmaTransform(zmeanMid, muMid)

In [7]:
muHigh = muTransform(zmedianHigh)
sigmaHigh = sigmaTransform(zmeanHigh, muHigh)

In [8]:
muLow, muMid, muHigh

(1.9021075263969205, 2.2082744135228043, 2.617395832834079)

In [9]:
sigmaLow, sigmaMid, sigmaHigh

(0.7227938838474179, 0.8446004309005916, 0.9190792415637358)

## Clean Dataset

# munge data, pad data and select only those with greater than 10 deaths
allTogetherClean <- allDat %>% 
  dplyr::arrange(countriesAndTerritories, dateRep) %>% 
  dplyr::mutate(dateRep = lubridate::dmy(dateRep))%>% 
  dplyr::rename(date = dateRep, new_cases = cases, new_deaths = deaths, country = countriesAndTerritories) %>%
  dplyr::select(date, country, new_cases, new_deaths) %>%
  dplyr::filter(country != "CANADA", 
                country != "Cases_on_an_international_conveyance_Japan") %>%
  dplyr::group_by(country) %>%
  padr::pad() %>%
  dplyr::mutate(new_cases = tidyr::replace_na(new_cases, 0),
                new_deaths = tidyr::replace_na(new_deaths, 0)) %>%
  dplyr::group_by(country) %>%
  dplyr::mutate(cum_deaths = sum(new_deaths)) %>%
  dplyr::filter(cum_deaths > 0) %>%
  dplyr::select(-cum_deaths)

In [10]:
dataset = pd.read_csv('../data/case_distribution_2.csv')
dataset.tail()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
10327,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10328,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0
10329,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10330,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0
10331,21/03/2020,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0


In [11]:
dataset.rename(columns = {
    "dateRep": "date",
    "cases": "new_cases",
    "deaths": "new_deaths",
    "countriesAndTerritories": "country"
},inplace=True)
allTogetherClean = dataset[['date', 'country', 'new_cases', 'new_deaths']]

In [12]:
## Exclude some countries
exclude_coutries = ['Canada','Cases_on_an_international_conveyance_Japan']
allTogetherClean = allTogetherClean[~allTogetherClean.country.isin(exclude_coutries)]
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
10327,25/03/2020,Zimbabwe,0,0
10328,24/03/2020,Zimbabwe,0,1
10329,23/03/2020,Zimbabwe,0,0
10330,22/03/2020,Zimbabwe,1,0
10331,21/03/2020,Zimbabwe,1,0


In [13]:
## Remove lower data points
threshold = 0
list_filtered_countried = allTogetherClean.groupby('country').filter(lambda x: x['new_deaths'].sum()>threshold)['country'].unique()

In [14]:
allTogetherClean = allTogetherClean[allTogetherClean.country.isin(list_filtered_countried)].reset_index(drop=True)
allTogetherClean.tail()

Unnamed: 0,date,country,new_cases,new_deaths
8969,25/03/2020,Zimbabwe,0,0
8970,24/03/2020,Zimbabwe,0,1
8971,23/03/2020,Zimbabwe,0,0
8972,22/03/2020,Zimbabwe,1,0
8973,21/03/2020,Zimbabwe,1,0


## Calculate UnderReporting

In [15]:
from scipy.stats import lognorm
def plnorm(x, mu, sigma):
    shape  = sigma
    loc    = 0
    scale  = np.exp(mu)
    return lognorm.cdf(x, shape, loc, scale)

In [16]:
plnorm(2,2.61,0.91), plnorm(10,2.61,0.91), plnorm(100,2.61,0.91)

(0.01758343637329408, 0.3677499540156789, 0.9858280976698108)

In [17]:
def hospitalisation_to_death_truncated(x,mu,sigma):
    return plnorm(x + 1, mu, sigma) - plnorm(x, mu, sigma)

def hospitalisation_to_death_truncated_low(x):
    return hospitalisation_to_death_truncated(x,muLow, sigmaLow)

def hospitalisation_to_death_truncated_mid(x):
    return hospitalisation_to_death_truncated(x,muMid, sigmaMid)

def hospitalisation_to_death_truncated_high(x):
    return hospitalisation_to_death_truncated(x,muHigh, sigmaHigh)

In [18]:
hospitalisation_to_death_truncated(2,2.61,0.91), hospitalisation_to_death_truncated(10,2.61,0.91)

(0.030786783175074942, 0.04009886620969777)

In [19]:
hospitalisation_to_death_truncated_low(10), hospitalisation_to_death_truncated_mid(10), hospitalisation_to_death_truncated_high(10)

(0.0433880005995787, 0.04436485993033279, 0.0396433875212705)

scale_cfr <- function(data_1_in, delay_fun){
  case_incidence <- data_1_in$new_cases
  death_incidence <- data_1_in$new_deaths
  cumulative_known_t <- 0 # cumulative cases with known outcome at time tt
  # Sum over cases up to time tt
  for(ii in 1:nrow(data_1_in)){
    known_i <- 0 # number of cases with known outcome at time ii
    for(jj in 0:(ii - 1)){
      known_jj <- (case_incidence[ii - jj]*delay_fun(jj))
      known_i <- known_i + known_jj
    }
    cumulative_known_t <- cumulative_known_t + known_i # Tally cumulative known
  }
  # naive CFR value
  b_tt <- sum(death_incidence)/sum(case_incidence) 
  # corrected CFR estimator
  p_tt <- sum(death_incidence)/cumulative_known_t
  data.frame(nCFR = b_tt, cCFR = p_tt, total_deaths = sum(death_incidence), 
             cum_known_t = round(cumulative_known_t), total_cases = sum(case_incidence))

# Sources
1. https://cmmid.github.io/topics/covid19/severity/global_cfr_estimates.html