In [1]:
from collections import namedtuple
from itertools import product

In [2]:
import holoviews as hv
from holoviews.operation import gridmatrix
import networkx as nx
import nevergrad as ng
import numpy as np
import pandas as pd
import pymc3 as pm
from scipy.integrate import odeint
from scipy.interpolate import interp1d
import theano
import theano.tensor as tt

In [3]:
hv.notebook_extension('bokeh', logo=False)
%opts Overlay [aspect=5/3, responsive=True]

In [31]:
def plot_trace(trace, varnames=None, combine=False, force=False):
    """Plot the distribution and trace for each latent variable in a pymc trace object.

    trace: the trace output from pymc.sample
    varnames: Optional specification of variables to include in the trace plot. If None, use all.
    """
    
    if combine:
        df = append_chain_num(pm.trace_to_dataframe(trace, varnames=varnames), 0)
    else:
        df = pd.concat([append_chain_num(pm.trace_to_dataframe(trace, varnames=varnames, chains=i), i)
                        for i in range(trace.nchains)])

#     if varnames is not None:
#         df = df[varnames + ['chain']]
        
    if not force and len(df.columns) > 12:
        raise Exception(f'There are {len(df.columns)-1} variables. Use force=True if you really want to plot them all.')
        
    plots = []
    for var in df.columns[:-1]:
        plots.append(
            hv.Overlay([hv.Distribution(df.loc[df['chain'] == chain], [var], [f'p({var})'])
                        for chain in set(df['chain'])], group=var)
            .options(aspect=3, responsive=True)
        )
        plots.append(
            hv.Overlay([hv.Curve(df.loc[df['chain'] == chain], 'index', var).options(alpha=0.6) for chain in set(df['chain'])])
            .options(aspect=3, responsive=True)
        )
    return hv.Layout(plots).cols(2)

In [5]:
def append_chain_num(df, n):
        df['chain'] = n
        return df

def trace_grid(trace, varnames=None, combine=False):
    

    if combine:
        df = append_chain_num(pm.trace_to_dataframe(trace), 0)
    else:
        df = pd.concat([append_chain_num(pm.trace_to_dataframe(trace, chains=i), i)
                        for i in range(trace.nchains)])

    if varnames is not None:
        df = df[varnames + ['chain']]
        
    def make_scatter(x, y):
        return hv.Overlay([
            hv.Points(df.loc[df['chain'] == chain], [x, y]).options(size=2, alpha=0.2, tools=['box_select'])
            for chain in set(df['chain'])
        ]).options(show_legend=False, aspect=None, responsive=False)

    def make_dist(x):
        return hv.Overlay([
            hv.Distribution(df.loc[df['chain'] == chain], [x], [f'p({x})'])
            for chain in set(df['chain'])
        ]).options(show_legend=False, ylabel=x, aspect=None, responsive=False)

    return hv.GridMatrix({
        (x, y): make_dist(x) if x == y else make_scatter(x, y)
        for x, y in product(df.columns[:-1], df.columns[:-1])
    })

In [40]:
regioni = pd.read_csv(
    'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv',
    parse_dates=['data']
)
lombardia = regioni[regioni['denominazione_regione'] == 'Lombardia'].copy()
lombardia['data'] = pd.to_datetime(lombardia['data'].dt.date)  # Drop the time
lombardia.sort_values('data', inplace=True)

We will divide the population by the state they are in and into age groups. We will track the population in each bin over time and create many plausible samples. Additionally, all groups can be either detected or undetected, recovered or not, alive or deceased. (Some groups will always be zero but that should not be a problem.) This means flow through the states is always one step at a time and in one direction. That makes it easy to divide the progress into stages, for example by symptom level. So our data will have seven dimensions:

samples $\times$ state $\times$ age $\times$ confirmed $\times$ recovered $\times$ deceased $\times$ time

Placing time last lets us have one dimensional vectors for scalar time-series (like the time vector itself). Some parameters will vary over age groups, others will not. Most parameters will be constant in time. We will use broadcasting to use these parameters over all values in dimensions in which they are constant.

Much of the motivation for this decomposition comes from the following document: https://www.epicentro.iss.it/coronavirus/bollettino/Infografica_17aprile%20ITA.pdf

It contains some aggregate statistics by age and symptom level as of the end of April 16.

Ignoring the samples dimension, we define the shorthand sacrdt for the remaining dimensions.

The state dimension follows S, E, I0...IN.

Individuals progress from S to E according to parameter beta, which describes both the contaigousness and the level of mixing, e.g. where staying home reduces mixing and wearing a mask reduces the chance of spreading the virus, both reduce beta. Only infectious persons can pass the virus and therefore have a nonzero beta and beta does not depend on age. (This brings along some assumptions, for example that all ages are treated equally when exibiting similar symptoms. We could expand this to vary by age also but won't yet.) Additionally, beta can vary over time, though we plan to parameterize it to be piecewise constant. So beta has dimensions s1cdt and is zero for S and E and for all recovered and deceased individuals. (This, too, is an assumption.)

Progress through other levels of state occur as an exponential process following rate parameter sigma. Sigma depends on the current state, age, and whether or not the case has been detected (which governs whether or not it receives professional treatment). Recovered and deceased individuals do not change state. Sigma has dimensions sac111.

Individuals start not-recovered but can recover from any state according to rate gamma. Because not infected persons can't recover, gamma for state S is always zero. This parameter can depend on age and detection, too so gamma has dimensionality sac111. It will always be zero for recovered and deceased groups.

Cases are detected (marked confirmed) when tested, according to parameter theta. Theta includes both the rate of testing and the true positive rate. False positives are not considered here (yet). As such, state S is associated with theta=0. The decision to allow testing of recovered and deceased groups should follow actual practices, which I do not know at the moment. For now, we will model no testing of recovered and deceased groups. We will assume for now that testing does not depend on age. Therefore theta has dimensions s11111.

Death occurs following the fashion of recovery, governed by the parameter mu. Only death due to the COVID-19 is modelled so mu for state S and for recovered individuals is always zero. To model asymptomatic spread, both the exposed and first infectious levels will have zero lethality.

In [None]:
# TODO: Should death just be a state above critical?

# Susceptible, exposed, plus 5 levels of infectious, the first of which is asymptomatic
N_INERT_STATES = 2
N_HIDDEN_STATES = 1
N_LETHAL_STATES = 4  # any state with symptoms is considered lethal (even if lethality is very low)
N_STATES = N_INERT_STATES + N_HIDDEN_STATES + N_LETHAL_STATES
N_AGES = 2  # Number of age groups
SD = 1  # Default sigma for weakly informative Lognormal priors

POPULATION = 10_060_574

T = pd.date_range('1 Feb 2020', '1 June 2020', freq='1d')
N_T = len(T)
T0 = T[:-1]
T1 = T[1:]
# TODO: Both DT and SQRT_DT are all ones for now.
# If this resolution is good enough we should remove them.
# If not, but constant spacing is fine, we should make them scalars.
DT = 1.0  # (T1 - T0).days
SQRT_DT = np.sqrt(DT)

ERA_STARTS = pd.to_datetime([
    '1 Jan 1900',  # before start of T for easier indexing
    '8 Mar 2020',
    '21 Mar 2020',
])
N_ERAS = len(ERA_STARTS)  # Number of different time periods (characterized by different values of beta)
ERA_INDICES = interp1d(ERA_STARTS.astype(int), list(range(N_ERAS)), kind='previous', fill_value='extrapolate')(T.astype(int)).astype(int)

T_STATS = pd.to_datetime('16 April 2020')
I_STATS = np.where(T == T_STATS)[0][0]
I_FIRST = np.where(T == lombardia['data'].min())[0][0]
I_LAST = np.where(T == lombardia['data'].max())[0][0]

with pm.Model() as model:
    
    #  state  ×  age  ×  confirmed  ×  recovered  ×  deceased  ×  time
    
    # All the states
    y = pm.Lognormal('y', mu=np.log(2e4), sd=2, shape=(N_STATES, N_AGES, 2, 2, 2, N_T))
    
    # All living, non-recovered individuals in states above exposed can pass the virus
    # beta does not depend on age
    beta = 1 / pm.Lognormal('ibeta', mu=np.log(5), sd=SD,
                            shape=(N_HIDDEN_STATES + N_LETHAL_STATES, 1, 2, 1, 1, N_ERAS))
    beta = beta[..., ERA_INDICES[:-1]]
    
    # All states except susceptible and critical can progress
    # Progression depends on age and detection status
    sigma = 1 / pm.Lognormal('isigma', mu=np.log(5), sd=SD, shape=(N_STATES - 2, N_AGES, 2, 1, 1, 1))
    sigma = tt.concatenate([sigma, np.zeros((N_STATES - 2, N_AGES, 2, 1, 1, 1))], axis=3)  # recovered can't progress
    sigma = tt.concatenate([sigma, np.zeros((N_STATES - 2, N_AGES, 2, 2, 1, 1))], axis=4)  # dead can't progress
    
    # Testing: assumptions here should be verified. Are recovered or deceased individuals tested?
    # How is their data incorporated? Is it back-dated or listed at the test date?
    theta = 1 / pm.Lognormal('itheta', mu=np.log(5), sd=SD, shape=(N_LETHAL_STATES, 1, 1, 1, 1, 1))
    theta = tt.concatenate([np.zeros((N_INERT_STATES + N_HIDDEN_STATES, 1, 1, 1, 1, 1)), theta], axis=0)
    theta = tt.concatenate([theta, np.zeros((N_STATES, 1, 1, 1, 1, 1))], axis=3)  # recovered aren't tested
    theta = tt.concatenate([theta, np.zeros((N_STATES, 1, 1, 2, 1, 1))], axis=4)  # dead aren't tested
    
    # Recovery
    gamma = 1 / pm.Lognormal('igamma', mu=np.log(5), sd=SD, shape=(N_STATES - 1, 1, 2, 1, 1, 1))
    gamma = tt.concatenate([np.zeros((1, 1, 2, 1, 1, 1)), gamma], axis=0)
    gamma = tt.concatenate([gamma, np.zeros((N_STATES, 1, 2, 1, 1, 1))], axis=4)  # dead can't recover
    
    # Lethality
    mu = 1 / pm.Lognormal('imu', mu=np.log(50), sd=SD, shape=(N_LETHAL_STATES, N_AGES, 2, 1, 1, 1))
    mu = tt.concatenate([np.zeros((N_INERT_STATES + N_HIDDEN_STATES, N_AGES, 2, 1, 1, 1)), mu], axis=0)
    mu = tt.concatenate([mu, np.zeros((N_STATES, N_AGES, 2, 1, 1, 1))], axis=3)  # recovered can't die
    
    # Compute the likelihood of each state based on the SDE and the prior state
    y0 = y[..., :-1]
    
    newly_exposed = y0[:1] * tt.sum(y0[N_INERT_STATES:, :, :, :1, :1] * beta, axis=[0, 1, 2, 3, 4], keepdims=True) / POPULATION
    disease_progressed = y0[1:-1] * sigma
    detections = y0[:, :, :1, :, :, :] * theta
    recoveries = y0[:, :, :, :1, :, :] * gamma
    deaths = y0[:, :, :, :, :1, :] * mu
    
    dy = tt.concatenate([-newly_exposed, newly_exposed, disease_progressed], axis=0)
    z = np.zeros((1, N_AGES, 2, 2, 2, N_T-1))
    dy += tt.concatenate([z, -disease_progressed, z], axis=0)
    dy += tt.concatenate([-detections, detections], axis=2)
    dy += tt.concatenate([-recoveries, recoveries], axis=3)
    dy += tt.concatenate([-deaths, deaths], axis=4)
    
    mu = y0 + DT * dy
    sd = np.sqrt(DT * ((100**2) + (0.01**2 * y0 * y0)))
    logp = tt.sum(pm.Normal.dist(mu=mu, sigma=sd).logp(y[..., 1:]))
    pm.Potential('sde', logp)
    
    # Add simple observation models for the data
    def simple_obs_model(name, mu):
        mu = tt.sum(mu, axis=(0, 1, 2, 3, 4))
        sd = np.sqrt(100 ** 2 + 0.10 ** 2 * mu * mu)
        return pm.Normal(name, mu=mu, sd=sd, observed=lombardia[name])

    y1 = y[..., I_FIRST:I_LAST+1]
    # Total confirmed cases: we assume this includes all detected cases.
    # By including recovered and deceased, we ensure this includes all past cases in addition to current ones.
    simple_obs_model('totale_casi', y1[:, :, 1:, :, :])
    # Deceased: just from detected cases
    simple_obs_model('deceduti', y1[:, :, 1:, :, 1:])
    # Home isolation: includes only detected cases not admitted to the hospital
    # Should it include presymptomatic cases? Is it current or total? We treat it as total here.
    simple_obs_model('isolamento_domiciliare', y1[:-2, :, 1:, :, :])
    # Admitted with symptoms: corresponds to severe symptoms
    simple_obs_model('ricoverati_con_sintomi', y1[-2:-1, :, 1:, :, :])
    # Intensive care: corresponds to critical symptoms
    simple_obs_model('terapia_intensiva', y1[-1:, :, 1:, :, :])
    # Recovered: all detected, recovered cases that at one point required hospitalization
    # Is this "recovered" as translated on the GitHub page or "discharged, healed" as I would translate it?
    # If the former, this should also include other known cases once recovered.
    simple_obs_model('dimessi_guariti', y1[-2:, :, 1:, 1:, :1])
    
    # Observe to restrict initial conditions
    # Most categories should be zero at the start
    y0 = y[..., 0]
    pm.Normal('initial_detected', mu=y0[:, :, 1], sd=10, observed=0)
    pm.Normal('initial_recovered', mu=y0[:, :, 0, 1], sd=10, observed=0)
    pm.Normal('initial_deceased', mu=y0[:, :, 0, 0, 1], sd=10, observed=0)
    pm.Normal('initial_symptomatic', mu=y0[N_INERT_STATES+N_HIDDEN_STATES:, :, 0, 0, 0], sd=10, observed=0)
    # Assume initial exposed and asymptmatic infectious numbers are small with a wide prior
    pm.Normal('initial_asymptomatic', mu=y0[1:N_INERT_STATES+N_HIDDEN_STATES, :, 0, 0, 0], sd=1_000, observed=500)
    
    # Observe to enforce categories that should always be zero
    pm.Normal('recovered_dead', mu=y[:, :, :, 1, 1], sd=10, observed=0)
    pm.Normal('susceptible_recovered', mu=y[0, :, :, 1, :], sd=10, observed=0)
    pm.Normal('nonlethal_dead', mu=y[:3, :, :, :, 1], sd=10, observed=0)
    pm.Normal('confirmed_susceptible', mu=y[0, :, 1, :, :], sd=10, observed=0)
    
    # Observe to enforce total population
    total = y.sum(axis=(0, 1, 2, 3, 4))
    pm.Normal('total', mu=total, sd=1, observed=POPULATION)
    
#     # Observe to enforce recorded stats
#     y1 = y[..., I_STATS]
#     # Antibody tests showing past infections could reveal how many people recovered without being detected
#     n_hidden_recovered = tt.sum(y1[:, :, 0, 1])
#     n_confirmed_recovered = tt.sum(y1[:, :, 1, 1])
#     pm.Normal('frac_hidden_recovered', mu=n_hidden_recovered / (n_hidden_recovered + n_confirmed_recovered),
#               sd=0.05, observed=0.50)
#     # Random testing could estimate how many active cases go undetected
#     n_hidden_cases = tt.sum(y1[N_INERT_STATES:, :, 0])
#     n_confirmed_cases = tt.sum(y1[N_INERT_STATES:, :, 1])
#     pm.Normal('frac_hidden_cases', mu=n_hidden_cases / (n_hidden_cases + n_confirmed_cases),
#               sd=0.05, observed=0.80)
    
#     trace = pm.sample(80, tune=20, target_accept=0.99, compute_convergence_checks=False, cores=8, chains=6)
    trace = pm.sample(200, tune=200, target_accept=0.99, compute_convergence_checks=False, cores=8, chains=6)

Only 200 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (6 chains in 8 jobs)
NUTS: [imu, igamma, itheta, isigma, ibeta, y]
Sampling 6 chains, 0 divergences: 100%|█████████▉| 2394/2400 [21:19<00:05,  1.09draws/s]

In [None]:
pm.model_to_graphviz(model)

In [None]:
# Did the total population stay close to the correct amount?
hv.Overlay([
    hv.Curve((T, np.sum(yi[:, :, :, :, :], axis=(0, 1, 2, 3, 4))), 'Date', 'Total Population')
    .options(alpha=0.2)
    for yi in trace['y'][::10]
]) * hv.HLine(POPULATION).options(color='grey', line_dash='dashed', alpha=0.4)

In [None]:
# Did these categories stay close to zero?
(
    hv.Overlay([
        hv.Curve((T, np.sum(yi[:, :, :, 1:, 1:], axis=(0, 1, 2, 3, 4))), 'Date', '#', label='recovered_dead')
        .options(alpha=0.2, color='red')
        for yi in trace['y'][::10]
    ]) *
    hv.Overlay([
        hv.Curve((T, np.sum(yi[:1, :, :, 1:, :], axis=(0, 1, 2, 3, 4))), 'Date', '#', label='susceptible_recovered')
        .options(alpha=0.2, color='green')
        for yi in trace['y'][::10]
    ]) *
    hv.Overlay([
        hv.Curve((T, np.sum(yi[:3, :, :, :, 1:], axis=(0, 1, 2, 3, 4))), 'Date', '#', label='nonlethal_dead')
        .options(alpha=0.2, color='yellow')
        for yi in trace['y'][::10]
    ]) *
    hv.Overlay([
        hv.Curve((T, np.sum(yi[:1, :, :1, :, :], axis=(0, 1, 2, 3, 4))), 'Date', '#', label='confirmed_susceptible')
        .options(alpha=0.2, color='blue')
        for yi in trace['y'][::10]
    ]) *
    hv.HLine(POPULATION).options(color='grey', line_dash='dashed', alpha=0.4)
).options(show_legend=True)

In [60]:
hv.Overlay([
    hv.Curve((T, np.sum(yi[:, :, 1:, :, :], axis=(0, 1, 2, 3, 4))), 'Date', 'Confirmed Cases')
    .options(alpha=0.2)
    for yi in trace['y'][::10]
])

In [61]:
hv.Overlay([
    hv.Curve((T, np.sum(yi[:, :, 1:, :1, :1], axis=(0, 1, 2, 3, 4))), 'Date', 'Current Discovered Cases')
    .options(alpha=0.2)
    for yi in trace['y'][::10]
])

In [62]:
hv.Overlay([
    hv.Curve((T, np.sum(yi[:, :, :, :, 1:], axis=(0, 1, 2, 3, 4))), 'Date', 'Deaths')
    .options(alpha=0.2)
    for yi in trace['y'][::10]
])

In [63]:
plot_trace(trace, varnames=['isigma'], force=True)