In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.integrate import odeint
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt  

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

def getConfirmedCases(country, startingAt=5):
    data = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-2/train.csv")
    dataOfCountry = data[data.Country_Region == country]
    result = dataOfCountry.ConfirmedCases[dataOfCountry.ConfirmedCases > startingAt]
    return result



# Any results you write to the current directory are saved as output.

* So I guess the SIR model is known to people who visit this kind of website by now. Just in case: https://en.wikipedia.org/wiki/Compartmental_models_in_epidemiology#The_SIR_model.

In this notebook I will use it in a slightly different context than that of the wikipedia article though. In the data supplied by John Hohpkin's we obviously don't have access to the number of people walking around downtown carrying the virus, the variable I (infected), but we do have access to the confirmed cases and recovered cases. 

Lets assume that people upon being diagnosed with Covid will be successfully quarantined/isolated and unable to spread the disease so that only the unconfirmed cases (which we don't have data for) can spread the disease. Then we can use the same diff equation as used in SIR but replacing the var R (recovered) by R for quarantined (or Q if you'd prefer that i guess). The parameter gamma of the SIR model will then indicate chance that an infected, not quarantined person will test positive and subsequently be quarantined.

I will keep the variables of the O.G. SIR model in my notebook because they somehow stuck to my brain as it were..


In [None]:
def SIR(X, t, beta, gamma, N):
    """
    S = X[0], I=X[1], R=X[2]
    """
    dSdt = -beta*X[0]*X[1]/N
    dIdt = -dSdt - gamma*X[1]
    dRdt = gamma*X[1]
    return [dSdt, dIdt, dRdt]


We forward-integrate the SIR model for a population the size of Italy's to get the following disastrous graph for what could happen. 

In [None]:
    beta, gamma, N = 0.9, 0.2, 6*10**7
    I0 = 5
    X0 = [N-I0, I0, 0]
    ts = np.linspace(0, 100 - 1, 100)
    Xs = odeint(SIR, X0, ts, args=(beta, gamma, N))
    
    f = plt.figure(figsize=(10,5))
    plt.plot(ts, Xs[:,0], label='susceptible');
    plt.plot(ts, Xs[:,1], label='infected, not yet quarantined');
    plt.plot(ts, Xs[:,2], label='confirmed infections');
    plt.ylabel("number of people", fontsize=10);
    plt.xlabel("time [days]", fontsize=10);
    plt.legend()
    plt.show()

1. Ok but what if we fit a SIR model to data?

In [None]:

def confirmedSIR(t, beta, gamma):
    N = 6*10**7
    #N = 1000
    I0 = 1
    X0 = [N-I0, I0, 0]
    return odeint(SIR, X0, t, args=(beta, gamma, N))[:,2]

def allSIR(t, beta, gamma):
    N = 6*10**7
    #N = 1000
    I0 = 1
    X0 = [N-I0, I0, 0]
    return odeint(SIR, X0, t, args=(beta, gamma, N))


In [None]:
    observed = getConfirmedCases("Spain")

    N = 6*10**7
    X0 = [N-I0, I0, 0]
    ts = np.linspace(0, len(observed) - 1, len(observed))
    
    popt, pcov = curve_fit(confirmedSIR, ts, observed)
    
    longerTime = np.linspace(0, 100, 1000)
    fitted = allSIR(longerTime, *popt)
    
    f = plt.figure(figsize=(10,5))
    plt.title("Best fitting SIR model for Spain")
    plt.plot(ts, observed, "ro")
    plt.plot(longerTime, fitted[:,1], label='Infected, not quarantined yet')
    plt.plot(longerTime, fitted[:,2], label='Confirmed cases')
    plt.legend()
    plt.show()
              

OK, 400 000 infected in the end huh. The result is a lot more optimistic than I had thought from such a model. Remember that this model can't change its parameters beta, gamma meaning it can't take 'actions' such as improved hygene or social distancing into account.

The way it appears to achieve its optimism is by setting a very small number of people to be infected on the street at a time. We can print its parameters to confirm this:

In [None]:
print("fitted infectiousness",popt[0])
print("fitted chance of being quarantined when sick: ", popt[1])

They're almost the same value! 

If you look at the SIR diff eq you will see that when these parameters are similar and when S is almost N (population size) then approximately as many people fall sick as who are quarantined so the number of unquarantined people is small. And as the S decreases the balance will shift so that fewer people fall sick than are isolated. In the end only a small part of the population will catch the disease.

In [None]:
def plotSIR(country, usingLogScale=False, I0=5):

    observed = getConfirmedCases(country, I0)
    observed = observed[~np.isnan(observed)]

    beta, gamma, N = 0.6, 0.2, 6*10**7
    X0 = [N-I0, I0, 0]
    ts = np.linspace(0, len(observed) - 1, len(observed))
    Xs = odeint(SIR, X0, ts, args=(beta, gamma, N))

    shortening = 10
    popt_partialData, pcov_partialData = curve_fit(confirmedSIR,
                                       ts[:len(ts) - shortening],
                                       observed[:len(ts) - shortening])
    longerTime = np.linspace(0, 100, 1000)
    fitted_partialData = allSIR(longerTime, *popt_partialData)

    popt, pcov = curve_fit(confirmedSIR, ts, observed)
    fitted = allSIR(longerTime, *popt)

    predicted = confirmedSIR(ts, *popt)
    residuals = predicted - observed
    predicted_partialData = confirmedSIR(ts[:len(ts) - shortening], *popt_partialData)
    residuals_partialData = predicted_partialData - observed[:len(ts) - shortening]

    f = plt.figure(figsize=(20,10))
    plt.subplot(311)
    plt.title("SIR model for " + country)
    plt.legend(loc='best')
    if usingLogScale:
        plt.ylabel("Logarithm of confirmed infections", fontsize=10);
        plt.plot(ts, np.log(observed),'ro', label='observed')
        plt.plot(longerTime, np.log(fitted[:,2]), label='fitted')
        plt.plot(longerTime, np.log(fitted_partialData[:,2]), label='fitted on partial data')
    else:
        plt.xlabel("time [days]", fontsize=10);
        plt.plot(ts,observed,'ro', label='observed')
        plt.plot(longerTime,fitted[:,2], label='fitted')
        plt.plot(longerTime, fitted_partialData[:,2], label='fitted on partial data')
    plt.legend()

    plt.subplot(312)
    plt.ylabel("Infected, not quarantined")
    plt.plot(longerTime, fitted[:,1], label='fitted')
    plt.plot(longerTime, fitted_partialData[:,1], label='fitted on partial data')

    plt.subplot(313)
    plt.ylabel("Residuals")
    plt.plot(ts, residuals, 'o')
    plt.plot(ts[:len(ts) - shortening], residuals_partialData, 'o')
    plt.xlim(0, longerTime[-1])
    plt.xlabel("time [days]", fontsize=10);
    plt.show()


Lets plot fitted models for some more countries. I don't believe the fit is very good so I will plot the residuals and also fit models with reduced amount of observed data. As you can see the residuals aren't wonderful (they should ideally be spread around zero with no time-correlation I believe) 

In [None]:
plotSIR("Spain", usingLogScale=True)
plotSIR("Italy", usingLogScale=True)

OK but now I predicted what will happen in Spain before checking whether the model can describe what already has happened in South Korea. Lets have a go at that then.

We can see that the scipy curve fitter does manage to find a fit to the korean data. Note that I had to cut off the first part of the korean epidemic because the SIR model has no chance of describing an epidemic that comes in waves. The residuals, however, indicate once more that the fit is not good..

In [None]:
plotSIR("Korea, South", I0=np.exp(4.5), usingLogScale=True)

As we are interested in predicting the future behavior of the epidemic we are more interested in evaluating our models in terms of accuracy on the observed samples outside the training set than in terms of accuracy inside the training set, i.e. residuals.

Lets split the fitting/training data into multiple chunks and see whether we would trust their individual predictions. We can then choose some metric, such as 1-norm, 2-norm, etc to evaluate the prediction performance. To help visualize the process as well as the goodness of prediction I will also plot each prediction on its individual test data chunk.

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluateModel(countries=["Spain"], usingPlotting=False):
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    parameters = []

    for country in countries:
        observed = getConfirmedCases(country).to_numpy()
        observed = observed[~np.isnan(observed)]

        I0, N = 5, 6*10**7
        X0 = [N-I0, I0, 0]
        for train, test in tscv.split(observed):
            popt, pcov = curve_fit(confirmedSIR, train, observed[train])
            fitted = allSIR(np.linspace(0,50 -1, 50), *popt)

            scores.append(mean_absolute_error(fitted[test,2], observed[test]))
            parameters.append(popt)
            if usingPlotting:
                plt.plot(test,np.log(fitted[test,2]))
        if usingPlotting:
            plt.plot(np.log(observed), 'o')
            plt.show()
    return parameters, scores


In [None]:
params, scores = evaluateModel(usingPlotting=True)
print("Prediction error: ", *scores)

Turns out the three last predictions where more aggressive than the observed data. Due to the log scale of the plot, they look ok but looking at the prediction errors (using the mean absolute error for simplicity) we see that they are quite a way off. 

It is interesting to think about how one should evaluate the prediction error. One way of looking at it may be that the mean absolute error and even more so mean squared error are less useful for evaluating this models for this type of problem since they will be very punishing for the last few data points. Glancing at the documentation for scipy's curve fitting function https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html we find out that it uses least squares a.k.a. mean squared error to find good parameters. This sounds like it could be a problem when fitting in linear scale. 