In [None]:
import datetime

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from scipy.integrate import odeint
from scipy.optimize import differential_evolution

from sklearn.metrics import r2_score

# The Dataset

In [None]:
covid19_df = pd.read_csv("../input/covid19turkey/Covid19-Turkey.csv")
covid19_df.tail()

# The SIR Model (Kermack-McKendrick)

The model consists of a system of three coupled nonlinear ordinary differential equations,

- Susceptibles: $ \frac{dS}{dt} = -\frac{\beta S I}{N} $,
- Infectives: $ \frac{dI}{dt} = \frac{\beta S I}{N} - \gamma I $,
- Removed: $ \frac{dR}{dt} = \gamma I $,

where $N=S + I + R$, $t$ is time, $S(t)$ is the number of susceptible people, $I(t)$ is the number of people infected, $R(t)$ is the number of people who have recovered and developed immunity to the infection, $\beta$ is the infection rate, and $\gamma$ is the recovery rate. 

In [None]:
def SIR(x, t, BETA, GAMMA):
    S, I, R = x

    N = S + I + R

    dSdt = -(BETA * S * I) / N
    dIdt = ((BETA * S * I) / N) - (GAMMA * I)
    dRdt = GAMMA * I

    return dSdt, dIdt, dRdt

# The SIRD Model

The Susceptible Infectious Recovered Deceased (SIRD) Model differentiates between Recovered (meaning specifically individuals having survived the disease and now immune) and Deceased. This model uses the following system of differential equations:

- Susceptibles: $ \frac{dS}{dt} = -\frac{\beta S I}{N} $,
- Infectives: $ \frac{dI}{dt} = \frac{\beta S I}{N} - \gamma I - \mu I$,
- Recovered: $ \frac{dR}{dt} = \gamma I $,
- Deceased: $ \frac{dD}{dt} = \mu I $,

where $\beta$ $\gamma$, $\mu$ are the rates of infection, recovery, and mortality, respectively.

In [None]:
def SIRD(x, t, BETA, GAMMA, MU):
    S, I, R, D = x

    N = S + I + R + D

    dSdt = -(BETA * S * I) / N
    dIdt = ((BETA * S * I) / N) - (GAMMA * I) - (MU * I)
    dRdt = GAMMA * I
    dDdt = MU * I

    return dSdt, dIdt, dRdt, dDdt

# Simulation Parameters

In [None]:
T = len(covid19_df) * 10
day_count = len(covid19_df)
t = np.linspace(0, day_count, T)

In [None]:
population = 83_154_997 # Based on December 2019
sir_y0 = (1.0, 1 / population, 0) # num. of susceptibles, num. of initial infected count, num. of removed
sird_y0 = (1.0, 1 / population, 0, 0) # num. of susceptibles, num. of initial infected count, num. of recovered, num. of deceased

# Optimization

## The SIR Model

The choice of loss function is based on three assumptions. Each of the loss functions are the *Mean Absolute Errors (MAE)* of different parameters.

1. The first assumption is the number of infected people is published correctly and the number of removed people is published incorrectly. In this case, the loss function is as follows:

$$
Loss_1 = \frac{1}{N} \sum_{k=0}^{N} \lvert y_k^I - \hat{y}_k^I \rvert
$$

2. The second assumption is the number of infected people is published incorrectly and the number of removed people is published correctly. In this case, the loss function is as follow:

$$
Loss_2 = \frac{1}{N} \sum_{k=0}^{N} \lvert y_k^R - \hat{y}_k^R \rvert
$$

3. The third assumption is the both of the infected and removed people count is published correctly. In this case, the loss function is the sum of the loss functions in assumption one and two.

$$
Loss_3 = Loss_1 + Loss_2
$$

$N$ is the number of data points, in this case number of days because the steps are adjusted as 1 days of periods. Superscripts $I$ and $R$ indicates the calculations for *infectives* and *removed*.

In [None]:
def sir_loss_1(x):
    BETA, GAMMA = x

    if BETA < 0 or GAMMA < 0:
        return np.inf
    
    y = odeint(SIR, sir_y0, t, args=(BETA, GAMMA))

    step = np.round((1 / day_count) * T).astype(int)

    I = y[0:step * len(covid19_df):step, 1]
    R = y[0:step * len(covid19_df):step, 2]
    
    return -r2_score(I, covid19_df["Active Cases"].values / population)

    #return (np.abs(I - covid19_df["Active Cases"].values / population).sum() / len(I))

def sir_loss_2(x):
    BETA, GAMMA = x

    if BETA < 0 or GAMMA < 0:
        return np.inf
    
    y = odeint(SIR, sir_y0, t, args=(BETA, GAMMA))

    step = np.round((1 / day_count) * T).astype(int)

    I = y[0:step * len(covid19_df):step, 1]
    R = y[0:step * len(covid19_df):step, 2]
    
    return -r2_score(R, ((covid19_df["Total Recovered"].values + covid19_df["Total Deaths"].values) / population))

    #return (np.abs(R - ((covid19_df["Total Recovered"].values + covid19_df["Total Deaths"].values) / population)).sum() / len(R))

def sir_loss_3(x):
    BETA, GAMMA = x

    if BETA < 0 or GAMMA < 0:
        return np.inf
    
    y = odeint(SIR, sir_y0, t, args=(BETA, GAMMA))

    step = np.round((1 / day_count) * T).astype(int)

    I = y[0:step * len(covid19_df):step, 1]
    R = y[0:step * len(covid19_df):step, 2]
    
    return -(r2_score(I, covid19_df["Active Cases"].values / population) + r2_score(R, ((covid19_df["Total Recovered"].values + covid19_df["Total Deaths"].values) / population)))

#     return (np.abs(I - covid19_df["Active Cases"].values / population).sum() / len(I)) + \
#         (np.abs(R - ((covid19_df["Total Recovered"].values + covid19_df["Total Deaths"].values) / population)).sum() / len(R))

## The SIRD Model

In the fifth version of the notebook, the loss function for the SIRD model is very similar to the Loss 3 of the SIR model. It is defined as follows:

$$
MAE_I = \frac{1}{N} \sum_{k=0}^{N} \lvert y_k^I - \hat{y}_k^I \rvert \\
MAE_R = \frac{1}{N} \sum_{k=0}^{N} \lvert y_k^R - \hat{y}_k^R \rvert \\
MAE_D = \frac{1}{N} \sum_{k=0}^{N} \lvert y_k^D - \hat{y}_k^D \rvert \\
Loss = MAE_I + MAE_R + MAE_D
$$

The only difference is that the $MAE_D$ term is added for the error computation of the deceased data.

In [None]:
def sird_loss(x):
    BETA, GAMMA, MU = x

    if BETA < 0 or GAMMA < 0 or MU < 0:
        return np.inf
    
    y = odeint(SIRD, sird_y0, t, args=(BETA, GAMMA, MU))

    step = np.round((1 / day_count) * T).astype(int)

    I = y[0:step * len(covid19_df):step, 1]
    R = y[0:step * len(covid19_df):step, 2]
    D = y[0:step * len(covid19_df):step, 3]
    
    return -(r2_score(I, covid19_df["Active Cases"].values / population) + \
        r2_score(R, covid19_df["Total Recovered"].values / population) + \
        r2_score(D, covid19_df["Total Deaths"].values / population))

#     return (np.abs(I - covid19_df["Active Cases"].values / population).sum() / len(I)) + \
#         (np.abs(R - covid19_df["Total Recovered"].values / population).sum() / len(R)) + \
#         (np.abs(D - covid19_df["Total Deaths"].values / population).sum() / len(D))

I used the `differential_evolution` global optimizer from *Scipy*. You can easily import it from the `scipy.optimize` package. The range of parameters are constrain between 0 and 10.

In [None]:
sir_res_loss_1 = differential_evolution(
    sir_loss_1,
    ((0, 50), (0, 50)),
    tol=0.00001
)

sir_res_loss_2 = differential_evolution(
    sir_loss_2,
    ((0, 100), (0, 100)),
    tol=0.00001
)

sir_res_loss_3 = differential_evolution(
    sir_loss_3,
    ((0, 100), (0, 100)),
    tol=0.00001
)

sird_res_loss = differential_evolution(
    sird_loss,
    ((0, 100), (0, 100), (0, 100)),
    tol=0.00001
)

$R_0$, the basic reproduction number, which estimates the speed at which a disease is capable of spreading in a population. Although the currently computed $R_0$ value of 5.66 is extraordinary, it indicates an outbreak.

In [None]:
print("SIR Loss 1:")
print(sir_res_loss_1)
print("R0:", sir_res_loss_1.x[0] / sir_res_loss_1.x[1])
print()

print("SIR Loss 2:")
print(sir_res_loss_2)
print("R0:", sir_res_loss_2.x[0] / sir_res_loss_2.x[1])
print()

print("SIR Loss 3:")
print(sir_res_loss_3)
print("R0:", sir_res_loss_3.x[0] / sir_res_loss_3.x[1])
print()
print(50 * "#")
print()

print("SIRD Loss:")
print(sird_res_loss)
print("R0:", sird_res_loss.x[0] / sird_res_loss.x[1])

# Results

After running the simulation, I pinpoint the each 147 days from the simulation time. The step is calculated as follows:

$$
\text{STEP}_k = \lfloor \frac{k}{D} \times T \rceil
$$

where $k=1$ for 1 days of stepping, $D=147$ which is the total simulation time and $T=1000$ which is the number of linearly spaced points in the duration of 147 days. For example, 50th day in real life indicates the $50 \times \lfloor \frac{1}{147} \times 1000 \rceil = 340$th data point in simulation.

By using the above formula, I got the $S$, $I$, and $R$ values from the simulation for each 147 days.

## The SIR Model Results

In [None]:
step = np.round((1 / day_count) * T).astype(int)

sir_y_loss_1 = odeint(SIR, sir_y0, t, args=(sir_res_loss_1.x[0], sir_res_loss_1.x[1]))
sir_y_loss_2 = odeint(SIR, sir_y0, t, args=(sir_res_loss_2.x[0], sir_res_loss_2.x[1]))
sir_y_loss_3 = odeint(SIR, sir_y0, t, args=(sir_res_loss_3.x[0], sir_res_loss_3.x[1]))

sir_S_loss_1 = sir_y_loss_1[:, 0]
sir_I_loss_1 = sir_y_loss_1[:, 1]
sir_R_loss_1 = sir_y_loss_1[:, 2]

sir_S_loss_2 = sir_y_loss_2[:, 0]
sir_I_loss_2 = sir_y_loss_2[:, 1]
sir_R_loss_2 = sir_y_loss_2[:, 2]

sir_S_loss_3 = sir_y_loss_3[:, 0]
sir_I_loss_3 = sir_y_loss_3[:, 1]
sir_R_loss_3 = sir_y_loss_3[:, 2]

In the below figure, you can see the 147 days of simulation result in addition to the real data. The susceptibles are removed from the figures since in the beginning, the value is pretty high with respect to the other data resulting a viusally destructive plot.

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 10), constrained_layout=True)

ax[0].plot(t, sir_I_loss_1, label="# of infectives")
ax[0].plot(t, sir_R_loss_1, label="# of removed")
ax[0].plot(t[0:step * len(covid19_df):step], covid19_df["Active Cases"].values / population, label="# of real infectives")
ax[0].plot(t[0:step * len(covid19_df):step], (covid19_df["Total Recovered"] + covid19_df["Total Deaths"]) / population, label="# of real removed")
ax[0].set_xlabel("Days")
ax[0].set_ylabel("Normalized Population")
ax[0].set_title("147 Days of Simuation w/ Loss 1")
ax[0].legend()

ax[1].plot(t, sir_I_loss_2, label="# of infectives")
ax[1].plot(t, sir_R_loss_2, label="# of removed")
ax[1].plot(t[0:step * len(covid19_df):step], covid19_df["Active Cases"].values / population, label="# of real infectives")
ax[1].plot(t[0:step * len(covid19_df):step], (covid19_df["Total Recovered"] + covid19_df["Total Deaths"]) / population, label="# of real removed")
ax[1].set_xlabel("Days")
ax[1].set_ylabel("Normalized Population")
ax[1].set_title("147 Days of Simuation w/ Loss 2")
ax[1].legend()

ax[2].plot(t, sir_I_loss_3, label="# of infectives")
ax[2].plot(t, sir_R_loss_3, label="# of removed")
ax[2].plot(t[0:step * len(covid19_df):step], covid19_df["Active Cases"].values / population, label="# of real infectives")
ax[2].plot(t[0:step * len(covid19_df):step], (covid19_df["Total Recovered"] + covid19_df["Total Deaths"]) / population, label="# of real removed")
ax[2].set_xlabel("Days")
ax[2].set_ylabel("Normalized Population")
ax[2].set_title("147 Days of Simuation w/ Loss 3")
ax[2].legend()

plt.show()

## The SIRD Model Results

In [None]:
step = np.round((1 / day_count) * T).astype(int)

sird_y_loss = odeint(SIRD, sird_y0, t, args=(sird_res_loss.x[0], sird_res_loss.x[1], sird_res_loss.x[2]))

sird_S_loss = sird_y_loss[:, 0]
sird_I_loss = sird_y_loss[:, 1]
sird_R_loss = sird_y_loss[:, 2]
sird_D_loss = sird_y_loss[:, 3]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 6), constrained_layout=True)

ax.plot(t, sird_I_loss, label="# of infectives")
ax.plot(t, sird_R_loss, label="# of recovered")
ax.plot(t, sird_D_loss, label="# of deceased")
ax.plot(t[0:step * len(covid19_df):step], covid19_df["Active Cases"].values / population, label="# of real infectives")
ax.plot(t[0:step * len(covid19_df):step], covid19_df["Total Recovered"] / population, label="# of real recovered")
ax.plot(t[0:step * len(covid19_df):step], covid19_df["Total Deaths"] / population, label="# of real deceased")
ax.set_xlabel("Days")
ax.set_ylabel("Normalized Population")
ax.set_title("147 Days of Simuation w/ Loss")
ax.legend()

plt.show()

# Conclusion

Which loss function do you think fits the data better than the others?

I think the model with *Loss 1* fits the data better than the others. This proves that if the published number of infectives are correct, then the other numbers are incorrect according to this specific simple SIR model.

In the next versions, I will add more sophisticated models to the notebook. See you.