In [None]:
#Libraried
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import datetime
from time import time
from scipy import stats

from sklearn.model_selection import GroupKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import os
import glob
import copy

import numpy as np
from scipy.integrate import odeint

# Load Data

In [None]:
ca_train = pd.read_csv('/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_train.csv')
ca_test = pd.read_csv('/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_test.csv')
ca_submission = pd.read_csv('/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_submission.csv')

train_df = ca_train
test_df =  ca_test
submission_df =  ca_submission

In [None]:
train_df.head()

# Simple EDA

In [None]:
x_1 = train_df['Date']
y_1 = train_df['ConfirmedCases']
y_2 = train_df['Fatalities']

fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=x_1, mode='lines+markers', y=y_1, marker=dict(color="mediumaquamarine"), showlegend=False,
               name="Original signal"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=x_1, mode='lines+markers', y=y_2, marker=dict(color="darkgreen"), showlegend=False,
               name="Original signal"),
    row=1, col=1
)

fig.update_layout(height=400, width=800, title_text="ConfirmedCases (pale) vs. Fatalities (dark) ")
fig.show()

what happend on Mar 10?  
https://apnews.com/96e87b81f05f7ec54fc3e0ad152bd25c  
AP news says that:  
>In Oakland, California, thousands of restless passengers who have been stuck aboard a cruise ship hit by the coronavirus waited their turn to get off the vessel and go to U.S. military bases or back to their home countries for two weeks of quarantine.

A part of passangers might be included the number of ConfirmedCases.

These features in training data set don't seem reliable.

## Let's get started with the basic SI model (Susceptible Infected)

First of all let's divide the population into two groups:

* The susceptibles, who are healthy people, the number of susceptibles is denoted as S.
* The infected, who have been infected by the virus, the number of infected is denoted as I.

The number of the total population is denoted as N.

So N = S + I

Let's assume each day there will be I[idx] (idx stands for the idx-th day) infected going out and they will meet with r people, and the probability for the contacted people to be infected is B, so we have:

* S[idx+1] = S[idx] - r*B*I[idx]*S[idx]/N
* I[idx+1] = I[idx] + r*B*I[idx]*S[idx]/N

Then we can start programming it:

In [None]:
# SI model
N = 1000000          # Total population
I = np.zeros(200)  # Infected
S = np.zeros(200)   # Susceptible

r = 10             # This value defines how quickly the disease spreads
B = 0.01            # Probability of being infected

I[0] = 1           # On day 0, there's only one infected person
S[0] = N-I[0]      # So the suspecptible people is equal = N - I[0]

for idx in range(199):
    S[idx+1] = S[idx] - r*B*I[idx]*S[idx]/N
    I[idx+1] = I[idx] + r*B*I[idx]*S[idx]/N


In [None]:
sns.lineplot(x=np.arange(200), y=S, label='Susceptible')
sns.lineplot(x=np.arange(200), y=I, label='Infected')

What the SI model suggests is that once a people got infected eventually the total population will be infected. But does it sound too simple? How can we improve it?

## Introducing SEIR model (Susceptible, Exposed, Infected and Recovered)

First of all let's divide the population into two groups:

TO-BE-COMPLETED

In [None]:
N = 36000000        # Total population
days = 200          # Period
E = np.zeros(days)  # Exposed          
E[0] = 0            # Day 0 exposed
I = np.zeros(days)  # Infected
I[0] = 144          # Day 0 infected                                                                
S = np.zeros(days)  # Susceptible
S[0] = N - I[0]     # Day 0 susceptible
R = np.zeros(days)  # Recovered
R[0] = 0

r = 20              # Number of susceptible could be contactes by an infected
B = 0.03            # Probability of spread for infected
a = 0.1             # Probability of converted from exposed to infected
r2 = 20             # Number of susceptible could be contactes by an exposed
B2 = 0.03           # Probability of spread for exposed
y = 0.1             # Probability of recovered


for idx in range(days-1):
    S[idx+1] = S[idx] - r*B*S[idx]*I[idx]/N - r2*B2*S[idx]*E[idx]/N
    E[idx+1] = E[idx] + r*B*S[idx]*I[idx]/N -a*E[idx] + r2*B2*S[idx]*E[idx]/N
    I[idx+1] = I[idx] + a*E[idx] - y*I[idx]
    R[idx+1] = R[idx] + y*I[idx]

    
plt.figure(figsize=(16,9))
sns.lineplot(x=np.arange(200), y=S, label='Susceptible')
sns.lineplot(x=np.arange(200), y=I, label='Infected')
sns.lineplot(x=np.arange(200), y=E, label='Exposed')
sns.lineplot(x=np.arange(200), y=R, label='Recovered')



I_origin = copy.copy(I)

## What if we implement a social-distancing policy?


In [None]:
N = 36000000        # Total population
days = 200          # Period
E = np.zeros(days)  # Exposed          
E[0] = 0            # Day 0 exposed
I = np.zeros(days)  # Infected
I[0] = 144            # Day 0 infected                                                                
S = np.zeros(days)  # Susceptible
S[0] = N - I[0]     # Day 0 susceptible
R = np.zeros(days)  # Recovered
R[0] = 0

r = 20              # Number of susceptible could be contactes by an infected
B = 0.03            # Probability of spread for infected
a = 0.1             # Probability of converted from exposed to infected
r2 = 20             # Number of susceptible could be contactes by an exposed
B2 = 0.03           # Probability of spread for exposed
y = 0.1             # Probability of recovered


for idx in range(days-1):
    if idx>10:
        r = 5
        r2 = 5
    S[idx+1] = S[idx] - r*B*S[idx]*I[idx]/N - r2*B2*S[idx]*E[idx]/N
    E[idx+1] = E[idx] + r*B*S[idx]*I[idx]/N -a*E[idx] + r2*B2*S[idx]*E[idx]/N
    I[idx+1] = I[idx] + a*E[idx] - y*I[idx]
    R[idx+1] = R[idx] + y*I[idx]

plt.figure(figsize=(16,9))
sns.lineplot(x=np.arange(200), y=S, label='Secestible')
sns.lineplot(x=np.arange(200), y=I, label='Infected')
sns.lineplot(x=np.arange(200), y=E, label='Exposed')
sns.lineplot(x=np.arange(200), y=R, label='Recovered')

I_sd = copy.copy(I)

In [None]:
plt.figure(figsize=(16,9))
sns.lineplot(x=np.arange(200), y=I_origin, label='Infected w/o social distancing')
sns.lineplot(x=np.arange(200), y=I_sd, label='Infected w/ social distancing')

It's clear that the social-distancing policy can effectively fallten the peak of infected. So during this challenging time the most helpful thing each of us can do is apparently staying at home!!!!


## A more formal SEIR model
https://towardsdatascience.com/social-distancing-to-slow-the-coronavirus-768292f04296  
https://scipython.com/book/chapter-8-scipy/additional-examples/the-sir-epidemic-model/  
We owe to:  
https://towardsdatascience.com/modelling-the-coronavirus-epidemic-spreading-in-a-city-with-python-babd14d82fa2  
https://qiita.com/kotai2003/items/ed28fb723a335a873061 (Japanese)  
https://arxiv.org/abs/2002.06563  

 



The SEIR model is a compartmental model for modeling how a disease spreads through a population. It’s an acronym for Susceptible, Exposed, Infected, Recovered. When a disease is introduced to a population, the people move from one of these classes (or compartments) to the next. When they reach the R state, they’re no longer able to be infected, depending on your interpretation, they either survived the disease and are now immune or succumbed to the illness and are out of the population.
This is an extension of the classic SIR model and simply adds one more equation to show those who are exposed. The full model is given below:

![alt text](https://miro.medium.com/max/1320/1*dXCHv_pSYiMG90efXiFNPQ.png "Logo Title Text 1")

We have four ODE’s in the time domain, with three parameters: α, β, and γ.
* α is the inverse of the incubation period (1/t_incubation)
* β is the average contact rate in the population
* γ is the inverse of the mean infectious period (1/t_infectious)

Equation (1) is the change in people susceptible to the disease and is moderated by the number of infected people and their contact with the infected. Equation (2) gives the people who have been exposed to the disease. It grows based on the contact rate and decreases based on the incubation period whereby people then become infected.
Equation (3) gives us the change in infected people based on the exposed population and the incubation period. It decreases based on the infectious period, so the higher γ is, the more quickly people die/recover and move on to the final stage in Equation (4). The final equation, number (5), is a constraint that indicates there are no birth/migration effects in the model; we have a fixed population from beginning to end.
There’s one more parameter we should discuss, the infamous R0 value.


![alt text](https://miro.medium.com/max/1620/1*kc4-Bv2nzIvb9xG6ELHuzA.png "Logo Title Text 1")

This value defines how quickly the disease spreads and can be related to our parameters through the relationship given in Equation (6)

![alt text](https://miro.medium.com/max/1153/1*K0qnrBZup_ToQeODajV-aw.png "Logo Title Text 1

Modeling Coronavirus
With these equations, we can build our model for the coronavirus itself to try to better understand how it might spread. The key is determining our values for α, β, and γ so we can see how it might spread.
A recent study of COVID-19 estimates some of these values for us (Hellewell et al. 2020), so we can use some of their parameter estimates to get our model off the ground.
Incubation period = 5 days -> α = 0.2
R0 = 3.5
Unfortunately, this paper doesn’t provide a value for γ, but we can get an estimate from another paper (which uses a more complex compartmental model) to get our 1/γ value of 2 days, so γ = 0.5.
Plugging the R0 and γ values into Equation (6), we get an estimate of β = 1.75.


In [None]:
def base_seir_model(init_vals, params, t):
    S_0, E_0, I_0, R_0 = init_vals
    S, E, I, R = [S_0], [E_0], [I_0], [R_0]
    alpha, beta, gamma = params
    dt = t[1] - t[0]
    for _ in t[1:]:
        next_S = S[-1] - (beta*S[-1]*I[-1])*dt
        next_E = E[-1] + (beta*S[-1]*I[-1] - alpha*E[-1])*dt
        next_I = I[-1] + (alpha*E[-1] - gamma*I[-1])*dt
        next_R = R[-1] + (gamma*I[-1])*dt
        S.append(next_S)
        E.append(next_E)
        I.append(next_I)
        R.append(next_R)
    return np.stack([S, E, I, R]).T

In [None]:
# Define parameters
t_max = 100
dt = .1
t = np.linspace(0, t_max, int(t_max/dt) + 1)
N = 36000000
init_infected = 144
init_exposed = 200
init_vals = 1 - (init_infected + init_exposed)/N,  init_exposed/N, init_infected/N, 0
alpha = 0.2
beta = 1.75
gamma = 0.5
params = alpha, beta, gamma
# Run simulation
results = base_seir_model(init_vals, params, t)
results_df = pd.DataFrame(results*N, columns=['susceptible', 'exposed',
                               'infected', 'recovered'])
results_df.head()
plt.figure(figsize=(16,9))
sns.lineplot(x=results_df.index, y=results_df['infected'], label='infected')
sns.lineplot(x=results_df.index, y=results_df['susceptible'], label='susceptible')
sns.lineplot(x=results_df.index, y=results_df['exposed'], label='exposed')
sns.lineplot(x=results_df.index, y=results_df['recovered'], label='recovered')


## Adding social-distancing

In [None]:
def seir_model_with_soc_dist(init_vals, params, t):
    S_0, E_0, I_0, R_0 = init_vals
    S, E, I, R = [S_0], [E_0], [I_0], [R_0]
    alpha, beta, gamma, rho = params
    dt = t[1] - t[0]
    for _ in t[1:]:
        next_S = S[-1] - (rho*beta*S[-1]*I[-1])*dt
        next_E = E[-1] + (rho*beta*S[-1]*I[-1] - alpha*E[-1])*dt
        next_I = I[-1] + (alpha*E[-1] - gamma*I[-1])*dt
        next_R = R[-1] + (gamma*I[-1])*dt
        S.append(next_S)
        E.append(next_E)
        I.append(next_I)
        R.append(next_R)
    return np.stack([S, E, I, R]).T

In [None]:
# Define parameters
t_max = 100
dt = .1
t = np.linspace(0, t_max, int(t_max/dt) + 1)
N = 36000000
init_infected = 144
init_exposed = 5000
init_vals = 1 - (init_infected + init_exposed)/N,  init_exposed/N, init_infected/N, 0
alpha = 0.2
beta = 1.75
gamma = 0.5
rho = 0.8
params = alpha, beta, gamma, rho
# Run simulation
results = seir_model_with_soc_dist(init_vals, params, t)
results_df = pd.DataFrame(results*N, columns=['susceptible', 'exposed',
                               'infected', 'recovered'])
results_df.head()

plt.figure(figsize=(16,9))
sns.lineplot(x=results_df.index, y=results_df['infected'], label='infected')
sns.lineplot(x=results_df.index, y=results_df['susceptible'], label='susceptible')
sns.lineplot(x=results_df.index, y=results_df['exposed'], label='exposed')
sns.lineplot(x=results_df.index, y=results_df['recovered'], label='recovered')


In [None]:
# # Define parameters
# t_max = 100
# dt = .1
# t = np.linspace(0, t_max, int(t_max/dt) + 1)
# N = 36000000
# init_infected = 144
# init_exposed = 300
# init_vals = 1 - (init_infected + init_exposed)/N,  init_exposed/N, init_infected/N, 0
# alpha = 0.2
# beta = 1.75
# gamma = 0.5
# rho = 1
# params = alpha, beta, gamma, rho
# # Run simulation
# results = seir_model_with_soc_dist(init_vals, params, t)
# results_df = pd.DataFrame(results*N, columns=['susceptible', 'exposed',
#                                'infected', 'recovered'])
# results_df.head()
# plt.figure(figsize=(16,9))
# sns.lineplot(x=results_df.index, y=results_df['infected'], label='infected')
# sns.lineplot(x=results_df.index, y=results_df['susceptible'], label='susceptible')
# sns.lineplot(x=results_df.index, y=results_df['exposed'], label='exposed')
# sns.lineplot(x=results_df.index, y=results_df['recovered'], label='recovered')


In [None]:
ca_covid_df = ca_train[(ca_train['Province/State']=='California') & (ca_train['Date']>='2020-03-10')]
ca_covid_df['pred_confirmed'] = results_df[:ca_covid_df.shape[0]]['infected'].values
ca_covid_df

In [None]:
results_df['Date'] = pd.date_range('2020-03-10', periods=results_df.shape[0]).values
submission_df['Date'] =  pd.date_range(start=ca_test['Date'].min(), periods=len(submission_df))
death_rate = 0.012
submission_df = pd.merge(submission_df, results_df, how='left', on='Date')
submission_df['ConfirmedCases'] = submission_df['infected']
submission_df['Fatalities'] = submission_df['ConfirmedCases'] * death_rate
submission_df[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('submission.csv', index=False)

In [None]:
submission_df