In [None]:
#Libraried
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import datetime
from time import time
from scipy import stats

from sklearn.model_selection import GroupKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import os
import glob
import copy

import numpy as np
from scipy.integrate import odeint

## Let's get started with the basic SI model (Susceptible Infected)

First of all let's divide the population into two groups:

* The susceptibles, who are healthy people, the number of susceptibles is denoted as S.
* The infected, who have been infected by the virus, the number of infected is denoted as I.

The number of the total population is denoted as N.

So N = S + I

Let's assume each day there will be I[idx] (idx stands for the idx-th day) infected going out and they will meet with r people, and the probability for the contacted people to be infected is B, so we have:

* S[idx+1] = S[idx] - r*B*I[idx]*S[idx]/N
* I[idx+1] = I[idx] + r*B*I[idx]*S[idx]/N

Then we can start programming it:

In [None]:
# SI model
N = 2200000          # Total population
I = np.zeros(200)  # Infected
S = np.zeros(200)   # Susceptible

r = 10             # This value defines how quickly the disease spreads
B = 0.01            # Probability of being infected

I[0] = 1           # On day 0, there's only one infected person
S[0] = N-I[0]      # So the suspecptible people is equal = N - I[0]

for idx in range(199):
    S[idx+1] = S[idx] - r*B*I[idx]*S[idx]/N
    I[idx+1] = I[idx] + r*B*I[idx]*S[idx]/N


In [None]:
sns.lineplot(x=np.arange(200), y=S, label='Susceptible')
sns.lineplot(x=np.arange(200), y=I, label='Infected')

What the SI model suggests is that once a people got infected eventually the total population will be infected. But does it sound too simple? How can we improve it?

## Introducing SEIR model (Susceptible, Exposed, Infected and Recovered)

First of all let's divide the population into two groups:

TO-BE-COMPLETED

In [None]:
N = 2200000        # Total population
days = 200          # Period
E = np.zeros(days)  # Exposed          
E[0] = 0            # Day 0 exposed
I = np.zeros(days)  # Infected
I[0] = 1          # Day 0 infected                                                                
S = np.zeros(days)  # Susceptible
S[0] = N - I[0]     # Day 0 susceptible
R = np.zeros(days)  # Recovered
R[0] = 0

r = 20              # Number of susceptible could be contactes by an infected
B = 0.03            # Probability of spread for infected
a = 0.1             # Probability of converted from exposed to infected
r2 = r             # Number of susceptible could be contactes by an exposed
B2 = B          # Probability of spread for exposed
y = 0.1             # Probability of recovered


for idx in range(days-1):
    S[idx+1] = S[idx] - r*B*S[idx]*I[idx]/N - r2*B2*S[idx]*E[idx]/N
    E[idx+1] = E[idx] + r*B*S[idx]*I[idx]/N -a*E[idx] + r2*B2*S[idx]*E[idx]/N
    I[idx+1] = I[idx] + a*E[idx] - y*I[idx]
    R[idx+1] = R[idx] + y*I[idx]
    
plt.figure(figsize=(16,9))
sns.lineplot(x=np.arange(200), y=S, label='Susceptible')
sns.lineplot(x=np.arange(200), y=I, label='Infected')
sns.lineplot(x=np.arange(200), y=E, label='Exposed')
sns.lineplot(x=np.arange(200), y=R, label='Recovered')



I_origin = copy.copy(I)

## What if we implement a social-distancing policy?


In [None]:
N = 2200000        # Total population
days = 200          # Period
E = np.zeros(days)  # Exposed          
E[0] = 0            # Day 0 exposed
I = np.zeros(days)  # Infected
I[0] = 1            # Day 0 infected                                                                
S = np.zeros(days)  # Susceptible
S[0] = N - I[0]     # Day 0 susceptible
R = np.zeros(days)  # Recovered
R[0] = 0

r = 20              # Number of susceptible could be contactes by an infected
B = 0.03            # Probability of spread for infected
a = 0.1             # Probability of converted from exposed to infected
r2 = r             # Number of susceptible could be contactes by an exposed
B2 = B           # Probability of spread for exposed
y = 0.1             # Probability of recovered


for idx in range(days-1):
    if idx>10:
        r = 5
        r2 = r
    S[idx+1] = S[idx] - r*B*S[idx]*I[idx]/N - r2*B2*S[idx]*E[idx]/N
    E[idx+1] = E[idx] + r*B*S[idx]*I[idx]/N -a*E[idx] + r2*B2*S[idx]*E[idx]/N
    I[idx+1] = I[idx] + a*E[idx] - y*I[idx]
    R[idx+1] = R[idx] + y*I[idx]

plt.figure(figsize=(16,9))
sns.lineplot(x=np.arange(200), y=S, label='Secestible')
sns.lineplot(x=np.arange(200), y=I, label='Infected')
sns.lineplot(x=np.arange(200), y=E, label='Exposed')
sns.lineplot(x=np.arange(200), y=R, label='Recovered')

I_sd = copy.copy(I)

### Let's plot them together

In [None]:
plt.figure(figsize=(16,9))
sns.lineplot(x=np.arange(200), y=I_origin, label='Infected w/o social distancing')
sns.lineplot(x=np.arange(200), y=I_sd, label='Infected w/ social distancing')

# Questions:

1. Can you tune the parameters used by the model so that it will better fit current status in Alberta?

Tips: 

* COVID-19 stats in AB can be found from here webiste:https://covid19stats.alberta.ca/

* You may want to tweak the parameters such as intitial exposed, 


2. As you can see, social distancing is an effective way of "flattening" the infected curve. Can you think of any other methods that can also be used for the same purpose? How can we adjust to model to reflect the effects you may come up with? What reasonable assumptions we can make? Can you also plot it along with the "do nothing" and "social distancing" curves so we can compare them?

3. What are your takeawys from dothing these excercises? As an Albertan, what can we do to help our community during this challening time?


In [None]:
## Construct an dataframe for the data
init_date = '2020-03-06'  # Initial date
covid_df = pd.DataFrame({'Susceptible-pred':I, 
                         'Exposed-pred':E, 
                         'Infected-pred':I, 
                         'Recovered-pred':R,
                         'Date':pd.date_range(start=init_date, periods=len(I)) ## Setup report date: start date = init date and incresed by 1 for len(I) times
                        })
covid_df

In [None]:
plt.figure(figsize=(16,9))
sns.lineplot(x='Date', y='Susceptible-pred', data=covid_df, label='Susceptible-pred')
sns.lineplot(x='Date', y='Exposed-pred', data=covid_df, label='Exposed-pred')
sns.lineplot(x='Date', y='Infected-pred', data=covid_df, label='Infected-pred')
sns.lineplot(x='Date', y='Recovered-pred', data=covid_df, label='Recovered-pred')

Let's load the actual confirmed cases, which can be from the train.csv file in the competition https://www.kaggle.com/c/covid19-global-forecasting-week-2


In [None]:
actual_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv',parse_dates=['Date'])
actual_df.head()

Let's filter out for Alberta only. Note that we also want to include the data after initial date.

In [None]:
actual_df = actual_df[(actual_df['Province_State']=='Alberta') & (actual_df['Date']>=init_date)]
actual_df

Now we can combine the actual and predicted data

In [None]:
covid_df = pd.merge(covid_df, actual_df, how='left', on='Date')

In order to tell how well the model works we need to compare the predicted cases against the confirmed cases. In that case, we will need to make sure we only compare those dates where we've already known the numbers.

In [None]:
## train_df is the dataset that will be used to "train" the model.
train_df = covid_df[covid_df['ConfirmedCases'].notnull()]
train_df

In [None]:
plt.figure(figsize=(16,9))
sns.lineplot(x='Date', y='Infected-pred', data=train_df, label='Predictions')
sns.lineplot(x='Date', y='ConfirmedCases', data=train_df, label='Actuals')

Apparently our current model underestimated the situations but by how much? Typically, in order to tell how well a model is we would want to compare the predictions to the actuals and measure it using some metrics, For instance, Mean Absolute Error (MAE) (https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-error) is a commonly used metric. In our case, to keep it consistent with the competition rules we will be using RMSLE ()


In [None]:
def RMSLE(actuals, predictions):
    return np.sqrt(mean_squared_error(np.log1p(actuals), np.log1p(predictions)))

print("Baseline model's RMSLE is %f" % (RMSLE(train_df['ConfirmedCases'], train_df['Infected-pred'])))

We've already seen the baseline model underestimated the COVID-19 in Alberta. The question is: how can we tune the model so it will fit the actuals better?

Keep in mind, a model is defined as two factors: 

1. The form of the model (funcitons, equations etc.)
2. The parameters

Once we selected a model, the form is fixed and what we need to do is to tune the parameters so the model can fit the training data. In our case, we'll need to tune a few parameters to **MINIMIZE** the RMSLE of confirmed cases (actuals) and predictions of infected (predictions). This process to tweak parameter values of a model to optimize a metric in respect to a given dataset is called model training.

Let's do a few things to formalize the process. Firstly, let's define a function of the model.

In [None]:
def SEIR(N, days, init_date, init_exposed, init_infected, r, B, a, y, r_sd, sd_days):
    '''
    Define the SEIR model:
    Parameters:
    N: total population
    init_date: intital data of the model
    init_exposed: initial number of exposed
    init_infected: initial number of infected
    r: Number of susceptible could be contactes by an infected
    B: Probability of spread for infected
    a: Probability of converted from exposed to infected
    y: Probability of recovered
    r_sd: Number of susceptible could be contactes by an infected with social distancing
    sd_days: number of days that social distancing was implemented after the initial date

    
    '''
    E = np.zeros(days)  # Exposed          
    E[0] = init_exposed            # Day 0 exposed
    I = np.zeros(days)  # Infected
    I[0] = init_infected            # Day 0 infected                                                                
    S = np.zeros(days)  # Susceptible
    S[0] = N - I[0]     # Day 0 susceptible
    R = np.zeros(days)  # Recovered
    R[0] = 0

    r2 = r             # Number of susceptible could be contactes by an exposed
    B2 = B           # Probability of spread for exposed

    for idx in range(days-1):
        if idx>sd_days:
            r = r_sd
            r2 = r_sd
        S[idx+1] = S[idx] - r*B*S[idx]*I[idx]/N - r2*B2*S[idx]*E[idx]/N
        E[idx+1] = E[idx] + r*B*S[idx]*I[idx]/N -a*E[idx] + r2*B2*S[idx]*E[idx]/N
        I[idx+1] = I[idx] + a*E[idx] - y*I[idx]
        R[idx+1] = R[idx] + y*I[idx]

    df = pd.DataFrame({'Susceptible-pred':I, 
                         'Exposed-pred':E, 
                         'Infected-pred':I, 
                         'Recovered-pred':R,
                         'Date':pd.date_range(start=init_date, periods=len(I))
                        })
    return df


pred_df = SEIR(N=220000, days=200, init_date="2020-03-06", init_exposed=0, 
     init_infected=1, r=20, B=0.03, a=0.1, y=0.1, r_sd=5, sd_days=10)   

pred_df

Let's define another function which validates the performance of an SEIR model with RMSLE

In [None]:

def eval_model(N, days, init_date, init_exposed, init_infected, r, B, a, y, r_sd, sd_days, actual_df):
    pred_df = SEIR(N=N, days=days, init_date=init_date, init_exposed=init_exposed, 
     init_infected=init_infected, r=r, B=B, a=a, y=y, r_sd=r_sd, sd_days=sd_days) 
#     print(pred_df)
    pred_df = pd.merge(pred_df, actual_df, how='left', on='Date')
    pred_df = pred_df[pred_df['ConfirmedCases'].notnull()]
#     print(pred_df['ConfirmedCases'], pred_df['Infected-pred'])
    return RMSLE(pred_df['ConfirmedCases'], pred_df['Infected-pred'])
    


Let's run the validation function with the same parameters values used for the baseline model and see if the results are consistent:

In [None]:
eval_model(N=220000, days=200, init_date="2020-03-06", init_exposed=0, 
     init_infected=1, r=20, B=0.03, a=0.1, y=0.1, r_sd=5, sd_days=10, actual_df=actual_df)

Let's change the initial exposed from 0 to 10 and see how that works:

In [None]:
eval_model(N=220000, days=200, init_date="2020-03-06", init_exposed=10, 
     init_infected=1, r=20, B=0.03, a=0.1, y=0.1, r_sd=5, sd_days=10, actual_df=actual_df)

In [None]:
scores = []
i = 0 
for init_exposed in np.arange(0, 50, 5):
    for r in np.arange(15, 25, 2):
        for B in np.arange(0.01, 0.08, 0.01):
            for a in np.arange(0.05, 0.2, 0.05):
                for r_sd_ratio in np.arange(0.1, 0.5, 0.1):
                    r_sd = r*r_sd_ratio
                    for y in np.arange(0.05,0.2,0.05):
                        score = eval_model(N=220000, days=200, init_date="2020-03-06", 
                                          init_exposed=init_exposed, init_infected=1, r=r, B=B, a=a, y=y, r_sd=r_sd, sd_days=10, 
                                          actual_df=actual_df)
                        i = i + 1
                        if i%500: # Print out the process for every 500 runs
                            print("Score: %f, inti_exposed: %f, r:%f, B:%f, a:%f, y:%f, r_sd:%f" % (score, init_exposed, r, B, a, y, r_sd))
                        scores.append([score, init_exposed, r, B, a, y, r_sd]) # append validation score as well as parameters used to scores

In [None]:
scores_pd = pd.DataFrame(scores, columns=['score', 'init_exposed', 'r', 'B', 'a', 'y', 'r_sd']).sort_values('score')
scores_pd.head()

In [None]:
pred_df = SEIR(N=2200000, days=200, init_date="2020-03-06", 
     init_exposed=scores_pd['init_exposed'].values[0], 
     init_infected=1, 
     r=scores_pd['r'].values[0], 
     B=scores_pd['B'].values[0], 
     a=scores_pd['a'].values[0], 
     y=scores_pd['y'].values[0], 
     r_sd=scores_pd['r_sd'].values[0], 
     sd_days=10) 


pred_df = pd.merge(pred_df, actual_df, how='left', on='Date')
pred_df = pred_df[pred_df['ConfirmedCases'].notnull()]

print("Baseline model's RMSLE is %f" % (RMSLE(pred_df['ConfirmedCases'], pred_df['Infected-pred'])))

plt.figure(figsize=(16,9))
sns.lineplot(x='Date', y='Infected-pred', data=pred_df, label='Predictions')
sns.lineplot(x='Date', y='ConfirmedCases', data=pred_df, label='Actuals')


pred_df