# TABLE OF CONTENTS

* [1. INTRODUCTION](#section-one)
* [2. SETUP](#section-two)
    - [2.1 Draw Packages](#subsection-two-one)
    - [2.2 Import and Wrangle Data](#subsection-two-two)
* [3. STORY](#section-three)
    - [3.1 Question 1: How do we model the evolution of the Covid19 pandemic?](#subsection-three-one)
    - [3.2 Question 2: How do we flatten the pandemic curve?](#subsection-three-two)
    - [3.3 Question 3: How do we estimate the inflection and plateau points?](#subsection-three-three)
* [4. CONCLUSION](#section-four)
* [5. REFERENCES](#section-five)

<a id="subsection-two-one"></a>
## 2.1 Draw Packages

In [None]:
# data wrangling
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# offline interactive visualization
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

# regression
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.graphics.api as smg

import warnings
warnings.filterwarnings("ignore")

<a id="subsection-two-two"></a>
## 2.2 Import and Wrangle Data

In [None]:
# Worldometer data
# ================

worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv')

# Replace missing values '' with NAN and then 0
worldometer_data = worldometer_data.replace('', np.nan).fillna(0)

# Correcting Country name 
worldometer_data['Country/Region'].replace({'USA':'US', 'UAE':'United Arab Emirates', 'S. Korea':'South Korea', \
                                           'UK':'United Kingdom'}, inplace=True)

# Grouped by day, country
# =======================

full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv')

# Merge in population data
full_grouped = full_grouped.merge(worldometer_data[['Country/Region', 'Population']], how='left', on='Country/Region')

full_grouped['Date'] = pd.to_datetime(full_grouped['Date'], format = '%Y-%m-%d')

In [None]:
def sir_model_betalist(I0 = 0.01, betalist = [0.5,0.8], gammalist = [0.15,0.25,0.5], days = 365):
    """
    Function takes Initial Infected Population(I0), list of transmission rates (betalist)
    and list of recovery rates(gammalist) as arguments.
    Plots Infectious population and Infected Population vs time for input parameters
    """
    
    for gamma in gammalist:
        
        # A. Plot Infectious Population
        plt.figure(figsize=(10,6))
        sns.set(style="darkgrid")
        plt.title("SIR Model: Infectious Population", fontsize=18)
        
        # Initialize model parameters
        for beta in betalist:
            N=1
            I=I0
            S=N-I
            gamma=gamma
            R=beta/gamma
            
            # Initialize empty lists
            inf=[]
            day=[]
            
            # Project into the future
            for i in range(days):
                day.append(i)
                inf.append(I)
                new_inf= I*S*beta
                new_rec= I*gamma
                I=I+new_inf-new_rec
                S=S-new_inf
            
            # Create plot objects by gamma and beta
            inf_max=round(np.array(inf).max()*100,1)
            sns.lineplot(day,inf, label=f"Beta: {beta} Gamma: {gamma} R0: {round(R,2)} Peak: {inf_max}%")
            plt.legend()
            
        # Show all plots objects
        plt.show()
        
        # B. Plot Total Infected Population
        plt.figure(figsize=(10,6))
        plt.title("SIR Model: Total Confirmed Cases", fontsize=18)       
        
        # Initialize model parameters
        for beta in betalist:
            N=1
            I=I0
            S=N-I
            C=I
            gamma=gamma
            R=beta/gamma
            
            # Initialize empty lists
            day=[]
            conf=[]

            # Project into the future            
            for i in range(days):
                day.append(i)
                conf.append(C)

                new_inf= I*S*beta
                new_rec= I*gamma
                I=I+new_inf-new_rec
                S=S-new_inf
                C=C+new_inf

            # Create plot objects by gamma and beta
            conf_max=round(np.array(conf).max()*100,1)
            sns.lineplot(day,conf, label=f"Beta: {beta} Gamma: {gamma} R0: {round(R,2)} Total :{conf_max}%")
            plt.legend()
            
        # Show all plots objects            
        plt.show()

<a id="subsection-three-three"></a>
## 3.3 Question 3: How do we estimate the inflection and plateau points?

In [None]:
# date = date of the most recent subwave of covid19 to project into the future
# date format yyyy-mm-dd, e.g., '2020-07-04'

def plot_country(country, date): 
    temp = full_grouped[full_grouped['Country/Region']==country]
    temp['recent_wave'] = np.where(temp['Date'] >= date,1,0)

    fig = px.line(temp, x='Date', y='Confirmed', color='recent_wave', \
                  title = 'Infections for ' + str(country), height=600)      
    fig.show()
    
    fig = px.line(temp, x='Date', y='Recovered', color='recent_wave', \
              title = 'Recovered Patients ' + str(country), height=600)      
    fig.show()
    
    return country, date

In [None]:
country, date = plot_country('Germany', '2020-07-01')

In [None]:
germ_tracking=pd.read_csv('../input/covid19-tracking-germany/covid_de.csv')
#print(germ_tracking)
germ_tracking=germ_tracking.groupby('date')[['cases','deaths','recovered']].sum()
germ_tracking['Total cases']=germ_tracking['cases'].cumsum()
germ_tracking['Total deaths']=germ_tracking['deaths'].cumsum()
germ_tracking['Total recovered']=germ_tracking['recovered'].cumsum()
germ_tracking['Total active']=germ_tracking['Total cases']-germ_tracking['Total deaths']-germ_tracking['Total recovered']
germ_tracking.reset_index(inplace=True)
#print(germ_tracking)

population = pd.read_csv('../input/covid19-tracking-germany/demographics_de.csv')
population['Total population']  = population['population'].cumsum()
population = population['Total population'].max()
#I0 = germ_tracking[germ_tracking['date'] == '2020-10-05']
#I0 = I0['Total cases']/population
#print(I0)
germ_at_may = germ_tracking[germ_tracking['date'] < '2020-05-30']
#print(germ_at_may)

In [None]:
def plot_germany(df,date): 
    
    df['recent_wave'] = np.where(df['date'] >= date,1,0)
    fig = px.line(df, x='date', y='Total cases', color='recent_wave',\
                  title = 'Infections', height=600)      
    fig.show()
    
    fig = px.line(df, x='date', y='Total recovered', color='recent_wave', \
              title = 'Recovered Patients ', height=600)      
    fig.show()
    
plot_germany(germ_tracking,'2020-10-05')

In [None]:
def estimate_sir_param_germany(df,date):
    
    # Assume everyone is at risk
    # Identify the maximum population and the latest date in the time series for the country
    population = pd.read_csv('../input/covid19-tracking-germany/demographics_de.csv')
    population['Total population']  = population['population'].cumsum()
    population = population['Total population'].max()
    #full_grouped[full_grouped['Country/Region']==country]["Population"].max()
    latest_date = datetime.strptime(df["date"].max(), '%Y-%m-%d')
    time_series_length = (latest_date - datetime.strptime(date,'%Y-%m-%d')).days + 1

    
    df['recent_wave'] = np.where(df['date'] >= date,1,0)
    
    # Initialize Numpy arrays for total population (the maximum population), 
    # susceptible population (empty), and change in time (i.e., 1 day)
    N  = np.array([population] * time_series_length)
    S  = np.array([])
    dt = np.array([1] * (time_series_length-1))

    # Apply the condition N = S+I+(R+D)
    # Filter time-series to those of the recent wave
    I = np.array(df[df['recent_wave']==1]['Total active'])
    R = np.array(df[df['recent_wave']==1]['Total recovered'])
    D = np.array(df[df['recent_wave']==1]['Total deaths'])

    # R includes both Recovered and Death for brevity
    S = N - I - (R + D)

    ## 1. Estimate beta
    
    x = (S * I) / N
    
    # Copy all elements except the last
    x = x[:-1].copy()
    
    # Take the first difference
    dS = np.diff(S)
    y = dS/dt

    # Fit into a linear regression
    results = sm.OLS(y, x, missing='drop').fit()
    beta = results.params
    print(results.summary())
    print('\n')
    print('*'*80)
    print(f"Transmission rate or Beta is: {beta}")
    print('*'*80)
    
    ## 2. Estimate gamma
    
    x = I[:-1].copy()
    dR = np.diff(R+D)
    y = dR/dt

    results = sm.OLS(endog=y, exog=x, missing='drop').fit()
    gamma = results.params
    print (results.summary())
    print('\n')
    print('*'*80)
    print(f"Recovery (and Mortality) rate or Gamma is: {gamma}")
    print('*'*80)
    
    #3. Calculate R

    print('\n')
    print('*'*80)
    print(f"Reproduction number or R is: {-beta/gamma}")
    print('*'*80)
    
    return -beta.astype('float'), gamma.astype('float'), datetime.strptime(date,'%Y-%m-%d').date()

In [None]:
estimate_sir_param_germany(germ_tracking,'2020-10-05')

> When we do the beta is a negative value, does that make sense? USA seem to have a negative beta as well from your previous notes. The beta is also way too high... 

In [None]:
# Calibrate model

def estimate_sir_param(country, date):
    
    # Assume everyone is at risk
    # Identify the maximum population and the latest date in the time series for the country
    population  = full_grouped[full_grouped['Country/Region']==country]["Population"].max()
    latest_date = full_grouped[full_grouped['Country/Region']==country]["Date"].max()
    
    time_series_length = (latest_date - datetime.strptime(date,'%Y-%m-%d')).days + 1

    temp = full_grouped[full_grouped['Country/Region']==country]
    temp['recent_wave'] = np.where(temp['Date'] >= date,1,0)
    
    # Initialize Numpy arrays for total population (the maximum population), 
    # susceptible population (empty), and change in time (i.e., 1 day)
    N  = np.array([population] * time_series_length)
    S  = np.array([])
    dt = np.array([1] * (time_series_length-1))

    # Apply the condition N = S+I+(R+D)
    # Filter time-series to those of the recent wave
    I = np.array(temp[temp['recent_wave']==1]['Active'])
    R = np.array(temp[temp['recent_wave']==1]['Recovered'])
    D = np.array(temp[temp['recent_wave']==1]['Deaths'])

    # R includes both Recovered and Death for brevity
    S = N - I - (R + D)

    ## 1. Estimate beta
    
    x = (S * I) / N
    
    # Copy all elements except the last
    x = x[:-1].copy()
    
    # Take the first difference
    dS = np.diff(S)
    y = dS/dt

    # Fit into a linear regression
    results = sm.OLS(y, x, missing='drop').fit()
    beta = results.params
    print(results.summary())
    print('\n')
    print('*'*80)
    print(f"Transmission rate or Beta is: {beta}")
    print('*'*80)
    
    ## 2. Estimate gamma
    
    x = I[:-1].copy()
    dR = np.diff(R+D)
    y = dR/dt

    results = sm.OLS(endog=y, exog=x, missing='drop').fit()
    gamma = results.params
    print (results.summary())
    print('\n')
    print('*'*80)
    print(f"Recovery (and Mortality) rate or Gamma is: {gamma}")
    print('*'*80)
    
    #3. Calculate R

    print('\n')
    print('*'*80)
    print(f"Reproduction number or R is: {-beta/gamma}")
    print('*'*80)
    
    return -beta.astype('float'), gamma.astype('float'), datetime.strptime(date,'%Y-%m-%d').date()


In [None]:
beta, gamma, date = estimate_sir_param(country, date)

In [None]:
sir_model_betalist(I0=0.01,betalist=[0.1428493,0.06502869], gammalist=[0.01069916,0.06634262])

> The Total confirmed cases reach 100% after 50-100 days. It has a peak of 73.9% which is very high as well. Using your dataset, it also reflect a 100% total cases but around 150-200 days. 