# Analyzing Effect of Government Preparedness on COVID-19

#### Today COVID-19 has not spared any place on the Earth and government of different nations are taking steps to control it.
Steps such as "Movement Restrictions" have been taken by governments to maintain social distancing.
Let's see if it has any effect on controlling the spread of COVID-19.

In [None]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import matplotlib.colors as mcolors

import matplotlib.cm as cm

import matplotlib.dates as mdates

import seaborn as sns

%matplotlib inline

import os

import matplotlib as mpl

from pathlib import Path
from tqdm.notebook import tqdm
from scipy.integrate import solve_ivp
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_log_error, mean_squared_error

from  IPython.display import Image

In [None]:
# obs_df = pd.read_csv('/kaggle/input/WHO_obesityByCountry_2016.csv')

In [None]:
strngcy_df = pd.read_csv('/kaggle/input/covid-stringency-index-by-country/covid-stringency-index.csv')

lockdown_df = pd.read_csv('/kaggle/input/covid19-lockdown-dates-by-country/countryLockdowndates.csv')

tests_perf_df = pd.read_csv('/kaggle/input/daily-covid-tests-performed/Tests_Performed_latest.csv')

train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/train.csv')

test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/test.csv')

In [None]:
submission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/submission.csv', index_col=['ForecastId'])

In [None]:
"""Process the dataframe: convert data-type to 'Date' and rename columns"""
def process_state_date(df):
#     df.loc[df.Province_State.isnull(),'Province_State'] = df.loc[df.Province_State.isnull(), 'Country_Region']
    df.Date = df.Date.apply(pd.to_datetime)
#     df['day_of_year'] = df.Date.apply(lambda x: x.dayofyear)
    df.rename({"Country_Region":"country", "Province_State":"state"}, axis=1, inplace=True)
    return df

In [None]:
# train_df2 = process_state_date(train_df.copy())

In [None]:
train_df2 = train_df.copy()

In [None]:
train_df2.Date = pd.to_datetime(train_df2.Date)

In [None]:
train_df2 = train_df2.loc[train_df2.ConfirmedCases > 0] # Take only those days where 'ConfirmedCases > 1'

In [None]:
#Groupby countries to ignore provinces
train_df3 = train_df2.groupby(['Country_Region', 'Date']).agg({'ConfirmedCases':sum, 'Fatalities':sum}).reset_index()

In [None]:
train_df4 = train_df3.sort_values(by=['ConfirmedCases', 'Date'], ascending=False)

In [None]:
#Take 20 most affected countries
top_cntry_df = train_df4.loc[train_df4.Country_Region.isin(train_df4.Country_Region.unique()[:20])]

#### Growth of COVID-19 in most affected (most number of Confirmed Cases) countries

In [None]:
plt.figure(figsize=(10,8))
# pal = sns.color_palette("Blues_d", 20)
# pal.reverse()
sns.lineplot(x='Date', y='ConfirmedCases', hue='Country_Region', data=top_cntry_df)
plt.legend(loc='best')
plt.show()
# plt.xlim([top_cntrys_tsdf.Date.min(), top_cntrys_tsdf.Date.max()])

#### It is clear from the plot that the spread rate of COVID-19 in US is to the extreme side and it would take more lives in coming days. The plot of Spain and Italy tend to flatten now.

## Analysing effect of Lockdown Stringency

Oxford University has developed a ["Government Response Tracker"](http://www.bsg.ox.ac.uk/research/research-projects/coronavirus-government-response-tracker) which rates the measures taken by governments by a new index called "Stringency Index".
Let's see how strictness imposed by governments affects the spread of COVID

In [None]:
strngcy_df.head()

In [None]:
#Rename the dataframe
strngcy_df.rename({'Entity':'Country_Region', 'Government Response Stringency Index ((0 to 100, 100 = strictest))':'strngcy_ind'}, axis=1, inplace=True)

In [None]:
#### Repairing countries

strngcy_df.loc[strngcy_df.Country_Region == 'United States', 'Country_Region'] = 'US'

strngcy_df.loc[strngcy_df.Country_Region == 'Myanmar', 'Country_Region'] = 'Burma'

strngcy_df.loc[strngcy_df.Country_Region == 'Taiwan', 'Country_Region'] = 'Taiwan*'

strngcy_df.loc[strngcy_df.Country_Region == 'South Korea', 'Country_Region'] = 'Korea, South'

In [None]:
strngcy_df.Date = strngcy_df.Date.apply(pd.to_datetime) #change to datetime

In [None]:
#### Merge main data and stringency DF

strngt_tr_df = train_df3.merge(strngcy_df, on=['Country_Region', 'Date'], how='left')

strngt_tr_df.strngcy_ind = strngt_tr_df.strngcy_ind.fillna(method='ffill') # replace na values with ffill method

strngt_tr_df['ind_diff'] = strngt_tr_df.strngcy_ind.diff() # Track change in stringency index by taking difference

strngt_tr_df.dropna(inplace=True) # drop NA

Lockdown dataset contains the dates on which governments of different countries imposed "Lockdown".

In [None]:
lockdown_df.head()

In [None]:
lockdown_df2 = lockdown_df.rename({'Country/Region': 'Country_Region', 'Date':'lockdown_date'}, axis=1).drop(['Province', 'Reference'], axis=1)

lockdown_df2.lockdown_date = pd.to_datetime(lockdown_df2.lockdown_date, dayfirst=True) #change to datetime

### Analysing for India

In [None]:
ind_df = strngt_tr_df.query("Country_Region=='India'")

ind_df2 = strngt_tr_df.query("Country_Region=='India' & ind_diff > 0")

date_list = ind_df2.Date.to_list()

ind_stngcy_list = ind_df2.strngcy_ind.to_list()

ind_df3 = ind_df.set_index("Date")

In [None]:
# setup the normalization and the colormap
normalize = mcolors.Normalize(vmin=min(ind_stngcy_list), vmax=max(ind_stngcy_list))
colormap = cm.Reds

ind_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'India'].lockdown_date.iloc[0].date()

#### Plot 'Confirmed Cases' against Date

In [None]:
ind_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5))
for ind, date in zip(ind_stngcy_list, date_list):
    plt.axvline(date, color=colormap(normalize(ind)), linestyle="--", lw=1)

plt.legend().remove()
plt.axhline(ind_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(ind_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(ind_df3.ConfirmedCases.max()), (ind_df3.index.min(), ind_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(ind_df3.Fatalities.max()), (ind_df3.index.min(), ind_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )

plt.annotate('Last Lockdown:\n{}'.format(ind_lockdown_date), (ind_lockdown_date, 0), xytext=(-80, -60), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
scalarmappaple.set_array(ind_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in India')
plt.show()

In the plot, "vertical red lines" represent the intensity with which lockdown was imposed. This intensity is the value of "Stringency Index". We can see that lockdown intensity is gradually increased to maximum 100 when everything was put closed on 24 March. From plot, this lockdown came when COVID was starting and India had 500 cases. Because of the lockdown, rate of increase in cases was slow. 

### Analysing for China

In [None]:
chn_df = strngt_tr_df.query("Country_Region=='China'")

chn_df2 = strngt_tr_df.query("Country_Region=='China' & ind_diff > 0")

chn_date_list = chn_df2.Date.to_list()

chn_stngcy_list = chn_df2.strngcy_ind.to_list()

chn_df3 = chn_df.set_index("Date")

# setup the normalization and the colormap
chn_normalize = mcolors.Normalize(vmin=min(chn_stngcy_list)-5, vmax=max(chn_stngcy_list))
colormap = cm.Reds

chn_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'China'].lockdown_date.max().date()

In [None]:
chn_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5))
for ind, date in zip(chn_stngcy_list, chn_date_list):
    plt.axvline(date, color=colormap(chn_normalize(ind)), linestyle="--", lw=1)

plt.legend().remove()
plt.axhline(chn_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(chn_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(chn_df3.ConfirmedCases.max()), (chn_df3.index.min(), chn_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(chn_df3.Fatalities.max()), (chn_df3.index.min(), chn_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Last Lockdown:\n{}'.format(chn_lockdown_date), (chn_lockdown_date, 0), xytext=(-80, -60), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=chn_normalize, cmap=colormap)
scalarmappaple.set_array(chn_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in China')
plt.show()

China has put first strict measure in January when it had about 3k cases. After imposing strictness the cases increased till an extent and then the curve flattened.

### Analysing for US

In [None]:
us_df = strngt_tr_df.query("Country_Region=='US'")

us_df.drop("Code", axis=1, inplace=True)

us_df_n = train_df3.query("Country_Region=='US' and Date>'2020-03-30'")

us_df = us_df.append(us_df_n)

us_df.fillna(method = 'ffill', inplace=True)

us_df2 = strngt_tr_df.query("Country_Region=='US' & ind_diff > 0")

us_date_list = us_df2.Date.to_list()

us_stngcy_list = us_df2.strngcy_ind.to_list()

us_df3 = us_df.set_index("Date")

# setup the normalization and the colormap
us_normalize = mcolors.Normalize(vmin=min(us_stngcy_list)-5, vmax=max(us_stngcy_list))
colormap = cm.Reds

us_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'US'].lockdown_date.max().date()

In [None]:
us_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5), legend=False)
for ind, date in zip(us_stngcy_list, us_date_list):
    plt.axvline(date, color=colormap(us_normalize(ind)), linestyle="--", lw=1)

plt.axhline(us_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(us_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(us_df3.ConfirmedCases.max()), (us_df3.index.min(), us_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(us_df3.Fatalities.max()), (us_df3.index.min(), us_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Last Lockdown:\n{}'.format(us_lockdown_date), (us_lockdown_date, 0), xytext=(-60, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=us_normalize, cmap=colormap)
scalarmappaple.set_array(us_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in US')
plt.show()

From the plot, US has put some strictness from March but it is still 65% stringent and it has not put "total lockdown" in the country due to which we can see the cases have been increased to 700k!!

### Analysing for Italy

In [None]:
ita_df = strngt_tr_df.query("Country_Region=='Italy'")

ita_df_n = train_df3.query("Country_Region=='Italy' and Date>'2020-03-30'")

ita_df = ita_df.append(ita_df_n)

ita_df.fillna(method = 'ffill', inplace=True)

ita_df2 = strngt_tr_df.query("Country_Region=='Italy' & ind_diff > 0")

ita_date_list = ita_df2.Date.to_list()

ita_stngcy_list = ita_df2.strngcy_ind.to_list()

ita_df3 = ita_df.set_index("Date")

# setup the normalization and the colormap
ita_normalize = mcolors.Normalize(vmin=min(ita_stngcy_list)-5, vmax=max(ita_stngcy_list))
colormap = cm.Reds

ita_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'Italy'].lockdown_date.max().date()

In [None]:
ita_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5), legend=False)
for ind, date in zip(ita_stngcy_list, ita_date_list):
    plt.axvline(date, color=colormap(ita_normalize(ind)), linestyle="--", lw=1)

plt.axhline(ita_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(ita_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(ita_df3.ConfirmedCases.max()), (ita_df3.index.min(), ita_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(ita_df3.Fatalities.max()), (ita_df3.index.min(), ita_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Last Lockdown:\n{}'.format(ita_lockdown_date), (ita_lockdown_date, 0), xytext=(-60, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=ita_normalize, cmap=colormap)
scalarmappaple.set_array(ita_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in Italy')
plt.show()

After imposing multiple lockdowns of different intensities, it has been increased to 90% stringent in Italy when cases were about 18k on 11 March 2020. Its effect is seen now as the curve is starting to flatten up.

### Analyse for Spain

In [None]:
esp_df = strngt_tr_df.query("Country_Region=='Spain'")

esp_df_n = train_df3.query("Country_Region=='Spain' and Date>'2020-03-30'")

esp_df = esp_df.append(esp_df_n)

esp_df.fillna(method = 'ffill', inplace=True)

esp_df2 = strngt_tr_df.query("Country_Region=='Spain' & ind_diff > 0")

esp_date_list = esp_df2.Date.to_list()

esp_stngcy_list = esp_df2.strngcy_ind.to_list()

esp_df3 = esp_df.set_index("Date")

# setup the normalization and the colormap
esp_normalize = mcolors.Normalize(vmin=min(esp_stngcy_list)-5, vmax=max(esp_stngcy_list))
colormap = cm.Reds

esp_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'Spain'].lockdown_date.max().date()

In [None]:
esp_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5), legend=False)
for ind, date in zip(esp_stngcy_list, esp_date_list):
    plt.axvline(date, color=colormap(esp_normalize(ind)), linestyle="--", lw=1)

plt.axhline(esp_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(esp_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(esp_df3.ConfirmedCases.max()), (esp_df3.index.min(), esp_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(esp_df3.Fatalities.max()), (esp_df3.index.min(), esp_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Last Lockdown:\n{}'.format(esp_lockdown_date), (esp_lockdown_date, 0), xytext=(-60, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=esp_normalize, cmap=colormap)
scalarmappaple.set_array(esp_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in Spain')
plt.show()

The timeline of neighbouring country Spain is almost same as Italy as Spain has also put strictness of about 90% stringent during same time as Italy on 14 March 2020. Its curve is also flattening up same as Italy.

### Analyze for Germany

In [None]:
ger_df = strngt_tr_df.query("Country_Region=='Germany'")

ger_df_n = train_df3.query("Country_Region=='Germany' and Date>'2020-03-30'")

ger_df = ger_df.append(ger_df_n)

ger_df.fillna(method = 'ffill', inplace=True)

ger_df2 = strngt_tr_df.query("Country_Region=='Germany' & ind_diff > 0")

ger_date_list = ger_df2.Date.to_list()

ger_stngcy_list = ger_df2.strngcy_ind.to_list()

ger_df3 = ger_df.set_index("Date")

# setup the normalization and the colormap
ger_normalize = mcolors.Normalize(vmin=min(ger_stngcy_list)-5, vmax=max(ger_stngcy_list))
colormap = cm.Reds

ger_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'Germany'].lockdown_date.max().date()

In [None]:
ger_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5), legend=False)
for ind, date in zip(ger_stngcy_list, ger_date_list):
    plt.axvline(date, color=colormap(ger_normalize(ind)), linestyle="--", lw=1)

plt.axhline(ger_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(ger_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(ger_df3.ConfirmedCases.max()), (ger_df3.index.min(), ger_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(ger_df3.Fatalities.max()), (ger_df3.index.min(), ger_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Last Lockdown:\n{}'.format(ger_lockdown_date), (ger_lockdown_date, 0), xytext=(-60, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=ger_normalize, cmap=colormap)
scalarmappaple.set_array(ger_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in Germany')
plt.show()

Germany has put lockdown on 20 March when the cases were 20k. Its number of cases have not increased like Italy or Spain. Its reason can't be inferred from the plot but it has something to do with the medical prowess of Germany.

### Analyze for South Korea

In [None]:
kor_df = strngt_tr_df.query("Country_Region=='Korea, South'")

kor_df_n = train_df3.query("Country_Region=='Korea, South' and Date>'2020-03-30'")

kor_df = kor_df.append(kor_df_n)

kor_df.fillna(method = 'ffill', inplace=True)

kor_df2 = strngt_tr_df.query("Country_Region=='Korea, South' & ind_diff > 0")

kor_date_list = kor_df2.Date.to_list()

kor_stngcy_list = kor_df2.strngcy_ind.to_list()

kor_df3 = kor_df.set_index("Date")

# setup the normalization and the colormap
kor_normalize = mcolors.Normalize(vmin=min(kor_stngcy_list)-5, vmax=max(kor_stngcy_list))
colormap = cm.Reds

kor_lockdown_date = lockdown_df2.loc[lockdown_df2.Country_Region == 'Korea, South'].lockdown_date.max().date()

In [None]:
kor_df3.plot(y=['ConfirmedCases','Fatalities'], figsize=(8,5), legend=False)
for ind, date in zip(kor_stngcy_list, kor_date_list):
    plt.axvline(date, color=colormap(kor_normalize(ind)), linestyle="--", lw=1)

plt.axhline(kor_df3.ConfirmedCases.max(), color='lightgrey', linestyle="--", lw=1)
plt.axhline(kor_df3.Fatalities.max(), color='lightgrey', linestyle="--", lw=1)
plt.annotate('Total Confirmed Cases:\n{}'.format(kor_df3.ConfirmedCases.max()), (kor_df3.index.min(), kor_df3.ConfirmedCases.max()), xytext=(35, -50), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Total Fatalities:\n{}'.format(kor_df3.Fatalities.max()), (kor_df3.index.min(), kor_df3.Fatalities.max()), xytext=(35, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
plt.annotate('Last Lockdown:\n{}'.format(kor_lockdown_date), (kor_lockdown_date, 0), xytext=(40, 20), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
# setup the colorbar
scalarmappaple = cm.ScalarMappable(norm=kor_normalize, cmap=colormap)
scalarmappaple.set_array(kor_stngcy_list)
cbar = plt.colorbar(scalarmappaple)
cbar.set_label('Lockdown Intensity', rotation=270)
plt.ylabel('Cumulative Count of Cases')
plt.title('Progression of COVID in South Korea')
plt.show()

#### South Korea has imposed lockdown quite early when number of cases were 500, on 23 Feb 2020. From then, there has been an exponential rise in cases but the curve flattened. The reason can be medical effectiveness of South Korea.

## Analysing <font color="turquoise">Number of COVID Tests performed</font> vs <font color="yellow">Number of Confirmed Cases</font>

Rapid testing is an important step in uncovering affected people. Let's see how number of cases have been increased on increasing number of tests.

In [None]:
tests_perf_df.head()

In [None]:
tests_perf_df.rename({"Entity":'Country_Region'}, axis=1, inplace=True) #rename country

tests_perf_df2 = tests_perf_df.loc[~tests_perf_df.Country_Region.str.contains('specimens')] #repair for US 

tests_perf_df2 = tests_perf_df2.loc[~tests_perf_df2.Country_Region.str.contains('India - samples')] #repair for India

tests_perf_df2.Country_Region = tests_perf_df2.Country_Region.apply(lambda x: x.split(' - ')[0]) # select country only

### Analyse for India

In [None]:
ind_test_df = tests_perf_df2.query("Country_Region == 'India'")

ind_test_df.drop(['Source URL', 'Source label', 'Notes'], axis=1, inplace=True)

ind_test_df.Date = pd.to_datetime(ind_test_df.Date)

ind_test_df2 = ind_test_df.merge(train_df3, on=['Country_Region', 'Date'], how='left') #merge with main data

In [None]:
#Find daily cases and daily test counts
ind_test_df2['daily_tests'] = ind_test_df2['Cumulative total'].diff()

ind_test_df2['daily_cases'] = ind_test_df2.ConfirmedCases.diff()

#Fill missing values
ind_test_df2.loc[ind_test_df2.daily_tests.isna(), 'daily_tests'] = ind_test_df2.loc[ind_test_df2.daily_tests.isna(), 'Cumulative total']

ind_test_df2.loc[ind_test_df2.daily_cases.isna(), 'daily_cases'] = ind_test_df2.loc[ind_test_df2.daily_cases.isna(), 'ConfirmedCases']

#Find % of confirmed cases w.r.t tests
ind_test_df2['percent_cases_conf_to_tested'] = (ind_test_df2.daily_cases/ind_test_df2.daily_tests) * 100

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 10))
ax[0,0].bar(ind_test_df2.Date, ind_test_df2.daily_cases)
ax[0,0].plot(ind_test_df2.Date, ind_test_df2.daily_cases, marker='o')
# ax.bar(ind_test_df2.index, ind_test_df2.daily_tests)
ax[1,0].bar(ind_test_df2.Date, ind_test_df2.daily_tests, color='orange')
ax[1,0].plot(ind_test_df2.Date, ind_test_df2.daily_tests, color='orange', marker='o')
fig.autofmt_xdate(rotation=30)
ax[0,0].annotate('No data available here', ('2020-04-03', 0), xytext=(-60, -25), 
            textcoords='offset points', 
            arrowprops=dict(arrowstyle='->')
            )
ax[0,0].set_ylabel('Daily Confirmed Cases')
ax[1,0].set_ylabel('Daily Tests Performed')
ax[1,0].set_xlabel('Date')
ax[0,0].set_title('Daily trend of Confirmed Cases vs Tests Performed in India')

ax[0,1].plot(ind_test_df2.Date, ind_test_df2.ConfirmedCases, marker='o')
ax[1,1].plot(ind_test_df2.Date, ind_test_df2['Cumulative total'], marker='o', color='orange')
ax[0,1].set_title('Cumulative trend of Confirmed Cases vs Tests Performed in India')
ax[1,1].set_xlabel('Date')

plt.subplots_adjust()

plt.show()

We can see that as we test more people for COVID, we can confirm the presence of COVID in more people and so is the reason for increase in confirmed cases.

### Analyse for US

In [None]:
us_test_df = tests_perf_df2.query("Country_Region == 'United States'")

us_test_df.drop(['Source URL', 'Source label', 'Notes'], axis=1, inplace=True)

us_test_df.Date = pd.to_datetime(us_test_df.Date)

us_test_df.loc[us_test_df.Country_Region == 'United States', 'Country_Region'] = 'US'

us_test_df2 = us_test_df.merge(train_df3, on=['Country_Region', 'Date'], how='left') #merge with main data

us_test_df2.dropna(inplace=True)

us_test_df2.Date = us_test_df2.Date.apply(lambda x: x.date()) # take only date

#Find daily cases and daily test counts
us_test_df2['daily_tests'] = us_test_df2['Cumulative total'].diff()

us_test_df2['daily_cases'] = us_test_df2.ConfirmedCases.diff()

#Fill missing values
us_test_df2.loc[us_test_df2.daily_tests.isna(), 'daily_tests'] = us_test_df2.loc[us_test_df2.daily_tests.isna(), 'Cumulative total']

us_test_df2.loc[us_test_df2.daily_cases.isna(), 'daily_cases'] = us_test_df2.loc[us_test_df2.daily_cases.isna(), 'ConfirmedCases']

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 10))
ax[0,0].bar(us_test_df2.Date, us_test_df2.daily_cases)
ax[0,0].plot(us_test_df2.Date, us_test_df2.daily_cases, marker='o')
# ax.bar(ind_test_df2.index, ind_test_df2.daily_tests)
ax[1,0].bar(us_test_df2.Date, us_test_df2.daily_tests, color='orange')
ax[1,0].plot(us_test_df2.Date, us_test_df2.daily_tests, color='orange', marker='o')
fig.autofmt_xdate(rotation=30)
ax[0,0].set_ylabel('Confirmed Cases')
ax[1,0].set_ylabel('Tests Performed')
ax[1,0].set_xlabel('Date')
ax[0,0].set_title('Daily trend of Confirmed Cases vs Tests Performed in US')

ax[0,1].plot(us_test_df2.Date, us_test_df2.ConfirmedCases, marker='o')
ax[1,1].plot(us_test_df2.Date, us_test_df2['Cumulative total'], marker='o', color='orange')
ax[0,1].set_title('Cumulative trend of Confirmed Cases vs Tests Performed in US')
ax[1,1].set_xlabel('Date')
plt.subplots_adjust()
plt.show()

In US also, the more number of tests, more number of confirmed cases. Here, we can see that on performing 2.5 million tests, about 500k people have come up infected i.e. more than 20% confirmation rate => more than 20 people found positive on testing 100 people. This is huge infection rate.

### Analyse for Italy

In [None]:
ita_test_df = tests_perf_df2.query("Country_Region == 'Italy'")

ita_test_df.drop(['Source URL', 'Source label', 'Notes'], axis=1, inplace=True)

ita_test_df.Date = pd.to_datetime(ita_test_df.Date)

ita_test_df2 = ita_test_df.merge(train_df3, on=['Country_Region', 'Date'], how='left') #merge with main data

ita_test_df2.dropna(inplace=True)

ita_test_df2.Date = ita_test_df2.Date.apply(lambda x: x.date()) # take only date

#Find daily cases and daily test counts
ita_test_df2['daily_tests'] = ita_test_df2['Cumulative total'].diff()

ita_test_df2['daily_cases'] = ita_test_df2.ConfirmedCases.diff()

#Fill missing values
ita_test_df2.loc[ita_test_df2.daily_tests.isna(), 'daily_tests'] = ita_test_df2.loc[ita_test_df2.daily_tests.isna(), 'Cumulative total']

ita_test_df2.loc[ita_test_df2.daily_cases.isna(), 'daily_cases'] = ita_test_df2.loc[ita_test_df2.daily_cases.isna(), 'ConfirmedCases']

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 10))
ax[0,0].bar(ita_test_df2.Date, ita_test_df2.daily_cases)
ax[0,0].plot(ita_test_df2.Date, ita_test_df2.daily_cases, marker='o')
# ax.bar(ind_test_df2.index, ind_test_df2.daily_tests)
ax[1,0].bar(ita_test_df2.Date, ita_test_df2.daily_tests, color='orange')
ax[1,0].plot(ita_test_df2.Date, ita_test_df2.daily_tests, color='orange', marker='o')
fig.autofmt_xdate(rotation=30)
ax[0,0].set_ylabel('Daily Confirmed Cases')
ax[1,0].set_ylabel('Daily Tests Performed')
ax[1,0].set_xlabel('Date')
ax[0,0].set_title('Daily trend of Confirmed Cases vs Tests Performed in Italy')

ax[0,1].plot(ita_test_df2.Date, ita_test_df2.ConfirmedCases, marker='o')
ax[1,1].plot(ita_test_df2.Date, ita_test_df2['Cumulative total'], marker='o', color='orange')
ax[0,1].set_title('Cumulative trend of Confirmed Cases vs Tests Performed in Italy')
ax[1,1].set_xlabel('Date')
plt.subplots_adjust()
plt.show()

In Italy, if we see the center of the plot, we can infer that the spread rate of COVID was high during March 22 week. The ratio of test:confirmed is approx 30:100.

### Analyse for South Korea

In [None]:
kor_test_df = tests_perf_df2.query("Country_Region == 'South Korea'")

kor_test_df.drop(['Source URL', 'Source label', 'Notes'], axis=1, inplace=True)

kor_test_df.Date = pd.to_datetime(kor_test_df.Date)

kor_test_df.loc[kor_test_df.Country_Region == 'South Korea', 'Country_Region'] = 'Korea, South'

kor_test_df2 = kor_test_df.merge(train_df3, on=['Country_Region', 'Date'], how='left') #merge with main data

kor_test_df2.dropna(inplace=True)

kor_test_df2.Date = kor_test_df2.Date.apply(lambda x: x.date()) # take only date

#Find daily cases and daily test counts
kor_test_df2['daily_tests'] = kor_test_df2['Cumulative total'].diff()

kor_test_df2['daily_cases'] = kor_test_df2.ConfirmedCases.diff()

#Fill missing values
kor_test_df2.loc[kor_test_df2.daily_tests.isna(), 'daily_tests'] = kor_test_df2.loc[kor_test_df2.daily_tests.isna(), 'Cumulative total']

kor_test_df2.loc[kor_test_df2.daily_cases.isna(), 'daily_cases'] = kor_test_df2.loc[kor_test_df2.daily_cases.isna(), 'ConfirmedCases']

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 10))
ax[0,0].bar(kor_test_df2.Date, kor_test_df2.daily_cases)
ax[0,0].plot(kor_test_df2.Date, kor_test_df2.daily_cases, marker='o')
# ax.bar(ind_test_df2.index, ind_test_df2.daily_tests)
ax[1,0].bar(kor_test_df2.Date, kor_test_df2.daily_tests, color='orange')
ax[1,0].plot(kor_test_df2.Date, kor_test_df2.daily_tests, color='orange', marker='o')
fig.autofmt_xdate(rotation=30)
ax[0,0].set_ylabel('Daily Confirmed Cases')
ax[1,0].set_ylabel('Daily Tests Performed')
ax[1,0].set_xlabel('Date')
ax[0,0].set_title('Daily trend of Confirmed Cases vs Tests Performed in South Korea')

ax[0,1].plot(kor_test_df2.Date, kor_test_df2.ConfirmedCases, marker='o')
ax[1,1].plot(kor_test_df2.Date, kor_test_df2['Cumulative total'], marker='o', color='orange')
ax[0,1].set_title('Cumulative trend of Confirmed Cases vs Tests Performed in South Korea')
ax[1,1].set_xlabel('Date')
plt.subplots_adjust()
# ax[0].set_xlim('2020-03-01', '2020-03-15')
# ax[1].set_xlim('2020-03-01', '2020-03-15')
plt.show()

During start of COVID in South Korea, people were being tested on large scale and due to this we can see the peak on same location in both plots. On taking medical and strict precautions, as number of tests remain same, number of cases got reduced.

# SEIR-HCD Model
This is a working example of a [SIER](https://en.wikipedia.org/wiki/Compartmental_models_in_epidemiology#The_SEIR_model) model with added compartments for HCD. The letters stand for:
* Susceptible
* Exposed
* Infected
* Recovered
* Hospitalized
* Critical
* Death

I have forked this kernel from @datasaurus' great kernel on SEIR-HCD. 
Adapted the equations from these web apps:
* http://gabgoh.github.io/COVID/index.html
* https://neherlab.org/covid19

## Parameters used in the model
`R_t` = reproduction number at time t. Typical 3.6* at t=0

**Transition times**
* `T_inc` = average incubation period. Typical 5.6* days
* `T_inf` = average infectious period. Typical 2.9 days
* `T_hosp` = average time a patient is in hospital before either recovering or becoming critical. Typical 4 days
* `T_crit` = average time a patient is in a critical state (either recover or die). Typical 14 days

**Fractions**
These constants are likely to be age specific (hence the subscript a):
* `m_a` = fraction of infections that are asymptomatic or mild. Assumed 80% (i.e. 20% severe)
* `c_a` = fraction of severe cases that turn critical. Assumed 10%
* `f_a` = fraction of critical cases that are fatal. Assumed 30%

*Averages taken from https://www.kaggle.com/covid-19-contributions

In [None]:
# Susceptible equation
def dS_dt(S, I, R_t, t_inf):
    return -(R_t / t_inf) * I * S


# Exposed equation
def dE_dt(S, E, I, R_t, t_inf, t_inc):
    return (R_t / t_inf) * I * S - (E / t_inc)


# Infected equation
def dI_dt(I, E, t_inc, t_inf):
    return (E / t_inc) - (I / t_inf)


# Hospialized equation
def dH_dt(I, C, H, t_inf, t_hosp, t_crit, m_a, f_a):
    return ((1 - m_a) * (I / t_inf)) + ((1 - f_a) * C / t_crit) - (H / t_hosp)


# Critical equation
def dC_dt(H, C, t_hosp, t_crit, c_a):
    return (c_a * H / t_hosp) - (C / t_crit)


# Recovered equation
def dR_dt(I, H, t_inf, t_hosp, m_a, c_a):
    return (m_a * I / t_inf) + (1 - c_a) * (H / t_hosp)


# Deaths equation
def dD_dt(C, t_crit, f_a):
    return f_a * C / t_crit


def SEIR_HCD_model(t, y, R_t, t_inc=2.9, t_inf=5.2, t_hosp=4, t_crit=14, m_a=0.8, c_a=0.1, f_a=0.3):
    """

    :param t: Time step for solve_ivp
    :param y: Previous solution or initial values
    :param R_t: Reproduction number
    :param t_inc: Average incubation period. Default 5.2 days
    :param t_inf: Average infectious period. Default 2.9 days
    :param t_hosp: Average time a patient is in hospital before either recovering or becoming critical. Default 4 days
    :param t_crit: Average time a patient is in a critical state (either recover or die). Default 14 days
    :param m_a: Fraction of infections that are asymptomatic or mild. Default 0.8
    :param c_a: Fraction of severe cases that turn critical. Default 0.1
    :param f_a: Fraction of critical cases that are fatal. Default 0.3
    :return:
    """
    if callable(R_t):
        reprod = R_t(t)
    else:
        reprod = R_t
        
    S, E, I, R, H, C, D = y
    
    S_out = dS_dt(S, I, reprod, t_inf)
    E_out = dE_dt(S, E, I, reprod, t_inf, t_inc)
    I_out = dI_dt(I, E, t_inc, t_inf)
    R_out = dR_dt(I, H, t_inf, t_hosp, m_a, c_a)
    H_out = dH_dt(I, C, H, t_inf, t_hosp, t_crit, m_a, f_a)
    C_out = dC_dt(H, C, t_hosp, t_crit, c_a)
    D_out = dD_dt(C, t_crit, f_a)
    return [S_out, E_out, I_out, R_out, H_out, C_out, D_out]

In [None]:
def plot_model(solution, title='SEIR+HCD model'):
    sus, exp, inf, rec, hosp, crit, death = solution.y
    
    cases = inf + rec + hosp + crit + death

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))
    fig.suptitle(title)
    
    ax1.plot(sus, 'tab:blue', label='Susceptible');
    ax1.plot(exp, 'tab:orange', label='Exposed');
    ax1.plot(inf, 'tab:red', label='Infected');
    ax1.plot(rec, 'tab:green', label='Recovered');
    ax1.plot(hosp, 'tab:purple', label='Hospitalised');
    ax1.plot(crit, 'tab:brown', label='Critical');
    ax1.plot(death, 'tab:cyan', label='Deceased');
    
    ax1.set_xlabel("Days", fontsize=10);
    ax1.set_ylabel("Fraction of population", fontsize=10);
    ax1.legend(loc='best');
    
    ax2.plot(cases, 'tab:red', label='Cases');    
    ax2.set_xlabel("Days", fontsize=10);
    ax2.set_ylabel("Fraction of population (Cases)", fontsize=10, color='tab:red');
    
    ax3 = ax2.twinx()
    ax3.plot(death, 'tab:cyan', label='Deceased');    
    ax3.set_xlabel("Days", fontsize=10);
    ax3.set_ylabel("Fraction of population (Fatalities)", fontsize=10, color='tab:cyan');


# Model without intervention
Let's see what the model looks like without any intervention, i.e. `R_0` is a contant value

In [None]:
N = 100000  # Population size
n_infected = 1
max_days = 100

# State at time = 0 for SEIR_HCD model
# The numbers correspond to the number of people in each of the SEIRHCD compartments
initial_state = [(N - n_infected)/ N, 0, n_infected / N, 0, 0, 0, 0]

R_0 = 3.6
t_inc = 5.6
t_inf = 2.9
t_hosp = 4
t_crit = 14
m_a = 0.8
c_a = 0.1
f_a = 0.3

args = (R_0, t_inc, t_inf, t_hosp, t_crit, m_a, c_a, f_a)

sol = solve_ivp(SEIR_HCD_model, [0, max_days], initial_state, args=args, t_eval=np.arange(max_days))

plot_model(sol, 'SEIR-HCD Model (without intervention)')

# Model with intervention
Lets assume that there is some intervention that causes the reproduction number (`R_0`) to fall to a lower value (`R_t`) at a certain time (e.g. physical distancing). Note that the actual drop will occur some time after the intervention measures are implemented.

This could be modified to take any function of `R_t(t)` values to model the reproduction number as a time varying variable

In [None]:
R_0 = 3.6 # reproduction number without intervention
R_t = 0.7  # reproduction number after intervention
intervention_day = 45

def time_varying_reproduction(t):
    if t > intervention_day:
        return R_t
    else:
        return R_0
    
args = (time_varying_reproduction, t_inc, t_inf, t_hosp, t_crit, m_a, c_a, f_a)

sol2 = solve_ivp(SEIR_HCD_model, [0, max_days], initial_state, args=args, t_eval=np.arange(max_days))

plot_model(sol2, f'SEIR-HCD Model (with intervention on day {intervention_day})')

Let's compare the infection rate between the two cases

In [None]:
sus, exp, inf, rec, hosp, crit, deaths = sol.y
sus2, exp2, inf2, rec2, hosp2, crit2, deaths2 = sol2.y

f = plt.figure(figsize=(8,5)) 
# plt.plot(exp, 'tab:orange', label='Exposed', linestyle=':');
plt.plot(inf, 'r', label='Infected', linestyle=':');
plt.plot(deaths, 'b', label='Deceased', linestyle=':');
plt.plot(hosp, 'tab:purple', label='Hospitalised', linestyle=':');
# plt.plot(exp2, 'tab:orange', label='Exposed with intervention');
plt.plot(inf2, 'r', label='Infected with intervention');
plt.plot(deaths2, 'b', label='Deceased with intervention');
plt.plot(hosp2, 'tab:purple', label='Hospitalised with intervention');

plt.title(f'Comparison of the effect of the intervention on day {intervention_day}')
plt.xlabel("Days", fontsize=10);
plt.ylabel("Fraction of population", fontsize=10);
plt.legend(loc='best');

You can see that after the intervention on day 45, the peak infections is lower than if there was no intervention and there are less than half as many deaths. You can see how powerful self-isolation is from this chart

# Fitting the model to data
There are certain variables that we can play with to fit the model to real data:
* Average incubation period, `t_inc`
* Average infection period, `t_inf`
* Average hospitalization period, `t_hosp`
* Average critital period, `t_crit`
* The fraction of mild/asymptomatic cases, `m_a`
* The fraction of severe cases that turn critical, `c_a`
* The fraction of critical cases that result in a fatality, `f_a`
* Reproduction number, `R_0` or `R_t`

The some of these are likely to be constants specific to the virus and some are likely to be time dependent variables dependent on factors such as:
* When a government intervened
* Peoples behaviours (do people actively self-isolate, not visit religious shrines etc.)
* Population demographic of a country (is a significant proportion of the population old?). This is the `a` subscript
* Heathcare system capacity (hostpital beds per capita)
* Number of testing kits available

We have already used two different reproduction numbers above. Let's see if we can derive a time-dependent `R_t` from the data. We will also try and optimize a handful of the parameters above to match the data.

We will also compare this to just using a single reproduction number. This might actaully be more suitable in countries where the outbreak has just started or they are struggling to limit the spread.

There are lots of ways to decay a parameter in epidemiology. I'm going to use a Hill decay, which has 2 parameters, `k` and `L` (the half decay constant):

In [None]:
Image(url= "https://raw.githubusercontent.com/wiki/SwissTPH/openmalaria/img/graphs/decay-functions.png")

In [None]:
DATE_BORDER = '2020-04-14'

In [None]:
# train3 = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/train.csv', parse_dates=['Date'])

# test3 = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/test.csv', parse_dates=['Date'])

In [None]:
# data_path = Path('/kaggle/input/covid19-global-forecasting-week-4/')

# train = pd.read_csv(data_path / 'train.csv', parse_dates=['Date'])
# test = pd.read_csv(data_path /'test.csv', parse_dates=['Date'])
# submission = pd.read_csv(data_path /'submission.csv', index_col=['ForecastId'])

In [None]:
# Load the population data into lookup dicts
pop_info = pd.read_csv('/kaggle/input/covid19-population-data/population_data.csv')

In [None]:
train = train_df.copy()
test = test_df.copy()

In [None]:
train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)

In [None]:
train['Area'] = train['Province_State'].fillna(train['Country_Region'])
test['Area'] = test['Province_State'].fillna(test['Country_Region'])

In [None]:
# https://www.kaggle.com/c/covid19-global-forecasting-week-1/discussion/139172
train['ConfirmedCases'] = train.groupby('Area')['ConfirmedCases'].cummax()
train['Fatalities'] = train.groupby('Area')['Fatalities'].cummax()

In [None]:
#Remove the leaking data
train_full = train.copy()
valid = train[train['Date'] >= test['Date'].min()]
train = train[train['Date'] < test['Date'].min()]

# Split the test into public & private
test_public = test[test['Date'] <= DATE_BORDER]
test_private = test[test['Date'] > DATE_BORDER]

Some of the "Area" are not present in the population dataset. Have to add it manually.

In [None]:
set(test_public.Area.unique()).difference(set(pop_info.Name))

In [None]:
pop_info = pop_info.append({'Name':'Bonaire, Sint Eustatius and Saba', 'Population': 25157, 'Type':'Province/State'}, ignore_index=True)

pop_info = pop_info.append({'Name':'Falkland Islands (Malvinas)', 'Population': 3398, 'Type':'Province/State'}, ignore_index=True)

pop_info = pop_info.append({'Name':'Malawi', 'Population': 18143217, 'Type':'Country/Region'}, ignore_index=True)

pop_info = pop_info.append({'Name':'Saint Pierre and Miquelon', 'Population': 6008, 'Type':'Province/State'}, ignore_index=True)

pop_info = pop_info.append({'Name':'Sao Tome and Principe', 'Population': 211028, 'Type':'Province/State'}, ignore_index=True)

pop_info = pop_info.append({'Name':'Western Sahara', 'Population': 567402, 'Type':'Province/State'}, ignore_index=True)

pop_info = pop_info.append({'Name':'South Sudan', 'Population': 10975927, 'Type':'Country/Region'}, ignore_index=True)

In [None]:
country_pop = pop_info.query('Type == "Country/Region"')
province_pop = pop_info.query('Type == "Province/State"')
country_lookup = dict(zip(country_pop['Name'], country_pop['Population']))
province_lookup = dict(zip(province_pop['Name'], province_pop['Population']))

In [None]:
# Fix the Georgia State/Country confusion - probably a better was of doing this :)
train['Province_State'] = train['Province_State'].replace('Georgia', 'Georgia (State)')
test['Province_State'] = test['Province_State'].replace('Georgia', 'Georgia (State)')
province_lookup['Georgia (State)'] = province_lookup['Georgia']

In [None]:
# Use a multi-index for easier slicing
train_full.set_index(['Area', 'Date'], inplace=True)
train.set_index(['Area', 'Date'], inplace=True)
valid.set_index(['Area', 'Date'], inplace=True)
test_public.set_index(['Area', 'Date'], inplace=True)
test_private.set_index(['Area', 'Date'], inplace=True)

submission['ConfirmedCases'] = 0
submission['Fatalities'] = 0

train_full.shape, train.shape, valid.shape, test_public.shape, test_private.shape, submission.shape

The function below evaluates a model with a constant `R` number as well as `t_hosp`, `t_crit`, `m`, `c`, `f`

In [None]:
OPTIM_DAYS = 21  # Number of days to use for the optimisation evaluation

In [None]:
# Use a constant reproduction number
def eval_model_const(params, data, population, return_solution=False, forecast_days=0):
    R_0, t_hosp, t_crit, m, c, f = params
    N = population
    n_infected = data['ConfirmedCases'].iloc[0]
    max_days = len(data) + forecast_days
    initial_state = [(N - n_infected)/ N, 0, n_infected / N, 0, 0, 0, 0]
    args = (R_0, 5.6, 2.9, t_hosp, t_crit, m, c, f)
               
    sol = solve_ivp(SEIR_HCD_model, [0, max_days], initial_state, args=args, t_eval=np.arange(0, max_days))
    
    sus, exp, inf, rec, hosp, crit, deaths = sol.y
    
    y_pred_cases = np.clip(inf + rec + hosp + crit + deaths, 0, np.inf) * population
    y_true_cases = data['ConfirmedCases'].values
    y_pred_fat = np.clip(deaths, 0, np.inf) * population
    y_true_fat = data['Fatalities'].values
    
    optim_days = min(OPTIM_DAYS, len(data))  # Days to optimise for
    weights = 1 / np.arange(1, optim_days+1)[::-1]  # Recent data is more heavily weighted
    msle_cases = mean_squared_log_error(y_true_cases[-optim_days:], y_pred_cases[-optim_days:], weights)
    msle_fat = mean_squared_log_error(y_true_fat[-optim_days:], y_pred_fat[-optim_days:], weights)
    
    msle_final = np.mean([msle_cases, msle_fat])
    
    if return_solution:
        return msle_final, sol
    else:
        return msle_final

The function below is essentially the same as above, by R is decayed using a Hill decay function. This model requires 2 additional parameters to be optimized, `k` & `L`

In [None]:
# Use a Hill decayed reproduction number
def eval_model_decay(params, data, population, return_solution=False, forecast_days=0):
    R_0, t_hosp, t_crit, m, c, f, k, L = params  
    N = population
    n_infected = data['ConfirmedCases'].iloc[0]
    max_days = len(data) + forecast_days
    
    # https://github.com/SwissTPH/openmalaria/wiki/ModelDecayFunctions   
    # Hill decay. Initial values: R_0=2.2, k=2, L=50
    def time_varying_reproduction(t): 
        return R_0 / (1 + (t/L)**k)
    
    initial_state = [(N - n_infected)/ N, 0, n_infected / N, 0, 0, 0, 0]
    args = (time_varying_reproduction, 5.6, 2.9, t_hosp, t_crit, m, c, f)
            
    sol = solve_ivp(SEIR_HCD_model, [0, max_days], initial_state, args=args, t_eval=np.arange(0, max_days))
    
    sus, exp, inf, rec, hosp, crit, deaths = sol.y
    
    y_pred_cases = np.clip(inf + rec + hosp + crit + deaths, 0, np.inf) * population
    y_true_cases = data['ConfirmedCases'].values
    y_pred_fat = np.clip(deaths, 0, np.inf) * population
    y_true_fat = data['Fatalities'].values
    
    optim_days = min(OPTIM_DAYS, len(data))  # Days to optimise for
    weights = 1 / np.arange(1, optim_days+1)[::-1]  # Recent data is more heavily weighted
    
    msle_cases = mean_squared_log_error(y_true_cases[-optim_days:], y_pred_cases[-optim_days:], weights)
    msle_fat = mean_squared_log_error(y_true_fat[-optim_days:], y_pred_fat[-optim_days:], weights)
    msle_final = np.mean([msle_cases, msle_fat])
    
    if return_solution:
        return msle_final, sol
    else:
        return msle_final

In [None]:
def use_last_value(train_data, valid_data, test_data):
    lv = train_data[['ConfirmedCases', 'Fatalities']].iloc[-1].values
    
    forecast_ids = test_data['ForecastId']
    submission.loc[forecast_ids, ['ConfirmedCases', 'Fatalities']] = lv
    
    if valid_data is not None:
        y_pred_valid = np.ones((len(valid_data), 2)) * lv.reshape(1, 2)
        y_true_valid = valid_data[['ConfirmedCases', 'Fatalities']]

        msle_cases = mean_squared_log_error(y_true_valid['ConfirmedCases'], y_pred_valid[:, 0])
        msle_fat = mean_squared_log_error(y_true_valid['Fatalities'], y_pred_valid[:, 1])
        msle_final = np.mean([msle_cases, msle_fat])

        return msle_final

In [None]:
def plot_model_results(y_pred, train_data, valid_data=None):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))
    
    ax1.set_title('Confirmed Cases')
    ax2.set_title('Fatalities')
    
    train_data['ConfirmedCases'].plot(label='Confirmed Cases (train)', color='g', ax=ax1)
    y_pred.loc[train_data.index, 'ConfirmedCases'].plot(label='Modeled Cases', color='r', ax=ax1)
#     print(y_pred['R'])
#     print(y_pred.head())
    ax3 = y_pred['R'].plot(label='Reproduction number', color='c', linestyle='-', secondary_y=True, ax=ax1)
    ax3.set_ylabel("Reproduction number", fontsize=10, color='c');
        
    train_data['Fatalities'].plot(label='Fatalities (train)', color='g', ax=ax2)
    y_pred.loc[train_data.index, 'Fatalities'].plot(label='Modeled Fatalities', color='r', ax=ax2)
    
    if valid_data is not None:
        valid_data['ConfirmedCases'].plot(label='Confirmed Cases (valid)', color='g', linestyle=':', ax=ax1)
        valid_data['Fatalities'].plot(label='Fatalities (valid)', color='g', linestyle=':', ax=ax2)
        y_pred.loc[valid_data.index, 'ConfirmedCases'].plot(label='Modeled Cases (forecast)', color='r', linestyle=':', ax=ax1)
        y_pred.loc[valid_data.index, 'Fatalities'].plot(label='Modeled Fatalities (forecast)', color='r', linestyle=':', ax=ax2)
    else:
        y_pred.loc[:, 'ConfirmedCases'].plot(label='Modeled Cases (forecast)', color='r', linestyle=':', ax=ax1)
        y_pred.loc[:, 'Fatalities'].plot(label='Modeled Fatalities (forecast)', color='r', linestyle=':', ax=ax2)
        
    ax1.legend(loc='best')
    

The function below fits a SEIR-HCD model for each area, either using a constant R or a decayed R, whichever is better. If the total cases/1M pop is below 1, then the last value is used.

In [None]:
def fit_model_public(area_name, 
                     initial_guess=[3.6, 4, 14, 0.8, 0.1, 0.3, 2, 50],
                     bounds=((1, 20), # R bounds
                             (0.5, 10), (2, 20), # transition time param bounds
                             (0.5, 1), (0, 1), (0, 1), (1, 5), (1, 100)), # fraction time param bounds
                     make_plot=True):
        
    train_data = train.loc[area_name].query('ConfirmedCases > 0')
    valid_data = valid.loc[area_name]
    test_data = test_public.loc[area_name]  
    
    try:
        population = province_lookup[area_name]
    except KeyError:
        population = country_lookup[area_name]
        
    cases_per_million = train_data['ConfirmedCases'].max() * 10**6 / population
    n_infected = train_data['ConfirmedCases'].iloc[0]
        
    if cases_per_million < 1:
        return use_last_value(train_data, valid_data, test_data)
                
    res_const = minimize(eval_model_const, initial_guess[:-2], bounds=bounds[:-2],
                         args=(train_data, population, False),
                         method='L-BFGS-B')
    
    res_decay = minimize(eval_model_decay, initial_guess, bounds=bounds,
                         args=(train_data, population, False),
                         method='L-BFGS-B')
    
    dates_all = train_data.index.append(test_data.index)
    dates_val = train_data.index.append(valid_data.index)
#     print(dates_all)
    
    # If using a constant R number is better, use that model
    if res_const.fun < res_decay.fun:
        msle, sol = eval_model_const(res_const.x, train_data, population, True, len(test_data))
        res = res_const
        R_t = pd.Series([res_const.x[0]] * len(dates_val), dates_val)
    else:
        msle, sol = eval_model_decay(res_decay.x, train_data, population, True, len(test_data))
        res = res_decay
        
        # Calculate the R_t values
        t = np.arange(len(dates_val))
        R_0, t_hosp, t_crit, m, c, f, k, L = res.x  
        R_t = pd.Series(R_0 / (1 + (t/L)**k), dates_val)
        
    sus, exp, inf, rec, hosp, crit, deaths = sol.y
    
    y_pred = pd.DataFrame({
        'ConfirmedCases': np.clip(inf + rec + hosp + crit + deaths, 0, np.inf) * population,
        'Fatalities': np.clip(deaths, 0, np.inf) * population,
        'R': R_t,
    }, index=dates_all)
    
    y_pred_valid = y_pred.iloc[len(train_data): len(train_data)+len(valid_data)]
    y_pred_test = y_pred.iloc[len(train_data):]
    y_true_valid = valid_data[['ConfirmedCases', 'Fatalities']]
        
    valid_msle_cases = mean_squared_log_error(y_true_valid['ConfirmedCases'], y_pred_valid['ConfirmedCases'])
    valid_msle_fat = mean_squared_log_error(y_true_valid['Fatalities'], y_pred_valid['Fatalities'])
    valid_msle = np.mean([valid_msle_cases, valid_msle_fat])
    
    if make_plot:
        print(f'Validation MSLE: {valid_msle:0.5f}')
        print(f'R: {res.x[0]:0.3f}, t_hosp: {res.x[1]:0.3f}, t_crit: {res.x[2]:0.3f}, '
              f'm: {res.x[3]:0.3f}, c: {res.x[4]:0.3f}, f: {res.x[5]:0.3f}')
        plot_model_results(y_pred, train_data, valid_data)
        
    # Put the forecast in the submission
    forecast_ids = test_data['ForecastId']
    submission.loc[forecast_ids, ['ConfirmedCases', 'Fatalities']] = y_pred_test[['ConfirmedCases', 'Fatalities']].values
    
    return valid_msle
            

In [None]:
# Fit a model on the full dataset (i.e. no validation)
def fit_model_private(area_name, 
                      initial_guess=[3.6, 4, 14, 0.8, 0.1, 0.3, 2, 50],
                      bounds=((1, 20), # R bounds
                              (0.5, 10), (2, 20), # transition time param bounds
                              (0.5, 1), (0, 1), (0, 1), (1, 5), (1, 100)), # fraction time param bounds
                      make_plot=True):
        
    train_data = train_full.loc[area_name].query('ConfirmedCases > 0')
    test_data = test_private.loc[area_name]
    
    try:
        population = province_lookup[area_name]
    except KeyError:
        population = country_lookup[area_name]
        
    cases_per_million = train_data['ConfirmedCases'].max() * 10**6 / population
    n_infected = train_data['ConfirmedCases'].iloc[0]
        
    if cases_per_million < 1:
        return use_last_value(train_data, None, test_data)
                
    res_const = minimize(eval_model_const, initial_guess[:-2], bounds=bounds[:-2],
                         args=(train_data, population, False),
                         method='L-BFGS-B')
    
    res_decay = minimize(eval_model_decay, initial_guess, bounds=bounds,
                         args=(train_data, population, False),
                         method='L-BFGS-B')
    
    dates_all = train_data.index.append(test_data.index)
    
    
    # If using a constant R number is better, use that model
    if res_const.fun < res_decay.fun:
        msle, sol = eval_model_const(res_const.x, train_data, population, True, len(test_data))
        res = res_const
        R_t = pd.Series([res_const.x[0]] * len(dates_all), dates_all)
    else:
        msle, sol = eval_model_decay(res_decay.x, train_data, population, True, len(test_data))
        res = res_decay
        
        # Calculate the R_t values
        t = np.arange(len(dates_all))
        R_0, t_hosp, t_crit, m, c, f, k, L = res.x  
        R_t = pd.Series(R_0 / (1 + (t/L)**k), dates_all)
        
    sus, exp, inf, rec, hosp, crit, deaths = sol.y
    
    y_pred = pd.DataFrame({
        'ConfirmedCases': np.clip(inf + rec + hosp + crit + deaths, 0, np.inf) * population,
        'Fatalities': np.clip(deaths, 0, np.inf) * population,
        'R': R_t,
    }, index=dates_all)
    
    y_pred_test = y_pred.iloc[len(train_data):]
    
    if make_plot:
        print(f'R: {res.x[0]:0.3f}, t_hosp: {res.x[1]:0.3f}, t_crit: {res.x[2]:0.3f}, '
              f'm: {res.x[3]:0.3f}, c: {res.x[4]:0.3f}, f: {res.x[5]:0.3f}')
        plot_model_results(y_pred, train_data)
        
    # Put the forecast in the submission
    forecast_ids = test_data['ForecastId']
    submission.loc[forecast_ids, ['ConfirmedCases', 'Fatalities']] = y_pred_test[['ConfirmedCases', 'Fatalities']].values
            

In [None]:
fit_model_public('Italy')
# fit_model_private('Italy')

Above you can see the model optimized on the last `OPTIM_DAYS` days of data from Italy which has been weighted to put more importance on recent data

The numbers show that `R_0` was around 3.5 and a decayed value is a better fit to the data. This is good news for Italy as it shows its measures are working.

Let's try Iran, South Korea and Japan.

In [None]:
fit_model_public('Iran')
# fit_model_private('Iran')

An unusual feature of the Iranian data is how high the R value needs to be for the model to fit. Iran was criticised for not closing religious shrines and locking down earlier, so the virus was extremely contagious. The data also appears of have multiple inflection points

In [None]:
fit_model_public('Korea, South')
# fit_model_private('Korea, South')

South Korea is unusual due to the incredible efforts to stop the spread

In [None]:
fit_model_public('Japan')
# fit_model_private('Japan')

In [None]:
fit_model_public('Hubei')
# fit_model_private('Hubei')

The plot for Hubei looks really good. In fact the data for most of the Chinese provinces fit well to compartment models

In [None]:
fit_model_public('United Kingdom')
# fit_model_private('United Kingdom')

In [None]:
fit_model_public('France')
# fit_model_private('United Kingdom')

Things are not looking good for the France also. Let's hope the lockdown starts bringing the R number down soon

# Calculate for all countries

In [None]:
# Public Leaderboard
validation_scores = []

for c in tqdm(test_public.index.levels[0].values):
    try:
        score = fit_model_public(c, make_plot=False)
        validation_scores.append({'Country': c, 'MSLE': score})
        print(f'{score:0.5f} {c}')
    except IndexError as e:
        print(c, 'has no cases in train')
    except ValueError as e:
        print(c, e)

validation_scores = pd.DataFrame(validation_scores)
print(f'Mean validation score: {np.sqrt(validation_scores["MSLE"].mean()):0.5f}')

In [None]:
# Find which areas are not being predicted well
validation_scores.sort_values(by=['MSLE'], ascending=False).head(20)

In [None]:
# Private Leaderboard
# for c in tqdm(test_private.index.levels[0].values):
#     try:
#         score = fit_model_private(c, make_plot=False)
#     except IndexError as e:
#         print(c, 'has no cases in train')

In [None]:
submission.round().to_csv('submission.csv')

In [None]:
# submission.join(test.set_index('ForecastId')).query(f'Date > "{DATE_BORDER}"').round().to_csv('forecast.csv')

# Todo/ideas

* Mix in other sources of data (e.g. number of hospital beds, age demographics)
* Global optmisation of virus specific parameters
* Use as features into a different model