# SEIR & HCD with age distribution Model for COVID19 Global forecast
SEIR MODEL Reference:

Many thanks for @datasaurus great Kernel : https://www.kaggle.com/anjum48/seir-model-with-intervention

Based on his kernel, we add age distribution to the model for more accurate estimation. Also, we add stringency score to reflect how the reproduction rate will change.

In [None]:
import json
from datetime import timedelta

from time import time 

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#from scipy.integrate import solve_ivp # Solve an initial value problem for a system of ODEs.
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_log_error, mean_absolute_error #MSLE for kaggle, MAE for ZIND

from IPython.display import Image
from IPython.core.display import HTML

from pprint import pprint

from tqdm import notebook
notebook.tqdm().pandas

idx = pd.IndexSlice

In [None]:
path ='/kaggle/input/covid19-global-forecasting-week-4/'
train_df = pd.read_csv(path+"train.csv",parse_dates = True)
test_df = pd.read_csv(path+"test.csv",parse_dates = True)
sub_df = pd.read_csv(path+"submission.csv",parse_dates = True)

In [None]:
train_df.Date = pd.to_datetime(train_df.Date)
test_df.Date = pd.to_datetime(test_df.Date)

In [None]:
train_df.head(2)

In [None]:
test_df.head(2)

In [None]:
sub_df.head(2)

# Transform and Load Data

In [None]:
countries = train_df.Country_Region.unique()
len(countries)

In [None]:
provinces = train_df.Province_State.unique()
len(provinces)

In [None]:
num_province_on_country = train_df.groupby("Country_Region")["Province_State"].apply(lambda s: len(s.dropna().unique()))
num_province_on_country[num_province_on_country>0]

In [None]:
train_begin_date = train_df.Date.min()
train_end_date = train_df.Date.max()
test_begin_date = test_df.Date.min()
test_end_date = test_df.Date.max()

In [None]:
train_begin_date

In [None]:
# https://www.kaggle.com/anjum48/seir-hcd-model#Fitting-the-model-to-data
DATE_BORDER = '2020-04-15'

# Load the population data into lookup dicts
pop_info = pd.read_csv('/kaggle/input/covid19-population-data/population_data.csv')
country_pop = pop_info.query('Type == "Country/Region"')
province_pop = pop_info.query('Type == "Province/State"')
country_lookup = dict(zip(country_pop['Name'], country_pop['Population']))
province_lookup = dict(zip(province_pop['Name'], province_pop['Population']))

# Fix the Georgia State/Country confusion - probably a better was of doing this :)
train_df['Province_State'] = train_df['Province_State'].replace('Georgia', 'Georgia (State)')
test_df['Province_State'] = test_df['Province_State'].replace('Georgia', 'Georgia (State)')
province_lookup['Georgia (State)'] = province_lookup['Georgia']

train_df['Area'] = train_df['Province_State'].fillna(train_df['Country_Region'])
test_df['Area'] = test_df['Province_State'].fillna(test_df['Country_Region'])

# https://www.kaggle.com/c/covid19-global-forecasting-week-1/discussion/139172
train_df['ConfirmedCases'] = train_df.groupby('Area')['ConfirmedCases'].cummax()
train_df['Fatalities'] = train_df.groupby('Area')['Fatalities'].cummax()

# Remove the leaking data
train_full_df = train_df.copy()
valid_df = train_df[train_df['Date'] >= test_df['Date'].min()].copy()
train_df = train_df[train_df['Date'] < test_df['Date'].min()].copy()

# Split the test into public & private
test_public_df = test_df[test_df['Date'] <= DATE_BORDER].copy()
test_private_df = test_df[test_df['Date'] > DATE_BORDER].copy()

# use area country dictionary for later match statistics
area_country_dict = dict(zip(train_full_df["Area"],train_full_df["Country_Region"]))

# Use a multi-index for easier slicing
train_full_df.set_index(['Area', 'Date'], inplace=True)
train_df.set_index(['Area', 'Date'], inplace=True)
valid_df.set_index(['Area', 'Date'], inplace=True)
test_public_df.set_index(['Area', 'Date'], inplace=True)
test_private_df.set_index(['Area', 'Date'], inplace=True)

sub_df['ConfirmedCases'] = 0
sub_df['Fatalities'] = 0

train_full_df.shape, train_df.shape, valid_df.shape, test_public_df.shape, test_private_df.shape, sub_df.shape

In [None]:
train_full_df.sort_index(level=[0,1],inplace=True)
train_df.sort_index(level=[0,1],inplace=True)
valid_df.sort_index(level=[0,1],inplace=True)
test_public_df.sort_index(level=[0,1],inplace=True)
test_private_df.sort_index(level=[0,1],inplace=True)
train_full_df.head(2)

In [None]:
len(area_country_dict)

# Check Area with non-zero case in the beginning

In [None]:
non_zero_regions_df = train_full_df.loc[idx[:,train_begin_date],:][train_full_df.loc[idx[:,train_begin_date],"ConfirmedCases"]!=0].copy()
non_zero_regions_df

In [None]:
# we pick the area of China out and assume the ConfirmedCases and Death will not change a lot in the future
province_country_dict = dict(zip(train_full_df.loc[:,"Province_State"],train_full_df.loc[:,"Country_Region"]))
china_area = [k for k,v in province_country_dict.items() if v == "China"] 
len(china_area)

In [None]:
china_area = china_area + ["Taiwan*"]
len(china_area)

In [None]:
# check plots
#fig,axs = plt.subplots(33,1,figsize = (8,33*6))
#count = 0
#for area in china_area:
#    train_full_df.loc[idx[area,:],"ConfirmedCases"].plot(ax=axs[count])
#    axs[count].set_title(area)
#    count += 1

# Country Specific Feature
- Population
- Age Distribution
- Stringency Score
- World Development Index Data

## Create area_feature_df

In [None]:
!pip install pycountry_convert

In [None]:
import pycountry
import pycountry_convert as pc
from pycountry_convert import country_name_to_country_alpha2 as name_to_code2
from pycountry_convert import country_alpha2_to_continent_code as code2_to_cont
from pycountry_convert import country_alpha2_to_country_name as code2_to_country
from pycountry_convert import country_name_to_country_alpha3 as country_to_code3

In [None]:
all_countries = list(set(area_country_dict.values()))
len(all_countries)

In [None]:
all_country_official = [c.name for c  in pycountry.countries]
all_code_official = [name_to_code2(c) for c in all_country_official]
no_match_dict = {}
for c in all_countries:
    if not c in all_country_official:
        no_match_dict[c] = c
len(no_match_dict)

In [None]:
no_match_l = list(no_match_dict.keys())
no_match_l

In [None]:
# mannually fix some
mannual_dict= {
    "Cote d'Ivoire":"Côte d'Ivoire",
    "Venezuela":"Venezuela, Bolivarian Republic of",
    'Congo (Brazzaville)': 'Congo',
    'West Bank and Gaza':"Palestine, State of", # no perfect match
    'Taiwan*':"Taiwan, Province of China",
    'Russia':'Russian Federation',
    'Iran':'Iran, Islamic Republic of',
    'Korea, South':"Korea, Republic of",
    'Syria':'Syrian Arab Republic',
    'Kosovo':"Kosovo", # Not Sure
    'Diamond Princess':"NoNeedToModel_1", # a ship
    'Burma': 'Myanmar',
    'US':"United States",
    'Holy See': 'Holy See (Vatican City State)',
    'Moldova':"Moldova, Republic of",
    'Vietnam':"Viet Nam",
    'Congo (Kinshasa)': 'Congo, The Democratic Republic of the',
     'Laos':"Lao People's Democratic Republic",
     'Brunei':"Brunei Darussalam",
     'Tanzania':"Tanzania, United Republic of",
     'MS Zaandam':"NoNeedToModel_2",#a ship
    "Bolivia":"Bolivia, Plurinational State of",
}
len(mannual_dict)

In [None]:
"Palestine, State of" in train_full_df["Country_Region"]

In [None]:
train_full_df[train_full_df["Country_Region"]=="Kosovo"]

In [None]:
# check tool
keyword  = 'Moro'
for c in all_country_official:
    if keyword in c:
        print(c)

In [None]:
kname_to_oname_dict = {}
kname_to_oname_dict.update(mannual_dict)
len(kname_to_oname_dict)

In [None]:
kname_to_ocode_dict = {}
for c in all_countries:
    if c in kname_to_oname_dict:
        pass
    else:
        kname_to_oname_dict[c] = c
for k,v in kname_to_oname_dict.items():
    
    if v not in all_country_official:
        if v == "Kosovo":
            kname_to_ocode_dict[k] = "XK"
        else:
            kname_to_ocode_dict[k] = v
    else:
        kname_to_ocode_dict[k] = name_to_code2(v)
kname_to_ocode_dict

In [None]:
name_transfer_back_dict = {v:k for k,v in kname_to_oname_dict.items()}
code_transfer_back_dict = {v:k for k,v in kname_to_ocode_dict.items()}

In [None]:
new_countries_l = [kname_to_oname_dict[c] for c in kname_to_oname_dict]

In [None]:
area_feature_df = pd.DataFrame([area_country_dict.keys(),area_country_dict.values()]).T
area_feature_df.columns = ["area","country_region"]
area_feature_df["official_name"] = area_feature_df["country_region"].apply(lambda x: kname_to_oname_dict[x])
area_feature_df["code2"] = area_feature_df["country_region"].apply(lambda x: kname_to_ocode_dict[x])
area_feature_df["code3"] = area_feature_df["official_name"].apply(lambda x: country_to_code3(x) if x in all_country_official else x)
area_feature_df.set_index("official_name",drop=False,inplace=True)
area_feature_df.head(3)

In [None]:
def code2_to_cont_specific(code2):
    if code2 not in all_code_official:
        if code2 == "XK": # special Kosolov
            return "EU"
        return "NotKnow"
    if code2 == "VA":#Holy See (Vatican City State)
        return "EU"
    if code2 =="TL":#Timor-Leste
        return "AS"
    if code2 == "EH": #Western Sahara
        return "AF" 
    return code2_to_cont(code2)
area_feature_df["continent"] = area_feature_df.code2.apply(lambda x : code2_to_cont_specific(x))

In [None]:
"XK" in all_code_official

In [None]:
code2_to_cont("XK")

In [None]:
area_feature_df[area_feature_df["continent"] == "NotKnow"]

In [None]:
area_feature_df[area_feature_df.code2 =="VA"]

In [None]:
area_feature_df.shape

In [None]:
len(area_feature_df.code3.unique())

## add demographic data

In [None]:
def find_latest(row):
    for col in rep_cols[::-1]:
        if row.loc[col] is not np.nan:
            return row.loc[col]
demo = pd.read_csv("/kaggle/input/global-population-estimates/data.csv")
demo.dropna(thresh=93,inplace=True) # drop those almost nan row
demo_pct_df = demo[demo["Series Code"].str.contains("TO.ZS")].copy() # find those pct feature
basic_cols = ['Country Name', 'Country Code', 'Series Name', 'Series Code'] + ['2015 [YR2015]',
               '2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]',
               '2020 [YR2020]']
demo_pct_df = demo_pct_df[basic_cols].copy() # pick valuable one
age_map_dict= {"SP.POP.0014.TO.ZS": "young",
 "SP.POP.1564.TO.ZS":"middle",
 "SP.POP.65UP.TO.ZS":"old"}
demo_pct_df["Series Code"] = demo_pct_df["Series Code"].apply(lambda x: age_map_dict[x] if x in age_map_dict else x) # new name
rep_cols = ['2015 [YR2015]','2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]','2020 [YR2020]']
demo_pct_df['latest_pct'] = demo_pct_df.apply(axis = 1, func = lambda row : find_latest(row)) # get the latest estimation
pop_df = demo_pct_df.pivot_table(index = "Country Code",columns = "Series Code",values = "latest_pct") /100  # pivot it
pop_df.head()

In [None]:
area_feature_df = area_feature_df.merge(pop_df,how = "left",left_on = "code3",right_index = True)

In [None]:
area_feature_df.isna().sum(axis = 1).sort_values(ascending=False)[:17]
# 16 region without age distribution data

In [None]:
# more accurate population for country level
pop_worldometer_df = pd.read_csv("/kaggle/input/population-by-country-2020/population_by_country_2020.csv")
country_s = pop_worldometer_df["Country (or dependency)"]
pop_worldometer_df.head()

In [None]:
c_dicts = {}
for c in country_s:
    if c in all_country_official:
        pass
    else:
        c_dicts[c] = c
for c1 in c_dicts:
    l_find = []
    for c2 in all_country_official:
        if c1 in c2:
            l_find.append(c1)
    if len(l_find) == 1:
        c_dicts[c1] = l_find[0]
    else:
        print(c1)

In [None]:
# mannually fix some
mannual_dict= {
    "Vietnam":"Viet Nam",
    "DR Congo":'Congo, The Democratic Republic of the',
    "South Korea":"Korea, Republic of",
    "North Korea":"Korea, Democratic People's Republic of",
    "Czech Republic (Czechia)":"Czechia",
    "Laos":"Lao People's Democratic Republic",
    "State of Palestine":"Palestine, State of",
    "Sao Tome & Principe":"Sao Tome and Principe",
    "St. Vincent & Grenadines":"Saint Vincent and the Grenadines",
    "U.S. Virgin Islands":"Virgin Islands, U.S.",
    "Saint Kitts & Nevis":"Saint Kitts and Nevis",
    "Faeroe Islands":"Faroe Islands",
    "British Virgin Islands":"Virgin Islands, British",
    "Wallis & Futuna":"Wallis and Futuna",
    "Saint Barthelemy":"Saint Barthélemy",
    "Saint Pierre & Miquelon":"Saint Pierre and Miquelon",
}
c_dicts.update(mannual_dict)

In [None]:
# check tool
keyword  = 'Saint'
for c in all_country_official:
    if keyword in c:
        print(c)

In [None]:
pop_worldometer_df["Country (or dependency)"] = country_s.replace(c_dicts)
pop_worldometer_df["code3"] = pop_worldometer_df["Country (or dependency)"].apply(lambda x : country_to_code3(x) if x in all_country_official else np.nan)
pop_w_combine = pop_worldometer_df[["code3","Population (2020)","Density (P/Km²)","Med. Age","Urban Pop %"]].copy()
pop_w_combine.columns = ["code3","wm_pop","wm_density","wm_med","wm_urban_pct"]
area_feature_df = area_feature_df.merge(pop_w_combine,on="code3",how = "left")

In [None]:
area_feature_df[area_feature_df["wm_pop"] == ".N.A"]

In [None]:
def getAreaPop(area):
    if area in province_lookup:
        return province_lookup[area]
    else:
        if not np.isnan(area_feature_df.loc[area,"wm_pop"]):
            return area_feature_df.loc[area,"wm_pop"]
        elif area in country_lookup:
            return country_lookup[area]
        else:
            return np.nan
area_feature_df.set_index("area",drop=False,inplace=True)
area_feature_df["pop"] = area_feature_df.area.apply(getAreaPop)
area_feature_df[area_feature_df["pop"].isna()] # all data is good

## Add Stringency Score
- Goverment Response and lockdown data
- Government Response Data from Oxford Data: 
    - https://www.bsg.ox.ac.uk/research/research-projects/oxford-covid-19-government-response-tracker
    - white paper: https://www.bsg.ox.ac.uk/sites/default/files/2020-03/BSG-WP-2020-031-v3.0.pdf

In [None]:
train_begin_date_str = train_begin_date.strftime("%Y-%m-%d")
train_end_date_str = train_end_date.strftime("%Y-%m-%d")
train_begin_date_str

In [None]:
import urllib.request, json 
# use the stringency score as the government response feature
link = f"https://covidtrackerapi.bsg.ox.ac.uk/api/stringency/date-range/{train_begin_date_str}/{train_end_date_str}"
with urllib.request.urlopen(link) as url:
    data = json.loads(url.read().decode())
data_Oxford_df = pd.DataFrame(data["data"])
for i in range(data_Oxford_df.shape[0]):
    for j in range(data_Oxford_df.shape[1]):
        if isinstance(data_Oxford_df.iloc[i,j],dict):
            if len(data_Oxford_df.iloc[i,j]) == 6:
                pass
            else:
                print(data_Oxford_df.iloc[i,j])
            data_Oxford_df.iloc[i,j] = data_Oxford_df.iloc[i,j]["stringency_actual"]

In [None]:
data_Oxford_df.columns = ["Stringency_" + col for col in data_Oxford_df.columns]
data_Oxford_df.fillna(axis = 1,method = "ffill",inplace=True) # no score than regard it as the score before
data_Oxford_df.fillna(0,inplace=True) # still no score than regarding it as 0

In [None]:
data_Oxford_df

In [None]:
len(set(area_feature_df.code3) -  set(data_Oxford_df.index)) # missing 64 coutries

In [None]:
code3_no_stringency = set(area_feature_df.code3) -  set(data_Oxford_df.index)
code3_no_stringency

In [None]:
area_feature_df = area_feature_df.merge(data_Oxford_df,right_index = True,left_on = "code3",how = "left")

## Add World Development Index Data

In [None]:
WDICountry_df = pd.read_csv("/kaggle/input/world-development-indicators/wdi-csv-zip-57-mb-/WDICountry.csv")
WDICountry_df.columns

In [None]:
WDICountry_cols = ['Country Code', 'Region', 'Income Group']
WDICountry_selected_df = WDICountry_df[WDICountry_cols].copy()
WDICountry_selected_df.head(3)

In [None]:
# add  World Bank Open Data: https://www.kaggle.com/theworldbank/world-development-indicators#WDISeries.csv
WDI = pd.read_csv("/kaggle/input/world-development-indicators/wdi-csv-zip-57-mb-/WDIData.csv")
WDI.head(2)

In [None]:
WDI.shape

In [None]:
selected_features_dict = {
    'Access to electricity (% of population)': 'EG.ELC.ACCS.ZS',
    'Adjusted net national income per capita (current US$)': 'NY.ADJ.NNTY.PC.CD',
    'Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)': 'SH.DTH.COMM.ZS',
    'Consumer price index (2010 = 100)': 'FP.CPI.TOTL',
    'Coverage of social insurance programs (% of population)': 'per_si_allsi.cov_pop_tot',
    'Current health expenditure (% of GDP)': 'SH.XPD.CHEX.GD.ZS',
    'Current health expenditure per capita (current US$)': 'SH.XPD.CHEX.PC.CD',
    'Current health expenditure per capita, PPP (current international $)': 'SH.XPD.CHEX.PP.CD',
    'Domestic general government health expenditure (% of current health expenditure)': 'SH.XPD.GHED.CH.ZS',
    'Domestic general government health expenditure (% of GDP)': 'SH.XPD.GHED.GD.ZS',
    'Domestic general government health expenditure (% of general government expenditure)': 'SH.XPD.GHED.GE.ZS',
    'Domestic general government health expenditure per capita (current US$)': 'SH.XPD.GHED.PC.CD',
    'Domestic general government health expenditure per capita, PPP (current international $)': 'SH.XPD.GHED.PP.CD',
    'Domestic private health expenditure (% of current health expenditure)': 'SH.XPD.PVTD.CH.ZS',
    'Domestic private health expenditure per capita (current US$)': 'SH.XPD.PVTD.PC.CD',
    'Domestic private health expenditure per capita, PPP (current international $)': 'SH.XPD.PVTD.PP.CD',
    'GDP growth (annual %)': 'NY.GDP.MKTP.KD.ZG',
    'GDP per capita (current US$)': 'NY.GDP.PCAP.CD',
    'GDP per capita growth (annual %)': 'NY.GDP.PCAP.KD.ZG',
    'GNI (current US$)': 'NY.GNP.MKTP.CD',
    'GNI growth (annual %)': 'NY.GNP.MKTP.KD.ZG',
    'GNI per capita (current LCU)': 'NY.GNP.PCAP.CN',
    'GNI per capita growth (annual %)': 'NY.GNP.PCAP.KD.ZG',
    'Life expectancy at birth, total (years)': 'SP.DYN.LE00.IN',
    'Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (per 100,000 population)': 'SH.STA.WASH.P5',
    'People using at least basic drinking water services (% of population)': 'SH.H2O.BASW.ZS',
    'People using at least basic sanitation services (% of population)': 'SH.STA.BASS.ZS',
    'People using safely managed drinking water services (% of population)': 'SH.H2O.SMDW.ZS',
    'People using safely managed sanitation services (% of population)': 'SH.STA.SMSS.ZS',
    'People with basic handwashing facilities including soap and water (% of population)': 'SH.STA.HYGN.ZS',
    'People with basic handwashing facilities including soap and water, rural (% of rural population)': 'SH.STA.HYGN.RU.ZS',
    'People with basic handwashing facilities including soap and water, urban (% of urban population)': 'SH.STA.HYGN.UR.ZS',
    'Population density (people per sq. km of land area)': 'EN.POP.DNST',
    'Population, total': 'SP.POP.TOTL',
    'Prevalence of undernourishment (% of population)': 'SN.ITK.DEFC.ZS',
}
len(selected_features_dict)

In [None]:
# delete some highly correlated variables (from later correlation plot)
del_hi_corr_var_l = [
    # --- 
    'SH.XPD.GHED.GD.ZS', # highly correlated with SH.XPD.PVTD.PP.CD	
    'SH.XPD.GHED.PC.CD',
    'SH.XPD.PVTD.PC.CD',
    'SH.XPD.CHEX.PC.CD',
    'SH.XPD.CHEX.GD.ZS',
    'SH.XPD.GHED.PP.CD',
    'SH.XPD.CHEX.PP.CD',
    'SH.XPD.PVTD.PP.CD',
    #----
    'NY.GNP.MKTP.CD',   # highly correlated with NY.GDP.MKTP.KD.ZG
    'NY.GNP.MKTP.KD.ZG', # highly correlated with NY.GDP.PCAP.CD
    'NY.GNP.PCAP.CN', 
    'NY.GNP.PCAP.KD.ZG',
    # ---- 
    "SP.DYN.LE00.IN", # highly correlated with SH.STA.BASS.ZS
    "SH.H2O.BASW.ZS",
    
    # -----
    'NY.GDP.MKTP.KD.ZG', # highly correlated with  'NY.GNP.MKTP.KD.ZG': 'GNI growth (annual %)',
    'NY.GNP.PCAP.KD.ZG',
    'NY.GDP.PCAP.KD.ZG',
    
    # -----
    'NY.GDP.PCAP.CD', # highly correlated with 'NY.ADJ.NNTY.PC.CD': 'Adjusted net national income per capita (current US$)',
    'NY.GNP.MKTP.CD',
    'NY.GNP.PCAP.CN',
]

In [None]:
needed_WDI_feature = list(set(selected_features_dict.values()) - set(del_hi_corr_var_l))
len(needed_WDI_feature)

In [None]:
code3_needed = list(area_feature_df["code3"].values)
WDI_selected_df = WDI[(WDI["Country Code"].isin(code3_needed))&(WDI["Indicator Code"].isin(needed_WDI_feature))].copy()
WDI_selected_df.columns

In [None]:
WDI_selected_df.head(2)

In [None]:
cols_to_fill=['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018']
def find_latest(row):
    for col in cols_to_fill[::-1]:
        if np.isnan(row.loc[col]):
            continue
        else:
            return row.loc[col]
    return np.nan
WDI_selected_df["val"] = WDI_selected_df.apply(axis = 1,func = lambda row: find_latest(row))

In [None]:
WDI_selected_pivot_df = WDI_selected_df.pivot_table(index= "Country Code",columns = "Indicator Code",values = "val")
WDI_selected_pivot_df.head()

In [None]:
WDI_selected_pivot_df.shape

In [None]:
'''
    Drop the following cols:
    'People with basic handwashing facilities including soap and water (% of population)': 'SH.STA.HYGN.ZS',
    'People with basic handwashing facilities including soap and water, rural (% of rural population)': 'SH.STA.HYGN.RU.ZS',
    'People with basic handwashing facilities including soap and water, urban (% of urban population)': 'SH.STA.HYGN.UR.ZS',
    'People using safely managed sanitation services (% of population)': 'SH.STA.SMSS.ZS',
    'People using safely managed drinking water services (% of population)': 'SH.H2O.SMDW.ZS',
    'Coverage of social insurance programs (% of population)': 'per_si_allsi.cov_pop_tot', 80
'''
WDI_selected_pivot_df.isna().sum().sort_values(ascending =False)

In [None]:
WDI_selected_pivot_df.dropna(axis = 1, thresh = 100 ,inplace=True)
WDI_selected_pivot_df.isna().sum().sort_values(ascending =False)

In [None]:
WDI_selected_pivot_df

In [None]:
WDI_data_selected = WDICountry_selected_df.merge(WDI_selected_pivot_df,how="right",left_on="Country Code",right_index = True)
WDI_data_selected.head(2)

In [None]:
len(set(code3_needed) - set(WDI_data_selected["Country Code"])) # nice only 6 we don't have, which actually is 3!

In [None]:
# combine with country_feature_df
area_feature_df = area_feature_df.merge(WDI_data_selected,how="left",left_on="code3",right_on = "Country Code")
area_feature_df.head(2)

In [None]:
area_feature_df.shape

In [None]:
area_feature_df.set_index("area",inplace=True, drop = False)

In [None]:
area_feature_df.isna().sum(axis = 1).sort_values(ascending = False)[:20]

In [None]:
area_feature_df.isna().sum()

## Fill Some Missing Value of area_feature_df
1. Income Group
2. Demographic ( no need to)
3. Stringency Score

In [None]:
# check first
area_feature_df.shape

In [None]:
area_feature_df[area_feature_df["continent"] == "NotKnow"]

In [None]:
area_feature_df[area_feature_df.continent.apply(lambda x: True if len(x)>2 else False)]

In [None]:
area_feature_df[area_feature_df["Income Group"].isna()]

In [None]:
# fill missing value of Income Group
area_feature_df["Income Group"].unique()

In [None]:
# Taiwan - Upper middle income
# Holy See - 'Lower middle income'
# https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)
area_feature_df.loc["Holy See","Income Group"] = "Lower middle income"
area_feature_df.loc["Taiwan*","Income Group"] = "Upper middle income"
area_feature_df.loc["Western Sahara","Income Group"] = "Low income"
area_feature_df.loc["Kosovo","Income Group"] = "Low income"

In [None]:
area_feature_df[area_feature_df["Income Group"].isna()]

In [None]:
# No missing value of continent.
area_feature_df["continent"].unique()

In [None]:
# fill missing value of population
area_feature_df[area_feature_df["pop"].isna()].index

In [None]:
'''
pop_fill_dict = {
    'Saint Pierre and Miquelon' : 5800, 
    'Malawi':19021137,
    'Bonaire, Sint Eustatius and Saba':25160, 
    'Sao Tome and Principe':218271,
    'South Sudan':11165316, 
    'Falkland Islands (Malvinas)':3548,
}
for k,v in pop_fill_dict.items():
    area_feature_df.loc[k,"pop"] = v
area_feature_df[area_feature_df["pop"].isna()].index
'''

In [None]:
area_feature_df.groupby(["continent","country_region"])["pop"].mean().reset_index().groupby("continent")["pop"].sum() /area_feature_df.groupby(["continent","country_region"])["pop"].mean().reset_index().groupby("continent")["pop"].sum().sum()

In [None]:
area_feature_df.loc["Kosovo","young"] = 0.258
area_feature_df.loc["Kosovo","middle"] = 0.672
area_feature_df.loc["Kosovo","old"] = 0.07

In [None]:
# treat these special areas specially
special_areas = ["Diamond Princess","MS Zaandam"] + china_area
special_areas_feature = area_feature_df.loc[special_areas,:].copy()
area_feature_df.drop(index = special_areas, axis = 0,inplace=True)

In [None]:
age_dist_group_df = area_feature_df.drop_duplicates(subset="country_region").groupby(["continent","Income Group"])[["middle","old","young"]].mean()
age_dist_group_df

In [None]:
for i in range(area_feature_df.shape[0]):
    c = area_feature_df.index[i]
    if np.isnan(area_feature_df.loc[c,"middle"]):
        cont = area_feature_df.loc[c,"continent"]
        income_group = area_feature_df.loc[c,"Income Group"]
        for age in ["young","middle","old"]:
            area_feature_df.loc[c,age] = age_dist_group_df.loc[idx[cont,income_group],age]
area_feature_df[area_feature_df["old"].isna()] # only those 4 special are nan, which will be taken out treat specially

In [None]:
pop_df

In [None]:
stringency_cols = data_Oxford_df.columns 

In [None]:
stringency_score_dist_group_df = area_feature_df.drop_duplicates(subset="country_region").groupby(["continent","Income Group"])[stringency_cols].mean()
stringency_score_dist_group_df.head()

In [None]:
# stringency score fill
for i in range(area_feature_df.shape[0]):
    c = area_feature_df.index[i]
    if np.isnan(area_feature_df.loc[c,stringency_cols[0]]):
        cont = area_feature_df.loc[c,"continent"]
        income_group = area_feature_df.loc[c,"Income Group"]
        if np.isnan(stringency_score_dist_group_df.loc[idx[cont,income_group],stringency_cols[0]]):
            area_feature_df.loc[c,stringency_cols] = area_feature_df[stringency_cols].mean()
        else:
            for str_col in stringency_cols:
                area_feature_df.loc[c,str_col] = stringency_score_dist_group_df.loc[idx[cont,income_group],str_col]
area_feature_df[area_feature_df[stringency_cols[0]].isna()]

# SEIR-HCD Model
This is a working example of a [SEIR](https://en.wikipedia.org/wiki/Compartmental_models_in_epidemiology#The_SEIR_model) model with added compartments for HCD. The letters stand for:
* S: Susceptible
* E: Exposed
* I: Infected
* R: Recovered
* H: Hospitalized (Severe)
* C: Critical in (ICU). O is "critical not in ICU/overflow" 
* D: Death , Fatalities

In [None]:
Image(url= "https://covid19-scenarios.org/assets/model_sketch.741fd99.svg")

Following equations are from these great web apps: 
* http://gabgoh.github.io/COVID/index.html
* https://covid19-scenarios.org/

## Parameters used in the model
`R_t` = reproduction number at time t.The number of secondary infections each infected individual produces. Typical 3.6* at t=0

**Transition times**
* `T_inc` = average incubation period. Exposed individuals progress to a symptomatic/infectious state after an average latency. Typical 5.6* days
* `T_inf` = average infectious period. Duration patient is infectious. Typical 2.9 days
* `T_hosp` = average time a patient is in hospital before either recovering or becoming critical. Typical 4 days
* `T_crit` = average time a patient is in a critical state (either recover or die). Typical 14 days

**Fractions**
These constants are likely to be age specific (hence the subscript a):
* `m_a` = fraction of infections that are asymptomatic or mild. Assumed 80% (i.e. 20% severe)
* `c_a` = fraction of severe cases that turn critical. Assumed 10%
* `f_a` = fraction of critical cases that are fatal. Assumed 30%

*Averages taken from https://www.kaggle.com/covid-19-contributions

In [None]:
oneday = timedelta(days=1)

# Susceptible equation
def make_a_s(df,age,t):
    return df.loc[:,age].loc[t]

# Susceptible individuals are exposed to the
# virus by contact with an infected individual.
def dS_dt(t, S, I, R_t, t_inf):
    
    def dSa_dt(Sa, I, R_t, ta_inf):
        return -(R_t / ta_inf) * I * Sa
    
    for age in ["young","middle","old"]:
        Sa = make_a_s(S,age,t)
        ta_inf = t_inf[age]
        val =  S.loc[t,age] + dSa_dt(Sa , I.loc[t,:].sum(), R_t.loc[t], ta_inf)
        S.loc[t+oneday,age]  = val
    return S

# Exposed equation
def dE_dt(t, S, E, I, R_t, t_inf, t_inc):

    def dEa_dt(Sa, Ea, I, R_t, ta_inf, ta_inc):
        # Exposed individuals progress towards a 
        # symptomatic state on average time t_inc
        return (R_t / ta_inf) * I * Sa - (Ea / ta_inc)
    
    for age in ["young","middle","old"]:
        Sa = make_a_s(S,age,t)
        Ea = make_a_s(E,age,t)
        ta_inf = t_inf[age]
        ta_inc = t_inc[age]
        val = E.loc[t,age] + dEa_dt(Sa, Ea, I.loc[t,:].sum(), R_t.loc[t], ta_inf, ta_inc)
        E.loc[t+oneday,age] = val
    
    return E

# Infected equation
def dI_dt(t, I, E, t_inc, t_inf):
    
    def dIa_dt(Ia, Ea, ta_inc, ta_inf):
        # Infected individuals infect an average of R_0  secondary 
        # infections. On a time-scale of t_inf infected individuals 
        # either recover or progress towards hospitalization.
        return (Ea / ta_inc) - (Ia / ta_inf)
    
    for age in ["young","middle","old"]:
        Ia = make_a_s(I,age,t)
        Ea = make_a_s(E,age,t)
        ta_inc = t_inc[age]
        ta_inf = t_inf[age]
        val = I.loc[t,age] + dIa_dt(Ia, Ea, ta_inc, ta_inf)
        I.loc[t+oneday,age] = val
    return I


# Hospialized equation
def dH_dt(t,I,C,H,t_inf,t_hosp,t_crit,m,f):
    
    def dHa_dt(Ia, Ca, Ha, ta_inf, ta_hosp, ta_crit, m_a, f_a):
        # Hospitalized individuals either recover or 
        # worsen towards a critical state on a time-scale of t_hosp
        return ((1 - m_a) * (Ia / ta_inf)) + ((1 - f_a) * Ca / ta_crit) - (Ha / ta_hosp)

    for age in ["young","middle","old"]:
        Ia = make_a_s(I,age,t)
        Ca = make_a_s(C,age,t)
        Ha = make_a_s(H,age,t)
        ta_inf = t_inf[age]
        ta_hosp = t_hosp[age]
        ta_crit = t_crit[age]
        m_a = m[age]
        f_a = f[age]
        val = H.loc[t,age] + dHa_dt(Ia, Ca, Ha, ta_inf, ta_hosp, ta_crit, m_a, f_a)
        H.loc[t+oneday,age] = val
    return H

# Critical equation
def dC_dt(t,H,C,t_hosp,t_crit,c):
    def dCa_dt(Ha, Ca, ta_hosp, ta_crit, c_a):
        # Critical individuals model ICU usage. 
        # They either return to the hospital state or die on a time-scale of t_crit
        return (c_a * Ha / ta_hosp) - (Ca / ta_crit)
    
    for age in ["young","middle","old"]:
        Ha = make_a_s(H,age,t)
        Ca = make_a_s(C,age,t)
        ta_hosp = t_hosp[age]
        ta_crit = t_crit[age]
        c_a = c[age]
        val = C.loc[t,age] + dCa_dt(Ha, Ca, ta_hosp, ta_crit, c_a)
        C.loc[t+oneday,age] = val
    
    return C

# Recovered equation
# Recovered individuals can not be infected again.
def dR_dt(t, R, I, H, t_inf, t_hosp, m, c):
    
    def dRa_dt(Ia, Ha, ta_inf, ta_hosp, m_a, c_a):
        return (m_a * Ia / ta_inf) + (1 - c_a) * (Ha / ta_hosp)

    for age in ["young","middle","old"]:
        Ia = make_a_s(I,age,t)
        Ha = make_a_s(H,age,t)
        ta_inf = t_inf[age]
        ta_hosp = t_hosp[age]
        m_a = m[age]
        c_a = c[age]
        val = R.loc[t,age] + dRa_dt(Ia, Ha, ta_inf, ta_hosp, m_a, c_a)
        R.loc[t+oneday,age] = val   
        
    return R

# Deaths equation
def dD_dt(t,D, C,t_crit,f):

    def dDa_dt(Ca, ta_crit, f_a):
        return f_a * Ca / ta_crit
    
    for age in ["young","middle","old"]:
        Ca = make_a_s(C,age,t)
        ta_crit = t_crit[age]
        f_a = f[age]
        val = D.loc[t,age] +  dDa_dt(Ca, ta_crit, f_a)
        D.loc[t+oneday,age] = val    
        
    return D


#def SEIR_HCD_model(t, y, R_t, t_inc=2.9, t_inf=5.2, t_hosp=4, t_crit=14, m_a=0.8, c_a=0.1, f_a=0.3):
def SEIR_HCD_model_age(t, y, R_t, param_dict):
    """
    :param t: Time step for solve_ivp
    :param y: Previous solution or initial values
    :param R_t: Reproduction number. Default 3.6 days.
    :param t_inc: Average incubation period. Default 5.2 days
    :param t_inf: Average infectious period. Default 2.9 days
    :param t_hosp: Average time a patient is in hospital before either recovering or becoming critical. Default 4 days
    :param t_crit: Average time a patient is in a critical state (either recover or die). Default 14 days
    :param m: Fraction of infections that are asymptomatic or mild. Default 0.8
    :param c: Fraction of severe cases that turn critical. Default 0.1
    :param f: Fraction of critical cases that are fatal. Default 0.3
    :return:
    """
    if callable(R_t):
        reprod = R_t(t)
    else:
        reprod = R_t
        
    S, E, I, R, H, C, D = y
    # S,E,I,R,H,C,D is a series with date as index and multiindex columns. (type, age_group)
    S = dS_dt(t, S, I, reprod, t_inf = param_dict['t_inf'])
    E = dE_dt(t, S, E, I, reprod, t_inf = param_dict['t_inf'], t_inc = param_dict['t_inc'])
    I = dI_dt(t, I, E, t_inc = param_dict['t_inc'], t_inf = param_dict['t_inf'])
    R = dR_dt(t, R, I, H, t_inf = param_dict['t_inf'], t_hosp = param_dict['t_hosp'], m = param_dict['m'], c = param_dict['c'])
    H = dH_dt(t, I, C, H, t_inf = param_dict['t_inf'], t_hosp= param_dict['t_hosp'], t_crit = param_dict['t_crit'],
                          m = param_dict['m'], f= param_dict['f'])
    C = dC_dt(t, H, C, t_hosp= param_dict['t_hosp'], t_crit= param_dict['t_crit'], c = param_dict['c'])
    D = dD_dt(t, D, C, t_crit = param_dict['t_crit'], f = param_dict['f'])
    return [S, E, I, R, H, C, D]

## Data Initialization (Change Data into the format we need)

In [None]:
area_feature_df.head()

In [None]:
pop_df = area_feature_df[["area","young","middle","old","pop"]].copy()
pop_df.set_index("area",inplace=True,drop = True)
pop_df.head()

In [None]:
# generate 
# 1. pop_type_df: store the change of those people type 
# 2. R_df: store the time changing value R, because of restriction of government posted
# 3. time_fixed_param_df: store time fixed model relative parameter.

#code2_s = list(code_pop_dict.keys())
dates = train_full_df.index.levels[1]
pop_type = [ptype for ptype in "SEIHCRD"]
age_type = ["young","middle","old"]
R = ["R_t"]
time_fixed_param = ["t_inc","t_inf","t_hosp","t_crit","m","c","f"]
pop_age_cols = pd.MultiIndex.from_product([pop_type,age_type])
#code_dates_index = pd.MultiIndex.from_product(code2_s,dates)
time_fixed_param_age_cols = pd.MultiIndex.from_product([time_fixed_param,age_type])

pop_type_df = pd.DataFrame(index = dates, columns = pop_age_cols)
R_s = pd.Series(index = dates)
#time_fixed_param_df = pd.DataFrame(index = dates, columns = time_fixed_param_age_cols)

In [None]:
param_dict = {}
default_param_dict = {
    "t_inc":4,
    "t_inf":3, 
    "t_hosp":4, 
    "t_crit":14, 
    "m":0.8, 
    "c":0.1, 
    "f":0.3
}
# mcf_dict, reference to https://covid19-scenarios.org/
mcf_dict = {
    'young': {'m': 0.98, 'c': 0.075, 'f': 0.3},
 'middle': {'m': 0.96, 'c': 0.15, 'f': 0.3},
 'old': {'m': 0.7, 'c': 0.4, 'f': 0.45}
}
for param in time_fixed_param:
    if param in ["m","c","f"]:
        param_dict[param] = {
           age:mcf_dict[age][param] for age in mcf_dict
        }
    else:
        param_dict[param] = {
            "young": default_param_dict[param],
            "middle":default_param_dict[param],
            "old":default_param_dict[param],
        }
param_dict

In [None]:
pop_type_df.head(2)

In [None]:
R_s.head(2)

In [None]:
default_R = 3.6

In [None]:
# stringency feature
stringency_features = ["code2"] + list(stringency_cols)
stringency_df = area_feature_df[stringency_features].copy()
stringency_df = stringency_df.groupby("code2").mean().reset_index()
stringency_df.columns = ["code2"]+list(dates)
stringency_df = stringency_df.melt(value_vars= dates,id_vars = "code2")
stringency_df.columns = ["code2","date","stringency"]
stringency_df.sort_values(["code2","date"],inplace=True)
stringency_df.set_index(["code2","date"],inplace=True)
stringency_df.head()

In [None]:
def pop_lookup(area):
    if area in province_lookup:
        return province_lookup[area]
    else:
        return country_lookup[area]
def show_l(a_l):    
    print(len(a_l)," Areas:")
    for a in a_l:
        print(a,":\t country: " + area_country_dict[a],",\t pop: ",pop_lookup(a))

zero_case_df = train_full_df.loc[idx[:,train_end_date],"ConfirmedCases"] == 0
zero_case_l = []
for c in area_country_dict.keys():
    if zero_case_df.loc[idx[c,train_end_date]] == True:
        zero_case_l.append(c)
show_l(zero_case_l)

In [None]:
not_zero_case_df = train_full_df.loc[idx[:,train_begin_date],"ConfirmedCases"] >10
not_zero_case_l = []
for a in area_country_dict.keys():
    if not_zero_case_df.loc[idx[a,train_begin_date]] == True:
        not_zero_case_l.append(a)
show_l(not_zero_case_l)

# Naive Example: Model without Intervention
- $R_t$ is the same regardless of stringency of country
- All parameters are the same for each age group
- No interaction between each step

In [None]:
def init_pop(area = "Afghanistan",show = True):
    cases_series_eg = train_full_df.loc[idx[area,:],:].copy()
    cases_series_eg = cases_series_eg[cases_series_eg['ConfirmedCases']>0].copy()
    if area in special_areas:
        return 0,cases_series_eg, 0, 0
    if cases_series_eg.shape[0] == 0:
        return 0, 0, 0, 0 
    first_case_date = cases_series_eg.index[0][1]
    first_case_num = cases_series_eg.loc[idx[:,first_case_date],"ConfirmedCases"].values[0]
    # add pop_type_data
    pop_type_df_eg  = pop_type_df.loc[first_case_date:,:].copy()
    # initialize pop_type_df
    col_num = pop_type_df.shape[1]
    pct_young = pop_df.loc[area,"young"]
    pct_middle = pop_df.loc[area,"middle"]
    pct_old = pop_df.loc[area,"old"]
    all_pop = pop_df.loc[area,"pop"]
    I_ratio = first_case_num/all_pop
    S_ratio = 1 - I_ratio
    pop_type_df_eg.loc[first_case_date,:] = np.array([S_ratio*pct_young,S_ratio*pct_middle,S_ratio*pct_old] + [0.] * 3 \
                                        + [I_ratio*pct_young,I_ratio*pct_middle,I_ratio*pct_old] + [0.]*(col_num-9))
    if show:
        print(f"Inititialize population for area {area}...")
        print("First Case Date: ", first_case_date)
        print("First Case Number: ", first_case_num )
    return pop_type_df_eg,cases_series_eg, first_case_date,all_pop

def init_R(naive=True, c2="AF", stringency_df = 0, show = True, default_R = default_R, sensitivity = 1, b = 1):
    if show:
        if naive:
            print(f"Inititialize Reproduction Rate as {default_R}, Naively")
        else:
            print("Initialize Reproduction Rate according to Stringency Score")
    R_s_eg = R_s.copy()
    R_s_eg.fillna(default_R,inplace=True)
    if not naive:
        assert not isinstance(stringency_df,int)
        stringency_score_eg = stringency_df.loc[idx[c2,:],"stringency"]
        stringency_score_eg.index = stringency_score_eg.index.droplevel(0) 
        # add sensitivity to measure how people tend to react to the policy (0.8~1.2), lower means more sensitive
        # add beta to measure how soon the policy get implemented (0.5~1), lower means slower
        stringency_multiplier = 1 - b * (stringency_score_eg/100) ** sensitivity
        R_s_eg  = R_s_eg * stringency_multiplier
    return R_s_eg
'''
def init_param(naive = True, mcf_dict = 0, default_param_dict = default_param_dict, show = True):
    if show:
        if naive:
            print(f"Inititialize Parameters as default_para_dict, Naively")
        else:
            print("Initialize Parameters according to Stringency Score")
        
    # initialize time_fixed_param_df
    time_fixed_param_df_eg = time_fixed_param_df.copy()
    if naive:
        for k,v in default_param_dict.items():
            time_fixed_param_df_eg.loc[:,idx[k,:]] = v
    else:        
        assert not isinstance(mcf_dict, int)
        for k,v in default_param_dict.items():
            if k not in ["m","c","f"]:
                time_fixed_param_df_eg.loc[:,idx[k,:]] = v
            else:
                for age in ["young","middle","old"]:
                    time_fixed_param_df_eg.loc[:,idx[k,age]] = mcf_dict[age][k]
    return time_fixed_param_df_eg
'''

def init_input(pop_type_df_eg , show = True):
    if show:
        print("Inititialize Input...")
    S_eg = pop_type_df_eg.loc[:,idx["S",:]].copy()
    E_eg = pop_type_df_eg.loc[:,idx["E",:]].copy()
    I_eg = pop_type_df_eg.loc[:,idx["I",:]].copy()
    H_eg = pop_type_df_eg.loc[:,idx["H",:]].copy()
    C_eg = pop_type_df_eg.loc[:,idx["C",:]].copy()
    R_eg = pop_type_df_eg.loc[:,idx["R",:]].copy()
    D_eg = pop_type_df_eg.loc[:,idx["D",:]].copy()


    
    #t_inc_eg = time_fixed_param_df_eg.loc[:,idx["t_inc",:]].copy()
    #t_inf_eg = time_fixed_param_df_eg.loc[:,idx["t_inf",:]].copy()
    #t_hosp_eg = time_fixed_param_df_eg.loc[:,idx["t_hosp",:]].copy()
    #t_crit_eg = time_fixed_param_df_eg.loc[:,idx["t_crit",:]].copy()
    #m_eg = time_fixed_param_df_eg.loc[:,idx["m",:]].copy()
    #c_eg = time_fixed_param_df_eg.loc[:,idx["c",:]].copy()
    #f_eg = time_fixed_param_df_eg.loc[:,idx["f",:]].copy()

    for df in [S_eg,E_eg,I_eg,H_eg, C_eg, R_eg, D_eg]:
        df.columns = df.columns.levels[1]
        
    y_eg = [S_eg,E_eg, I_eg, H_eg, C_eg, R_eg, D_eg]
    
    return y_eg

In [None]:
def run_model(first_case_date, y_eg, R_s_eg, param_dict, end_date = train_end_date):
    date = first_case_date
    while date < end_date:
        y_eg = SEIR_HCD_model_age(date, y_eg, R_s_eg, param_dict)
        date += oneday
    pop_type_df_eg_end = pd.concat(y_eg,axis = 1)
    pop_type_df_eg_end.columns = pop_age_cols
    return pop_type_df_eg_end

In [None]:
OPTIM_DAYS = 21
def evaluate_model(pop_type_df_end, cases_series, all_pop, show = True, est_begin_date = train_begin_date, est_end_date = train_end_date):
    pop_type_df_end_pop = pop_type_df_end.sum(axis =1 ,level = 0) * all_pop
    cases_pop = cases_series.reset_index(level=0,drop = True)    
    cases_pop = cases_pop.loc[est_begin_date:est_end_date].copy()
    pop_type_df_end_pop = pop_type_df_end_pop.loc[est_begin_date:est_end_date].copy()
    
    optim_days = min(OPTIM_DAYS, len(data))  # Days to optimise for
    weights = 1 / np.arange(1, optim_days+1)[::-1]  # Recent data is more heavily weighted
    
    sus = pop_type_df_end_pop["S"]
    exp = pop_type_df_end_pop["E"]
    inf = pop_type_df_end_pop["I"]
    rec = pop_type_df_end_pop["R"]
    hosp = pop_type_df_end_pop["H"]
    crit = pop_type_df_end_pop["C"]
    death  = pop_type_df_end_pop["D"]
    cases = inf + rec + hosp + crit + death
    cases_real = cases_pop.loc[:,"ConfirmedCases"]
    death_real = cases_pop.loc[:,"Fatalities"]
    
    # Mean Absolute Error.
    case_score = mean_absolute_error(cases_real.iloc[-optim_days:],cases.iloc[-optim_days:],sample_weight = weights)
    death_score = mean_absolute_error(death_real.iloc[-optim_days:],death.iloc[-optim_days:],sample_weight = weights)

    # the column-wise root mean squared logarithmic error.
    case_log_score = np.sqrt(mean_squared_log_error(cases_real.iloc[-optim_days:],cases.iloc[-optim_days:],sample_weight = weights))
    death_log_score = np.sqrt(mean_squared_log_error(death_real.iloc[-optim_days:],death.iloc[-optim_days:],sample_weight = weights))
    
    MAE = {
        "case": case_score,
        "death": death_score,
    }
    target_MAE = MAE["death"]
    
    RMSLE = {
        "case":case_log_score,
        "death": death_log_score,
    }
    target_RMSLE = np.mean([RMSLE["case"],RMSLE["death"]])
    if show:
        print("MAE: {:.2f}, RMSLE: {:.3f}".format(target_MAE,target_RMSLE))
        print("MAE Detail\n")
        pprint(MAE)
        print("RMSLE Detail\n")
        pprint(RMSLE)
    return MAE, RMSLE, target_MAE, target_RMSLE

In [None]:
from pandas.plotting import register_matplotlib_converters
from matplotlib.dates import DateFormatter
register_matplotlib_converters()
date_form = DateFormatter("%m-%d")

def plot_model(pop_type_df_end, cases_series, all_pop, R_s, title='SEIR+HCD model'):
    
    pop_type_df_end_pop = pop_type_df_end.sum(axis =1 ,level = 0) * all_pop
    cases_pop = cases_series.reset_index(level=0,drop = True)

    pop_type_df_end_pop.index = pd.to_datetime(pop_type_df_end_pop.index)
    cases_pop.index = pd.to_datetime(cases_pop.index)
    R_s.index = pd.to_datetime(R_s.index)

    R_s = R_s.loc[cases_pop.index]
    
    
    sus = pop_type_df_end_pop["S"]
    exp = pop_type_df_end_pop["E"]
    inf = pop_type_df_end_pop["I"]
    rec = pop_type_df_end_pop["R"]
    hosp = pop_type_df_end_pop["H"]
    crit = pop_type_df_end_pop["C"]
    death  = pop_type_df_end_pop["D"]
    cases = inf + rec + hosp + crit + death
    cases_real = cases_pop["ConfirmedCases"]
    death_real = cases_pop["Fatalities"]

    
    #ax1.plot(exp, 'tab:orange', label='Exposed');
    #ax1.plot(inf, 'tab:red', label='Infected');
    #ax1.plot(rec, 'tab:green', label='Recovered');
    #ax1.plot(hosp, 'tab:purple', label='Hospitalised');
    #ax1.plot(crit, 'tab:brown', label='Critical');
    #ax1.plot(death, 'tab:cyan', label='Deceased');
    
    #ax1.set_xlabel("Date", fontsize=10);
    #ax1.set_ylabel("Population", fontsize=10);
    #ax1.legend(loc='best');
    fig,(ax1,ax2) = plt.subplots(1,2,figsize=(16,6))
    fig.suptitle(title)    
    ax1.plot(cases, color = "b", label='Cases');    
    ax1.plot(cases_real, color = "b", linestyle = '--',label='Cases(Real)');    
    ax1.set_xlabel("Date", fontsize=10);
    ax1.set_ylabel("Cases", fontsize=10, color='b');
    ax1.legend(loc="upper left")
    ax3 = ax1.twinx()
    ax3.plot(R_s, color = "black",label = "Rt")
    ax3.set_ylabel("Rt", fontsize=10, color='black');
    ax3.legend(loc="upper right")
    ax1.xaxis.set_major_formatter(date_form)
    
    ax2.plot(death,  color = 'r', label='Fatalities');    
    ax2.plot(death_real, color = 'r', linestyle = '--',label='Fatalities(Real)');    
    ax2.set_xlabel("Date", fontsize=10)
    ax2.set_ylabel("Fatalities", fontsize=10, color='r')
    ax2.legend(loc="upper left")
    ax4 = ax2.twinx()
    ax4.plot(R_s, color = "black",label = "Rt")
    ax4.set_ylabel("Rt", fontsize=10, color='black');
    ax4.legend(loc="upper right")
    ax2.xaxis.set_major_formatter(date_form)
    plt.show()

In [None]:
area_code2_dict = dict(zip(area_feature_df.area,area_feature_df.code2))
area = "Afghanistan"
c2 = area_code2_dict[area]
pop_type_df_eg, cases_series_eg, first_case_date,all_pop = init_pop(area = area)
if isinstance(pop_type_df_eg, int):
    print(area," no cases until now")
else:    
    R_s_eg = init_R(naive=True)
    y_eg = init_input(pop_type_df_eg,param_dict)
    pop_type_df_eg_end = run_model(first_case_date, y_eg, R_s_eg, param_dict)
    plot_model(pop_type_df_eg_end, cases_series_eg, all_pop,R_s = R_s_eg, title='SEIR+HCD model without intervention')
    MAE_1,RMSLE_1, target_MAE_1,target_RMSLE_1 = evaluate_model(pop_type_df_eg_end, cases_series_eg, all_pop)

We can see from the model, predicted value is much larger than real value. The most direct way is to decrease the __reproduction rate__.

# Experiment

In [None]:
def plot_exp(y_exp, title='SEIR+HCD model'):
    sus, exp, inf, rec, hosp, crit, death = y_exp
    sus = sus.sum(axis = 1) 
    exp = exp.sum(axis = 1) 
    inf = inf.sum(axis = 1) 
    rec = rec.sum(axis = 1) 
    hosp = hosp.sum(axis = 1) 
    crit = crit.sum(axis = 1) 
    death = death.sum(axis = 1) 
    
    cases = inf + rec + hosp + crit + death

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))
    fig.suptitle(title)
    
    ax1.plot(sus, 'tab:blue', label='Susceptible');
    ax1.plot(exp, 'tab:orange', label='Exposed');
    ax1.plot(inf, 'tab:red', label='Infected');
    ax1.plot(rec, 'tab:green', label='Recovered');
    ax1.plot(hosp, 'tab:purple', label='Hospitalised');
    ax1.plot(crit, 'tab:brown', label='Critical');
    ax1.plot(death, 'tab:cyan', label='Deceased');
    
    ax1.set_xlabel("Days", fontsize=10);
    ax1.set_ylabel("Fraction of population", fontsize=10);
    ax1.legend(loc='best');
    ax1.xaxis.set_major_formatter(date_form)
    
    ax2.plot(cases, 'tab:red', label='Cases');    
    ax2.set_xlabel("Days", fontsize=10);
    ax2.set_ylabel("Fraction of population (Cases)", fontsize=10, color='tab:red');
    
    ax3 = ax2.twinx()
    ax3.plot(death, 'tab:cyan', label='Deceased');    
    ax3.set_xlabel("Days", fontsize=10);
    ax3.set_ylabel("Fraction of population (Fatalities)", fontsize=10, color='tab:cyan');
    ax2.xaxis.set_major_formatter(date_form)
    
def fill_data(data, pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date):
    new_dates = pd.date_range(pred_begin_date,pred_end_date)
    if isinstance(data, pd.Series):
        new_s = pd.Series(data.iloc[-1],index = new_dates)
        data = pd.concat([data,new_s],axis = 0)
    elif isinstance(data, pd.DataFrame):
        new_df = pd.DataFrame(np.nan,index = new_dates,columns= data.columns)
        new_df.iloc[0,:] = data.iloc[-1,:]
        new_df.fillna(method = "ffill",inplace=True)
        data = pd.concat([data,new_df],axis = 0)
    else:
        raise Error
    return data
from datetime import date

def run_sim(R0, n_inf, pop_exp, pct_young,pct_middle, title = 'SEIR+HCD model'):
    I_ratio = n_inf/pop_exp
    S_ratio = 1- I_ratio
    pop_type_exp = pd.DataFrame(index = pd.date_range(start_date,end_date), columns = pop_age_cols)
    pop_type_exp.loc[start_date,:] = 0
    col_num = pop_type_exp.shape[1]
    pop_type_exp.loc[start_date,:] = np.array([S_ratio*pct_young,S_ratio*pct_middle,S_ratio*pct_old] + [0.] * 3 \
                                        + [I_ratio*pct_young,I_ratio*pct_middle,I_ratio*pct_old] + [0.]*(col_num-9))
    y_exp = init_input(pop_type_exp,show=False)
    R_exp = init_R(naive=True,default_R=R0,show = False,)
    R_exp = fill_data(R_exp, pred_begin_date = train_end_date+oneday, pred_end_date = end_date)
    date = start_date
    while date < end_date:
        y_exp = SEIR_HCD_model_age(date, y_exp, R_exp, param_dict)
        date += oneday
    plot_exp(y_exp, title=title)
start_date = date(2020,1,22)
end_date = start_date + timedelta(days=100)
pop_exp = 1e5
n_inf  = 1
pct_young = 0.2
pct_middle = 0.6
pct_old = 1-pct_young-pct_middle
param_dict
R0 = 3.6
run_sim(R0, n_inf,pop_exp, pct_young, pct_middle, title = "SEIR+HCD model, R0 = " + str(R0))
R0 = 7.2
run_sim(R0, n_inf,pop_exp, pct_young, pct_middle, title = "SEIR+HCD model, R0 = " + str(R0))

# Upgrade: Model with Intervention
- __Assumption 1__ -  about Reproduction Rate:
    - $R_0$ depends on the density of country and how much attention people usually put to health. Using World Development Index.
    - $R_t$ can be modeled in a __Transmission Reduction__ way:  Several studies attempt to estimate the effect of different aspects of social distancing and infection control on the rate of transmission. A report by Wang et al estimates a step-wise reduction of $R_0$ from above three to around 1 and then to around 0.3 due to successive measures implemented in Wuhan. This study investigates the effect of school closures on influenza transmission.
    - We can model the reduction use Stringency Score from Oxford.
- We first implement the __Transmission Reduction__

In [None]:
if isinstance(pop_type_df_eg, int):
    print(area," no cases until now")
else:    
    R_s_eg = init_R(naive=False,c2 = c2,stringency_df = stringency_df,sensitivity = 0.8)
    y_eg = init_input(pop_type_df_eg,param_dict)
    pop_type_df_eg_end = run_model(first_case_date, y_eg, R_s_eg, param_dict)
    plot_model(pop_type_df_eg_end, cases_series_eg, all_pop,R_s = R_s_eg, title='SEIR+HCD model with Reproduction Rate Modified')
    MAE_2,RMSLE_2, target_MAE_2,target_RMSLE_2 = evaluate_model(pop_type_df_eg_end, cases_series_eg, all_pop)    

In [None]:
print("Compared with model 1, model 2 performance:")
print("MAE Improved: {:.2f} %".format((target_MAE_1/target_MAE_2-1)*100))
print("RMLSE Improved: {:.2f} %".format((target_RMSLE_1/target_RMSLE_2-1)*100))

# Find the reason why prediction fail
- Use the RMSLE score of each area's prediction and correlates it with other features.

In [None]:
def eval_model(area = "Afghanistan", 
               naive_R = True,stringency_df = 0,default_R = default_R,sensitivity = 1, b = 1,
               param_dict = param_dict,
               title="No title provided", plot=True, show = True):
    c2 = area_code2_dict[area]
    pop_type_df_eg, cases_series_eg, first_case_date,all_pop = init_pop(area = area,show = show)
    
    
    if isinstance(pop_type_df_eg, int):
        print(area," no cases until now")
        default_loss ={'case':0, 'death':0,'rec': 0}
        MAE, RMSLE, target_MAE, target_RMSLE = default_loss, default_loss, 0 , 0
    else:    
        R_s_eg = init_R(naive=naive_R,c2 = c2,stringency_df = stringency_df,show = show, default_R = default_R, sensitivity = sensitivity, b = b)
        y_eg = init_input(pop_type_df_eg, show = show)
        pop_type_df_eg_end = run_model(first_case_date, y_eg, R_s_eg, param_dict)
        if plot:
            plot_model(pop_type_df_eg_end, cases_series_eg,all_pop, R_s_eg, title=title)
        MAE, RMSLE, target_MAE, target_RMSLE = evaluate_model(pop_type_df_eg_end, cases_series_eg, all_pop, show = show)        
    
    return MAE, RMSLE, target_MAE, target_RMSLE

In [None]:
selected_features_dict_vk = {v:k for k,v in selected_features_dict.items()}
WDI_features_dict = {}
for col in area_feature_df.columns:
    if col in selected_features_dict_vk:
        WDI_features_dict[col] = selected_features_dict_vk[col]
        continue

In [None]:
def get_area_basic_info(area):
    c2 = area_code2_dict[area]
    country = area_country_dict[area]
    continent = area_feature_df[area_feature_df.area==area].loc[area,"continent"]
    income_group = area_feature_df[area_feature_df.area==area].loc[area,"Income Group"]
    density = area_feature_df[area_feature_df.area==area].loc[area,"EN.POP.DNST"]
    
    young, middle, old, pop = pop_df.loc[area,:]
    print(f"Area: {area}\nCountry: {country}\nContinent: {continent}\nIncome Group: {income_group}")
    print(f"Population: {pop}, Density: {density}\nyoung:{np.round(young,2)}, middle:{np.round(middle,2)}, old:{np.round(old,2)}\n")
    for k,v in WDI_features_dict.items():
        val = area_feature_df[area_feature_df.area==area].loc[area,k]
        print(v,": ",val)

In [None]:
area = "Turkey"
get_area_basic_info(area)

show = False
MAE_1, RMSLE_1, target_MAE_1, target_RMSLE_1 = eval_model(area = area,naive_R=True, title = "naive",show = show)
MAE_2, RMSLE_2, target_MAE_2, target_RMSLE_2= eval_model(area = area,naive_R=False, stringency_df = stringency_df, default_R=50,
                                                          title = "+ Stringency Score", show = show)

In [None]:
print(area)
print("Compared with model 1, model 2 performance:")
print("MAE Improved: {:.2f} %".format((target_MAE_1/target_MAE_2-1)*100))
print("RMLSE Improved: {:.2f} %".format((target_RMSLE_1/target_RMSLE_2-1)*100))

## Analyze the correlation of RMSLE and other feature 

area_feature_df.loc[:,"case_RMSLE"] = np.nan
area_feature_df.loc[:,"death_RMSLE"] = np.nan
count = 0
for area in  area_feature_df.index:
    print(area)
    _, RMSLE_eg,_,_ = eval_model(area = area,
                                 naive_R=False, stringency_df = stringency_df, default_R=3.6,
                                 title = "+ Stringency Score", show = False,plot = False)
    area_feature_df.loc[area,"case_RMSLE"] = RMSLE_eg["case"]
    area_feature_df.loc[area,"death_RMSLE"] = RMSLE_eg["death"]
    print(RMSLE_eg)
    count += 1
    if count % 20 == 0:
        print(np.round(count/area_feature_df.shape[0] * 100,1), "% finished")

area_feature_df[["case_RMSLE","death_RMSLE"]].describe()

area_feature_df.sort_values("case_RMSLE")

corr_cols = []
for col in area_feature_df:
    if col not in stringency_cols:
        corr_cols.append(col)
corr_df = area_feature_df[corr_cols].copy()
corr_df.shape

corr = corr_df.corr().sort_values("case_RMSLE").T
cols = corr.index
corr = corr_df[cols].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

RMSLE_l = ["case_RMSLE","death_RMSLE"]
neg_corr_4 = ["SN.ITK.DEFC.ZS","SH.DTH.COMM.ZS","young","SH.STA.WASH.P5"]  
pos_corr_4 = ["SH.XPD.GHED.GE.ZS",'NY.ADJ.NNTY.PC.CD',"SH.STA.BASS.ZS","EG.ELC.ACCS.ZS"]
print("Negatively Correlated with RMSLE")
for neg in neg_corr_4:
    if neg in WDI_features_dict:
        print(neg,":",WDI_features_dict[neg])
    else:
        print(neg)
print()
print("Positively Correlated with RMSLE")
for pos in pos_corr_4:
    if pos in WDI_features_dict:
        print(pos,":",WDI_features_dict[pos]) 
    else:
        print(pos)

for col in neg_corr_4 + pos_corr_4:
    if col in del_hi_corr_var_l:
        print(col)

corr_cols_2 = [col for col in corr_cols if col not in del_hi_corr_var_l]

def corr(x, y, **kwargs):
    #Calculate the value
    coef = np.corrcoef(x, y)[0][1]
    #Make the label
    label = r'$\rho$ = ' + str(round(coef, 2))
    #Add the label to the plot
    ax = plt.gca()
    ax.annotate(label, xy = (0.2, 0.95), xycoords = ax.transAxes)
    
plot = False
if plot:
    #Create a pair grid instance
    grid = sns.PairGrid(data = area_feature_df[RMSLE_l + neg_corr_4].copy(),height = 4)

    #Map the plots to the locations
    grid = grid.map_upper(plt.scatter, color = 'darkred')
    grid = grid.map_upper(corr)
    grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
    grid = grid.map_diag(plt.hist, bins = 10, edgecolor =  'k', color = 'darkred');

if plot:
    #Create a pair grid instance
    grid = sns.PairGrid(data = area_feature_df[RMSLE_l + pos_corr_4].copy(),height = 4)

    #Map the plots to the locations
    grid = grid.map_upper(plt.scatter, color = 'darkred')
    grid = grid.map_upper(corr)
    grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
    grid = grid.map_diag(plt.hist, bins = 10, edgecolor =  'k', color = 'darkred');
    plt.title("Visualization of variable positively correlated with RMSLE")

if plot:
    #plot the remaining variables
    
    #Create a pair grid instance
    grid = sns.PairGrid(data = area_feature_df[[ col for col in corr_cols_2 if \
                                                (col not in (pos_corr_4+neg_corr_4) and (col in WDI_features_dict) or col in RMSLE_l)]].copy(),height = 4)

    #Map the plots to the locations
    grid = grid.map_upper(plt.scatter, color = 'darkred')
    grid = grid.map_upper(corr)
    grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
    grid = grid.map_diag(plt.hist, bins = 10, edgecolor =  'k', color = 'darkred')

## Finding 1
1.   From the correlation matrix, we can see that some variable are highly correlated, like GDP and GNP data, which we can delete. Also, Health expenditure data are also highly correlated, for this case, we keep the one with highest correlation with RMSLE, especially death RMSLE (0.4), SH.XPD.GHED.GE.ZS, which is Domestic general government health expenditure (% of general government expenditure). We apply the same method to variables that negatively correlated with score
2. From the first negative correlation pair plot we can see that the model tends to __perform well in country with country with high pct of young people, and country without enough access to sanitation infrastructure__.
3. From the second positive correlation pair plot we can see that the model still tend to perform well in poor country

# Find the right paramter
1. Adjusting $R_0$ using stringency score, fixing m,c,f.  t_inc(5.2), t_inf(2.9), t_hosp(4), t_crit(14)/
2. Get each different areas $R_0$ score and try to find the pattern

In [None]:
def pick_best_param(x,
                    area = "Turkey", 
                    naive_R = False,stringency_df = stringency_df,
                    param_dict = param_dict,
                    title="No title provided", plot=False, show = False):
    # we only paly with t_inc, t_inf, t_hosp, t_crit parameters
    # as we have already used method to determine m,c,f and R_s_t.
    #R0 = 3.6, sensitivity = 1, t_inc = 4, t_inf = 3, t_hosp = 4, t_crit = 14,
    #print(x)
    R0,b,t_inc,t_inf, t_hosp, t_crit = x
    param_dict_picked = param_dict
    sensitivity = 1
    for age in ["young","middle","old"]:
        param_dict_picked["t_inc"][age] = t_inc
        param_dict_picked["t_inf"][age] = t_inf
        param_dict_picked["t_hosp"][age] = t_hosp
        param_dict_picked["t_crit"][age] = t_crit
    #print("R0:",R0)
    # change default_param_dict -> test_param_dict
    MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
                                                naive_R = naive_R,stringency_df = stringency_df, default_R= R0 ,sensitivity = sensitivity,b=b,
                                                param_dict = param_dict_picked,
                                                title=title, plot=plot, show = show)
    # choose the optimization target as target_RMSLE 
    return target_RMSLE

def fit_model(initial_guess = [3.6, 1, 3, 3, 5, 10], 
              bounds = [
                  (2, 5),# R0
                  (0.5,1), # b
                  (1, 7), # t_inc
                  (1, 5), # t_inf
                  (3, 7),# t_hosp
                  (7, 14),# t_crit
              ],
                area = "Turkey", 
                naive_R = False,
                stringency_df = stringency_df,
                param_dict = param_dict,
                title="No title provided",
                plot=False,
                show = False,
):
    # R_0....  T_inc, T_inf, T_hosp, T_critical
    
    args = (area, 
            naive_R, stringency_df, 
            param_dict, 
            title, plot, show)
    special_func = lambda x : pick_best_param(x,*args)
    print("Start Fitting "+ area + "....")
    res = minimize(special_func, 
                   x0 = initial_guess, bounds=bounds,
                   method='L-BFGS-B')
    # save it
    #optimized_param = res.x
    #optimized_param_dict = dict(zip(["t_inc","t_inf","t_hosp","t_crit"],optimized_param))
    #optimized_param_dict.update({
    #    "m":0.8, 
    #    "c":0.1, 
    #    "f":0.3
    #})
    # rerun it
    R0,b,t_inc,t_inf, t_hosp, t_crit = res.x
    sensitivity = 1
    param_dict_picked = param_dict
    
    for age in ["young","middle","old"]:
        param_dict_picked["t_inc"][age] = t_inc
        param_dict_picked["t_inf"][age] = t_inf
        param_dict_picked["t_hosp"][age] = t_hosp
        param_dict_picked["t_crit"][age] = t_crit
    
    MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
               naive_R = naive_R, stringency_df = stringency_df, default_R = R0, sensitivity = sensitivity,b = b,
               param_dict = param_dict_picked,
               title= "Fitted Model for " + area, plot = True, show = False)
    return R0,b,sensitivity,t_inc, t_inf, t_hosp, t_crit, MAE, RMSLE

In [None]:
def show_param(area, R0, b,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE):
    print(area)
    print("R0 {:.2f}, b {:.2f}, sensitivity {:.2f}, t_inc {:.2f}, t_inf {:.2f}, t_hosp {:.2f}, t_crit {:.2f}".format(R0, b, sensitivity,
                                                                                                                     t_inc, t_inf, t_hosp, t_crit))
    print("Case RMSLE {:.3f}, Death RMSLE {:.3f}".format(RMSLE["case"],RMSLE["death"]))
    print("Reach score {:.3f}".format((RMSLE["case"]+RMSLE["death"])/2))

In [None]:
top_worst_10 =['Nepal', 'Thailand', 'Turkey', 'United Arab Emirates',
       'British Columbia', 'Michigan', 'Louisiana', 'Cambodia', 'Panama',
       'Finland']
area_feature_df.loc[:,"fitted_R0"] = np.nan
area_feature_df.loc[:,"fitted_b"] = np.nan
area_feature_df.loc[:,"fitted_sensitivity"] = np.nan
area_feature_df.loc[:,"fitted_t_inc"] = np.nan
area_feature_df.loc[:,"fitted_t_inf"] = np.nan
area_feature_df.loc[:,"fitted_t_crit"] = np.nan
area_feature_df.loc[:,"fitted_t_hosp"] = np.nan
area_feature_df.loc[:,"fitted_case_RMSLE"] = np.nan
area_feature_df.loc[:,"fitted_death_RMSLE"] = np.nan
sensitivity = 1
count = 0
before_area_has_finished  = "Prince Edward Island"
start = True
for area in  area_feature_df.index:# remember to change
    if not start:
        if area == before_area_has_finished:
            start = True
        count += 1
        continue
    t0 = time()
    R0, b,sensitivity, t_inc,t_inf,t_hosp,t_crit,MAE, RMSLE = fit_model(area = area)
    area_feature_df.loc[area,"fitted_R0"] = R0
    area_feature_df.loc[area,"fitted_b"] = b
    area_feature_df.loc[area,"fitted_sensitivity"] = sensitivity
    area_feature_df.loc[area,"fitted_t_inc"] = t_inc
    area_feature_df.loc[area,"fitted_t_inf"] = t_inf
    area_feature_df.loc[area,"fitted_t_hosp"] = t_hosp
    area_feature_df.loc[area,"fitted_t_crit"] = t_crit
    area_feature_df.loc[area,"fitted_case_RMSLE"] = RMSLE["case"]
    area_feature_df.loc[area,"fitted_death_RMSLE"] = RMSLE["death"]
    show_param(area, R0,b,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE)
    t1 = time()
    print(np.round((t1-t0)/60,2)," minutes")
    count += 1
    if count % 20 == 0:
        print(np.round(count/area_feature_df.shape[0] * 100,1), "% finished")

In [None]:
# test
area = "Nepal"
R0 = 2
b = 1
sensitivity = 1
t_inc = 10
t_inf = 3
t_hosp = 7
t_crit = 14
param_dict_picked = param_dict

for age in ["young","middle","old"]:
    param_dict_picked["t_inc"][age] = t_inc
    param_dict_picked["t_inf"][age] = t_inf
    param_dict_picked["t_hosp"][age] = t_hosp
    param_dict_picked["t_crit"][age] = t_crit

MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
           naive_R = False, stringency_df = stringency_df, default_R = R0, sensitivity = sensitivity, b = b,
           param_dict = param_dict_picked,
           title= "Fitted Model for " + area, plot = True, show = False)
show_param(area, R0, b,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE)

In [None]:
area_feature_df.to_csv("feature.csv")

In [None]:
area_feature_df.shape[0]

In [None]:
area_feature_df.shape[0] - area_feature_df["fitted_R0"].isna().sum()

In [None]:
fitted_cols = ["fitted_R0","fitted_b","fitted_sensitivity","fitted_t_inc","fitted_t_inf","fitted_t_hosp","fitted_t_crit","fitted_case_RMSLE","fitted_death_RMSLE"]
area_feature_df.loc[:,fitted_cols].describe()

In [None]:
stringency_cols_refined = [col for col in stringency_cols if ("01-22" not in col and "04-10" not in col)]
corr_cols = []
for col in area_feature_df:
    if col in stringency_cols_refined:
        continue
    corr_cols.append(col)
corr_df = area_feature_df[corr_cols].copy()
corr_df.shape
corr = corr_df.corr().sort_values("fitted_case_RMSLE").T
corr_df = area_feature_df[corr.columns].copy()
corr = corr_df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)
# 'RdBu_r' & 'BrBG' are other good diverging colorm   SH.XPD.CHEX.GD.ZS

### Finding 2
1. the evaluate metric will let the model tend to fit the begining well but bad in the long run
2. especially when the COVID19 is controlled in some country, like China. The predicted curve still tend to go up and show no signs of slow down.
3. The death curve usually perform badly.

## Submission
1. Using all train data to fit and pick out the right parameter
2. Extend the end date to test_end_date.

In [None]:
# train_end_date+oneday != test_begin_date
def fill_data(data, pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date):
    new_dates = pd.date_range(pred_begin_date,pred_end_date)
    if isinstance(data, pd.Series):
        new_s = pd.Series(data.iloc[-1],index = new_dates)
        data = pd.concat([data,new_s],axis = 0)
    elif isinstance(data, pd.DataFrame):
        new_df = pd.DataFrame(np.nan,index = new_dates,columns= data.columns)
        new_df.iloc[0,:] = data.iloc[-1,:]
        new_df.fillna(method = "ffill",inplace=True)
        data = pd.concat([data,new_df],axis = 0)
    else:
        raise Error
    return data

def init_R_pred(naive=True, c2="AF", stringency_df = 0, show = True, default_R = default_R, sensitivity = 1,b = 1,
                pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date):
    if show:
        if naive:
            print(f"Inititialize Reproduction Rate as {default_R}, Naively")
        else:
            print("Initialize Reproduction Rate according to Stringency Score")
    R_s_eg = R_s.copy()
    R_s_eg.fillna(default_R,inplace=True)
    R_s_eg = fill_data(R_s_eg, pred_begin_date = pred_begin_date, pred_end_date = pred_end_date)
    new_dates = pd.date_range(pred_begin_date,pred_end_date)
    const = np.exp(-np.log(2)/30) # account for the shape of Stringency Score
    if not naive:
        assert not isinstance(stringency_df,int)
        stringency_score_eg = stringency_df.loc[idx[c2,:],"stringency"]
        stringency_score_eg.index = stringency_score_eg.index.droplevel(0) 
        stringency_score_eg = fill_data(stringency_score_eg, pred_begin_date = pred_begin_date, pred_end_date = pred_end_date)
        stringency_score_eg /= 100
        for date in new_dates:
            stringency_score_eg.loc[date] = 1 - (1 - stringency_score_eg.loc[date-oneday])*const        
        # add sensitivity to measure how people tend to react to the policy (0~3)
        stringency_multiplier = 1 - stringency_score_eg ** sensitivity * b
        R_s_eg  = R_s_eg * stringency_multiplier
    return R_s_eg
    
def get_predict_pop_type(area,all_pop, pop_type_df_eg_end, adjust = True):               
    pop_type_df_end_pop = pop_type_df_eg_end.sum(axis =1 ,level = 0) * all_pop
    pop_type_df_end_pop = pop_type_df_end_pop.loc[test_begin_date: test_end_date]    
    
    sus = pop_type_df_end_pop["S"]
    exp = pop_type_df_end_pop["E"]
    inf = pop_type_df_end_pop["I"]
    rec = pop_type_df_end_pop["R"]
    hosp = pop_type_df_end_pop["H"]
    crit = pop_type_df_end_pop["C"]
    death  = pop_type_df_end_pop["D"]
    cases = inf + rec + hosp + crit + death
    
    # adjust cases and death according to the last day value of the training data
    last_case = train_full_df.loc[idx[area,train_end_date],"ConfirmedCases"]
    last_death = train_full_df.loc[idx[area,train_end_date],"Fatalities"]
    if adjust:
        if last_case != 0 and cases.loc[train_end_date] != 0 :
            cases *= last_case/cases.loc[train_end_date]
        if last_death != 0 and death.loc[train_end_date] != 0 :
            death *= last_death/death.loc[train_end_date]     
    output = pd.concat([cases,death],axis = 1)
    output.reset_index(inplace=True)
    output.columns = ["Date","ConfirmedCases","Fatalities"]    
    output["Area"] = area
    return output

def predict(area = "Afghanistan", pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date,
            naive_R = False, stringency_df = stringency_df, default_R = default_R,
            param_dict = param_dict,
            show = False,
            adjust = False
            ):
    # adjust is to adjust the last case to match with the actual data
    
    c2 = area_code2_dict[area]
    pop_type_df_eg, cases_series_eg, first_case_date, all_pop = init_pop(area = area,show = show)
    pop_type_df_eg = fill_data(pop_type_df_eg, pred_begin_date, pred_end_date)
    cases_series_eg = fill_data(cases_series_eg, pred_begin_date, pred_end_date)
    
    fitted_R0 = area_feature_df.loc[area,"fitted_R0"]
    fitted_b = area_feature_df.loc[area,"fitted_b"]
    fitted_sensitivity = area_feature_df.loc[area,"fitted_sensitivity"]
    fitted_t_inc =  area_feature_df.loc[area,"fitted_t_inc"]
    fitted_t_inf =  area_feature_df.loc[area,"fitted_t_inf"]
    fitted_t_hosp =  area_feature_df.loc[area,"fitted_t_hosp"]
    fitted_t_crit =  area_feature_df.loc[area,"fitted_t_crit"]
    if np.isnan(fitted_R0):
        print(f"Area {area}'s fitted value is nan")
        fitted_R0 = 3.6
        fitted_b = 1
        fitted_sensitivity = 1
        fitted_t_inc = 3
        fitted_t_inf = 2
        fitted_t_hosp = 4
        fitted_t_crit = 12
    
    for age in ["young","middle","old"]:
        param_dict["t_inc"][age] = fitted_t_inc
        param_dict["t_inf"][age] = fitted_t_inf
        param_dict["t_hosp"][age] = fitted_t_hosp
        param_dict["t_crit"][age] = fitted_t_crit

    
    if isinstance(pop_type_df_eg, int):
        print(area," no cases until now")
        default_loss ={'case':0, 'death':0,'rec': 0}
    else:    
        R_s_eg = init_R_pred(naive=naive_R, c2=c2, stringency_df=stringency_df, show = show, 
                            default_R = fitted_R0, sensitivity= fitted_sensitivity, b = fitted_b,
                            pred_begin_date = pred_begin_date, pred_end_date = pred_end_date)
        y_eg = init_input(pop_type_df_eg, show = show)
        pop_type_df_eg_end = run_model(first_case_date, y_eg, R_s_eg, param_dict, end_date = test_end_date)
        output = get_predict_pop_type(area ,all_pop, pop_type_df_eg_end,adjust = adjust)
    return output

In [None]:
outputs = pd.DataFrame(columns = ["Date","Area","ConfirmedCases","Fatalities"])

print(area)
area = "Andorra"
pop_type_df_eg_exp,_,_,_ = init_pop(area = area,show = show)
area_feature_df.loc[area,"fitted_R0"]
c2 = area_code2_dict[area]
R_s_exp = init_R_pred(naive=False, c2=c2, stringency_df=stringency_df, show = show, 
                            default_R = 3.6, sensitivity= 1,
                            pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date)
y_eg_exp = init_input(pop_type_df_eg_exp, show = show)
y_eg_exp

In [None]:
before_area_has_finished  = "Botswana"
start = True
count = 0
for area in  area_feature_df.index:
    if not start:
        if area == before_area_has_finished:
            start = True
        count += 1
        continue
    print(area)
    out = predict(area = area, show = False,adjust = False)
    outputs = pd.concat([outputs, out],axis = 0, sort =True)        
    count += 1
    if count % 20 == 0:
        print(np.round(count/area_feature_df.shape[0] * 100,1), "% finished")

In [None]:
def naive_fill(area, pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date,
                          show = False):
    _, cases_series_eg, _, _ = init_pop(area = area,show = show)
    cases_series_eg = cases_series_eg
    #before_part = train_full_df.loc[idx["Diamond Princess",test_begin_date:],["ConfirmedCases","Fatalities"]]
    #cases_series_eg = pd.concat([before_part,cases_series_eg],axis = 0)
    
    cases_series_eg.sort_index(level=[0,1],inplace=True)
    cases_series_eg.reset_index(level=0,drop = True,inplace=True)
    cases_series_eg = fill_data(cases_series_eg, pred_begin_date, pred_end_date)
    cases_series_eg = cases_series_eg.loc[test_begin_date:pred_end_date]
    cases_series_eg.reset_index(inplace=True)
    cases_series_eg = cases_series_eg.loc[:,["index","ConfirmedCases","Fatalities"]].copy()
    cases_series_eg.columns = ["Date","ConfirmedCases","Fatalities"]    
    cases_series_eg["Area"] = area
    return cases_series_eg

def predict_special_areas(area, pred_begin_date = train_end_date+oneday, pred_end_date = test_end_date,
                          show = False):    
    if area in special_areas:
        cases_series_eg = naive_fill(area, pred_begin_date = pred_begin_date, pred_end_date = pred_end_date,
                          show = False)
        return cases_series_eg
    else:
        raise KeyError(area)

In [None]:
outputs_special = pd.DataFrame(columns = ["Date","Area","ConfirmedCases","Fatalities"])

In [None]:
for area in special_areas_feature.index:
    print(area)
    out = predict_special_areas(area = area)
    outputs_special = pd.concat([outputs_special, out],axis = 0,sort=False)        
outputs_special

In [None]:
outputs_all = pd.concat([outputs,outputs_special],axis = 0,sort=False)
outputs_all

In [None]:
test_df = test_df.merge(outputs_all,on=["Date","Area"],how = "left")
test_df

In [None]:
sub_df2 = test_df.loc[:,["ForecastId","ConfirmedCases","Fatalities"]].copy()
sub_df2.fillna(0,inplace=True)

In [None]:
sub_df.shape == sub_df2.shape

In [None]:
sub_df2.isna().sum()

In [None]:
sub_df2.to_csv("submission.csv",index=False)

## Special Fit
- just some test

In [None]:
def pick_best_param_exp(x,
                    area = "Turkey", 
                    naive_R = False,stringency_df = stringency_df,
                    param_dict = param_dict,
                    title="No title provided", plot=False, show = False):
    # we only paly with t_inc, t_inf, t_hosp, t_crit parameters
    # as we have already used method to determine m,c,f and R_s_t.
    #R0 = 3.6, sensitivity = 1, t_inc = 4, t_inf = 3, t_hosp = 4, t_crit = 14,
    #print(x)
    R0, sensitivity, t_inc, t_inf,t_hosp, t_crit = x
    param_dict_picked = param_dict
    for age in ["young","middle","old"]:
        param_dict_picked["t_inc"][age] = t_inc
        param_dict_picked["t_inf"][age] = t_inf
        param_dict_picked["t_hosp"][age] = t_hosp
        param_dict_picked["t_crit"][age] = t_crit
    #print("R0:",R0)
    # change default_param_dict -> test_param_dict
    MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
                                                naive_R = naive_R,stringency_df = stringency_df, default_R= R0 ,sensitivity = sensitivity,
                                                param_dict = param_dict_picked,
                                                title=title, plot=plot, show = show)
    # choose the optimization target as target_RMSLE 
    return target_RMSLE

def fit_model_exp(initial_guess = [3.6, 1, 3, 3, 4, 12], 
              bounds = [
                  (2, 5),# R0
                  (0.8,1.2), # sensitivity
                  (1, 7), # t_inc
                  (1, 5), # t_inf
                  (3, 10),# t_hosp
                  (7, 14),# t_crit
              ],
                area = "Turkey", 
                naive_R = False,
                stringency_df = stringency_df,
                param_dict = param_dict,
                title="No title provided",
                plot=False,
                show = False,
):
    # R_0....  T_inc, T_inf, T_hosp, T_critical
    
    args = (area, 
            naive_R, stringency_df, 
            param_dict, 
            title, plot, show)
    special_func = lambda x : pick_best_param_exp(x,*args)
    print("Start Fitting "+ area + "....")
    res = minimize(special_func, 
                   x0 = initial_guess, bounds=bounds,
                   method='L-BFGS-B')
    # save it
    #optimized_param = res.x
    #optimized_param_dict = dict(zip(["t_inc","t_inf","t_hosp","t_crit"],optimized_param))
    #optimized_param_dict.update({
    #    "m":0.8, 
    #    "c":0.1, 
    #    "f":0.3
    #})
    # rerun it
    
    R0, sensitivity, t_inc, t_inf, t_hosp, t_crit = res.x
    param_dict_picked = param_dict
    
    for age in ["young","middle","old"]:
        param_dict_picked["t_inc"][age] = t_inc
        param_dict_picked["t_inf"][age] = t_inf
        param_dict_picked["t_hosp"][age] = t_hosp
        param_dict_picked["t_crit"][age] = t_crit
    
    MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
               naive_R = naive_R, stringency_df = stringency_df, default_R = R0, sensitivity = sensitivity,
               param_dict = param_dict_picked,
               title= "Fitted Model for " + area, plot = True, show = False)
    return R0,sensitivity,t_inc, t_inf, t_hosp, t_crit, MAE, RMSLE

In [None]:
area = "Finland"
R0,sensitivity, t_inc, t_inf, t_hosp, t_crit,MAE, RMSLE  = fit_model_exp(area = area)

In [None]:
area = "Thailand"
R0,sensitivity, t_inc, t_inf, t_hosp, t_crit,MAE, RMSLE  = fit_model_exp(area = area)
show_param(area, R0,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE)

In [None]:
area = "Nepal"
t0 = time()
R0,sensitivity, t_inc, t_inf, t_hosp, t_crit,MAE, RMSLE  = fit_model_exp(area = area)
t1 = time()
print((t1-t0)/60," minutes")

In [None]:
area = "Turkey"
t0 = time()
R0,sensitivity, t_inc, t_inf, t_hosp, t_crit,MAE, RMSLE  = fit_model_exp(area = area)
t1 = time()
print((t1-t0)/60," minutes")
show_param(area, R0,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE)

In [None]:
area = "Michigan"
t0 = time()
R0,sensitivity, t_inc, t_inf, t_hosp, t_crit,MAE, RMSLE  = fit_model_exp(area = area)
t1 = time()
print((t1-t0)/60," minutes")
show_param(area, R0,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE)

In [None]:
area = "Panama"
t0 = time()
R0,sensitivity, t_inc, t_inf, t_hosp, t_crit,MAE, RMSLE  = fit_model_exp(area = area)
t1 = time()
print((t1-t0)/60," minutes")
show_param(area, R0,sensitivity, t_inc, t_inf, t_hosp, t_crit, RMSLE)

In [None]:
area = "Panama"
R0 = 5
sensitivity = 1.2
t_inc = 1
t_inf = 1
t_hosp = 3
t_crit = 7
param_dict_picked = param_dict

for age in ["young","middle","old"]:
    param_dict_picked["t_inc"][age] = t_inc
    param_dict_picked["t_inf"][age] = t_inf
    param_dict_picked["t_hosp"][age] = t_hosp
    param_dict_picked["t_crit"][age] = t_crit

MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
           naive_R = False, stringency_df = stringency_df, default_R = R0, sensitivity = sensitivity,
           param_dict = param_dict_picked,
           title= "Fitted Model for " + area, plot = True, show = False)

In [None]:
area = "Panama"
R0 = 5
sensitivity = 0.8
t_inc = 1
t_inf = 1
t_hosp = 3
t_crit = 7
param_dict_picked = param_dict

for age in ["young","middle","old"]:
    param_dict_picked["t_inc"][age] = t_inc
    param_dict_picked["t_inf"][age] = t_inf
    param_dict_picked["t_hosp"][age] = t_hosp
    param_dict_picked["t_crit"][age] = t_crit

MAE, RMSLE, target_MAE, target_RMSLE = eval_model(area = area, 
           naive_R = False, stringency_df = stringency_df, default_R = R0, sensitivity = sensitivity,
           param_dict = param_dict_picked,
           title= "Fitted Model for " + area, plot = True, show = False)

In [None]:
stringency_df.loc[]