In this notebook I am interested in using Holt's algorithm to model the trend of the virus in each location. The problem with Holt's algorithm is that it only accounts for trend and then predicts accordingly. I eventually want to add information about lock-downs and testing to my model to make more accurate predictions.

# Set up Python Environment
Load the necessary libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.api import ExponentialSmoothing,SimpleExpSmoothing, Holt
from matplotlib.dates import (
        MonthLocator,
        num2date,
        AutoDateLocator,
        AutoDateFormatter,
)
import gc # garbage collector

# stats models
import statsmodels.api as sm
from fbprophet import Prophet

# time libraries
import datetime

# warning libraries for debugging
import warnings

# deal with date in x-axis of plots
from pandas.plotting import register_matplotlib_converters

In [None]:
from pykalman import KalmanFilter

code to create time bar to run functions

In [None]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

# EDA
Let's take a quick look at our training and testing data.

In [None]:
train_df = pd.read_csv('../input/covid19-global-forecasting-week-4/train.csv')
test_df = pd.read_csv('../input/covid19-global-forecasting-week-4/test.csv')

train_df["Date"] = pd.to_datetime(train_df["Date"], infer_datetime_format=True)
test_df["Date"] = pd.to_datetime(test_df["Date"], infer_datetime_format=True)

## Training Data
let's look at the training set.

In [None]:
# look at the top of the training data frame
train_df.head()

In [None]:
target_values=["ConfirmedCases","Fatalities"]

In [None]:
# breathe of the target values
train_df.describe()

In [None]:
# number of nulls in the training set
# compared to the number of rows
print("number of rows in training set:")
print(len(train_df))
print("null values in each column:")
print(train_df.isnull().sum())

### Countries and Provinces

In [None]:
print("number of unique countries in training data: %i" %
    train_df['Country_Region'].nunique())

In [None]:
print("Of the countries with provinces, how many provinces do they have?")
print(train_df.loc[train_df['Province_State'].notnull(),:]. \
    groupby('Country_Region')['Province_State'].nunique())

In [None]:
print("Of the countries with provinces, do they have any rows with no province listed?")
print("How many rows do they have?")
countriesWithProvinces = list(train_df.loc[train_df['Province_State'].notnull(),"Country_Region"])
train_df.loc[((train_df["Country_Region"].isin(countriesWithProvinces))
              & (train_df["Province_State"].isnull())),"Country_Region"].value_counts()

So basically we can count the "N/A" province as a province in these 4 countries since they have a data point for each day. I want to create a column that contains both the province and country in one string.

In [None]:
def location(country, province):
    if province == province:
        loc = ("%s, %s" % 
               (province, 
               country))
        return(loc)
    else:
        return(country)
    
train_df['location'] = train_df.apply(
    lambda x: location(x["Country_Region"],
                      x["Province_State"]), axis=1)
test_df['location'] = test_df.apply(
    lambda x: location(x["Country_Region"],
                      x["Province_State"]), axis=1)

### Dates

In [None]:
print ("Start Date:")
print (train_df['Date'].min())
print ("End Date:")
print (train_df['Date'].max())

In [None]:
# number of dates for each country/provice
print(train_df. \
    groupby(['Country_Region','Province_State'])['Date']. \
      nunique(). \
     reset_index()['Date'].unique())

Each country has a count for each of the 65 days.

### Plot of the training data

In [None]:
uniq_location=list(train_df["location"].unique())
for target in target_values:
    plt.figure(figsize =(15,8))
    plt.title(target)
    for l_id in uniq_location:
        train_locationX = train_df.loc[(
                train_df["location"]==l_id),:].copy()
        plt.plot(train_locationX["Date"],
                 train_locationX[target], 
                 label = l_id)    
    #plt.legend(loc = 'best')

## Testing Data

In [None]:
# look at the top of the training data frame
test_df.head()

In [None]:
print ("Start Date:")
print (test_df['Date'].min())
print ("End Date:")
print (test_df['Date'].max())

# Evaluation Metric
The evaluation metric for this competition is the root mean squared logarithmic error. Below I created a method that can calculate this value.

In [None]:
def rmsle(pred_series,true_series):
    sum_series = (np.log(pred_series+1) - \
        np.log(true_series+1))**2
    return np.sqrt(np.sum(sum_series))

# Splitting the Training Dataset
We need to split the original training data into a training and validation set. I decided to splits the training set into the before March 19th and the validation set as past March 19th since that is the earliest date on the testing set.

In [None]:
start_validation='2020-03-19'
train = train_df.loc[train_df["Date"]<start_validation,:]
valid = train_df.loc[train_df["Date"]>=start_validation,:]

The holt algorithm only works if the data frames have the date as the index so let's refort our data frames.

In [None]:
# for the training data I want to reformat
# the dataframe so that the timestamp is the 
# index
print("reformat training data frame...")
def trainDF2timeDF(training_df):
    timeValue_df =  train.copy()
    timeValue_df = timeValue_df.set_index("Date")
    warnings.simplefilter("ignore")
    timeValue_df.index = pd.to_datetime(timeValue_df.index.values)
    return(timeValue_df)

timeIndexed_train = trainDF2timeDF(train)
timeIndexed_train_df = trainDF2timeDF(train_df)

To seperate the true targets in the validation set we create a new data frame with their name changed.

In [None]:
valid_holt = valid.copy().rename(
    columns={"timestamp": "now", 
             "ConfirmedCases": "true_ConfirmedCases",
            "Fatalities":"true_Fatalities"})

# Holt model
This model accounts for trend but not seasonality. Below we test this model without any filter. We also try out different parameters.

We can test the damped parameter:

In [None]:
holt_params={}
holt_params["damped_False"]=[False]
holt_params["damped_True"]=[True]

In [None]:
# This model splits the data based on 
# location
uniq_location=list(valid["location"].unique())
nlocations=len(uniq_location)
print("number of locations: "+ str(nlocations))
x=0
for l_id in uniq_location:
    update_progress(x / nlocations)
    x+=1
    # fit the model to the target_values of this location
    for target in target_values:
        sub_timeTrain_df = timeIndexed_train.loc[(
            timeIndexed_train["location"]==l_id),target].copy()
        numValid = len(valid_holt.loc[(
            valid_holt["location"]==l_id),:])
        for param in holt_params.keys():
            fit_holt = Holt(
                sub_timeTrain_df,
                damped=holt_params[param][0]).fit(optimized=True)
            # forecast the targets
            target_col = ("%s_%s" %
                         (param,target))
            valid_holt.loc[(
                valid_holt["location"]==l_id),target_col] = \
                fit_holt.forecast(numValid).values
            alpha_col = (("%s_alpha") % param)
            valid_holt.loc[(
                valid_holt["location"]==l_id),alpha_col] = \
                    fit_holt.model.params['smoothing_level']
update_progress(1)

## Visualizing Validation Set

In [None]:
# Ignore this code. 
# I just use it when I am too lazy to wait for the plots below
l_id="France"
if 1==0:
    for target in target_values:
        train_bidX_meterY = train.loc[(
                train["location"]==l_id),:].copy()
        valid_bidX_meterY = valid.loc[(
                valid["location"]==l_id),:].copy()
        pred_bidX_meterY = valid_holt.loc[(
                valid_holt["location"]==l_id),:].copy()
        plt.figure(figsize =(15,8))
        plt.title(l_id+" "+target)
        plt.plot(train_bidX_meterY["Date"],
                 train_bidX_meterY[target], 
                 label = 'Train')
        plt.plot(valid_bidX_meterY["Date"],
                 valid_bidX_meterY[target],
                 label = 'Validation')
        plt.plot(pred_bidX_meterY["Date"],
                pred_bidX_meterY["damped_False_"+target],
                 label = 'Holt Model (damped=False)')
        plt.plot(pred_bidX_meterY["Date"],
                pred_bidX_meterY["damped_True_"+target],
                 label = 'Holt Model (damped=True)')
        plt.legend(loc = 'best')

In [None]:
for target in target_values:
    for l_id in uniq_location:
        train_bidX_meterY = train.loc[(
                train["location"]==l_id),:].copy()
        valid_bidX_meterY = valid.loc[(
                valid["location"]==l_id),:].copy()
        pred_bidX_meterY = valid_holt.loc[(
                valid_holt["location"]==l_id),:].copy()
        plt.figure(figsize =(15,8))
        plt.title(l_id+" "+target)
        plt.plot(train_bidX_meterY["Date"],
                 train_bidX_meterY[target], 
                 label = 'Train')
        plt.plot(valid_bidX_meterY["Date"],
                 valid_bidX_meterY[target],
                 label = 'Validation')
        plt.plot(pred_bidX_meterY["Date"],
                pred_bidX_meterY["damped_False_"+target],
                 label = 'Holt Model (damped=False)')
        plt.plot(pred_bidX_meterY["Date"],
                pred_bidX_meterY["damped_True_"+target],
                 label = 'Holt Model (damped=True)')
        plt.legend(loc = 'best')

It may seem obvious, but the model works best once there is a linear trend to begin with. In other words, the Holt's model cannot predict when the virus is going to hit the particular location, but it can predict the exponential trend.

## Calculate Error Rate for Holt's Model

In [None]:
for target in target_values:
    print("Holt (damped=False) RMSLE value for %s:" % target)
    print(rmsle(valid_holt["damped_False_"+target],
               valid_holt["true_"+target]))
    print("Holt (damped=True) RMSLE value for %s:" % target)
    print(rmsle(valid_holt["damped_True_"+target],
               valid_holt["true_"+target]))

## Submission for Holt's Model

In [None]:
# This model splits the data based on 
# location
nlocations=len(uniq_location)
print("number of locations: "+ str(nlocations))
x=0
for l_id in uniq_location:
    update_progress(x / nlocations)
    x+=1
    # fit the model to the target_values of this location
    for target in target_values:
        sub_timeTrain_df = timeIndexed_train_df.loc[(
            timeIndexed_train_df["location"]==l_id),target].copy()
        numValid = len(test_df.loc[(
            test_df["location"]==l_id),:])
        fit_holt = Holt(
            sub_timeTrain_df,
            damped=False).fit(optimized=True)
        # forecast the targets
        test_df.loc[(
            test_df["location"]==l_id),target] = \
            fit_holt.forecast(numValid).values
update_progress(1)

In [None]:
submission = test_df.loc[:,["ForecastId","ConfirmedCases","Fatalities"]]
submission.to_csv("submission_holt_dampedFalse.csv",sep=",",index=False)