# Time Series Forecasting

In [1]:
from datetime import timedelta
from pathlib import Path

import matplotlib.pyplot as plt
import ipywidgets as widgets
import numpy as np
import pandas as pd
from pandas.plotting import autocorrelation_plot
from tqdm.notebook import tqdm

from statsmodels.tsa.arima.model import ARIMA

plt.rcParams["figure.figsize"] = (20, 10)

## Load all data

Here we load all datasets into a dictionary which maps the filename to the corresponding `pandas.DataFrame`.

In [2]:
datasets = {f.name : pd.read_csv(f) for f in tqdm(Path('.').glob('*.csv'))}

|          | 0/? [00:00<?, ?it/s]

## Clean the dataframes

In [3]:
confirmed_df = datasets['time_series_covid19_confirmed_global.csv']
confirmed_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/27/20,12/28/20,12/29/20,12/30/20,12/31/20,1/1/21,1/2/21,1/3/21,1/4/21,1/5/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,51039,51280,51350,51405,51526,51526,51526,51526,53011,53105
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,56254,56572,57146,57727,58316,58316,58991,59438,59623,60283
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,98249,98631,98988,99311,99610,99897,100159,100408,100645,100873
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,7821,7875,7919,7983,8049,8117,8166,8192,8249,8308
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,17240,17296,17371,17433,17553,17568,17608,17642,17684,17756


### Drop unused columns

In [4]:
confirmed_df = confirmed_df.drop(columns=['Province/State', 'Lat', 'Long'])
confirmed_agg = confirmed_df.groupby('Country/Region').sum()
confirmed_agg.head()

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,12/27/20,12/28/20,12/29/20,12/30/20,12/31/20,1/1/21,1/2/21,1/3/21,1/4/21,1/5/21
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,51039,51280,51350,51405,51526,51526,51526,51526,53011,53105
Albania,0,0,0,0,0,0,0,0,0,0,...,56254,56572,57146,57727,58316,58316,58991,59438,59623,60283
Algeria,0,0,0,0,0,0,0,0,0,0,...,98249,98631,98988,99311,99610,99897,100159,100408,100645,100873
Andorra,0,0,0,0,0,0,0,0,0,0,...,7821,7875,7919,7983,8049,8117,8166,8192,8249,8308
Angola,0,0,0,0,0,0,0,0,0,0,...,17240,17296,17371,17433,17553,17568,17608,17642,17684,17756


## Forecasting

In [5]:
def get_data(country: str = None):
    data_df = confirmed_agg.loc[country, :] if country else confirmed_agg.sum()
    data_df.index = pd.to_datetime(data_df.index).to_period('D')
    return data_df

In [6]:
def train_test_split(data: pd.Series, train_ratio: float = 0.66):
    assert 0. < train_ratio < 1., 'train_ratio must be in (0.0, 1.0)'
    bound = int(len(data) * train_ratio)
    train, test = data[:bound], data[bound:]
    train.name = 'train'
    test.name = 'test'
    return train, test

Here we define two metrics to evaluate our model predictions against actual values, namely:
  * Mean absolute error (MAE)
  * Mean absolute percentage error (MAPE)

In [7]:
def MAE(y, y_hat):
    return np.mean(np.abs(y - y_hat))

def MAPE(y, y_hat, eps = 1e-10):
    return np.mean(np.abs((y - y_hat)/(y + eps)))

In [8]:
@widgets.interact(country=['World'] + confirmed_agg.index.values.tolist(),
                  train_ratio=widgets.FloatSlider(min=0.1, max=0.9, value=0.8, step=0.1, continuous_update=False),
                  p=widgets.BoundedIntText(min=0, max=confirmed_agg.shape[1], value=1, continuous_update=False),
                  d=widgets.BoundedIntText(min=0, max=2, value=1, continuous_update=False),
                  q=widgets.BoundedIntText(min=0, max=confirmed_agg.shape[1], value=1, continuous_update=False),
                  forecast_days=widgets.BoundedIntText(min=0, max=365, value=30, continuous_update=False),
                  model_details=False)
def forecast(country, train_ratio, p, d, q, forecast_days, model_details):
    data = get_data(None if country == 'World' else country)
    train, test = train_test_split(data, train_ratio)
    
    model = ARIMA(train, order=(p,d,q)).fit()
    if model_details:
        autocorrelation_plot(data)
        plt.show()
        print(model.summary())
    
    forecast = model.predict(start=test.head(1).index[0], end=test.tail(1).index[0] + timedelta(days=forecast_days))
    
    print(f'test MAE: {MAE(test, forecast[:len(test)]):.3f}')
    print(f'test MAPE: {MAPE(test, forecast[:len(test)]):.3f}')
    
    result = data.to_frame().join(test).join(forecast, how='outer')
    result.columns = ['train', 'test', 'predicted']
    result.plot()

interactive(children=(Dropdown(description='country', options=('World', 'Afghanistan', 'Albania', 'Algeria', '…

## Conclusion

