## COVID-19 forecast with ARIMA, LSTM
This study assessed the performance of commonly-used time series forecasting techniques (ARIMA, LSTM) on each set of national covid data.

In [None]:
import pandas as pd
import numpy as np
import itertools
from statsmodels.tsa.arima_model import ARIMA
import warnings
warnings.simplefilter('ignore')

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM
print(tf.__version__)
tf.random.set_seed(1234)

import math
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
seed(1)
import os
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta

In [None]:
df_confirmed = pd.read_csv("../input/novel-corona-virus-2019-dataset/time_series_covid_19_confirmed.csv")

In [None]:
def create_dataset(df, previous=1):
    dataX, dataY = [], []
    for i in range(len(df)-previous-1):
        a = df[i:(i+previous), 0]
        dataX.append(a)
        dataY.append(df[i + previous, 0])
    return np.array(dataX), np.array(dataY)

def RNN_out_of_sample(model, last_real_seq, n_prediction, window):
    prediction = np.array([])
    predictor = np.array([[last_real_seq]])
    next_prediction = model.predict(predictor)
    for i in range(n_prediction):
        next_prediction = model.predict(predictor)
        prediction = np.append(prediction, next_prediction)
        predictor[0][0] = np.append(predictor[0][0],next_prediction)[1:]
    return prediction

#Mean absolute percentage error
def mape(y1, y_pred): 
    y1, y_pred = np.array(y1), np.array(y_pred)
    return np.mean(np.abs((y1 - y_pred) / y1)) * 100

In [None]:
def get_best_forecast(train_test, verify): 
    
    compare = verify["daily"][:10]
    
    last_2 = list(train_test[-2:]["daily"])
    diff = last_2[-1] - last_2[-2]
    naive = np.zeros(10)
    naive[0] = last_2[-1]
    for i in range(1,10):
        naive[i] = naive[i-1] + diff
    
    LSTM_model, LSTM_forecast = get_LSTM_forecast(train_test)
    LSTM_error = mape(LSTM_forecast, compare)
    
    arima_model, arima_forecast = get_arima_forecast(train_test)
    arima_error = mape(arima_forecast, compare)
    
    train_test = train_test.append(verify)
    #compare
    if LSTM_error <= arima_error:
        _, forecast = get_LSTM_forecast(train_test)
    else:
        _, forecast = get_arima_forecast(train_test)
    
    return forecast, LSTM_forecast, arima_forecast, naive

In [None]:
def get_arima_forecast(train):
    
    train = train['daily']
    p = d = q = range(0,4)
    a = 99999
    pdq = list(itertools.product(p,d,q))
    
    #Determining the best parameters
    for var in pdq:
        try:
            model = ARIMA(train, order=var)
            result = model.fit()

            if (result.aic <= a) :
                a = result.aic
                param = var
        except:
            continue
            
    #Modeling
    model = ARIMA(train, order = param).fit()
    
    arima_forecast = model.forecast(steps=10)[0]

    return model, arima_forecast

In [None]:
def get_LSTM_forecast(train):
    
    df = np.array(train['daily'])
    df = pd.DataFrame(df)
    df = np.array(df)

    train_size = int(len(df) * 0.8)
    test_size = len(df) - train_size
    train, test = df[0:train_size,:], df[train_size:len(df),:]

    scaler = MinMaxScaler(feature_range=(0, 1))
    train = scaler.fit_transform(train)
    test = scaler.fit_transform(test)

    # Lookback period
    lookback = 20
    X_train, Y_train = create_dataset(train, lookback)
    X_test, Y_test = create_dataset(test, lookback)

    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

    # Generate LSTM network
    model = tf.keras.Sequential()
    model.add(LSTM(4, input_shape=(1, lookback)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, Y_train, validation_split=0.2, epochs=10, batch_size=1, verbose=2)
    
    predict = RNN_out_of_sample(model, list(X_test[-1].flatten()), 10, lookback )
    lstm_result = scaler.inverse_transform(predict.reshape(-1, 1))
    
    return model, lstm_result

In [None]:
countries = ["Canada","Malaysia","India","US"]

for country in countries:
    
    time_series = df_confirmed[df_confirmed['Country/Region'] == country].loc[:,'1/22/20':'5/29/21']
    dates = [datetime.strptime(x, '%m/%d/%y') for x in time_series.columns]
    time_series = time_series.values.tolist()[0] 
    df_data = pd.DataFrame(columns = ['ds','y'])
    df_data['ds'] = dates
    df_data['y'] = time_series

    daily_count = [y - x for x,y in zip(time_series,time_series[1:])]
    daily_count = [time_series[0]] + daily_count
    df_data['daily'] = daily_count
    
    data_train_test = df_data.loc[df_data.ds < pd.to_datetime("4/29/21")]
    data_verify = df_data.loc[df_data.ds >= pd.to_datetime("4/29/21")]
    
    forecast, LSTM_forecast, arima_forecast, naive = get_best_forecast(data_train_test, data_verify)
    
    df_comparison_plot = pd.DataFrame({"Date" : data_verify["ds"][:10],
                                       "actual" : data_verify["daily"][:10],
                                       "LSTM_forecast" : LSTM_forecast.flatten(),
                                       "arima_forecast" : arima_forecast,
                                       "naive" : naive})
    
    print("LSTM error: ", mape(LSTM_forecast.flatten(), data_verify["daily"][:10]))
    print("ARIMA error: ", mape(arima_forecast, data_verify["daily"][:10]))
    print("Naive error: ", mape(naive, data_verify["daily"][:10]))

    df_comparison_plot.plot(x = "Date")