# 1. Importing Libraries and Dataset

In [None]:
import pandas as pd
from pandas import Grouper
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from math import sqrt
from math import log
from math import exp

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from scipy.stats import boxcox

import warnings
warnings.filterwarnings("ignore")

In [None]:
dados = pd.read_csv("/kaggle/input/corona-virus-brazil/brazil_covid19_macro.csv")
dados.head()

# 2. Problem Description

### The problem is to predict the number of deaths by covid-19 in Brazil.

In [None]:
covid = pd.DataFrame(data=dados.deaths.values, index=dados.date, columns=["deaths"])
covid

# 3. Test Harness

In [None]:
split_point = len(covid) - 56
dataset, validation = covid[0:split_point], covid[split_point:]

print('Dataset %d, Validation %d' % (len(dataset), len(validation)))

In [None]:
# prepare data
X = dataset.values
X = X.astype('int64')

train_size = int(len(X) * 0.5)
train, test = X[0:train_size], X[train_size:]

# 4. Persistence

Before getting bogged down in data analysis and modeling is to establish a baseline of performance. This will provide both a template for evaluating models using the proposed
test harness and a performance measure by which all more elaborate predictive models can be compared. The baseline prediction for time series forecasting is called the naive forecast, or persistence. This is where the observation from the previous time step is used as the prediction for the observation at the next time step.

In [None]:
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):
    # predict
    yhat = history[-1]
    predictions.append(yhat)
    
    # observation
    obs = test[i]
    history.append(obs)
    print('>Predicted=%.3f, Expected=%.3f' % (yhat, obs))

# report performance
rmse = sqrt(mean_squared_error(test, predictions))
print('RMSE: %.3f' % rmse)

In [None]:
print(f"The persistence model achieved an RMSE of {rmse}")

# 5. Data Analysis

## 5.1 Summary Statistics

In [None]:
dataset.describe()

## 5.2 Line Plot

In [None]:
# line plots of time series
dataset.plot()
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

* There is an increasing trend of robberies over time.

* The trend means the dataset is almost certainly non-stationary and the apparent change in fluctuation may also contribute.

## 5.3 Density Plot

In [None]:
plt.figure()
dataset.hist(edgecolor='k')
dataset.plot(kind='kde')
plt.show()

* The distribution is not Gaussian, but is pretty close.

* The distribution may be exponential or a double Gaussian.

# 6. Arima Models

## 6.1 Manually Configured ARIMA

In [None]:
# create a differenced time series
def difference(dataset):
    diff = list()
    for i in range(1, len(dataset)):
        value = dataset[i] - dataset[i-1]
        diff.append(value)
    return pd.Series(diff)

# difference data
stationary = difference(X)
stationary.index = dataset.index[1:]

# check if stationary
result = adfuller(stationary)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

The results show that the test statistic value 0.747134 is bigger than the critical value at 5% of -2.869. This suggests that we can not reject the null hypothesis. Accepting the null hypothesis means that time series is no-stationary or have time-dependent structure.

In [None]:
# ACF and PACF plots of time series
plt.figure()
plt.subplot(211)
plot_acf(dataset, lags=50, ax=plt.gca())
plt.subplot(212)
plot_pacf(dataset, lags=50, ax=plt.gca())
plt.tight_layout()
plt.show()

* The ACF shows a significant lag for 30-32 months.

* The PACF shows a significant lag for perhaps 2 months.

* A good starting point for the p and q values are 31 and 2.

Some experimentation shows that the model does not appear to be stable, with non-zero AR and MA orders defined at the same time. The model can be simplified to ARIMA(0,2,2).

In [None]:
# evaluate manually configured ARIMA model

X = dataset.values
X = X.astype('float32')
train_size = int(len(X) * 0.50)
train, test = X[0:train_size], X[train_size:]
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):
    model = ARIMA(history, order=(0,2,2))
    model_fit = model.fit(disp=0)
    yhat = model_fit.forecast()[0]
    predictions.append(yhat)
    # observation
    obs = test[i]
    history.append(obs)
    print('>Predicted=%.3f, Expected=%.3f' % (yhat, obs))
# report performance
rmse = sqrt(mean_squared_error(test, predictions))
print('RMSE: %.3f' % rmse)

## 6.2 Grid Search ARIMA 

We will search all combinations of the following parameters:

* p: 0 to 5.
* d: 0 to 2.
* q: 0 to 5.

In [None]:
# evaluate an ARIMA model for a given order (p,d,q) and return RMSE
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    X = X.astype('float32')
    train_size = int(len(X) * 0.50)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    rmse = sqrt(mean_squared_error(test, predictions))
    return rmse

In [None]:
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    rmse = evaluate_arima_model(dataset, order)
                    if rmse < best_score:
                        best_score, best_cfg = rmse, order
                    print('ARIMA%s RMSE=%.3f' % (order,rmse))
                except:
                    continue
    print('Best ARIMA%s RMSE=%.3f' % (best_cfg, best_score))

In [None]:
# evaluate parameters
p_values = range(0,5)
d_values = range(0,2)
q_values = range(0,5)

evaluate_models(dataset.values, p_values, d_values, q_values)

The results show that the best configuration discovered was ARIMA(4,1,2)

## 6.3 Review Residual Errors

In [None]:
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):
    # predict
    model = ARIMA(history, order=(4,1,2))
    model_fit = model.fit(disp=0)
    yhat = model_fit.forecast()[0]
    predictions.append(yhat)
    # observation
    obs = test[i]
    history.append(obs)

In [None]:
# errors
residuals = [test[i]-predictions[i] for i in range(len(test))]
residuals = pd.DataFrame(residuals)
plt.figure()
plt.subplot(211)
residuals.hist(ax=plt.gca())
plt.subplot(212)
residuals.plot(kind='kde', ax=plt.gca())
plt.show()

The distribution of residual errors is a Gaussian with a zero mean. It's the ideal. It is also a good idea to check the time series of the residual errors for any type of autocorrelation. If present, it would suggest that the model has more opportunity to model the temporal structure in the data.

In [None]:
plt.figure()
plt.subplot(211)
plot_acf(residuals, lags=50, ax=plt.gca())
plt.subplot(212)
plot_pacf(residuals, lags=50, ax=plt.gca())
plt.show()

The results suggest that what little autocorrelation is present in the time series has been captured by the model.

## 6.4 Box-Cox Transformed Dataset

The Box-Cox transform is a method that is able to evaluate a suite of power transforms, including, but not limited to, log, square root, and reciprocal transforms of the data.

In [None]:
X = dataset.values
X = X[(X!=0).any(axis=1)]
transformed, lam = boxcox(X.flatten())
print('Lambda: %f' % lam)
plt.figure(1)
# line plot
plt.subplot(311)
plt.plot(transformed)
# histogram
plt.subplot(312)
plt.hist(transformed)
# q-q plot
plt.subplot(313)
qqplot(transformed, line='r', ax=plt.gca())
plt.show()

Evaluate ARIMA models with box-cox transformed time series

In [None]:
# invert box-cox transform
def boxcox_inverse(value, lam):
    if lam == 0:
        return exp(value)
    return exp(log(lam * value + 1) / lam)

X = dataset.values
X = X[(X!=0).any(axis=1)]
X = X.flatten()
train_size = int(len(X) * 0.50)
train, test = X[0:train_size], X[train_size:]
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):
    # transform
    transformed, lam = boxcox(history)
    if lam < -5:
        transformed, lam = history, 1
    # predict
    model = ARIMA(transformed, order=(4,1,2))
    model_fit = model.fit(disp=0)
    yhat = model_fit.forecast()[0]
    # invert transformed prediction
    yhat = boxcox_inverse(yhat, lam)
    predictions.append(yhat)
    # observation
    obs = test[i]
    history.append(obs)
    print('>Predicted=%.3f, Expected=%.3f' % (yhat, obs))
# report performance
rmse = sqrt(mean_squared_error(test, predictions))
print('RMSE: %.3f' % rmse)

# 7. Model Validation

## 7.1 Finalize Model

In [None]:
# prepare data
X = dataset.values
X = X[(X!=0).any(axis=1)]
X = X.flatten()
# transform data
transformed, lam = boxcox(X)
# fit model
model = ARIMA(transformed, order=(4,1,2))
model_fit = model.fit(disp=0)
# save model
model_fit.save('model.pkl')
np.save('model_lambda.npy', [lam])

## 7.2 Validate Model

In [None]:
# invert box-cox transform
def boxcox_inverse(value, lam):
    if lam == 0:
        return exp(value)
    return exp(log(lam * value + 1) / lam)

# load and prepare datasets
X = dataset.values
X = X[(X!=0).any(axis=1)]
X = X.flatten()
history = [x for x in X]
y = validation.values
y = y[(y!=0).any(axis=1)]
y = y.flatten()
# load model
model_fit = ARIMAResults.load('model.pkl')
lam = np.load('model_lambda.npy')
# make first prediction
predictions = list()
yhat = model_fit.forecast()[0]
yhat = boxcox_inverse(yhat, lam)
predictions.append(yhat)
history.append(y[0])
print('>Predicted=%.3f, Expected=%.3f' % (yhat, y[0]))
# rolling forecasts
for i in range(1, len(y)):
    # transform
    transformed, lam = boxcox(history)
    if lam < -5:
        transformed, lam = history, 1
    # predict
    model = ARIMA(transformed, order=(4,1,2))
    model_fit = model.fit(disp=0)
    yhat = model_fit.forecast()[0]
    # invert transformed prediction
    yhat = boxcox_inverse(yhat, lam)
    predictions.append(yhat)
    # observation
    obs = y[i]
    history.append(obs)
    print('>Predicted=%.3f, Expected=%.3f' % (yhat, obs))
# report performance
rmse = sqrt(mean_squared_error(y, predictions))
print('RMSE: %.3f' % rmse)
plt.plot(y)
plt.plot(predictions, color='red')
plt.show()