- This notebook is a modification of the original written in this [article](https://medium.com/analytics-vidhya/arima-model-from-scratch-in-python-489e961603ce).
- To have a better understanding of the ARIMA model, I suggest this [article](https://towardsdatascience.com/understanding-arima-time-series-modeling-d99cd11be3f8) written by Tony Yiu.

In [None]:
DATA_PATH = '../input/novel-corona-virus-2019-dataset/time_series_covid_19_deaths_US.csv'

P = 2
Q = 2
TRAIN_SIZE = 0.8

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import autocorrelation_plot
from sklearn.linear_model import LinearRegression

from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv(DATA_PATH)

In [None]:
df

In [None]:
df.columns[112:235].values

In [None]:
raw_data = df.iloc[:, 112:235].sum().reset_index().rename(columns={'index': 'date', 0: 'value'})

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(data=raw_data, x='date', y='value', ax=ax)
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)
plt.xticks(rotation=15)
plt.show()

In [None]:
raw_data.set_index('date', inplace=True)
raw_data.dropna(inplace=True)

In [None]:
class ARIMA(object):
    def __init__(self):
        pass
    
    @staticmethod
    def _regressor(df, n, val_col, out_pred_col, train_size=0.8):
        _df = df.copy()

        #Generating the lagged n terms
        for i in range(1, n + 1):
            _df[f'shifted_values_{i}'] = _df[val_col].shift(i)
        _df.dropna(inplace=True)


        _train_size = int(train_size * _df.shape[0])

        #Breaking data set into test and training
        _df_train = pd.DataFrame(_df[:_train_size])
        _df_test = pd.DataFrame(_df[_train_size:])

        #X contains the lagged values ,hence we skip the first column
        X_train = _df_train.iloc[:, 1:].values.reshape(-1, n)
        #Y contains the value, it is the first column
        y_train = _df_train.iloc[:, 0].values.reshape(-1, 1)
        
        # We do the same thing with the test set
        X_test = _df_test.iloc[:, 1:].values.reshape(-1, n)
        y_test = _df_test.iloc[:, 0].values.reshape(-1, 1)

        #Running linear regression to generate the coefficents of lagged terms
        lr = LinearRegression()
        lr.fit(X_train, y_train)

        theta  = lr.coef_.T
        intercept = lr.intercept_
        _df_train[out_pred_col] = X_train.dot(theta) + intercept
#         _df_train[[val_col, out_pred_col]].plot()

        _df_test[out_pred_col] = X_test.dot(theta) + intercept
#         _df_test[[val_col, out_pred_col]].plot()

        rmse = np.sqrt(mean_squared_error(y_test, _df_test[out_pred_col]))

#         print(f'RMSE = {rmse}. Value of n = {n}')
        new_df = pd.concat([_df_train, _df_test])[[val_col, out_pred_col]]
        return new_df, theta, intercept, rmse, np.abs(_df_test[out_pred_col].mean())
    
    @staticmethod
    def AR(df, p, val_col='value', out_pred_col='predicted_value', train_size=0.8):
        return ARIMA._regressor(df, p, val_col, out_pred_col, train_size)
    
    @staticmethod
    def I(df, val_col='value', fn=None):
        _df = df.copy()
        if fn is not None:
            return pd.DataFrame(fn(_df[val_col])).dropna()
        return _df

    @staticmethod
    def MA(df, q, val_col='value', in_pred_col='predicted_value', out_pred_col='ma_predicted_value', train_size=0.8):
        _df = df.copy()
        _df['residual'] = _df[val_col] - _df[in_pred_col]
        return ARIMA._regressor(_df[['residual']], q, 'residual', out_pred_col, train_size)
    
    
    def fit(self, df, p, q, val_col, train_size, stationary_fn=None):
        _df = df.copy()
        stationary_df = ARIMA.I(_df, val_col, stationary_fn)
#         stationary_df[val_col].plot()
#         plt.show()
        
        ar_out_df, ar_theta, ar_intercept, ar_rmse, ar_mean_pred = ARIMA.AR(stationary_df, p, val_col, 'ar_predicted_value', train_size)
#         print(ar_rmse)
#         ar_out_df['ar_predicted_value'].plot()
#         plt.show()
        
        ma_out_df, ma_theta, ma_intercept, ma_rmse, ma_mean_pred = ARIMA.MA(ar_out_df, q, val_col, 'ar_predicted_value', 'ma_predicted_value', train_size)
#         print(ma_rmse)
#         ma_out_df['ma_predicted_value'].plot()
#         plt.show()
        
#         final_predictions = (ar_out_df['ar_predicted_value'] + ma_out_df['ma_predicted_value']).dropna()
#         final_predictions.plot()
        
#         return final_predictions
        stationary_df['prediction'] = ar_out_df['ar_predicted_value'] + ma_out_df['ma_predicted_value']
        return stationary_df.dropna(), ar_theta, ar_intercept, ma_theta, ma_intercept, ar_rmse, ma_rmse, ar_mean_pred, ma_mean_pred

In [None]:
arima_model = ARIMA()

In [None]:
# stationary_fn = lambda x: x.diff()
# def reverse_stationary_fn(ori_x, new_x):
#     x = new_x.copy()
#     x += ori_x.shift(1)
#     return x

stationary_fn = lambda x: x.diff().diff()
def reverse_stationary_fn(ori_x, new_x):
    x = new_x.copy()
    x += ori_x.shift(1)
    x += ori_x.diff().shift(1)
    return x

# stationary_fn = lambda x: np.log(x).diff()
# def reverse_stationary_fn(ori_x, new_x):
#     x = new_x.copy()
#     x += np.log(ori_x).shift(1)
#     return np.exp(x)

# stationary_fn = lambda x: np.log(x).diff().diff()
# def reverse_stationary_fn(ori_x, new_x):
#     x = new_x.copy()
#     x += np.log(ori_x).shift(1)
#     x += np.log(ori_x).diff().shift(1)
#     return np.exp(x)

min_ar_rmse = float('inf')
min_ma_rmse = float('inf')
best_ar_mean_pred = None
best_ma_mean_pred = None
best_p = 1
best_q = 1

for i in range(1, 21):
    final_predictions, \
        ar_theta, ar_intercept, \
        ma_theta, ma_intercept, \
        ar_rmse, ma_rmse, \
        ar_mean_pred, ma_mean_pred = arima_model.fit(raw_data, i, best_q, 'value', TRAIN_SIZE, stationary_fn)

    if ar_rmse < min_ar_rmse:
        min_ar_rmse = ar_rmse
        best_ar_mean_pred = ar_mean_pred
        best_p = i

            
for j in range(1, 21):
    final_predictions, \
        ar_theta, ar_intercept, \
        ma_theta, ma_intercept, \
        ar_rmse, ma_rmse, \
        ar_mean_pred, ma_mean_pred = arima_model.fit(raw_data, best_p, j, 'value', TRAIN_SIZE, stationary_fn)

    if ma_rmse < min_ma_rmse:
        min_ma_rmse = ma_rmse
        best_ma_mean_pred = ma_mean_pred
        best_q = j

In [None]:
print(f'Min RMSE of AR model: {min_ar_rmse} (relative err: {min_ar_rmse/best_ar_mean_pred}). Best P: {best_p}')
print(f'Min RMSE of MA model: {min_ma_rmse} (relative err: {min_ma_rmse/best_ma_mean_pred}). Best Q: {best_q}')

In [None]:
final_predictions, \
    ar_theta, ar_intercept, \
    ma_theta, ma_intercept, \
    ar_rmse, ma_rmse, \
    ar_mean_pred, ma_mean_pred = arima_model.fit(raw_data, 14, 12, 'value', TRAIN_SIZE, stationary_fn)

In [None]:
final_predictions.plot()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(data=pd.DataFrame(reverse_stationary_fn(raw_data['value'], final_predictions['value'])).reset_index(), x='date', y='value', ax=ax)
sns.lineplot(data=pd.DataFrame(reverse_stationary_fn(raw_data['value'], final_predictions['prediction'])).reset_index(), x='date', y='prediction', ax=ax)
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)
plt.xticks(rotation=15)
plt.show()