# Time Series Forecasting - Autoregressive Models

Libraries used : pmdarima, statsmodel

## Articles 
AR,MA, ARMA, ARIMA, SARIMA

1. https://towardsdatascience.com/time-series-models-d9266f8ac7b0#:~:text=AR%2C%20MA%2C%20ARMA%2C%20and%20ARIMA%20models%20are%20used,over%20the%20historical%20data%20of%20observation%20overtime%20period.

2. https://puneet166.medium.com/time-series-forecasting-how-to-predict-future-data-using-arma-arima-and-sarima-model-8bd20597cc7b#:~:text=ARIMA%2C%20ARMA%20and%20SARIMA%20are%20used%20for%20predict,how%20to%20do%20forecasting%20using%20these%20three%20models.

By plotting ACF and PACF, we can find a suitable model with help of this table given below (after the series is stationary) :-

| Model | ACF Pattern | PACF Pattern |
| --- | --- | --- |
| AR(p) | Exponential decay or damped sine wave pattern or both | Significant spike through first lag |
| MA(q) | Significant spike through first lag | Exponential decay |
| ARMA(1,1) | Exponential decay from lag 1 | Exponential decay from lag 1 |
| ARMA(p,q) | Exponential decay | Exponential decay |

In [None]:
#%pip install pmdarima

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import datetime
import math
import warnings


import pmdarima as pmd
from sklearn.metrics import mean_squared_error
from math import sqrt
import pmdarima as pmd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
from sklearn import metrics
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
import itertools

In [None]:
# In order to use this notebook for univarate time series analysis :-
# 1) The primary requirement is not to have missing values or categorial(string) data for time_dependent variable 
#    and time_column.
# 2) This cell requires information on file_name (only csv), time_dependent_variable, time_column, date_time format (frmt)
#    and resample grain(X). After filling the required information correctly, you can run all the cells (Cell ---> Run All)
# 3) Example :-
#   file_name               = "JetRail Avg Hourly Traffic Data - 2012-2013.csv"
#   time_dependent_variable = "Count"    (column name in your dataset)
#   time_column             = "Datetime" (column name in your dataset)
#   frmt                    = "%Y-%m-%d"
#   X                       = "D" 

file_name = "Prime.xlsx"
time_dependent_variable = "Liq Rate"
time_column = "Date"
frmt =  '%b-%y'
X = "M"

### Reading the csv file

In [None]:
def data(time_column, file_name, frmt='%Y-%m-%d %H:%M:%S', X="D"):
    if ".csv" in file_name:
        df = pd.read_csv(file_name, parse_dates=True)
    elif ".xlsx" in file_name:
        df = pd.read_excel(file_name, parse_dates=True)

    df = df[[time_column, time_dependent_variable]]
    df[time_column] = pd.to_datetime(df[time_column], format=frmt)
    df.index = df[time_column]
    df = df.resample(X).mean()
    df.reset_index(inplace=True)
    return df


df = data(time_column, file_name, frmt, X)

### Splitting the data into train and test

In [None]:
# This splits the data into train and test using default split_size = 0.7
def train_test_split_perc(df, split= 0.7):
    total_size=len(df)
    train_size=math.floor(split*total_size) #(70% Dataset)
    train = df.head(train_size)
    test  = df.tail(len(df) -train_size)
    return test,train
    
test,train = train_test_split_perc(df, split= 0.8)
y_hat = test.copy()

In [None]:
# We can also split the training data using split date
def train_test_split_date(df, split_date):
    train = df.loc[df.index <= split_date].copy()
    test = df.loc[df.index > split_date].copy()
    return train, test

### Metrics

Probabilistic Model Selection with AIC/BIC in Python

Article :https://medium.com/analytics-vidhya/probabilistic-model-selection-with-aic-bic-in-python-f8471d6add32#:~:text=AIC%20and%20BIC%20techniques%20can%20be%20implemented%20in,statsmodels.formula.api%20provides%20a%20direct%20approach%20to%20compute%20aic%2Fbic.
Video : https://campus.datacamp.com/courses/arima-models-in-python/chapter-3-the-best-of-the-best-models?ex=4

In [None]:
def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print('Evaluation metric results:-')
    print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
    print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
    print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
    print(f'MAPE is : {mean_absolute_percentage_error(y_true,y_pred)}')
    print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')

In [None]:
def evaluation_metrics_func(y_true, y_pred):
    return {"MAPE": np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
            "MSE": metrics.mean_squared_error(y_true, y_pred),
            "RMSE": np.sqrt(metrics.mean_squared_error(y_true, y_pred)),
            "MAE": metrics.mean_absolute_error(y_true, y_pred),
            "R2": metrics.r2_score(y_true, y_pred)}

### Defining a plot function

In [None]:
 def plot(method):
    plt.figure(figsize=(12,8))
    plt.plot(train[time_column], train[time_dependent_variable], label='Train')
    plt.plot(test[time_column],test[time_dependent_variable], label='Test')
    plt.plot(y_hat[time_column],y_hat[method], label= method +' forecast')
    plt.legend(loc='best')
    plt.title(method + ' forecast')
    plt.show()
    

# AR

If you want only specific list of lags like 1 & 3 as AR components, then you can do that in the following way :-

https://stackoverflow.com/questions/55882111/arima-model-for-certain-lags

https://towardsdatascience.com/advanced-time-series-analysis-with-arma-and-arima-a7d9b589ed6d

In [None]:
# By set dynamic = true, prediction would be one-step recursive

def ar(lags,trend="ct", dynamic=True, method = "AR"):
    model = AutoReg(train[time_dependent_variable], lags=lags, trend= trend)
    fit1 = model.fit()
    y_hat[method] = fit1.predict(start=len(train), end=len(train)+len(test)-1, dynamic=dynamic)
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])

ar(lags= 7)

### Autoregressive Model ( Seasonality = True)

Specify the number of periods as well...

In [None]:
def ar_seasonal(lags, period = 7,trend="ct", method = "ARS"):
    model = AutoReg(train[time_dependent_variable], lags=lags, trend= trend, seasonal=True, period = period)
    fit1 = model.fit()
    y_hat[method] = fit1.predict(start=len(train), end=len(train)+len(test)-1, dynamic=True)
    #plot(method)
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])

ar_seasonal(lags= 7)

In [None]:
def ar_best_params( p_values=range(40), basis = "MAPE"):
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        order = (p,)
        try:
            metric = ar_seasonal(*order)[basis]  # you can also choose ar_seasonal() here
            if metric < best_score:
                best_score, best_cfg = metric, order
            print('AR%s metric=%.3f' % (order,metric))
        except:
            continue
    print('Best AR%s metric=%.3f' % (best_cfg, best_score))
    return best_cfg
    
best_cfg = ar_best_params()

In [None]:
ar_seasonal(*best_cfg)
plot(method= 'AR' )

# MA with pmdarima library

In [None]:
def auto_ma(max_q = 40, trend="ct", summary= True, method = "MA"):
    model = pmd.auto_arima(train[time_dependent_variable], 
                          start_p=0, 
                          start_q=2,
                          d = 0,
                           max_p = 0, max_q=max_q, seasonal=False,trend = trend,
                            trace=True,error_action='ignore',
                          suppress_warnings=True,stepwise=True)
    if summary : print(model.summary())
    y_hat[method] = model.predict(len(test))
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])
    
auto_ma()

In [None]:
plot(method = "MA")

# ARMA with pmdarima library

In [None]:
def auto_arma(max_p = 7, max_q= 7, summary= False,trend="ct", method = "ARMA"):
    model = pmd.auto_arima(train[time_dependent_variable], 
                          start_p=1, 
                          start_q=1,
                           d = 0,
                           max_p = max_p, max_q = max_q,
                           seasonal=False, trend = trend,
                           trace=True,error_action='ignore',
                          suppress_warnings=True,stepwise=True)
    if summary : print(model.summary())
    y_hat[method] = model.predict(len(test))
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])
    
auto_arma()

In [None]:
plot(method = "ARMA")

# ARIMA

In [None]:
def arima(p,d,q, summary = False, method = "ARIMA" ):
    model = ARIMA(train[time_dependent_variable], exog=None, order=(p,d,q))
    fit1 = model.fit()
    if summary: print(fit1.summary())
    y_hat[method] = fit1.predict(start=len(train), end=len(train)+len(test)-1, dynamic=True, typ='levels')
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])

arima(1, 1, 1)

In [None]:
def arima_best_params(p_values=range(7), d_values=range(2), q_values=range(7),basis = "MAE"):
    best_score, best_cfg = float("inf"), None
    pdq = list(itertools.product(p_values, d_values, q_values))
    for order in pdq:
                try:
                    metric = arima(*order)[basis]
                    if metric < best_score:
                        best_score, best_cfg = metric, order
                    print('ARIMA%s metric=%.3f' % (order,metric))
                except:
                    continue
    print('Best ARIMA%s metric=%.3f' % (best_cfg, best_score))
    return best_cfg
    
best_cfg = arima_best_params()

In [None]:
arima(*best_cfg, summary = False)

In [None]:
plot(method= 'ARIMA' )

# ARIMA with pmdarima library

• Pmdarima (for py + arima) is a statistical library designed to fill the void in Python’s time-series analysis capabilities, which is the equivalent of R’s auto.arima

In [None]:
def auto_arima(max_p= 7, max_q=7, summary= False,  method = "PMD_ARIMA"):
    model = pmd.auto_arima(train[time_dependent_variable], 
                          start_p=1, 
                          start_q=1,
                           max_p=max_p, max_q=max_q,
                           seasonal=False, 
                           d=None, trace=True,error_action='ignore',
                          suppress_warnings=True,stepwise=True)
    if summary : print(model.summary())
    y_hat[method] = model.predict(len(test))
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])
    
auto_arima()

In [None]:
plot(method=  "PMD_ARIMA")

# SARIMA

m refers to the number of periods in each season.

• 7 → Daily

• 12 → Monthly

• 52 → Weekly

• 4 → Quarterly

• 1 → Annual (non-seasonal)

In [None]:
def sarima(p,d,q, P, D, Q, M, method = "SARIMA"):
    model = SARIMAX(train[time_dependent_variable], order=(p,d,q),  seasonal_order=(P,D,Q,M))
    fit1 = model.fit()
    y_hat[method] = fit1.predict(start=len(train), end=len(train)+len(test)-1, dynamic=True, typ='levels')
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])

sarima(1, 1, 1, 1, 1, 3, 7)

### Hyperparameter Tuning : Best Parameters

In [None]:
def sarima_best_params(p_values=range(2), d_values=range(2), q_values=range(2), P_values=range(2), D_values=range(2), Q_values=range(2), basis="MAE"):
    best_score, best_cfg = float("inf"), None
    M = [7,30]
    pdqPDQM = list(itertools.product(p_values, d_values, q_values, P_values, D_values, Q_values, M))
    for order in pdqPDQM:
        try:
            metric = sarima(*order)[basis]
            if metric < best_score:
                 best_score, best_cfg = metric, order
                 print('ARIMA%s metric=%.3f' % (order,metric))
        except:
            continue
    print('Best ARIMA%s metric=%.3f' % (best_cfg, best_score))
    return best_cfg 
    
best_cfg = sarima_best_params()

In [None]:
print(best_cfg)
sarima(*best_cfg)

In [None]:
plot(method = "SARIMA")

# SARIMA using pmdarima library

In [None]:
def auto_sarima(m, max_p= 3, max_q= 3, max_P=3 , max_Q=3 , summary= False, method = "PMD_SARIMA"):
    model = pmd.auto_arima(train[time_dependent_variable], 
                               start_p=1, start_q=1,
                                max_p=max_p, max_q=max_q, seasonal=True, start_P=1,
                                start_Q=1, max_P=max_P, max_D=7, max_Q=max_Q, m=m,
                                d=None, D=None, trace=True, error_action='ignore', 
                                suppress_warnings=True,
                                stepwise=True)
    if summary : print(model.summary())
    y_hat[method] = model.predict(len(test))
    return evaluation_metrics_func(test[time_dependent_variable], y_hat[method])

auto_sarima(m = 7)

In [None]:
def auto_sarima_best_seasonal_params(basis = "MAE"):
    best_score, best_cfg = float("inf"), None
    for m in [1,4,7,12,52]:
        print("="*100)
        print(f' Fitting SARIMA for Seasonal value m = {str(m)}')
        order = m
        try:
            metric = auto_sarima(m= m)[basis]
            if metric < best_score:
                best_score, best_cfg = metric, order
            print('SARIMA%s metric=%.3f' % (order,metric))
        except:
            continue
    print('Best SARIMA%s metric=%.3f' % (best_cfg, best_score))
    return best_cfg
    
best_cfg = auto_sarima_best_seasonal_params()

In [None]:
auto_sarima(m = best_cfg, summary = True)

In [None]:
plot(method=  "PMD_SARIMA")

In [None]:
y_hat