# ARIMA

In [1]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from tsa.dateconverter import DateConverter
from statsmodels.tsa.stattools import acf, pacf


### functions in arima class:

- fit model (auto arima if order is None)
- test stationarity
- acf, pacf plot
- differencing + acf/pacf/stationarity
- plot series
- plot forecast w/ test
- residuals: plot, density, acf, pacf
    

In [2]:
class Arima:
    def __init__(self):
        self._auto = None
        return
    
    def _index_df(self, df: pd.DataFrame, x: str, y: str):
        idf = pd.DataFrame(df[[x, y]])
        converter = DateConverter()
        idf[x] = idf[x].apply(lambda x: converter.convert_date(x))
        idf = idf.set_index([x])
        return idf
    
    def __call__(self, df: pd.DataFrame, x: str, y: str):
        self._y = y
        self._df = self._index_df(df, x, y)
    
    def fit(self, order = None, show_summary=True):
        if order is None:
            self._auto_arima = pm.auto_arima(self._df, stepwise = False, seasonal=False)
            if show_summary:
                print(self._auto_arima.summary())
            self._auto = True
        else:
            self._auto = False
            model = ARIMA(self._df, order = order)
            self._arima = model.fit()
            if show_summary:
                print(self._arima.summary())
                
    def predict(self, num_periods: int):
        if self._auto is None:
            print("run fit() first")
        elif self._auto is True:
            self._forecasts = self._auto_arima.predict(n_periods = num_periods)
            return self._forecasts
        else:
            self._forecasts = self._arima.forecast(num_periods)
            return self._forecasts
         
    def plot_forecast(self, test_df, y, exclude_time = False):
        if self._auto is None:
            print("run fit() first")
        else:
            forecasts = self.predict(len(test_df))
            if exclude_time:
                x_axis_data = self._df.index.tolist()+[str(i.date()) for i in forecasts.index.tolist()]
            else:
                x_axis_data = self._df.index.tolist()+[str(i.date()) + ' ' + str(i.time()) for i in forecasts.index.tolist()]
            y_axis_series = self._df[self._df.columns[0]].to_list() + ['-']*len(test_df)
            y_axis_forecast = ['-']*len(self._df) + forecasts.values.tolist()
            y_axis_test = ['-']*len(self._df) + test_df[y].to_list()
            
            return {
                'title': f'Forecast of {self._y}:',
                'x': x_axis_data,
                'y_series': y_axis_series,
                'y_test': y_axis_test,
                'y_forecast': y_axis_forecast
            }
    
    def test_stationarity(self, order_of_differencing = 0):
        
        df = self._df
        
        for i in range(order_of_differencing):
            df = df.diff().dropna()
        
        dftest = adfuller(df[self._y], autolag = 'AIC')
        print("1. ADF : ",dftest[0])
        print("2. P-Value : ", dftest[1])
        print("3. Num Of Lags : ", dftest[2])
        print("4. Num Of Observations Used For ADF Regression and Critical Values Calculation :", dftest[3])
        print("5. Critical Values :")
        for key, val in dftest[4].items():
            print("\t",key, ": ", val)
        if dftest[1] > 0.05:
            print(f"\n\nAs p-value is outside the confidence interval of 95%, series is non-stationary.\nDifferenced: {order_of_differencing}")
        else:
            print(f"\n\nAs p-value is inside the confidence interval of 95%, series is stationary.\nDifferenced: {order_of_differencing}")
    
    def acf_plot(self):
        acf_vals, confint = acf(self._df, alpha = 0.05)
        return {
            'title': f'Autocorrelation plot of {self._y}',
            'y': acf_vals.tolist(),
            'x': list(range(len(acf_vals))),
            'upper': (confint[:, 1] - acf_vals).tolist(),
            'lower': (confint[:, 0] - acf_vals).tolist()
        }
        
    def pacf_plot(self):
        pacf_vals, confint = pacf(self._df, alpha = 0.05)
        return {
            'title': f'Partial Autocorrelation plot of {self._y}',
            'y': pacf_vals.tolist(),
            'x': list(range(len(pacf_vals))),
            'upper': (confint[:, 1] - pacf_vals).tolist(),
            'lower': (confint[:, 0] - pacf_vals).tolist()
        }  
    
    def error_metrics(self, test_df, y):
        return pd.DataFrame([
            
            ["MAPE", mean_absolute_percentage_error(test_df[y], self._forecasts)],
            ["MAE", mean_absolute_error(test_df[y], self._forecasts)],
            ["MSE", mean_squared_error(test_df[y], self._forecasts)]], columns=["Metric", "Value"]
        )
    
        

In [3]:
a = Arima()

In [4]:
df = pd.read_csv('datasets/2/Electric_Production.csv')

In [5]:
train_df = pd.DataFrame(df[:int(0.8 * len(df))])
test_df = pd.DataFrame(df[int(0.8 * len(df)):])

In [6]:
a(train_df, 'DATE', 'IPG2211A2N')

In [7]:
a.fit(order = (2,1,5))

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:             IPG2211A2N   No. Observations:                  317
Model:                 ARIMA(2, 1, 5)   Log Likelihood                -807.912
Date:                Fri, 30 Jun 2023   AIC                           1631.825
Time:                        16:42:06   BIC                           1661.871
Sample:                    01-01-1985   HQIC                          1643.828
                         - 05-01-2011                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9975      0.006    157.219      0.000       0.985       1.010
ar.L2         -0.9971      0.005   -203.002      0.000      -1.007      -0.987
ma.L1         -1.2442      0.061    -20.254      0.0



In [8]:
preds = a.predict(10)

In [9]:
preds

2011-06-01    104.068954
2011-07-01    114.723465
2011-08-01    111.686225
2011-09-01     98.803980
2011-10-01     89.089704
2011-11-01     92.244333
2011-12-01    105.077473
2012-01-01    114.733615
2012-02-01    111.569958
2012-03-01     98.785777
Freq: MS, Name: predicted_mean, dtype: float64

In [10]:
a.plot_forecast(test_df, 'DATE', 'IPG2211A2N')

{'title': 'Forecast of IPG2211A2N:',
 'x': ['1985-01-01',
  '1985-02-01',
  '1985-03-01',
  '1985-04-01',
  '1985-05-01',
  '1985-06-01',
  '1985-07-01',
  '1985-08-01',
  '1985-09-01',
  '1985-10-01',
  '1985-11-01',
  '1985-12-01',
  '1986-01-01',
  '1986-02-01',
  '1986-03-01',
  '1986-04-01',
  '1986-05-01',
  '1986-06-01',
  '1986-07-01',
  '1986-08-01',
  '1986-09-01',
  '1986-10-01',
  '1986-11-01',
  '1986-12-01',
  '1987-01-01',
  '1987-02-01',
  '1987-03-01',
  '1987-04-01',
  '1987-05-01',
  '1987-06-01',
  '1987-07-01',
  '1987-08-01',
  '1987-09-01',
  '1987-10-01',
  '1987-11-01',
  '1987-12-01',
  '1988-01-01',
  '1988-02-01',
  '1988-03-01',
  '1988-04-01',
  '1988-05-01',
  '1988-06-01',
  '1988-07-01',
  '1988-08-01',
  '1988-09-01',
  '1988-10-01',
  '1988-11-01',
  '1988-12-01',
  '1989-01-01',
  '1989-02-01',
  '1989-03-01',
  '1989-04-01',
  '1989-05-01',
  '1989-06-01',
  '1989-07-01',
  '1989-08-01',
  '1989-09-01',
  '1989-10-01',
  '1989-11-01',
  '1989-12-01'

In [11]:
a.test_stationarity(0)

1. ADF :  -1.7386950117875237
2. P-Value :  0.4112939344557697
3. Num Of Lags :  15
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 301
5. Critical Values :
	 1% :  -3.452263435801039
	 5% :  -2.871190526189069
	 10% :  -2.571911967527952


As p-value is outside the confidence interval of 95%, series is non-stationary.
Differenced: 0


In [12]:
a.test_stationarity(1)

1. ADF :  -6.352997676450292
2. P-Value :  2.5832755038271256e-08
3. Num Of Lags :  14
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 301
5. Critical Values :
	 1% :  -3.452263435801039
	 5% :  -2.871190526189069
	 10% :  -2.571911967527952


As p-value is inside the confidence interval of 95%, series is stationary.
Differenced: 1


In [13]:
a.acf_plot()

{'title': 'Autocorrelation plot of IPG2211A2N',
 'y': [1.0,
  0.8750939644524365,
  0.6580769154084761,
  0.5454444386667688,
  0.6244456960102033,
  0.800267720079079,
  0.8809590602769177,
  0.7823666482463564,
  0.5919279404591529,
  0.49981925422416645,
  0.5945120151490753,
  0.7857164713686282,
  0.8852534219791408,
  0.7723671946719537,
  0.571461594261342,
  0.4653021240258178,
  0.5427902423346485,
  0.7094128905896229,
  0.7829484319966931,
  0.691300452496651,
  0.5126524683281335,
  0.4248525287888732,
  0.5154762395249518,
  0.6979134166134366,
  0.7901153365760363,
  0.6842729034271963],
 'x': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 'upper': [0.0,
  0.11008254460537936,
  0.17515163389813615,
  0.20291379452725578,
  0.2199648851197904,
  0.24048925743025362,
  0.27084445654961664,
  0.3035889787716656,
  0.32711048405288756,
  0.33984285102535144,
  0.34863715648702

In [14]:
a.pacf_plot()

{'title': 'Partial Autocorrelation plot of IPG2211A2N',
 'y': [1.0,
  0.8778632491500709,
  -0.47257821007359935,
  0.6141596343497127,
  0.544135252474487,
  0.3493818895605283,
  -0.07883520109115906,
  -0.11019543899465495,
  -0.15409457305770044,
  0.22379700723299323,
  0.27287604260814374,
  0.35304873729806185,
  0.11368490718252872,
  -0.534787979366814,
  0.015412877208869842,
  -0.10669022928379383,
  0.055340805837452156,
  -0.056542646886863446,
  -0.034329742302245406,
  0.18575983642741573,
  -0.013517985069724917,
  0.009183313210426257,
  0.1009986316760949,
  0.11516543052670751,
  -0.030742019446067283,
  -0.22605806429442385],
 'x': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 'upper': [0.0,
  0.11008254460537936,
  0.11008254460537942,
  0.11008254460537936,
  0.11008254460537936,
  0.11008254460537942,
  0.1100825446053794,
  0.1100825446053794,
  0.110082544605379

In [15]:
a.error_metrics(test_df, 'IPG2211A2N')

Unnamed: 0,Metric,Value
0,MAPE,0.029315
1,MAE,3.098744
2,MSE,17.42027
