In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#This data is a time series of advance retail sales: Clothing and clothing accessory stores
cloth = pd.read_csv("/kaggle/input/advance-retail-sales-time-series-collection/RSCCASN.csv",parse_dates=["date"],index_col="date")
cloth = cloth.drop(["realtime_start","realtime_end"],axis=1)
cloth.head()

In [None]:
cloth.tail()

We can see that data is present from January 1992 to October 2019. Also the value is in million dollars of retail sales

# Data Preprocessing

In [None]:
import matplotlib.pyplot as plt
cloth.plot()
plt.show()

We can see from the graph that there is a seasonality in the data

In [None]:
#Checking the stationarity - Augmented Dickey-Fuller Test(ADF Test)

from statsmodels.tsa.stattools import adfuller

adf_stat = adfuller(cloth['value'])
print(adf_stat)

First element of the tuple is the test statistic which is -0.844. The more negative it is, more stationary the data is.

Second element of the tuple is p-value of the test-statistic which is 0.8. Since it is very high, we cannot reject the null hypothesis (non-stationary)

Fifth element is a dictionary. To get a p-value of 0.01 we need a test statistic of -3.45

In [None]:
cloth["2013":"2014"].plot()
plt.show()

We have to remove the stationarity. We can see from the plot that data is seasonal every year and also increasing with time. One easy way to take care of both problems is calculating percent change YoY

In [None]:
#Removing stationarity
cloth_yoy = cloth.pct_change(12).dropna()
cloth_yoy.head(5)

In [None]:
cloth_yoy.plot()

We can see from the above plot that the data looks more stationary. Let's do a ADF test to confirm the stationarity

In [None]:
print(adfuller(cloth_yoy['value']))

The p-value is down to 0.02 after transforming the data. Let's try differenencing on these values to check the p-value

In [None]:
cloth_yoy2 = cloth_yoy.diff().dropna()
cloth_yoy2.plot()

In [None]:
print(adfuller(cloth_yoy2['value']))

The plot looks even more stationary. The p-value is close to 0 now and we can proceed with this data to build ARMA models

# Simple ARMA and ARMAX models

In [None]:
#Set the frequency to Days
cloth_yoy2 = cloth_yoy2.asfreq("MS")

In [None]:
#Splitting into train and test
train = cloth_yoy2.loc[:"2016"]
test = cloth_yoy2.loc["2017":]
test.head()

In [None]:
#Fitting a simple AR model
from statsmodels.tsa.statespace.sarimax import SARIMAX
model10 = SARIMAX(train,order=(1,0,1),trend="c")
results = model10.fit()
print(results.summary())

We can see that AR1 coefficient is significant and the bias term or intercept is not at all significant

In [None]:
#One step ahead Forecasting
forecast = results.get_prediction(start=-24)
mean_forecast = forecast.predicted_mean
conf_int = forecast.conf_int()
fig,ax = plt.subplots(figsize=(18,6))
train.plot(ax=ax,label="observed")
mean_forecast.plot(ax=ax,color="r",label="predicted")
ax.fill_between(conf_int.index,conf_int["lower value"],conf_int["upper value"],color="pink")
ax.legend()
fig.show()

In [None]:
#Dynamic Forecasting in the past
forecast = results.get_prediction(start=-24,dynamic=True)
mean_forecast = forecast.predicted_mean
conf_int = forecast.conf_int()
fig,ax = plt.subplots(figsize=(18,6))
train.plot(ax=ax,label="observed")
mean_forecast.plot(ax=ax,color="r",label="predicted")
ax.fill_between(conf_int.index,conf_int["lower value"],conf_int["upper value"],color="pink")
ax.legend()
fig.show()

In [None]:
#Dynamic Forecasting into the future
forecast = results.get_forecast(steps=36)
mean_forecast = forecast.predicted_mean
conf_int = forecast.conf_int()
fig,ax = plt.subplots(figsize=(18,6))
train.plot(ax=ax,label="observed")
mean_forecast.plot(ax=ax,color="r",label="predicted")
ax.fill_between(conf_int.index,conf_int["lower value"],conf_int["upper value"],color="pink")
ax.legend()
fig.show()

In [None]:
#ARMAX model
trainx = train.copy()
trainx["month"] = trainx.index.month
trainx.head()

In [None]:
modelx = SARIMAX(trainx["value"],order=(1,0,1),exog=trainx["month"])
resultsx = modelx.fit()
print(resultsx.summary())

As expected, the month variable turned out to be insignificant

# ACF and PACF

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
fig, ax = plt.subplots(2,1,figsize=(18,10))
plot_acf(cloth_yoy2,lags=10,zero=False,ax=ax[0])
plot_pacf(cloth_yoy2,lags=10,zero=False,ax=ax[1])

From the above plots we can see that both ACF and PACF tails off constantly

# Choosing the best model

In [None]:
aic_dict = {}
for p in range(0,4):
    for q in range(0,4):
        model = SARIMAX(train,order=(p,0,q),trend="c")
        res = model.fit(maxiter=500)
        print(p,q,res.aic,res.bic)

From the above AIC values, we can see that ARMA(2,3) ia the best fit

In [None]:
#ARMA(2,3) process

mod23 = SARIMAX(train,order=(2,0,3),trend="c")
res23 = mod23.fit(maxiter=500)

forecast = res23.get_forecast(steps=36)
mean_forecast = forecast.predicted_mean
conf_int = forecast.conf_int()
fig,ax = plt.subplots(figsize=(18,6))
train.plot(ax=ax,label="observed",color="blue")
test.plot(ax=ax,color="blue")
mean_forecast.plot(ax=ax,color="r",label="predicted")
ax.fill_between(conf_int.index,conf_int["lower value"],conf_int["upper value"],color="pink")
ax.legend()
fig.show()

We can see from above graph that the prediction is lot better than AR(1) modelled previously

# Diagnostics

In [None]:
print("Mean absolute error: {}".format(np.mean(np.abs(res23.resid))))

In [None]:
res.plot_diagnostics(figsize=(15,8))
plt.show()

We can see from the above plots that the model is good except some outliers

In [None]:
print(res.summary())

The residuals are completely uncorrelated. But they don't have a normal distribution because of the outliers

# Seasonal Time Series Analysis

We can use the original dataset instead of the YoY change data

In [None]:
#Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(cloth["value"],period=12)
decomp.plot()
plt.show()

In [None]:
#Seasonal differencing and normal differencing
cloth_diff = cloth.diff(1).diff(12).dropna()
cloth_diff.plot()
plt.show()

In [None]:
#Seasonal ACF and PACF
lags = [12, 24, 36, 48, 60]
fig, (ax1, ax2) = plt.subplots(2,1,figsize=(18,8))
plot_acf(cloth_diff,lags=lags,zero=False,ax=ax1)
plot_pacf(cloth_diff,lags=lags,zero=False,ax=ax2)

We can see from seasonal ACF anf PACF plots that there is no seasonal orders significant

In [None]:
#SARIMAX
train_full = cloth.loc[:"2016"]
test_full = cloth.loc["2017":]

sarima_mod = SARIMAX(train_full,order=(2,1,3),seasonal_order=(0,1,0,12))
sarima_res = sarima_mod.fit(maxiter=500)
print(sarima_res.summary())

# Automation of ARIMA modelling

In [None]:
import pmdarima as pm
results = pm.auto_arima(cloth,maxiter=500,seasonal=True,m=12,information_criterion="aic",trace=True,error_action="ignore")
print(results.summary())

In [None]:
results.plot_diagnostics()

In [None]:
sarima_mod = SARIMAX(train_full,order=(1,0,2),seasonal_order=(2,1,2,12))
sarima_res = sarima_mod.fit(maxiter=500)
print(sarima_res.summary())

In [None]:
forecast = sarima_res.get_forecast(steps=60)
mean_forecast = forecast.predicted_mean
conf_int = forecast.conf_int()
fig,ax = plt.subplots(figsize=(18,6))
train_full.plot(ax=ax,label="observed",color="blue")
test_full.plot(ax=ax,color="blue")
mean_forecast.plot(ax=ax,color="r",label="predicted")
ax.fill_between(conf_int.index,conf_int["lower value"],conf_int["upper value"],color="pink")
ax.legend()
fig.show()

In [None]:
print(mean_forecast.iloc[-1])
print(conf_int.iloc[-1])

Model forecasts that sales will be ~37 Billion USD in December 2021