# Import Libraries

In [None]:
!pip install arch

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt 
import statsmodels.tsa.api as tsa
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
import statsmodels.api as sm
import numpy as np

# Import Data

In [None]:
# import train data
df_train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
df_train.head()

In [None]:
# import test data
df_test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
df_test.head()

# Data Cleaning

In [None]:
print("Original length", len(df_train))

# remove shops not in test
df_train = df_train[df_train['shop_id'].isin(df_test['shop_id'])]
print("Removed shop length", len(df_train))

# remove items not in test
df_train = df_train[df_train['item_id'].isin(df_test['item_id'])]
print("Removed items length", len(df_train))

# Exploratory Data Analysis

In [None]:
mon_sales = df_train.groupby(['date_block_num']) \
    .agg(item_cnt_mon=('item_cnt_day', 'sum'))
mon_sales.head()

In [None]:
sm.stats.acorr_ljungbox(mon_sales['item_cnt_mon'])

box test p-value for all 10 lags is less than 0.05, we reject the null hypothesis and conclude that the items aggregated monthly count time series exhibit autocorrelation.

In [None]:
sns.set_theme()
def plot_ts(x, lags=10, alpha=0.05):
    plt.figure(figsize=(10, 7.5))
    ts = pd.Series(x)
    layout = (2,2)
    ts_ax = plt.subplot2grid(layout, (0,0), colspan = 2)
    acf_ax = plt.subplot2grid(layout, (1,0))
    pacf_ax = plt.subplot2grid(layout, (1,1))
    ts_ax.set_title('original time series')
    ts.plot(ax= ts_ax)
    tsa.graphics.plot_acf(ts, ax=acf_ax, lags=lags, alpha=alpha)
    tsa.graphics.plot_pacf(ts, ax=pacf_ax, lags=lags, alpha=alpha)
    plt.tight_layout()
    plt.show(block=False)

In [None]:
tsa.adfuller(mon_sales['item_cnt_mon'])[1]

adf test p value=0.11353 is greater than 0.05, we fail to reject the null hypothesis that the time series contains a unit root thus, it is not stationary.

In [None]:
tsa.adfuller(mon_sales['item_cnt_mon'].diff()[1:])[1]

after a single difference, the adf test p-value=4.0326e-19 is less than 0.05. thus, we reject the null hypothesis and conclude that the time series is stationary.   


In [None]:
plot_ts(mon_sales['item_cnt_mon'].diff()[1:])

from the autocorrelation plot, the autocorelation of lags beyond 0 is not statistically different from 0.   
from the partial autocorrelation plot, the autocorrelation of lag 2 and 10 is statistically different from 0.

# Modeling

## ARIMA

In [None]:
model_2 = ARIMA(mon_sales['item_cnt_mon'], order=(2, 1, 0))
model_2_fit = model_2.fit()
model_2_fit.summary()

In [None]:
model_10 = ARIMA(mon_sales['item_cnt_mon'], order=(10, 1, 0))
model_10_fit = model_10.fit()
model_10_fit.summary()

ARIMA(2, 1, 0) has a lower AIC and BIC than ARIMA(10, 1, 0) thus, we select ARIMA(2, 1, 0) to model the time series. however, it is worth noting that the ar.l2 of the ARIMA(2, 1, 0) is not significantly different from 0 since p-value is 0.08 greater than 0.05.

In [None]:
model_21 = ARIMA(mon_sales['item_cnt_mon'], order=(2, 1, 0), enforce_stationarity=False)
with model_21.fix_params({'ar.L2': 0}):
    model_21_fit = model_21.fit()
model_21_fit.summary()

Fixing ar.L2 of ARIMA(2, 1, 0) to 0 produces a lower AIC and BIC

### Residual Test

In [None]:
sm.stats.acorr_ljungbox(model_21_fit.resid)

since box test p-value for all 6 lags of the residuals is greater than 0.05, we fail to reject the null hypothesis and conclude the residuals are white noise

In [None]:
sm.stats.acorr_ljungbox(model_21_fit.resid**2)

since box test p-value for all 6 lags of the residuals square is lesser than 0.05, we reject the null hypothes is and conclude the residuals square have autocorrelation

## GARCH

In [None]:
resid = model_21_fit.resid
plot_ts(resid**2)

from the autocorrelation plot, the autocorelation of lags beyond 1 is not statistically different from 0.  
from the partial autocorrelation plot, the autocorrelation of lag 1 and 9 is statistically different from 0.

In [None]:
garch_11  = arch_model(resid, p=1, q=1)
garch_11_fit = garch_11.fit()
garch_11_fit.summary()

In [None]:
garch_91  = arch_model(resid, p=9, q=1)
garch_91_fit = garch_91.fit()
garch_91_fit.summary()

GARCH(1, 1) produces the lowest AIC score relative to the other GARCH models. however, it is worth noting the mu and beta1 is not statistically different from 0 since p-value is greater than 0.05.

In [None]:
garch_11_fix  = arch_model(resid, p=1, q=1)
garch_11_fix_fit = garch_11_fix.fit()
garch_11_fix_fit.params[['beta[1]', 'mu']] = 0
garch_11_fix_fit = garch_11_fix.fix(garch_11_fix_fit.params)
garch_11_fix_fit.summary()

fixing the GARCH(1, 1) parameters of beta1 and mu result in higher AIC and BIC compared to the original

### GARCH with different distribution

In [None]:
garch_11  = arch_model(resid, p=1, q=1)
garch_11_fit = garch_11.fit()
garch_11_fit.summary()

In [None]:
garch_11_std  = arch_model(resid, p=1, q=1, dist='studentst')
garch_11_fit_std = garch_11_std.fit()
garch_11_fit_std.summary()

In [None]:
garch_11_skew  = arch_model(resid, p=1, q=1, dist='skewt')
garch_11_fit_skew = garch_11_skew.fit()
garch_11_fit_skew.summary()

GARCH(1, 1) model with normal distribution produces the lowest AIC (although similar to Student's t) and BIC relative to the other distributions. 

## Forecast
### ARIMA(2, 1) ~ GARCH(1, 1) with generalized error distribution

In [None]:
arima_forecast = model_21_fit.forecast().values[0]
garch_forecast = garch_11_fit.forecast(horizon=1) \
    .mean['h.1'].iloc[-1]
forecast = arima_forecast + garch_forecast

In [None]:
submission = df_test.copy()
submission = submission.drop(['shop_id', 'item_id'], axis=1)
avg_id = forecast / len(df_test)
submission['item_cnt_month'] = avg_id
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f ./submission.csv -m "Message"