In [None]:
%matplotlib  inline

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from fbprophet import Prophet

import os
print(os.listdir("../input"))

In [None]:
df = pd.read_csv("../input/avocado.csv", parse_dates=["Date"])

del df['Unnamed: 0']
cols = ['Date', 'AveragePrice', 'type', 'region']
df = df[cols]
df = df[(df.region =='TotalUS') & (df.type == 'conventional') ] #& (df.Date >= '2016-01-01')

del df['region']
del df['type']

df = df.sort_values("Date")

df.columns = ['ds', 'y']
df.set_index('ds', inplace=True)

# Train test split 
train = df[:-12]
test = df[-12:]

train.info()

# Only selecting `TotalUS` prices for `conventional` avocados

In [None]:
train.head()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

ax = sns.scatterplot(x=train.index, y=train.y)
ax = sns.scatterplot(x=test.index, y=test.y)

ax.axes.set_xlim(train.index.min(), test.index.max());

In [None]:
from pandas import DataFrame
from datetime import datetime, timedelta
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import stats
import statsmodels.api as sm
from itertools import product
from math import sqrt
from sklearn.metrics import mean_squared_error 

import warnings
warnings.filterwarnings('ignore')

colors = ["windows blue", "amber", "faded green", "dusty purple"]
sns.set(rc={"figure.figsize": (20,10), "axes.titlesize" : 18, "axes.labelsize" : 12, 
            "xtick.labelsize" : 14, "ytick.labelsize" : 14 })

# Stationarity check and Seasonal decomposition

In [None]:
seasonal_decompose(train.y, model='additive').plot()
print("Dickey–Fuller test: p=%f" % adfuller(train.y)[1])

The p-value indicates that series is not stationary.

# Box-Cox transformation

In [None]:
train['y_box'], lmbda = stats.boxcox(train.y)

seasonal_decompose(train.y_box, model='additive').plot()
print("Dickey–Fuller test: p=%f" % adfuller(train.y_box)[1])

The p-value indicates that series is stationary.

In [None]:
# Initial approximation of parameters using Autocorrelation and Partial Autocorrelation Plots
ax = plt.subplot(211)

# Plot the autocorrelation function
plot_acf(train.y_box[0:].values.squeeze(), lags=16, ax=ax)
ax = plt.subplot(212)
plot_pacf(train.y_box[0:].values.squeeze(), lags=16, ax=ax)
plt.tight_layout()

# Differencing 
## d=1

In [None]:
train['y_box_1d'] = train['y_box'].diff(periods=1)
train.head()

In [None]:
fig, ax_arr = plt.subplots(2,1)

ax_arr[0].plot(train.y_box)
ax_arr[1].plot(train.y_box_1d)
plt.tight_layout();

In [None]:
# STL-decomposition
seasonal_decompose(train.y_box_1d[1:]).plot()   
print("Dickey–Fuller test: p=%f" % adfuller(train.y_box_1d[1:])[1])

In [None]:
# Initial approximation of parameters using Autocorrelation and Partial Autocorrelation Plots
ax = plt.subplot(211)
# Plot the autocorrelation function
plot_acf(train.y_box_1d[1:].values.squeeze(), lags=16, ax=ax)
ax = plt.subplot(212)
plot_pacf(train.y_box_1d[1:].values.squeeze(), lags=16, ax=ax)
plt.tight_layout()

The p-value indicates that series is stationary as the computed p-value is lower than the significance level $\alpha=0.01$.

There are not many spikes in the plots outside the insignificant zone (shaded) so there may not be enough information available in the residuals to be extracted by AR and MA models.

There may be a seasonal component available in the residuals at the lags of 4 weeks represented by spikes at these intervals. But probably not significant.

# ARIMA Model
AutoRegressive Integrated Moving Average

ARIMA models are denoted with the notation $ARIMA(p, d, q)$. These parameters account for seasonality, trend, and noise in datasets:

* $p$ - the number of lag observations to include in the model, or lag order. ($AR$)
* $d$ - the number of times that the raw observations are differenced, or the degree of differencing. ($I$)
* $q$ - the size of the moving average window, also called the order of moving average.($MA$)

A linear regression model is constructed including the specified number and type of terms, and the data is prepared by a degree of differencing in order to make it stationary, i.e. to remove trend and seasonal structures that negatively affect the regression model. A value of 0 for a parameter indicates to not use that element of the model.

### Parameter Selection
We will iteratively explore different combinations of parameters. For each combination we fit a new ARIMA model with $SARIMAX()$ and assess its overall quality.

We will use the **AIC** (Akaike Information Criterion) value, returned with ARIMA models fitted using `statsmodels`.

The AIC measures how well a model fits the data while taking into account the overall complexity of the model. A model that fits the data very well while using lots of features will be assigned a larger AIC score than a model that uses fewer features to achieve the same goodness-of-fit. Therefore, we are interested in finding the model that yields the **lowest AIC value**.



In [None]:
# Initial approximation of parameters
ps = range(0, 2)
d = 1
qs = range(0, 2)

parameters = product(ps, qs)
parameters_list = list(parameters)
len(parameters_list)

In [None]:
%%time 

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model = SARIMAX(train.y_box, order=(param[0], d, param[1])).fit(disp=-1)
    except ValueError:
        print('bad parameter combination:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by='aic', ascending=True).head())

Note the AICs are negative but this is not a problem.

Usually, AIC is positive; however, it can be shifted by any additive constant, and some shifts can result in negative values of AIC. It is not the absolute size of the AIC value, it is the relative values over the set of models considered, and particularly the differences between AIC values, that are important.

In [None]:
print(best_model.summary())

**Analysis of Results**

The coef column shows the weight (i.e. importance) of each feature and how each one impacts the time series. The P>|z| column informs us of the significance of each feature weight. Here, each weight has a p-value lower or close to 0.05, so it is reasonable to retain all of them in our model.

When fitting seasonal ARIMA models (and any other models for that matter), it is important to run model diagnostics to ensure that none of the assumptions made by the model have been violated. The plot_diagnostics object allows us to quickly generate model diagnostics and investigate for any unusual behavior.

In [None]:
print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[13:])[1])

In [None]:
best_model.plot_diagnostics(figsize=(15, 12))
plt.show()

Our primary concern is to ensure that the residuals of our model are uncorrelated and normally distributed with zero-mean. If the seasonal ARIMA model does not satisfy these properties, it is a good indication that it can be further improved.

In the histogram (top right), the $KDE$ line should follow the $N(0,1)$ line (normal distribution with mean 0, standard deviation 1) closely. This is an indication whether the residuals are normally distributed or not.

In the Q-Q-plot the ordered distribution of residuals (blue dots) should follow the linear trend of the samples taken from a standard normal distribution with $N(0, 1)$. Again, this is an indication whether the residuals are normally distributed.

The standardized residual plot doesn't display any obvious seasonality.

This is confirmed by the autocorrelation plot, which shows that the time series residuals have low correlation with lagged versions of itself.

## Prediction

In [None]:
# Inverse Box-Cox Transformation Function
def invboxcox(y,lmbda):
   if lmbda == 0:
      return(np.exp(y))
   else:
      return(np.exp(np.log(lmbda*y+1)/lmbda))

In [None]:
test['yhat_ARIMA'] = invboxcox(best_model.forecast(12), lmbda)
test['yhat_ARIMA'] = np.round(test.yhat_ARIMA, 2)

test.tail()

In [None]:
test.y.plot(linewidth=3)
test.yhat_ARIMA.plot(color='r', ls='--', label='Predicted Units', linewidth=3)

plt.legend()
plt.grid()
plt.title('Price - weekly forecast')
plt.ylabel('$');

In [None]:
test['e'] = test.y - test.yhat_ARIMA

rmse = np.sqrt(np.mean(test.e**2)).round(2)
mape = np.round(np.mean(np.abs(100*test.e/test.y)), 0)

print('RMSE = $', rmse)
print('MAPE =', mape, '%')

# ARIMA with Constant trend

In [None]:
%%time 

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model = SARIMAX(train.y_box, order=(param[0], d, param[1]), trend='ct').fit(disp=-1)
    except ValueError:
        print('bad parameter combination:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())

In [None]:
print(best_model.summary())

In [None]:
print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[13:])[1])

In [None]:
best_model.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
test['yhat_ARIMAct'] = invboxcox(best_model.forecast(12), lmbda)
test['yhat_ARIMAct'] = np.round(test.yhat_ARIMAct, 2)

test.tail()

In [None]:
test.y.plot(linewidth=3)
test.yhat_ARIMAct.plot(color='r', ls='--', label='Predicted Units', linewidth=3)

plt.legend()
plt.grid()
plt.title('Price - weekly forecast')
plt.ylabel('$');

In [None]:
test['e'] = test.y - test.yhat_ARIMAct

rmse = np.sqrt(np.mean(test.e**2)).round(2)
mape = np.round(np.mean(np.abs(100*test.e/test.y)), 0)

print('RMSE = $', rmse)
print('MAPE =', mape, '%')

# SARIMAX
Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors model

When dealing with seasonal effects, we make use of the seasonal $ARIMA$, which is denoted as $ARIMA(p,d,q)(P,D,Q)s$. Here, $(p, d, q)$ are the non-seasonal parameters described above, while $(P, D, Q)$ follow the same definition but are applied to the seasonal component of the time series. The term $s$ is the periodicity of the time series (4 for quarterly periods, 12 for yearly periods, 52 for weekly periods etc.).

### Parameter Selection
The same process of parameter selection for seasonal ARIMA. For each combination we fit a new seasonal ARIMA model with $SARIMAX()$ and assess its overall quality.

In [None]:
%%time 

# Initial approximation of parameters
Qs = range(0, 2)
qs = range(0, 3)
Ps = range(0, 3)
ps = range(0, 3)
D=1
d=1
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)

i = 52 # weekly seasonality 

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model = SARIMAX(train.y_box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], i)).fit(disp=-1)
    except ValueError:
        print('bad parameter combination:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
print(best_model.summary())

In [None]:
print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[i+1:])[1])

In [None]:
best_model.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
# STL-decomposition
plt.subplot(211)
best_model.resid[i+1:].plot()
plt.ylabel(u'Residuals')
ax = plt.subplot(212)

plot_acf(best_model.resid[i+1:].values.squeeze(), lags=i, ax=ax)

print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[i+1:])[1])

plt.tight_layout()

In [None]:
test['yhat_SARIMA'] = invboxcox(best_model.forecast(12), lmbda)
test['yhat_SARIMA'] = np.round(test.yhat_SARIMA, 2)

test.tail()

In [None]:
test.y.plot(linewidth=3)
test.yhat_SARIMA.plot(color='r', ls='--', label='Predicted Units', linewidth=3)

plt.legend()
plt.grid()
plt.title('Price - weekly forecast')
plt.ylabel('$');

In [None]:
test['e'] = test.y - test.yhat_SARIMA

rmse = np.sqrt(np.mean(test.e**2)).round(2)
mape = np.round(np.mean(np.abs(100*test.e/test.y)), 0)

print('RMSE = $', rmse)
print('MAPE =', mape, '%')

# SARIMA with constant trend

In [None]:
%%time 

# Initial approximation of parameters
Qs = range(0, 2)
qs = range(0, 3)
Ps = range(0, 3)
ps = range(0, 3)
D=1
d=1
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)

i = 52 # weekly seasonality 

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model = SARIMAX(train.y_box, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], i), trend='ct').fit(disp=-1)
    except ValueError:
        print('bad parameter combination:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
print(best_model.summary())

In [None]:
print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[i+1:])[1])

In [None]:
best_model.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
# STL-decomposition
plt.subplot(211)
best_model.resid[i+1:].plot()
plt.ylabel(u'Residuals')
ax = plt.subplot(212)

plot_acf(best_model.resid[i+1:].values.squeeze(), lags=i, ax=ax)

print("Dickey–Fuller test:: p=%f" % adfuller(best_model.resid[i+1:])[1])

plt.tight_layout()

In [None]:
test['yhat_SARIMAct'] = invboxcox(best_model.forecast(12), lmbda)
test['yhat_SARIMAct'] = np.round(test.yhat_SARIMAct, 2)

test.tail()

In [None]:
test.y.plot(linewidth=3)
test.yhat_SARIMAct.plot(color='r', ls='--', label='Predicted Units', linewidth=3)

plt.legend()
plt.grid()
plt.title('Price - weekly forecast')
plt.ylabel('$');

In [None]:
test['e'] = test.y - test.yhat_SARIMAct

rmse = np.sqrt(np.mean(test.e**2)).round(2)
mape = np.round(np.mean(np.abs(100*test.e/test.y)), 0)

print('RMSE = $', rmse)
print('MAPE =', mape, '%')

del test['e']

# Results so far

In [None]:
test

In [None]:
test.y.plot(linewidth=3)

test.yhat_ARIMA.plot(color='r', ls='--', label='ARIMA forecast', linewidth=3)
test.yhat_ARIMAct.plot(color='r', ls=':', label='ARIMA constant trend', linewidth=3)
test.yhat_SARIMA.plot(color='grey', ls='--', label='SARIMA', linewidth=3)
test.yhat_SARIMAct.plot(color='grey', ls=':', label='SARIMA constant trend', linewidth=3)

plt.legend()
plt.grid()
plt.title('Price - weekly forecast')
plt.ylabel('$');

# Upnext...
## Do avocados and bread go together? Let's find out!