In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style='background:lightblue; border:0; color:black'><center>Time series analysis for predict daily sales</center></h1>

Along this notebook we will focus in analyse the total number of sold items of a company. Instead of predict the number of a particular product in a store sold over a month, we will try to predict the total number of sold in over all the stores.

In the first part we will make a classical approach, using ARIMA models. In the second part we will use a slightly different methodology, using Recurrent Neural Networks (RNN), widely used in the COVID-19 analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import math
from sklearn.metrics import mean_squared_error as mse

In [None]:
# Import all of them 
sales=pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")

# settings
import warnings
warnings.filterwarnings("ignore")

item_cat=pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
item=pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
sub=pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
shops=pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
test=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

The data we use for this analysis is the one propose in the kaggle competition Predict Future Sales.

In [None]:
sales.head()

In [None]:
sales['date'] = pd.to_datetime(sales['date'],format = '%d.%m.%Y')


As we mentioned in the introduction, we only focus in the total number of sold item, so that the next is to sum over all the products grouping by days.

In [None]:
ts=sales.groupby(["date"])["item_cnt_day"].sum()
ts.astype('float')
ts=ts.to_frame()
ts.reset_index(inplace=True)
ts.date = pd.to_datetime(ts.date)



Our time series has the form

In [None]:
# Draw Plot
plt.figure(figsize=(12,8), dpi= 80)
plt.plot( 'item_cnt_day', data=ts)

# Decoration
plt.ylim(50, 15000)
plt.xticks( rotation=0, horizontalalignment='center', alpha=.7)
plt.yticks(alpha=.7) 
plt.title("Nº o sold items in a day from 2013 - 2016")
plt.grid(axis='both', alpha=.3)

# Remove borders
plt.gca().spines["top"].set_alpha(0.0)    
plt.gca().spines["bottom"].set_alpha(0.3)
plt.gca().spines["right"].set_alpha(0.0)    
plt.gca().spines["left"].set_alpha(0.3)   
plt.show()

As we can see our time series has a very noisy form. To reduce all that noise we will make a Box-Cox transform.

<h2 style='background:lightblue; border:0; color:black'><center>Box-Cox transform</center></h2>


The Box-Cox transformation is defined as

$$ y_{i}^{(\lambda)}= \left\{\begin{matrix}
\frac{y_{i}^{\lambda}-1}{\lambda} \quad   \lambda\neq 0\\
ln(y_{i}) \quad  \lambda = 0\end{matrix}\right. $$

We use the function boxcox of the packet stats, to calculate the transform and the $\lambda$ parameter

In [None]:
stats.boxcox(ts['item_cnt_day'])

In [None]:
ts['item_box']=stats.boxcox(ts['item_cnt_day'])[0]

In [None]:
# Draw Plot
plt.figure(figsize=(12,8), dpi= 80)
plt.plot( 'item_box', data=ts)

# Decoration
plt.ylim(2.525, 2.7)
plt.xticks( rotation=0, horizontalalignment='center', alpha=.7)
plt.yticks(alpha=.7) 
plt.title("Nº o sold items in a day from 2013 - 2016")
plt.grid(axis='both', alpha=.3)

# Remove borders
plt.gca().spines["top"].set_alpha(0.0)    
plt.gca().spines["bottom"].set_alpha(0.3)
plt.gca().spines["right"].set_alpha(0.0)    
plt.gca().spines["left"].set_alpha(0.3)   
plt.show()

Our series has some noise but is slightly better than the previous case. 
The next step is study the autocorrelation and the partial autocorrelation function.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4), dpi= 80)
plot_acf(ts.item_box.tolist(), ax=ax1, lags=100)
plot_pacf(ts.item_box.tolist(), ax=ax2, lags=80)

# Decorate
# lighten the borders
ax1.spines["top"].set_alpha(.3); ax2.spines["top"].set_alpha(.3)
ax1.spines["bottom"].set_alpha(.3); ax2.spines["bottom"].set_alpha(.3)
ax1.spines["right"].set_alpha(.3); ax2.spines["right"].set_alpha(.3)
ax1.spines["left"].set_alpha(.3); ax2.spines["left"].set_alpha(.3)

# font size of tick labels
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()

The autocorrelation and the partial autocorrelation function suggest that the time series has a seasonal and a trend component. In the next plot can be watching better.

In [None]:
# Import Data
ts.set_index(ts['date'], inplace=True)

# Decompose
result = seasonal_decompose(ts['item_box'], model='multiplicative')

# Plot
plt.rcParams.update({'figure.figsize': (10,10)})
result.plot().suptitle('Time Series Decomposition of sold items')
plt.show()

<h2 style='background:lightblue; border:0; color:black'><center>The ARIMA model</center></h2>


We know that the time series has a trend component. The first thing we have to do is eliminate that trend. To do this we differentiate with lag 1.

In [None]:
diff = ts.diff()
diff.drop(diff.index[0], inplace=True)

In [None]:
result = seasonal_decompose(diff['item_box'], model='additive')
# Plot
plt.rcParams.update({'figure.figsize': (10,10)})
result.plot().suptitle('Time Series Decomposition of sold items')
plt.show()

The trend has gone. We will see what happens now with the autocorrelation and the partial autocorrelation function.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4), dpi= 80)
plot_acf(diff.item_box.tolist(), ax=ax1, lags=100)
plot_pacf(diff.item_box.tolist(), ax=ax2, lags=80)

# Decorate
# lighten the borders
ax1.spines["top"].set_alpha(.3); ax2.spines["top"].set_alpha(.3)
ax1.spines["bottom"].set_alpha(.3); ax2.spines["bottom"].set_alpha(.3)
ax1.spines["right"].set_alpha(.3); ax2.spines["right"].set_alpha(.3)
ax1.spines["left"].set_alpha(.3); ax2.spines["left"].set_alpha(.3)

# font size of tick labels
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()

It is obvious that our time series has a seasonality with lag 7, and it something that makes sense if one thinks about that this lag has the same length as the number of the days of the week.To solve this problem we will make a differentiation with lag 7.

In [None]:
diff_s = diff.diff(7)
diff_s.drop(diff_s.index[0:7], inplace=True)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,4), dpi= 80)
plot_acf(diff_s.item_box.tolist(), ax=ax1, lags=100)
plot_pacf(diff_s.item_box.tolist(), ax=ax2, lags=80)

# Decorate
# lighten the borders
ax1.spines["top"].set_alpha(.3); ax2.spines["top"].set_alpha(.3)
ax1.spines["bottom"].set_alpha(.3); ax2.spines["bottom"].set_alpha(.3)
ax1.spines["right"].set_alpha(.3); ax2.spines["right"].set_alpha(.3)
ax1.spines["left"].set_alpha(.3); ax2.spines["left"].set_alpha(.3)

# font size of tick labels
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()

Now we have a series that don't have the trend and the seasonality. We will try now to find the right parameters of the ARIMA model. To do this we make a simple function to find this parameters based on the aic score.

In [None]:
def SARIMA_PARAMETERS(parameters,time_serie):
    """
    parameters-> list of SARIMA parameters (p,d,q,P,D,Q,s)
    
    """
    list_param=[]
    
    for i in tqdm_notebook(parameters):
        try:
            model=SARIMAX(time_serie, order=(i[0], i[1], i[2]), seasonal_order=(i[3], i[4], i[5], i[6])).fit(disp=-1)
        except:
            continue
            
        aic = model.aic
        list_param.append([i, aic])   
        
    list_param_df = pd.DataFrame(list_param)
    list_param_df.columns = ['(p,d,q)x(P,D,Q)s', 'AIC']
    list_param_df = list_param_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return list_param_df

In [None]:
p = range(0, 3, 1)
d = range(1,2)
q = range(0, 3, 1)
P = range(0, 3, 1)
D = range(1, 2, 1)
Q = range(0, 3, 1)
s = range(7,8)
parameters = product(p,d, q, P,D, Q,s)
parameters_list = list(parameters)


As we can see our function has 7 parameters. The p,q correspond with the order of the ARMA model associated, the P and Q are the order of the seasonal component. The d is the number of differentiation in the model, and D and s are the number of differentiations and the lag of the seasonal component. The reader could think that why we take care about the number of differentiations and the lag of the seasonal component in the previus section, well the answer is simple, if we have the number of differenciations we have less parameters to determinate. Right now we have a total of 81 posible combinations, but the number increase incredible fast if we have to estime three more parameters.

In [None]:
results_df=SARIMA_PARAMETERS(parameters_list,ts['item_box'][0:1004])

We use all the time serie except of the last 30 days, which we will use for test our model.

In [None]:
results_df

The best model is the $(1, 1, 1)( 1, 1, 1)_{7}$ with an aic socre of -6257

In [None]:
model = SARIMAX(ts['item_box'][0:1004], order=(1, 1, 1), seasonal_order=(1, 1, 1, 7))
res=model.fit(dis=-1)
fcast = res.get_forecast(30)

In [None]:
a=res.plot_diagnostics(figsize=(15,12))

As we can see in the plot, our residuals don't fall a normal distribution, but at least they are uncorrelated.

In [None]:
ts['arima_model'] = res.fittedvalues
forecast = res.predict(start=ts['item_box'][:1004].shape[0], end=ts['item_box'].shape[0]-1)
forecast = ts['arima_model'][:1004].append(forecast)
plt.figure(figsize=(18, 7.5))
plt.plot(forecast[10:1004], color='r', label='model')
plt.plot(ts['item_box'][10:1004], label='actual')


plt.legend()
plt.show()

In red we have the model and the real data is in blue. As we can see the fit is quite good in all the serie. The next part is to compare the test data with the prediction.

In [None]:
x=list(range(1,31))
y_error =[abs(fcast.conf_int(alpha=0.01)['lower item_box'].values-forecast[1004:].values), fcast.conf_int(alpha=0.01)['upper item_box'].values-forecast[1004:].values] 


In [None]:
plt.figure(figsize=(15, 7.5))

plt.errorbar(x,forecast[1004:].values,yerr=y_error,fmt='o')
plt.errorbar(x,ts['item_box'][1004:].values, label='actual', fmt='o',color='r')

plt.show()

In blue we have the prediction with a confidence band of 95% and in red is the real data.

<h2 style='background:lightblue; border:0; color:black'><center>Advanced methods</center></h2>


In [None]:
scaler = MinMaxScaler()
scaler.fit(ts.take([ 2], axis=1))
scaled_train_data = scaler.transform(ts.take([2], axis=1)[0:1004])


In [None]:
n_input = 12
n_features= 1
generator = TimeseriesGenerator(scaled_train_data, scaled_train_data, length=n_input, batch_size=1)


lstm_model = Sequential()
lstm_model.add(LSTM(200, activation='relu', input_shape=(n_input, n_features)))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mse')

lstm_model.summary()

In [None]:
lstm_model.fit_generator(generator,epochs=20)


In [None]:
plt.figure(figsize=(15, 7.5))
losses_lstm = lstm_model.history.history['loss']
plt.figure(figsize=(12,4))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.xticks(np.arange(0,21,1))
plt.plot(range(len(losses_lstm)),losses_lstm);

In [None]:
lstm_predictions_scaled = list()

batch = scaled_train_data[-n_input:]
current_batch = batch.reshape((1, n_input, n_features))

for i in range(30):   
    lstm_pred = lstm_model.predict(current_batch)[0]
    lstm_predictions_scaled.append(lstm_pred) 
    current_batch = np.append(current_batch[:,1:,:],[[lstm_pred]],axis=1)

In [None]:
lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)

In [None]:
test_data=pd.DataFrame()
test_data['LSTM_Predictions'] = lstm_predictions.reshape(30)
test_data.index=ts.index[1004:]

In [None]:
plt.figure(figsize=(15, 7.5))
plt.errorbar(x,test_data['LSTM_Predictions'],fmt='o')
plt.errorbar(x,ts['item_box'][1004:], label='actual', fmt='o', color='r')


plt.show()

<h2 style='background:lightblue; border:0; color:black'><center>Comparison</center></h2>

In [None]:
def rmse(y_true, y_pred):
    return math.sqrt(mse(y_true, y_pred))

In [None]:
rmse(ts['item_box'][1004:],test_data['LSTM_Predictions'])

In [None]:
rmse(ts['item_box'][1004:],forecast[1004:])