In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import warnings
warnings.filterwarnings('ignore')

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
df = pd.read_csv('../input/sales-forecasting/train.csv')
df['Postal Code'] = df['Postal Code'].fillna(5401)
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])
df['Order Week'] = df['Order Date'].apply(lambda x:f'{x.year}/{x.isocalendar()[1]}')
df['Order Year'] = df['Order Date'].apply(lambda x:x.year)
df['Order Day number'] = df['Order Date'].apply(lambda x:x.isocalendar()[2])
df['Order Month'] = df['Order Date'].apply(lambda x:datetime.datetime(x.year,x.month,1).strftime('%b-%Y'))
df['Lead_Time']=(df['Ship Date']-df['Order Date']).apply(lambda x:x.days)
df.sort_values(['Order Date'],inplace=True)

In [None]:
df

# Introduction

The goal of this step is to predict next week sales. There are different ways to proceed, we can either predict day by day sales on a window of seven days or directly predict weekly sales. If we work on weeks, we'll get less variability but also less observations since we will be grouping sales by weeks. From a different perspective, we can work on global sales time series or its subseries, for example predict sales for each category or even product. Working on different subseries might be beneficial if for example each subserie demonstrates some significative pattern such as seasonality.


In order to evaluate our models before predicting future sales, data will be splitted into training set containing years 2015/2016/2017 sales and test set containing year 2018 sales.

# Weekly predictions

## Global Sales :

### Week sales Data frame and time series plot

In [None]:
#Group sales per week : Order Week = Week number/Year
weekly_sales = pd.DataFrame(df.groupby('Order Week',sort=False)['Sales'].sum())
weekly_sales.reset_index(inplace=True)
weekly_sales['Order Year'] = weekly_sales['Order Week'].apply(lambda x:x.split('/')[0])



In [None]:
#Time serie plot 
fig = go.Figure()
fig.add_trace(go.Scatter(x=weekly_sales['Order Week'],
                         y=weekly_sales['Sales'],text='ds'))

### Trend detection HP-Filter

The HP filter removes a smooth trend, T, from the data x by solving

min sum((x[t] - T[t])**2 + lamb*((T[t+1] - T[t]) - (T[t] - T[t-1]))**2)

Lambda can be defined using the rule of thumb : 

Lambda = 100*(number of periods in a year)^2

In this respect, for:
Annual data = 100*1^2 = 100
Quarterly data = 100*4^2 = 1,600
Monthly data = 100*12^2 = 14,400
Weekly data = 100*52^2 = 270,400

An alternative is to use power 4 instead of 2 (See Ravn and Uhlig (2002)).

In [None]:
import statsmodels.api as sm

In [None]:
#Exclude year 2018
Y = np.array(weekly_sales[weekly_sales['Order Year']!='2018']['Sales'])
cycle, trend = sm.tsa.filters.hpfilter(Y, 10052**2)
fig = plt.figure(figsize=(12,6))
ax = fig.add_axes([0,0,1,1])
ax.plot(range(len(Y)),trend,label='Trend',c='r')
ax.plot(range(len(Y)),cycle,label='Cycle',c='g')
ax.plot(range(len(Y)),Y,label='Time series')
ax.legend()

==> In order to detect the presence of saisonality, we are going to subtract the trend we computed of sales time series.

### Stationarity test

For stationarity test, we consider only cycle component (Y - trend)

In [None]:
#Or the variable cycle predefined in th HP filter part
Y_transformed = Y-trend

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
adf_resutls = adfuller(Y_transformed,maxlag=52)
print(f'ADF test results are :')
print('ADF Statistic: %f' % adf_resutls[0])
print('p-value: %f' % adf_resutls[1])
print('Critical Values:')
for key, value in adf_resutls[4].items():
    print('\t%s: %.3f' % (key, value))
if adf_resutls[0]<=-2.9 :
    print('==> Non-stationarity can be rejected')
else :
    print('==> Non-stationarity cannot be rejected')

==> Y_transformed is stationary

### Saisonality

In [None]:
import statsmodels.graphics.tsaplots as tsaplots

In [None]:
#Seasonal subseries plot : Weeks are groupped by months (Year 2015/2016/2017)


monthly_subseries = pd.DataFrame(df[df['Order Year']!=2018].groupby(['Order Month','Order Week'],sort=False)['Sales'].sum())

fig = plt.figure(figsize=(30,20))
axes = fig.add_axes([0,0,1,1])
tsaplots.seasonal_plot(monthly_subseries.groupby('Order Month')['Sales'],list(range(1,37)),ax=axes)

fig.show()

==> Observations :  

    1/ The high variability of sales within months
    2/ Years 2015&2016 are similare in term of sales monthly mean value variation, at the beginning of the year, sales value is a bit stable and tends to fluctuate around a certain value, at September (Index 9 & 21) it spikes then decreases harshly to start reincreasing till the end of the year.
    3/ Year 2017 sale values behaves differently. Apparently, the main difference is that the september spikes is advanced to May(Index 29), sales value regain stability and start increasing by september till the end of the year.


In [None]:
# Perform Fourier transform on cycle component using scipy 
#Fourier transform is used to map signals from the time domain to the frequency domain. 
#It gives the possibility to spot major signals(with high intensity) contained in the time serie. 
from scipy import fftpack
#cycle = Y - trend
y_fft = fftpack.fft(cycle)

# Plot data

fr = (round(157/2)+1) * np.linspace(0,1,round(157/2))
y_m = 2/(round(157/2)+1) * np.abs(y_fft[:(round(157/2))])**2
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
ax[0].plot(range(158), Y)    # plot time series
ax[1].stem(fr, y_m) # plot freq domain

==> There is no dominant spike, still, amplitude value reaches maximum at period 37 (weeks) and second maximum at period 75 wich is approximately equal to 37*2...

### Autocorrelation Plots (ACF & PACF)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
plot_pacf(cycle,lags=76,ax=axes[0])
plot_acf(cycle,lags=76,ax=axes[1])
fig.suptitle('Cycle composant')
plt.show()

Since there are no apparent spikes, these plots give no insights about what model to use. But we had some good ideas according to previous plots, for example, seasonal subseries plot demonstrates that monthly sales tend to yearly seasonality, this characteristic can be transfered to our weekly model as a seasonality with period equal to 52 weeks.
On the other hand, we can integrate laggs 37 & 75 as AR terms

In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
plot_pacf(trend,lags=76,ax=axes[0])
plot_acf(trend,lags=76,ax=axes[1])
fig.suptitle('Trend Composant')
plt.show()

==> In order to predict future trend terms the plots bellow indicates an AR(1) model.

### Modeling and training :

In [None]:
#Cyclic component : 
model1_fit = sm.tsa.statespace.SARIMAX(cycle,order=([37,75],0,0),seasonal_order=(1,0,0,52),
                                       trend='c',enforce_invertibility=False,enforce_stationarity=False).fit()



In [None]:
#Trend Component
model2_fit = sm.tsa.statespace.SARIMAX(trend,order=(1,0,0),seasonal_order=(0,0,0,0),
                                       trend='c',enforce_invertibility=False,enforce_stationarity=False).fit()

In [None]:
#Predict training set
cycle_training_predictions = model1_fit.fittedvalues
trend_training_predictions = model2_fit.fittedvalues
sales_training_predictions = cycle_training_predictions + trend_training_predictions

In [None]:
#Plot training predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,159)),y=Y,name='True values'))
fig.add_trace(go.Scatter(x=np.array(range(75,159)),y=sales_training_predictions[75:],name='Predicted Values'))

In [None]:
#Metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error

RMSE = np.sqrt(mean_squared_error(Y[75:],sales_training_predictions[75:]))
MAE = mean_absolute_error(Y[75:],sales_training_predictions[75:])

print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')

### Evaluating the model 

In [None]:
#This function takes a set of data, split it into train and test sets, and predicts test set.
#In order to predict, say 2 weeks, the function predicts one week, then integrate the real value of that week in the model to predict the second week
def week_prediction(X,Xtest_len=12,cycle_order=(0,0,0),cycle_seasonal_order=(2,0,1,12)):
    predictions =[]
    for j in range(Xtest_len):

        X_train = X[0:len(X)-Xtest_len+j]
        
        cycle, trend = sm.tsa.filters.hpfilter(X_train, 10052**2)
        
        model1_fit =  sm.tsa.statespace.SARIMAX(cycle,order=cycle_order,seasonal_order=cycle_seasonal_order,
                            enforce_invertibility=False,enforce_stationarity=False).fit()
        
        
        model2_fit = sm.tsa.statespace.SARIMAX(trend,order=(1,0,0),seasonal_order=(0,0,0,0),
                            enforce_invertibility=False,enforce_stationarity=False).fit()
        
        cycle_prediction_step = model1_fit.predict(start=len(X_train),end=len(X_train))
        trend_prediction_step = model2_fit.predict(start=len(X_train),end=len(X_train))
        sales_prediction_step = cycle_prediction_step + trend_prediction_step
        predictions.append(sales_prediction_step)
    predictions = np.reshape(predictions,((Xtest_len),))
    RMSE = np.sqrt(mean_squared_error(X[len(X)-Xtest_len:],predictions))
    MAE = mean_absolute_error(X[len(X)-Xtest_len:],predictions)
    
    return predictions , RMSE , MAE

In [None]:
X = np.array(weekly_sales['Sales'])

warnings.filterwarnings('ignore')
predictions,RMSE,MAE = week_prediction(X,52,cycle_order=([37,75],0,0),cycle_seasonal_order=(1,0,0,52))


In [None]:
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=X[-52:],name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=np.array(predictions),name='Predicted Values'))

### Predicting next week sales

In [None]:
#Train the model on all data
warnings.filterwarnings('ignore')
X = np.array(weekly_sales['Sales'])
cycle, trend = sm.tsa.filters.hpfilter(X, 10052**2) 

model1_fit = sm.tsa.statespace.SARIMAX(cycle,order=([37,75],0,0),seasonal_order=(1,0,0,52),trend='c',enforce_invertibility=False,
                                      enforce_stationarity=False).fit()
model2_fit = sm.tsa.statespace.SARIMAX(trend,order=(1,0,0),seasonal_order=(0,0,0,0),trend='c',enforce_invertibility=False,
                                      enforce_stationarity=False).fit()
fitted_values = model1_fit.fittedvalues + model2_fit.fittedvalues

#Next week sales value :
next_week_prediction = model1_fit.predict(start=len(cycle),end=len(cycle)+3) + model2_fit.predict(start=len(trend),end=len(trend)+3)
for i in range(0,4):
    print(f' Week +{1+i} sales value prediction is equal to : {round(next_week_prediction[i],2)} (cur)')

In [None]:
#Plot results

fig = go.Figure()
transformed_fitted = np.exp(fitted_values)
fig.add_trace(go.Scatter(x=np.array(range(200,212)),y=X[200:],name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(200,212)),y=fitted_values[200:],name='Predicted Values'))
fig.add_trace(go.Scatter(x=np.array(range(209,214)),y=np.concatenate((np.array(fitted_values[-1]).reshape(1,),next_week_prediction)),
                         line = dict(color='red', width=4, dash='dash'),
                         name='Next weeks prediction'))

==> These results, are not convincing, the test data plot shows that the model doesn't really fit the data and fail to predict sales spikes. 

This is because we failed to find a significant seasonality or relation with close laggs. Also, the signal behavior changes dratically between years.

There are several ways of improvement : 

    1/ Ameliorate the signal decomposition, we used an HP-filter(lambda) to detrend the signal, hence tuning the lambda parameter might be of help. We can also use other decomposition methods such as moving averages or differencying the time series.
    2/ Predict monthly sales then split it on weeks, working on months reduces variability and helps in finding seasonal patterns.
    3/ Instead of working on global sales, work on sales per subcategories that demonstrates significant seasonal patterns and aggregate these sub-predictions to find global sales prediction. 
    4/ Collect more data...

## Sales per Categories :

In this part we follow the third area of improvement precited in the previous chapter. Instead of predicting global sales, we are going to predict sales per category and then aggregate them to find global sales prediction.

### Week sales Data frame and time series plot


In [None]:
#Time series Data frame
sales_category_week = pd.DataFrame(df.groupby(['Category','Order Week'],sort=False)['Sales'].sum())
sales_category_week.reset_index(inplace=True)

#Time series plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=sales_category_week[sales_category_week['Category']=='Office Supplies']['Order Week'],
                         y=sales_category_week[sales_category_week['Category']=='Furniture']['Sales'],name='Furniture'))
fig.add_trace(go.Scatter(x=sales_category_week[sales_category_week['Category']=='Office Supplies']['Order Week'],
                         y=sales_category_week[sales_category_week['Category']=='Office Supplies']['Sales'],name='Office Supplies'))
fig.add_trace(go.Scatter(x=sales_category_week[sales_category_week['Category']=='Office Supplies']['Order Week'],
                         y=sales_category_week[sales_category_week['Category']=='Technology']['Sales'],name='Technology'))
fig.update_layout(autosize=False,width=1200,height=600,title_x=0.5,title_text='Weekly sales per Category',
                 xaxis_title='Date (Week number/Year)',yaxis_title='Profit value')

### Furniture weekly sales 

In [None]:
#Furniture sales (2018 sales excluded)
Y = np.array(sales_category_week[sales_category_week['Category']=='Furniture']['Sales'])[:-52]
Y_detrend = np.diff(Y)

In [None]:
plt.plot(range(156),Y_detrend)
plt.title('Furniture Sales first order difference')
plt.show()

### Stationarity test

In [None]:
adf_resutls = adfuller(Y_detrend,maxlag=52)
print(f'ADF test results are :')
print('ADF Statistic: %f' % adf_resutls[0])
print('p-value: %f' % adf_resutls[1])
print('Critical Values:')
for key, value in adf_resutls[4].items():
    print('\t%s: %.3f' % (key, value))
if adf_resutls[0]<=-2.9 :
    print('==> Non-stationarity can be rejected')
else :
    print('==> Non-stationarity cannot be rejected')

### Fourier Transform

In [None]:
# Perform Fourier transform on cycle component using scipy 
from scipy import fftpack
#Y_transformed = Y - trend
y_fft = fftpack.fft(Y_detrend)

# Plot data

fr = (round(156/2)) * np.linspace(0,1,round(156/2))
y_m = 2/(round(156/2)) * np.abs(y_fft[:(round(156/2))])**2
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
ax[0].plot(range(157), Y)    # plot time series
ax[1].stem(fr, y_m) # plot freq domain

==> Two spikes, at 73 weeks period and in less degree at 62 weeks period.

### PACF & ACF 

In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
plot_pacf(Y_detrend,lags=76,ax=axes[0])
plot_acf(cycle,lags=76,ax=axes[1])
fig.suptitle('Cycle composant')
plt.show()

### Modeling and training :

In [None]:
model1_fit = sm.tsa.statespace.SARIMAX(Y,order=([1,52,64],1,[1]),seasonal_order=(1,0,0,73),
                                       trend='c',enforce_invertibility=False,enforce_stationarity=False).fit()

In [None]:
#Predict training set : 
sales_training_predictions = model1_fit.fittedvalues
#Plot training predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,159)),y=Y,name='True values'))
fig.add_trace(go.Scatter(x=np.array(range(73,159)),y=sales_training_predictions[73:],name='Predicted Values'))

In [None]:
#Metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error

RMSE = np.sqrt(mean_squared_error(Y[73:],sales_training_predictions[73:]))
MAE = mean_absolute_error(Y[73:],sales_training_predictions[73:])

print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')

### Evaluating the model 

In [None]:
#Same prediction method.
def week_prediction(X,Xtest_len=12,order=(0,0,0),seasonal_order=(2,0,1,12)):
    predictions =[]
    for j in range(Xtest_len):

        X_train = X[0:len(X)-Xtest_len+j]
        
        model1_fit =  sm.tsa.statespace.SARIMAX(X_train,order=order,seasonal_order=seasonal_order,
                            enforce_invertibility=False,enforce_stationarity=False).fit()
        
        
        
        prediction_step = model1_fit.predict(start=len(X_train),end=len(X_train))

        predictions.append(prediction_step)
    predictions = np.reshape(predictions,((Xtest_len),))
    RMSE = np.sqrt(mean_squared_error(X[len(X)-Xtest_len:],predictions))
    MAE = mean_absolute_error(X[len(X)-Xtest_len:],predictions)
    
    return predictions , RMSE , MAE

In [None]:
X = np.array(sales_category_week[sales_category_week['Category']=='Furniture']['Sales'])
predictions,RMSE,MAE = week_prediction(X,52,order=([1,52,64],1,[1]),seasonal_order=(1,0,0,73))

In [None]:
#Plot test results
print(f'Test Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=X[-52:],name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=np.array(predictions),name='Predicted Values'))

In [None]:
#Train the model on all data

model_fit = sm.tsa.statespace.SARIMAX(X,order=([1,52,64],1,[1]),seasonal_order=(1,0,0,73),trend='c',enforce_invertibility=False,
                                      enforce_stationarity=False).fit()
fitted_values = model_fit.fittedvalues 

#Next week sales value :
next_week_prediction = model_fit.predict(start=len(X),end=len(X)+3) 
for i in range(1,5):
    print(f'week + {i} sales value prediction is equal to : {round(next_week_prediction[i-1],2)} (cur)')

In [None]:
#Plot results

fig = go.Figure()
transformed_fitted = np.exp(fitted_values)
fig.add_trace(go.Scatter(x=np.array(range(200,212)),y=X[200:],name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(200,212)),y=fitted_values[200:],name='Predicted Values'))
fig.add_trace(go.Scatter(x=np.array(range(208,214)),y=np.concatenate((np.array(fitted_values[-1]).reshape(1,),next_week_prediction)),
                         line = dict(color='red', width=4, dash='dash'),
                         name='Next weeks prediction'))

### Next steps would be to redo the same process for other categories, predict sales for each one then aggregate all sales to get global sales value prediction. 

### Thank's for making it to the end of this notebook, please feel free to share your remarks and thoughts. 