# Libraries and Data import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from urllib.request import urlopen
import json
import ipywidgets as widgets
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

#If needed other libraries will be called during the project 

In [None]:
df = pd.read_csv('../input/sales-forecasting/train.csv')

# Data exploration

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Missing Values 
df.isnull().sum()

In [None]:
df[df['Postal Code'].isnull()]['City'].unique()

All observations with missing Postal Code are from the city of Burlington, we can either fill it or drop all the column since it wont be used in this analysis.

In [None]:
#Ok, lets fill it :D
df['Postal Code'] = df['Postal Code'].fillna(5401)

In [None]:
#Quick convert of Order Date and Shipe Date into Date form 
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])

#Add Order Year column
df['Order Year'] = df['Order Date'].apply(lambda x:x.year)
#Add Order month column
df['Order Month'] = df['Order Date'].apply(lambda x:datetime.datetime(x.year,x.month,1))

#Add a column containing : The number of week/Year ([1,52]/Year) in which the order was made 
df['Order Week'] = df['Order Date'].apply(lambda x:f'{x.year}/{x.isocalendar()[1]}')

#Add a column containing : The number of the day [1,7] the order was made
df['Order Day number'] = df['Order Date'].apply(lambda x:x.isocalendar()[2])

#Sort Data by Order date
df.sort_values(['Order Date'],inplace=True)

In [None]:
#Unique values (This steps gives little insights about what sub-series we might consider)
for c in df.columns :
    print(f"Number of {c} unique values : {df[c].nunique()}")


==> We can analyze total sales or sales per category/subcategories or Shipe mode utilization... 

# Data Visualization

### Ship Mode and Lead Time

In [None]:
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=2,cols=2,specs=specs)
fig.add_trace(go.Pie(labels=['Standard Class','Second Class', 'First Class' , 'Same Day'],
                     values=df[df['Order Year']==2015]['Ship Mode'].value_counts(),title='2015'),1,1)
fig.add_trace(go.Pie(labels=['Standard Class','Second Class', 'First Class' , 'Same Day'],
                     values=df[df['Order Year']==2016]['Ship Mode'].value_counts(),title='2016'),1,2)
fig.add_trace(go.Pie(labels=['Standard Class','Second Class', 'First Class' , 'Same Day'],
                     values=df[df['Order Year']==2017]['Ship Mode'].value_counts(),title='2017'),2,1)
fig.add_trace(go.Pie(labels=['Standard Class','Second Class', 'First Class' , 'Same Day'],
                     values=df[df['Order Year']==2018]['Ship Mode'].value_counts(),title='2018'),2,2)

fig.update_layout(title_text='Shipe mode rate per year',title_x=0.5)

In [None]:
#Lead Time distribution per Segment :
df['Lead_Time']=(df['Ship Date']-df['Order Date']).apply(lambda x:x.days)
fig = px.histogram(df[df['Lead_Time']>0], x="Lead_Time", color="Segment",labels={'Lead_Time':'Lead Time in days'})
fig.show()

There is some anomalies in ship dates, some of them are anterior to the order date. Only those yielding a positive lead time are plotted. Results are confusing, Shipe Date column is not credible...

### Sales mapping (Choropleth)

In [None]:
#A Dictionary containing State codes, these codes will be used for the next plot
codes = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','District of Columbia':'DC','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
#Create a widget to vary year [2015,2018], in order to visualize each year plot (It's not working on kaggle)
c=2015
int_range = widgets.IntSlider(min=2015,max=2018,description="Plot's Year")
display(int_range)

def on_value_change(change):
    global c 
    c=change['new']
    clear_output(wait=True)
    display(int_range)
    sales_percity = pd.DataFrame(df[df['Order Year']==c].groupby('State')['Sales'].sum())
    sales_percity.reset_index(inplace=True)
    sales_percity['state_code'] = sales_percity['State'].apply(lambda x:codes[x])

    data = dict(type = 'choropleth',
            locations = sales_percity['state_code'],
            locationmode = 'USA-states',
            colorscale= 'Portland',
            text= sales_percity['State'],
            z=sales_percity['Sales'],
            colorbar = {'title':'Colorbar Title'})
    layout = dict(geo = {'scope':'usa'},title=f'Sales per State in {c}',title_x=0.5)
    choromap = go.Figure(data = [data],layout = layout)
    iplot(choromap)

int_range.observe(on_value_change, names='value')
on_value_change({'new':2015})

    1/ Through all years states of California and New York have the greatest total sales value
    2/ Texas has a medium total sales value compared to California and New York
    3/ In the last year Washington total sales value increased considerably.

### Sales per Year per Category

In [None]:
sales_percategory = pd.DataFrame(df.groupby(['Category','Order Year'],sort=False)['Sales'].sum()).sort_values('Order Year')
sales_percategory.sort_values(['Category','Order Year'],inplace=True)
sales_percategory.reset_index(inplace=True)
fig = px.bar(sales_percategory,x='Order Year',y='Sales',title='Sales per Year per Category',
             color='Category',labels={'Order Year':'Year','Sales':'Sales (c)'},barmode='group')
fig.update_layout(xaxis_tickformat = 'd',autosize=False,width=1100,height=600,title_x=0.5)

fig.show()

==> All categories sales exhibit an increasing trend, except in 2016 where Technology and Office Supplies sales diminished.
This is better remarked through the next growth rate graph.

In [None]:
sales_percategory['Sales n-1']=sales_percategory['Sales'].shift()
sales_percategory['Growth Rate'] = round(((sales_percategory['Sales']-sales_percategory['Sales n-1'])/sales_percategory['Sales n-1']),4)
fig2 = px.bar(sales_percategory[sales_percategory['Order Year']!=2015],x='Order Year',y='Growth Rate',
              title='Sales Growth Rate per Year per Category',
             color='Category',labels={'Order Year':'Year','Growth Rate':'GR'},text='Growth Rate',barmode='group')
fig2.update_traces( texttemplate='%{text:.2%s}', textposition='outside')
fig2.update_layout(uniformtext_minsize=8, uniformtext_mode='hide',title_x=0.5)
fig2.show()

### Monthly sales per Category

In [None]:
#Group sales per week and per categories
sales_category_month = pd.DataFrame(df.groupby(['Category','Order Month'],sort=False)['Sales'].sum())
sales_category_month.reset_index(inplace=True)
sales_category_month.sort_values(['Category','Order Month'],inplace=True)
fig = px.line(sales_category_month,x='Order Month',y='Sales',color='Category',
             hover_data={"Order Month": "|%B  %Y"})
fig.update_layout(autosize=False,width=1000,height=600,title_x=0.5,title_text='Monthly sales per Category')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b \n\n\n\n\n\n\n %Y",ticklabelmode="period")
fig.show()

### Weekly sales per category

In [None]:
sales_category_week = pd.DataFrame(df.groupby(['Category','Order Week'],sort=False)['Sales'].sum())
sales_category_week.reset_index(inplace=True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=sales_category_week[sales_category_week['Category']=='Office Supplies']['Order Week'],
                         y=sales_category_week[sales_category_week['Category']=='Furniture']['Sales'],name='Furniture'))
fig.add_trace(go.Scatter(x=sales_category_week[sales_category_week['Category']=='Office Supplies']['Order Week'],
                         y=sales_category_week[sales_category_week['Category']=='Office Supplies']['Sales'],name='Office Supplies'))
fig.add_trace(go.Scatter(x=sales_category_week[sales_category_week['Category']=='Office Supplies']['Order Week'],
                         y=sales_category_week[sales_category_week['Category']=='Technology']['Sales'],name='Technology'))
fig.update_layout(autosize=False,width=1200,height=600,title_x=0.5,title_text='Weekly sales per Category',
                 xaxis_title='Date (Week number/Year)',yaxis_title='Profit value')

# Furniture monthly sales analysis

### Preprocessing

In [None]:
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
#Creating a data frame containing Monthly sales of furniture
X_frame = sales_category_month[sales_category_month['Category']=='Furniture'][['Order Month', 'Sales']]
X_frame.set_index('Order Month',inplace=True)
#Add a column containing first difference of sales (sales(t)-sales(t-1))
X_frame['Sales diff 1'] = X_frame['Sales'].diff()
#Add a column containing logarithmic transformation of sales also its first and second order difference
X_frame['log Sales'] = np.log(X_frame['Sales'])
X_frame ['log Sales diff 1'] = X_frame['log Sales'].diff()
X_frame['log Sales diff 2'] = X_frame ['log Sales diff 1'].diff()
#Set series to be equal to 2nd order difference of logarithmic transformation
#At first trial using Raw sales yields a curved trend, after transforming it using logarithmic function the serie becomes non stationary. 
#Differencing it bring it back to stationarity and attenuate the trend effect. 
#Set a training set containing years 2015/2016/2017 observations and test set containing year 2018 observations.
X = np.array(X_frame['log Sales diff 2'].dropna() )
X_train = X[:-12]
X_test = X[-12:]

In [None]:
#Plotting X_train
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,167)),y=X_train,name='True values'))
fig.update_layout(title_text='Time serie plot',title_x=0.5)

## Stationarity test :

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
adf_resutls = adfuller(X_train,maxlag=10)
print(f'ADF test results are :')
print('ADF Statistic: %f' % adf_resutls[0])
print('p-value: %f' % adf_resutls[1])
print('Critical Values:')
for key, value in adf_resutls[4].items():
    print('\t%s: %.3f' % (key, value))
if adf_resutls[0]<=-2.9 :
    print('==> Non-stationarity can be rejected')
else :
    print('==> Non-stationarity cannot be rejected')

## Autocorrelation and partial autocorrelation graphs :

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
plot_pacf(X_train,lags=14,ax=axes[0])
plot_acf(X_train,lags=14,ax=axes[1])
plt.show()

According to the Partial Autocorrelation plot, Arima((2,2,1)*(0,0,0,0)) is a good candidate. Because second spike in Partial Autocorrelation graph is close to the significance treshhold let's remove this order from the model. The model becomes Arima((1,2,1)*(0,0,0,0))

### Additive decomposition

In [None]:
#The value of period is justified by the monthly sales per category plots which suspects a saisonality of 12 months for Furniture category and this value has a relative good effect on the trend composant
additive_decomposition = seasonal_decompose(X_train,period=12,model='additive')
plt.rcParams.update({'figure.figsize': (16,12)})
additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
plt.tight_layout()

## ARIMA Model

In [None]:
import statsmodels.api as sm

In [None]:
#Reset time serie to be the logarithmic transformation of Sales, differentiation is done automatically in SARIMAX function. 
X = np.array(X_frame['log Sales'].dropna() )
X_train = X[:-12]
X_test = X[-12:]

In [None]:
#The saisonality term is added after comparison of bic metric and due to the last remark on the decomposition graph
order= (1,2,1)
seasonal_order = (1,0,0,12)
trend='c'

In [None]:
model1_fit = sm.tsa.statespace.SARIMAX(X_train,order=order,seasonal_order=seasonal_order,trend=trend,enforce_invertibility=False,
                                      enforce_stationarity=False).fit()
fitted_values= model1_fit.fittedvalues

In [None]:
#Plot true and predicted sales for the training set
RMSE = np.sqrt(mean_squared_error(X_train[12:],fitted_values[12:]))
MAE = mean_absolute_error(X_train[12:],fitted_values[12:])
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,167)),y=X_train,name='True values'))
fig.add_trace(go.Scatter(x=np.array(range(12,167)),y=fitted_values[12:],name='Predicted Values'))
fig.update_layout(title_text='Time serie plot',title_x=0.5)

In [None]:
#Monthly prediction: This function gives month by month predictions. 
#If next 2 months sales are to be predicted, the function predicts the value for the first month and add it's true value to the training set then predicts for 2nd month
def month_prediction(X,Xtest_len=12,order=(0,0,0),seasonal_order=(2,0,1,12),trend='c'):
    predictions1 =[]
    for j in range(Xtest_len):
        X_train = X[0:len(X)-Xtest_len+j]
        model1 =  sm.tsa.statespace.SARIMAX(X_train,order=order,seasonal_order=seasonal_order,
                            enforce_invertibility=False,enforce_stationarity=False,trend=trend)
      
            
        model1_fit = model1.fit()
        prediction_step = model1_fit.predict(start=len(X_train),end=len(X_train))
        predictions1.append(prediction_step)
    predictions1 = np.reshape(predictions1,((Xtest_len),))
    RMSE = np.sqrt(mean_squared_error(X[len(X)-Xtest_len:],predictions1))
    MAE = mean_absolute_error(X[len(X)-Xtest_len:],predictions1)
    
    return predictions1 , RMSE , MAE

In [None]:
#Test set predictions
predictions1,RMSE,MAE = month_prediction(X,Xtest_len=12,order=order,seasonal_order=seasonal_order,trend=trend)

In [None]:
#Plot Test results
RMSE = np.sqrt(mean_squared_error(X_test,predictions1))
MAE = mean_absolute_error(X_test,predictions1)
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,12)),y=X[-12:],name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(0,12)),y=np.array(predictions1),name='Predicted Values'))

These results contain predictions of logarithmic transformation of sales. For next, predicted and true sales are plotted

In [None]:
transformed_predictions = np.exp(np.array(predictions1))
transformed_Xtest = np.exp(np.array(X_test))
RMSE = np.sqrt(mean_squared_error(transformed_Xtest,transformed_predictions))
MAE = mean_absolute_error(transformed_Xtest,transformed_predictions)
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,12)),y=transformed_Xtest,name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(0,12)),y=transformed_predictions,name='Predicted Values'))

# Total weekly sales

### Preprocessing

In [None]:
#Same transformations as the previous analysis
X_frame = pd.DataFrame(df.groupby(['Order Week'],sort=False)['Sales'].sum())
X_frame['Sales diff 1'] = X_frame['Sales'].diff()
X_frame['log Sales'] = np.log(X_frame['Sales'])
X_frame ['log Sales diff 1'] = X_frame['log Sales'].diff()
X_frame['log Sales diff 2'] = X_frame['log Sales diff 1'].diff()
#Same split as the previous analysis 2015/2016/2017 for training and 2018 for test
X = np.array(X_frame['log Sales diff 2'].dropna() )
X_train = X[:-52]
X_test = X[-52:]

In [None]:
#Plot time series
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,208)),y=X_train,name='True values'))
fig.update_layout(title_text='Time series plot',title_x=0.5)

### Stationarity test

In [None]:
adf_resutls = adfuller(X_train,maxlag=10)
print(f'ADF test results are :')
print('ADF Statistic: %f' % adf_resutls[0])
print('p-value: %f' % adf_resutls[1])
print('Critical Values:')
for key, value in adf_resutls[4].items():
    print('\t%s: %.3f' % (key, value))
if adf_resutls[0]<=-2.9 :
    print('==> Non-stationarity can be rejected')
else :
    print('==> Non-stationarity cannot be rejected')

### PACF and ACF

In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
plot_pacf(X_train,lags=52,ax=axes[0])
plot_acf(X_train,lags=52,ax=axes[1])

plt.show()

### Additive decomposition

In [None]:
#Decomposition :
additive_decomposition = seasonal_decompose(X_train,period=52,model='additive')
plt.rcParams.update({'figure.figsize': (16,12)})
additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
plt.tight_layout()

    1/ When using Raw Sales Data, we get a curved trend, in order to attenuate it, a logarithmic transformation had been applied but the serie became non statitionary.
    2/ the first order difference brought the series back to stationarity.


## ARIMA Model 

In [None]:
X = np.array(X_frame['log Sales'].dropna() )
X_train = X[:-52]
X_test = X[-52:]

In [None]:
#Set lag orders
order = ([1,2,3,4,18,32],2,1)
seasonal_order=(1,0,0,12)
trend='c'

In [None]:
#Fit the model 
model2_fit = sm.tsa.statespace.SARIMAX(X_train,order=order,seasonal_order=seasonal_order,trend=trend,enforce_invertibility=False,
                                      enforce_stationarity=False).fit()
fitted_values2= model2_fit.fittedvalues

In [None]:
#Plot true and predicted sales for the training set
RMSE = np.sqrt(mean_squared_error(X_train[52:],fitted_values2[52:]))
MAE = mean_absolute_error(X_train[52:],fitted_values2[52:])
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,167)),y=X_train,name='True values'))
fig.add_trace(go.Scatter(x=np.array(range(52,167)),y=fitted_values2[52:],name='Predicted Values'))
fig.update_layout(title_text='Time serie plot',title_x=0.5)

In [None]:
#Weekly prediction : This function plays the same role as monthly prediction function but on a window of a week
def week_prediction(X,Xtest_len=12,order=(0,0,0),seasonal_order=(2,0,1,12),trend='c'):
    predictions1 =[]
    for j in range(Xtest_len):
        X_train = X[0:len(X)-Xtest_len+j]
        
        model1 =  sm.tsa.statespace.SARIMAX(X_train,order=order,seasonal_order=seasonal_order,
                            enforce_invertibility=False,enforce_stationarity=False,trend=trend)
      
            
        model1_fit = model1.fit()
        prediction_step = model1_fit.predict(start=len(X_train),end=len(X_train))
        predictions1.append(prediction_step)
    predictions1 = np.reshape(predictions1,((Xtest_len),))
    RMSE = np.sqrt(mean_squared_error(X[len(X)-Xtest_len:],predictions1))
    MAE = mean_absolute_error(X[len(X)-Xtest_len:],predictions1)
    
    return predictions1 , RMSE , MAE

In [None]:
predictions2,RMSE,MAE = month_prediction(X,Xtest_len=52,order=order,seasonal_order=seasonal_order,trend=trend)

In [None]:
#Plot test results
RMSE = np.sqrt(mean_squared_error(X_test,predictions2))
MAE = mean_absolute_error(X_test,predictions2)
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=X_test,name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=np.array(predictions2),name='Predicted Values'))

In [None]:
#Transform back and plot real values
transformed_predictions = np.exp(np.array(predictions2))
transformed_Xtest = np.exp(np.array(X_test))
RMSE = np.sqrt(mean_squared_error(transformed_Xtest,transformed_predictions))
MAE = mean_absolute_error(transformed_Xtest,transformed_predictions)
print(f'Train Root Mean Squared Error = {RMSE}')
print(f'Test Mean Absolute Error = {MAE}')
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=transformed_Xtest,name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(0,52)),y=transformed_predictions,name='Predicted Values'))

### Predicting next week sales :

In [None]:
#Train the model on all data
model2_fit = sm.tsa.statespace.SARIMAX(X,order=order,seasonal_order=seasonal_order,trend=trend,enforce_invertibility=False,
                                      enforce_stationarity=False).fit()
fitted_values = model2_fit.fittedvalues

In [None]:
#Predict next week sale value
next_week_prediction = model2_fit.predict(start=len(X),end=len(X))
print(f'Next week sales value prediction is equal to : {round(np.exp(next_week_prediction[0]),2)} (cur)')

In [None]:
#Plot results
fig = go.Figure()
transformed_fitted = np.exp(fitted_values)
fig.add_trace(go.Scatter(x=np.array(range(200,212)),y=X_frame['Sales'].iloc[200:],name='True Values'))
fig.add_trace(go.Scatter(x=np.array(range(200,212)),y=transformed_fitted[200:],name='Predicted Values'))
fig.add_trace(go.Scatter(x=np.array(range(209,211)),y=np.array([transformed_fitted[-1],np.exp(next_week_prediction[0])]),
                         line = dict(color='red', width=4, dash='dash'),
                         name='Next week prediction'))


### Monthly sales time series plot shows that each catogory has a different behavior, in next parts we will predict future weekly sales per category...