# Main Modelling for Time Series Store forecast
By Alex Dance https://www.linkedin.com/in/alex-dance/
This notebook is one of several notebooks for a project to improve store and product forecasts
EDA – Exploratory Data Analysis – includes working with annual forecasts
Main Modelling
XG Boost modelling by Month
Weighted average
ARIMA – Month and Other Modelling
Deep Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
from sklearn.metrics import mean_squared_error , mean_absolute_error
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from xgboost import XGBClassifier
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score

In [None]:
class color:  # Testing to make the heading look a liitle more impressive
   BOLD = '\033[1m'

In [None]:
df = pd.read_csv("../input/demand-forecasting-kernels-only/train.csv")
df.head()

In [None]:
split = "2017-01-01"

In [None]:
df['date'] =  pd.to_datetime(df['date'])

# Feature Engineering

In [None]:
split = "2017-01-01"

In [None]:
df['ItemStoreCombined'] = df['item'].map(str) + '-' + df['store'].map(str) 
# this is used in particular to ensure the rolling forecast data does not leak from 1 item / store combination to the next

In [None]:
df.head()

In [None]:

df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.weekofyear

In [None]:
df_roll=df.copy() # for the rolling forecast

In [None]:
# for rolling forecast
df_roll['sales-1'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(1).mean().reset_index(0,drop=True)
df_roll['sales-2'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(2).mean().reset_index(0,drop=True)
df_roll['sales-3'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(3).mean().reset_index(0,drop=True)
df_roll['sales-4'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(4).mean().reset_index(0,drop=True)
df_roll['sales-5'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(5).mean().reset_index(0,drop=True)
df_roll['sales-6'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(6).mean().reset_index(0,drop=True)
df_roll['sales-7'] = df_roll.groupby('ItemStoreCombined')['sales'].rolling(7).mean().reset_index(0,drop=True)

In [None]:
df_roll.head(10)

In [None]:
# ConsideredLooking forward but chose not to

# To check rolling mean is working correctly

In [None]:
df_roll_1_1= df_roll[(df_roll.store==1) & (df_roll.item==1)]

In [None]:
df_roll_2_2 = df_roll[(df_roll.store==2) & (df_roll.item==2)]

In [None]:
df_roll_2_2.head() # to check rolling mean worked. As this is product 2 in store 2 and as Sales--3 has Nan then the rolling mean is not bleeding from earlier data

In [None]:
df_roll_1_1.head()  

In [None]:
df_roll = df_roll.dropna()  

# Set Date as Index

In [None]:
df = df.set_index('date')

In [None]:
df.head()

# Functions to calculate multiple error metrics
* calculate_error is for more metrics
* calculate errorb is for less metrics

In [None]:
def calculate_error(test_sales, train_sales , test_prediction, train_prediction):
    # https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/
    MSE_test = mean_squared_error(y_true=test_sales,  y_pred=test_prediction) # Mean Square Error (MAE)
    MSE_train = mean_squared_error(y_true=train_sales,  y_pred=train_prediction)
    MAE_test = mean_absolute_error(y_true=test_sales,  y_pred=test_prediction) # Mean Absolute Error (MAE)
    MAE_train = mean_absolute_error(y_true=train_sales,  y_pred=train_prediction)
    MAPE = np.mean(np.abs(test_prediction - test_sales  ) **2)  # Mean Absolute Percentage Error (MAPE)
    RMSE  = np.mean(np.sqrt((test_prediction - test_sales) ** 2))    
    return{'MSE_test': MSE_test, 'MSE_train':MSE_train ,'MAE_test': MAE_test, 'MAE_train':MAE_train, 'MAPE':MAPE, 'RMSE':RMSE}

In [None]:
def calculate_errorb(test_sales,  test_prediction):
    # https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/
    MSE_test = mean_squared_error(y_true=test_sales,  y_pred=test_prediction) # Mean Square Error (MAE)
    MAE_test = mean_absolute_error(y_true=test_sales,  y_pred=test_prediction) # Mean Absolute Error (MAE)
    MAPE = np.mean(np.abs(test_prediction - test_sales  ) **2)  # Mean Absolute Percentage Error (MAPE)
    RMSE  = np.mean(np.sqrt((test_prediction - test_sales) ** 2))    
    return{'MSE_test': MSE_test ,'MAE_test': MAE_test,  'MAPE':MAPE, 'RMSE':RMSE}

# Rolling Forecast

In [None]:
df_roll_store_item = df.groupby(["store","item"]).rolling('7D').sales.mean() 

In [None]:
print(df_roll_store_item)

In [None]:
df_roll_store_item =df_roll_store_item.reset_index()

In [None]:
df_roll_store_item.head()

In [None]:
df_roll_store_item.sample(5)

In [None]:
df_roll_store_item =df_roll_store_item.rename(columns={"sales":"Mean_Amount_7D"})

In [None]:
df_roll_store_item.head()

In [None]:
df_roll_final = df_roll.merge(df_roll_store_item, left_on=['date','store','item'], right_on=['date','store','item'] )

In [None]:
df_roll_final_7days = df_roll_final[(df_roll_final.date >= '2017-01-01') & (df_roll_final.date < '2017-01-08')]

In [None]:
df_roll_final_7days.head()

In [None]:
df_roll_final_7days.sum()

In [None]:
df_roll_final.head()

In [None]:
df_roll_final = df_roll_final.drop (['sales-1', 'sales-2','sales-3', 'sales-4', 'sales-5', 'sales-6',  'sales-7'],axis=1)

# Splitting and make Test Train and Main Tracking Option

In [None]:
df_train = df[ :split ] 
df_test = df[split : ] 

In [None]:
# df_test_final will be the collated way of comparing the sales and all the forecasting options. 
# Every time a new model is run it will be added to this
df_test_final = df_test.copy()

In [None]:
df_test_final =df_test_final.drop (['dayofweek', 'quarter','month', 'year', 'dayofyear', 'weekofyear'],axis=1)

In [None]:
df_test.head()

In [None]:
y_train = df_train.loc[:,'sales']
y_test= df_test.loc[:,'sales']
X_train = df_train.drop (['sales'],axis=1) 
X_test = df_test.drop (['sales'],axis=1)

In [None]:
print(y_train.shape)
print(y_test.shape)
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train.head()

In [None]:
y_train.head()

# Add Rolling 

In [None]:
df_test_final = df_test_final.merge(df_roll_store_item, left_on=['date','store','item'], right_on=['date','store','item'] )

In [None]:
df_test_final.head()

# Weighted and SMA

In [None]:
df_roll.head()

In [None]:
df_weighted = df_roll.copy() 

In [None]:
df_weighted['date'] =  pd.to_datetime(df_weighted['date'])

In [None]:
df_weighted = df_weighted.set_index('date')

In [None]:
df_weighted.head()

In [None]:
weights = np.arange(1,11) #this creates an array with integers 1 to 10 included
weights

In [None]:
wma10 = df_weighted['sales'].rolling(10).apply(lambda prices: np.dot(prices, weights)/weights.sum(), raw=True)

In [None]:
wma10.head(20)

In [None]:
df_weighted['10-day-WMA'] = wma10

In [None]:
df_weighted.head()

In [None]:
wma10.sample(5)

In [None]:
df_weighted.info()

In [None]:
df_weighted.tail()

In [None]:
df_weighted_7days = df_weighted[(df_weighted.index >= '2017-01-01')] 

In [None]:
df_weighted_7days = df_weighted_7days[(df_weighted_7days.index < '2017-01-08')]

In [None]:
df_weighted_7days.head()

In [None]:
RMSE_Weighted_10 =  np.mean(np.sqrt((df_weighted_7days['10-day-WMA'] - df_weighted_7days['sales']) ** 2))    

In [None]:
sma10 = df_weighted['sales'].rolling(10).mean()


In [None]:
df_weighted['sma10'] = sma10

In [None]:
df_weighted_short = df_weighted[split : "2017-03-30"] 
df_weighted_short_1_1 =  df_weighted_short[(df_weighted_short.store==1) & (df_weighted_short.item==1)]

In [None]:
df_weighted_short_1_1.head()

In [None]:
plt.figure(figsize = (12,6))
plt.plot(df_weighted_short_1_1['sales'], label="sales")
plt.plot(df_weighted_short_1_1['10-day-WMA'], label="10-Day WMA")
plt.plot(df_weighted_short_1_1['sma10'], label="10-Day SMA")
plt.xlabel("Date")
plt.ylabel("sales")
plt.legend()
plt.show()

# XG Boost

In [None]:
XG_model = xgb.XGBRegressor(n_estimators=1000) 

In [None]:
X_test = X_test.drop (['ItemStoreCombined'],axis=1)
X_train = X_train.drop (['ItemStoreCombined'],axis=1)

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
y_test.sum()

In [None]:
%%time
XG_model.fit(X_train, y_train,eval_set=[(X_test, y_test)],early_stopping_rounds=50,verbose=False)


In [None]:
_ = plot_importance(XG_model, height=0.9)

In [None]:
XG_test_prediction = XG_model.predict(X_test)

In [None]:
XG_test_all =X_test.copy()
XG_train_all =X_train.copy()
XG_test_all['XG prediction'] = XG_model.predict(X_test)
XG_train_all['XG prediction'] =XG_model.predict(X_train)
XG_test_all['sales'] = y_test
XG_train_all['sales'] = y_train
df_xg_all = pd.concat([XG_test_all, XG_train_all], sort=False)

In [None]:
RMSE_XG_initial  = np.mean(np.sqrt((XG_test_all['XG prediction'] - XG_test_all['sales']) ** 2)) 
print(RMSE_XG_initial)

In [None]:
_ = df_xg_all[['sales','XG prediction']].plot(figsize=(15, 5))
# too many stores and products for graph to be useful apart form seeing the outliers
# when see the blue this is the outliers
# there are very few super low sales days
# there are plenty of days that are very high - which are good for business but hard to forecast

In [None]:
df_xg_all.sample(10)

In [None]:
XG_test_all.head()

In [None]:
XG_test_all['sales']

In [None]:
XG_test_all['XG prediction']

In [None]:
XG_test_all.head()

In [None]:
df_test_all_1_1 = XG_test_all[(XG_test_all.store==1)&(XG_test_all.item==1)]

In [None]:
_ = df_test_all_1_1[['sales','XG prediction']].plot(figsize=(15, 5))

In [None]:
df_test_all_2_1 = XG_test_all[(XG_test_all.store==2)&(XG_test_all.item==1)]

In [None]:
_ = df_test_all_2_1[['sales','XG prediction']].plot(figsize=(15, 5))

In [None]:
df_test_all_2_2 = XG_test_all[(XG_test_all.store==2)&(XG_test_all.item==2)]

In [None]:
_ = df_test_all_2_2[['sales','XG prediction']].plot(figsize=(15, 5))

In [None]:
XG_test_all.head()

In [None]:
# This calls the error calculating function
XG_Results= calculate_error(XG_test_all['sales'],XG_train_all['sales'],XG_test_all['XG prediction'],XG_train_all['XG prediction'])

In [None]:
print(XG_Results)

In [None]:
print(color.BOLD +"XG Boost Results ")
print ('\033[0m')

print("Mean Squared Error -MSE")
print("MSE_test",XG_Results['MSE_test'])
print("MSE_train",XG_Results['MSE_train'])
print(" ")
print("Mean Absolute Error - MAE")
print("MAE_test",XG_Results['MAE_test'])
print("MAE_train",XG_Results['MAE_train'])
print(" ")
print("Mean Absolute Percentage Error - MPE")
print("MAPE",XG_Results['MAPE'])
print(" ")
print("Root Mean Squared Error -RMSE")
print("RMSE",XG_Results['RMSE'])


In [None]:
XGaccuracy = accuracy_score(XG_test_all['sales'], XG_test_all['XG prediction'].round()) 
print("Accuracy: %.2f%%" % (XGaccuracy * 100.0))
# This accuracy score does not relfect the accuracy of the result. Instead I looked at the forecasts. I have therefore not used accuracy score further and instead used RMSE and others/

In [None]:
XG_test_all['error'] = XG_test_all['sales'] - XG_test_all['XG prediction']
XG_test_all['abs_error'] = XG_test_all['error'].apply(np.abs)
XG_test_all['abs_error_percent'] = (XG_test_all['abs_error'] / XG_test_all['sales'])*100
error_by_day = XG_test_all.groupby(['year','month','dayofmonth']).mean()[['sales','XG prediction','error','abs_error','store','item']]

In [None]:
error_by_day = XG_test_all.groupby(['year','month','dayofmonth']).mean()[['sales','XG prediction','error','abs_error','store','item']]
1
error_by_day.sort_values('error', ascending=True).head(5)

In [None]:
df_xg_all.head()

In [None]:
XG_test_all['error']

In [None]:

num_bins = 100
plt.title('XG by prod abs error percent')
plt.hist(XG_test_all['abs_error_percent'], bins =num_bins)
plt.xlim((0,50))
plt.show()

In [None]:
XG_test_all.head()

In [None]:
XG_test_all.abs_error_percent.quantile([0.01,0.05,0.1,0.25,0.5,0.75,0.995])
# used this information for presentation in pack to look at the accuracy of the model

In [None]:
XG_test_all.abs_error_percent.quantile([0.01,0.05,0.1,0.25,0.5,0.75,0.995])
# used this information for presentation in pack to look at the accuracy of the model

# Add XG Boost Results to final(master tab)

In [None]:
XG_test_predictions = XG_test_all.copy()

In [None]:
XG_test_predictions.head()

In [None]:
XG_test_predictions = XG_test_predictions.drop (['dayofweek', 'dayofmonth','quarter','month', 'year', 'dayofyear', 'weekofyear'],axis=1)

In [None]:
df_test_final.sum()

In [None]:
df_test_final = df_test_final.merge(XG_test_predictions, left_on=['date','store','item'], right_on=['date','store','item'] )

In [None]:
df_test_final.sample(10)

# Cat Boost

In [None]:
#CatBoostModel=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
CatBoostModel=CatBoostRegressor()

In [None]:
CatBoostModel.fit(X_train, y_train,eval_set=(X_test, y_test),plot=True)

In [None]:
catboostpred = CatBoostModel.predict(X_test)

In [None]:
print(catboostpred)

In [None]:
CAT_test_all =X_test.copy()
CAT_train_all =X_train.copy()
CAT_test_all['CAT prediction'] = CatBoostModel.predict(X_test)
CAT_train_all['CAT prediction'] =CatBoostModel.predict(X_train)
CAT_test_all['sales'] = y_test
CAT_train_all['sales'] = y_train
df_CAT_all = pd.concat([CAT_test_all, CAT_train_all], sort=False)


In [None]:
CAT_test_all.sum()

In [None]:
df_test_all_1_1 = CAT_test_all[(CAT_test_all.store==1)&(CAT_test_all.item==1)]

In [None]:
_=df_test_all_1_1[['sales','CAT prediction']].plot(figsize=(15, 5))

In [None]:
CAT_Results= calculate_error(CAT_test_all['sales'],CAT_train_all['sales'],CAT_test_all['CAT prediction'],CAT_train_all['CAT prediction'])

In [None]:
print(color.BOLD +"CAT Boost Results ")
print ('\033[0m')

print("Mean Squared Error -MSE")
print("MSE_test",CAT_Results['MSE_test'])
print("MSE_train",CAT_Results['MSE_train'])
print(" ")
print("Mean Absolute Error - MAE")
print("MAE_test",CAT_Results['MAE_test'])
print("MAE_train",CAT_Results['MAE_train'])
print(" ")
print("Mean Absolute Percentage Error - MPE")
print("MAPE",CAT_Results['MAPE'])
print(" ")
print("Root Mean Squared Error -RMSE")
print("RMSE",CAT_Results['RMSE'])

In [None]:
df_test_final = df_test_final.merge(CAT_test_all, left_on=['date','store','item'], right_on=['date','store','item'] )

In [None]:
df_test_final.head()

# added some forecasts form Amazon and compared them - that are NOT included here

# Final Compare

In [None]:
df_test_final.sum()

In [None]:
df_test_final.sample(5)

In [None]:
df_test_final_1_1= df_test_final[(df_test_final.store==1) & (df_test_final.item==1)]

#df_test_final_1_1= df_test_final_Auto[(df_test_final_Auto.store==1) & (df_test_final_Auto.item==1)]
df_test_final_1_1_Jan = df_test_final_1_1[(df_test_final.date<'2017-01-31')]

In [None]:
df_test_final_1_1.sample(3)

In [None]:
RMSE_1_1_XG  = np.mean(np.sqrt((df_test_final_1_1['XG prediction'] - df_test_final_1_1['sales']) ** 2)) 
print(RMSE_1_1_XG)

In [None]:
df_test_final_1_1_NovDec = df_test_final_1_1[(df_test_final.date>'2017-10-31')]

In [None]:
df_test_final_1_1_Jan.info()

# Plotting all together

In [None]:
df_test_final_1_1_Jan.head(2)

In [None]:
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = df_test_final_1_1_Jan[['XG prediction', 'CAT prediction','sales_x']].plot(ax=ax, style=['-','-','.'])
ax.set_ylim(0, 50)
#ax.set_xbound(lower='12-12-2017', upper='31-12-2017')
plot = plt.suptitle('Jan 2017 sales and forecast for product 1 in store 1')

In [None]:
df_test_final_7days = df_test_final[(df_test_final.date>'2017-01-01')]

In [None]:
df_test_final_7days = df_test_final_7days[(df_test_final_7days.date<'2017-01-08')]

In [None]:
df_test_final_7days.head()

In [None]:
RMSE_7_days_Cat  = np.mean(np.sqrt((df_test_final_7days['CAT prediction'] - df_test_final_7days['sales_x']) ** 2)) 
print(RMSE_7_days_Cat)

# Covert to By Day

In [None]:
df_test_final_new = df_test_final.copy()

In [None]:
df_test_final_new['date'] =  pd.to_datetime(df_test_final_new['date'])

In [None]:
df_test_final_new = df_test_final_new.set_index('date')

In [None]:
DailyFinal = df_test_final_new.resample('D').sum()

In [None]:
DailyFinal.head()

In [None]:
RMSE_daily_XG  = np.mean(np.sqrt((DailyFinal['XG prediction'] - DailyFinal['sales_x']) ** 2)) 
print(RMSE_daily_XG)

In [None]:
RMSE_daily_CAT  = np.mean(np.sqrt((DailyFinal['CAT prediction'] - DailyFinal['sales_x']) ** 2)) 
print(RMSE_daily_CAT)

# Monthly Final

In [None]:
MonthlyFinal = df_test_final_new.resample('M').sum()

In [None]:
MonthlyFinal.head()

In [None]:
MonthlyFinal.info()

In [None]:
RMSE_monthly_XG  = np.mean(np.sqrt((MonthlyFinal['XG prediction'] - MonthlyFinal['sales_x']) ** 2)) 
print(RMSE_monthly_XG)

In [None]:
RMSE_monthly_CAT  = np.mean(np.sqrt((MonthlyFinal['CAT prediction'] - MonthlyFinal['sales_x']) ** 2)) 
print(RMSE_monthly_CAT)

# Group by Store

In [None]:
df_test_final.sample(10)

In [None]:
Store_Month_Test_Final = df_test_final_new.groupby(['store']).resample('M').sum()

In [None]:
Store_Month_Test_Final.tail(10)

In [None]:
RMSE_Store_Month_XG  = np.mean(np.sqrt((Store_Month_Test_Final['XG prediction'] - Store_Month_Test_Final['sales_x']) ** 2)) 
print(RMSE_Store_Month_XG)

In [None]:
RMSE_Store_Month_Cat  = np.mean(np.sqrt((Store_Month_Test_Final['CAT prediction'] - Store_Month_Test_Final['sales_x']) ** 2)) 
print(RMSE_Store_Month_Cat)

In [None]:
Store_Month_Test_Final.info()

# By Store by Item

In [None]:
Store_item_Month_Test_Final = df_test_final_new.groupby(['store','item']).resample('M').sum()

In [None]:
Store_item_Month_Test_Final.head()

In [None]:
RMSE_Store_item_XG  = np.mean(np.sqrt((Store_item_Month_Test_Final['XG prediction'] - Store_item_Month_Test_Final['sales_x']) ** 2)) 
print(RMSE_Store_item_XG)

In [None]:
RMSE_Store_item_Month_Cat  = np.mean(np.sqrt((Store_item_Month_Test_Final['CAT prediction'] - Store_item_Month_Test_Final['sales_x']) ** 2)) 
print(RMSE_Store_item_Month_Cat)

# Final Results

In [None]:
print(color.BOLD +"RMSE ")
print ('\033[0m')

print("Root Mean Squared Error -RMSE")
print("RMSE XG Boost",XG_Results['RMSE'])
print("RMSE",CAT_Results['RMSE'])

In [None]:
df_test_final.sum()