# Introduction

In this notebook I will be looking at the store sales time series forecasting Kaggle competition data and attempting to accurately model store sales in a variety of product families from 9 stores.

# Imports

In [None]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from learntools.time_series.style import *  # plot style settings
#from learntools.time_series.utils import plot_lags, make_lags, make_leads
from learntools.time_series.utils import (create_multistep_example,
                                          load_multistep_data,
                                          make_lags,
                                          make_leads,
                                          plot_lags,
                                          make_multistep_target,
                                          plot_multistep,
                                          plot_periodogram, 
                                          seasonal_plot)
from statsmodels.graphics.tsaplots import plot_pacf
import matplotlib.pyplot as plt
import plotly as py
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.multioutput import RegressorChain
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load Data

In [None]:
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
trans = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
#load the data
comp_dir = Path('../input/store-sales-time-series-forecasting')
store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
#rearranging values according to the store, product family, and date that they occur
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
#unstacking the time series data to wide form and averaging over product families over stores
family_sales = (
    store_sales
    .groupby(['family', 'date'])
    .mean() 
    .unstack('family')
    .loc[:, ['sales', 'onpromotion']]
)


holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')

df_test = pd.read_csv(
    comp_dir / 'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

Below we see the stacked version of our data.

In [None]:
store_sales

And here is the wide form of our time series data.

In [None]:
store_sales.unstack(['store_nbr', 'family'])

Let's get a look at a few of the time series that we will be working with.

In [None]:
y = store_sales.unstack(['store_nbr', 'family']).loc['2017']
STORE_NBR = '3'  # 1 - 54
STORE_NBR_2 = '5'  # 1 - 54
STORE_NBR_3 = '15'  # 1 - 54
FAMILY = 'AUTOMOTIVE'
FAMILY_2 = 'MAGAZINES'
FAMILY_3 = 'BEAUTY'

# Uncomment to see a list of product families
display(store_sales.index.get_level_values('family').unique())

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, sharey=True)
ax1 = y.loc(axis=1)['sales', STORE_NBR, FAMILY].plot(**plot_params, ax=ax1)
ax1.set_title(f'{FAMILY} Sales at Store {STORE_NBR}');
ax2 = y.loc(axis=1)['sales', STORE_NBR_2, FAMILY_2].plot(**plot_params, ax=ax2)
ax2.set_title(f'{FAMILY_2} Sales at Store {STORE_NBR_2}');
ax3 = y.loc(axis=1)['sales', STORE_NBR_3, FAMILY_3].plot(**plot_params, ax=ax3)
ax3.set_title(f'{FAMILY_3} Sales at Store {STORE_NBR_3}');

# Trend

It looks like many of the stores' product families follow a fairly flat or ranging trend (close to a slope of 0). Let's see if we can fit a trend to these sales using a time dummy.

In [None]:
y = store_sales.unstack(['store_nbr', 'family'])
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    drop=True,
)
X = dp.in_sample()
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

Using a linear model we have fit a simple straight line to our data to show how sales are trending over time.

Let's look at the trend our model predicts below.

In [None]:
y = store_sales.unstack(['store_nbr', 'family']).loc['2017']
short_y_pred = y_pred.loc['2017']
STORE_NBR = '3'  # 1 - 54
STORE_NBR_2 = '5'  # 1 - 54
STORE_NBR_3 = '15'  # 1 - 54
FAMILY = 'AUTOMOTIVE'
FAMILY_2 = 'MAGAZINES'
FAMILY_3 = 'BEAUTY'

display(store_sales.index.get_level_values('family').unique())

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, sharey=True)
ax1 = y.loc(axis=1)['sales', STORE_NBR, FAMILY].plot(**plot_params, ax=ax1)
ax1 = short_y_pred.loc(axis=1)['sales', STORE_NBR, FAMILY].plot(ax=ax1)
ax1.set_title(f'{FAMILY} Sales at Store {STORE_NBR}');
ax2 = y.loc(axis=1)['sales', STORE_NBR_2, FAMILY_2].plot(**plot_params, ax=ax2)
ax2 = short_y_pred.loc(axis=1)['sales', STORE_NBR_2, FAMILY_2].plot(ax=ax2)
ax2.set_title(f'{FAMILY_2} Sales at Store {STORE_NBR_2}');
ax3 = y.loc(axis=1)['sales', STORE_NBR_3, FAMILY_3].plot(**plot_params, ax=ax3)
ax1 = short_y_pred.loc(axis=1)['sales', STORE_NBR_3, FAMILY_3].plot(ax=ax3)
ax3.set_title(f'{FAMILY_3} Sales at Store {STORE_NBR_3}');

And we can see from the line plotted in each of the graphs that the trend line is fairly flat over different product families. The flat trend line is indicative of a ranging market that is neither growing nor shrinking.

# Seasonality

Now lets consider seasonality. Presumably sales at a supermarket would follow normal social patterns like weekly grocery runs, monthly expenditures, heavier traffic during the holidays, and the like.

Let's use a periodogram to see what seasons are present in our store sales data. To get an idea for all product families and stores, we will work with their average.

In [None]:
average_sales = (
    store_sales.loc[:, ['sales']]
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)
average_sales
plot_periodogram(average_sales);

Based on this periodogram we can see strong seasonality monthly, biweekly, weekly, and semiweekly. Below we will use fourier features to model longer seasons (multiple weeks) and one-hot features to model shorter seasons (weekly). I think by modeling weekly seasons that will take care of biweekly and semiweekly features as well.

In [None]:
y = store_sales.unstack(['store_nbr', 'family'])
fourier = CalendarFourier(order = 4, freq = 'M')
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    additional_terms=[fourier],
    seasonal = True,
    drop=True,
)
X = dp.in_sample()
X

Below you can see a glimpse at the fourier features that we created. We created these features based on monthly seasonality with a variety of 4 subdivisions of this season. Also we created one-hot encodings of the days of the week.

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True, sharey=True)
ax1 =  X[X.columns[8:10]].loc['2017'].plot(ax=ax1)
ax1.set_title('Fourier Features 1');
ax2 =  X[X.columns[10:12]].loc['2017'].plot(ax=ax2)
ax2.set_title('Fourier Features 2');
ax3 =  X[X.columns[12:14]].loc['2017'].plot(ax=ax3)
ax3.set_title('Fourier Features 3');
ax4 =  X[X.columns[14:16]].loc['2017'].plot(ax=ax4)
ax4.set_title('Fourier Features 4');

Let's look at the output when we try to use a linear model to model this data's seasonality with the features we created.

In [None]:
y.loc['2017']

In [None]:
y = y.loc['2017']
X = X.loc['2017']
model = LinearRegression().fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

STORE_NBR = '3'  # 1 - 54
FAMILY = 'AUTOMOTIVE'
ax = y.loc['2017'].loc(axis=1)['sales', STORE_NBR, FAMILY].plot(**plot_params)
ax = y_pred.loc['2017'].loc(axis=1)['sales', STORE_NBR, FAMILY].plot(ax=ax)
ax.set_title(f'{FAMILY} Sales at Store {STORE_NBR}');

To see how well we model seasonality we will subtract the model's prediction from the ground truth values, giving the deseasoned values.

In [None]:
y_deseason = y - y_pred

average_sales_deseason = (
    y_deseason.stack(['store_nbr', 'family']).loc[:, ['sales']]
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)

ax = average_sales_deseason.plot(**plot_params)
ax.set_title('Average of Deseasoned Data')

It looks like there is still a lot of movement, so we can look at periodogram to see if this is just a result of cycles in the market or seasonality that wasn't fully accounted for.

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = plot_periodogram(average_sales, ax=ax1)
ax1.set_title("Product Sales Frequency Components")
ax2 = plot_periodogram(average_sales_deseason, ax=ax2);
ax2.set_title("Deseasonalized");

Based on a periodogram of the deseasoned values it looks like we did well to remove seasonality and the little variance left is likely noise or cycles.

**Holidays**

Lets look at how holidays impact sales.

In [None]:
# National and regional holidays in the training set
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2017':'2017-08-15', ['description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)

display(holidays)

A list of holidays in Ecuador was supplied.

In [None]:
ax = average_sales_deseason.plot(**plot_params)
plt.plot_date(holidays.index, average_sales_deseason[holidays.index], color='C3')
ax.set_title('National and Regional Holidays');

We can see from this plot that some of these holidays may help our predictions by removing variance.

Lets add one-hot encoding features to account for these holidays.

In [None]:
X_holidays = pd.get_dummies(holidays)

X2 = X.join(X_holidays, on='date').fillna(0.0)

Let's see how this impacts our predictions.

In [None]:
X2

In [None]:
 model = LinearRegression().fit(X2, y)

y_pred = pd.DataFrame(model.predict(X2), index=X2.index, columns=y.columns)

avg = (
    y_pred.stack(['store_nbr', 'family']).loc[:, ['sales']]
    .groupby('date').mean()
    .squeeze()
    .loc['2017']
)

deseasoned_average = average_sales - avg

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = average_sales.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold", ax=ax1)
ax1 = avg.plot(ax=ax1, label="Seasonal")
ax2 = deseasoned_average.plot(title="Deseasoned Sales", ylabel="items sold", ax=ax2)

That looks great! From the chart of deseasoned sales, we can see that adding holiday data reduced a lot of the noise in the sales data.

# Cycles

Now lets see if we can capture the rest of the noise present in our data by modeling market cycles.

To model if we are doing this well we will be looking at the column with the largest standard deviation (or variance) which would be the GROCERY I column of sales for store 46.

In [None]:
deseasoned = y - y_pred
deseasoned.columns[y.std().argmax()]

In [None]:
cycle_col = deseasoned.loc(axis=1)['sales', '46', 'GROCERY I']

In [None]:
STORE_NBR = '46'  # 1 - 54
FAMILY = 'GROCERY I'
ax = cycle_col.plot(**plot_params)
ax.set_title('Deseasoned Grocery Sales at Store 46');

Below we set up a moving average over the non-deseasoned data that smooths over weekly seasons, but preserves cycles in the data.

In [None]:
y_ma = y.loc(axis=1)['sales', '46', 'GROCERY I'].rolling(window=7, center=True).mean()

ax = y_ma.plot()
ax.set_title("Seven-Day Moving Average");

Seeing how the moving average looks a lot like the deseasoned data plot makes a case for cyclical movement.

In [None]:
plot_pacf(cycle_col, lags=8);
plot_lags(cycle_col, lags=8, nrows=2);

Based on the partial autocorrelation, it looks like lags 1 and 4 may be useful. Also, We see some potentially useful non-linear results from our lag plots, especially 1 and 4.

On promotion data refers to the number of items that the stores put on promotion in each product family. This may supply useful lead features as the store decides when to put items on promotion.

In [None]:
promo = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion', '46', 'GROCERY I'].loc['2017']

In [None]:
plot_lags(x=promo, y=cycle_col, lags=3, leads=3, nrows=1);

Lead 2 and Lag 3 both look like they could lend useful insights into our data.

In [None]:
y = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales']
all_promotion = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion'].loc['2017']
X_lags = make_lags(y, lags=4)
X_promo = pd.concat([
    make_lags(all_promotion, lags=3),
    all_promotion,
    make_leads(all_promotion, leads=2),
], axis=1)

#putting together on promotion lag and lead data, seasonality, trends, holidays, and sales lags
X = pd.concat([X2, X_lags, X_promo], axis=1).dropna()
y, X = y.align(X, join='inner', axis = 0)
X

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=30, shuffle=False)

model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns = y_train.columns).clip(0.0)
y_pred = pd.DataFrame(model.predict(X_valid), index=X_valid.index, columns = y_valid.columns).clip(0.0)
rmsle_train = mean_squared_log_error(y_train, y_fit) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, y_pred) ** 0.5
print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')
y_avg = (y.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
y_fit_avg = (y_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
y_pred_avg = (y_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

In [None]:
ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_fit_avg.plot(ax=ax, label="Fitted", color='C0')
ax = y_pred_avg.plot(ax=ax, label="Forecast", color='C3')
ax.legend();

0.74409 (root mean squared log error) is pretty good! Visually it looks like a simple linear regression model does a pretty good job of capturing sales trends.

# Error

# Hybrid Modeling

In [None]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method


In [None]:
def fit(self, X_1, X_2, y):
    self.model_1.fit(X_1, y)

    y_fit = pd.DataFrame(
        self.model_1.predict(X_1),
        index=X_1.index, columns=y.columns,
    )

    y_resid = y - y_fit
    y_resid = y_resid.stack(['family', 'store_nbr']).squeeze() # wide to long

    self.model_2.fit(X_2, y_resid)

    # Save column names for predict method
    self.y_columns = y.columns
    # Save data for question checking
    self.y_fit = y_fit
    self.y_resid = y_resid


# Add method to class
BoostedHybrid.fit = fit


In [None]:
def predict(self, X_1, X_2):
    y_pred = pd.DataFrame(
        self.model_1.predict(X_1),
        index=X_1.index, columns=self.y_columns,
    )
    y_pred = y_pred.stack(['family', 'store_nbr']).squeeze()  # wide to long

    y_pred += self.model_2.predict(X_2)
    
    return y_pred.unstack(['family', 'store_nbr'])  # long to wide


# Add method to class
BoostedHybrid.predict = predict

In [None]:

X_2 = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'onpromotion']  # onpromotion feature
X_2.stack(['family', 'store_nbr'])
X_2, y = X_2.align(y, join='inner', axis = 0)
X_2 = X_2.stack(['family', 'store_nbr']).squeeze()
# Label encoding for 'family'
le = LabelEncoder()  # from sklearn.preprocessing
X_2 = X_2.reset_index('family')
X_2['family'] = le.fit_transform(X_2['family'])
X_2

In [None]:
model = BoostedHybrid(LinearRegression(), XGBRegressor())

model.fit(X, X_2, y)
y_pred = model.predict(X, X_2)

y_pred = y_pred.clip(0.0)
#converting back to the right order
y_pred = y_pred.stack(['store_nbr', 'family']).unstack(['store_nbr', 'family'])

In [None]:
y_pred

In [None]:
rmsle_train = mean_squared_log_error(y, y_pred) ** 0.5
rmsle_train

In [None]:
y_fit = pd.DataFrame(model.predict(X, X_2), index=X.index, columns = y.columns)
y_avg = (y.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
y_fit_avg = (y_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

In [None]:
ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_fit_avg.plot(ax=ax, label="Fitted", color='C0')
ax.legend();

# Preparing to Forecast

Submission to the competition requires a 16 step forecast with a one step lead time (16 days into the future starting 1 day after the last date we have available). So, instead of training on all of the data that we have as we have been doing, we will will restructure our data to be multistep to account for this forecast.

In [None]:
X = dp.in_sample()
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2017':'2017-08-15', ['description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)

X_holidays = pd.get_dummies(holidays)
X = X.join(X_holidays, on='date').fillna(0.0)

y = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales']
all_promotion = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion'].loc['2017']
X_lags = make_lags(y, lags=4)

X_promo = pd.concat([
    make_lags(all_promotion, lags=3),
    all_promotion,
    make_leads(all_promotion, leads=1),
], axis=1)

#putting together on promotion lag and lead data, seasonality, trends, holidays, and sales lags
X_whole = pd.concat([X, X_lags, X_promo], axis=1).dropna()
#X
y_whole = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales']
y = make_multistep_target(y, steps=16).dropna()
y, X = y.align(X_whole, join='inner', axis = 0)
X_fore = X_whole.loc['2017-08']
y

In [None]:
X_fore

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=16, shuffle=False)
linear_model = LinearRegression(fit_intercept=False).fit(X_train, y_train)

linear_y_fit = pd.DataFrame(linear_model.predict(X_train), index=X_train.index, columns = y_train.columns).clip(0.0)
linear_y_pred = pd.DataFrame(linear_model.predict(X_valid), index=X_valid.index, columns = y_valid.columns).clip(0.0)

In [None]:
y

In [None]:
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_fit_avg = (linear_y_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_pred_avg = (linear_y_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

#reorder the columns based on the proper order of steps
linear_y_fit_avg = linear_y_fit_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

linear_y_pred_avg = linear_y_pred_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

#create the forecasted values:
check = linear_y_pred.loc['2017-07-31']
check.index.names=['date', 'store_nbr', 'family']
check = check.groupby('date').mean()
check.index = y_whole.loc['2017-07-31':'2017-08-15'].index

In [None]:
rmsle_train = mean_squared_log_error(y_train, linear_y_fit) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, linear_y_pred) ** 0.5
rmsle_forecast= mean_squared_log_error(y_valid.loc['2017-07-31'], linear_y_pred.loc['2017-07-31']) ** 0.5
print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')
print(f'Forecast RMSLE: {rmsle_forecast:.5f}')

In [None]:
ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = plot_multistep(linear_y_fit_avg, ax=ax, every=16)
ax = plot_multistep(linear_y_pred_avg, ax=ax, every=16)
ax = check.plot(ax=ax, label="Forecasted", color='C0')

In [None]:
linear_model = LinearRegression(fit_intercept=False).fit(X, y)

linear_y_fit_full = pd.DataFrame(linear_model.predict(X), index=X.index, columns = y.columns).clip(0.0)
linear_y_forecast = pd.DataFrame(linear_model.predict(X_fore), index=X_fore.index, columns = y.columns).clip(0.0)

In [None]:
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_fit_full_avg = (linear_y_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_forecast_avg = (linear_y_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

#reorder the columns based on the proper order of steps
linear_y_fit_full_avg = linear_y_fit_full_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

linear_y_forecast_avg = linear_y_forecast_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

#create the forecasted values:
linear_forecast = linear_y_forecast.loc['2017-08-15']
linear_forecast.index.names=['date', 'store_nbr', 'family']
linear_forecast_avg = linear_forecast.groupby('date').mean()
linear_forecast_avg.index = df_test.unstack(['store_nbr', 'family']).index

In [None]:
rmsle_fit_full = mean_squared_log_error(y, linear_y_fit_full) ** 0.5
print(f'Full Training RMSLE: {rmsle_fit_full:.5f}')

In [None]:
ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = plot_multistep(linear_y_fit_full_avg, ax=ax, every=16)
ax = plot_multistep(linear_y_forecast_avg, ax=ax, every=16)
ax = linear_forecast_avg.plot(ax=ax, label="Forecasted", color='C0')

We're going to make a dataset that is cleaned up so as not to use series information for a model like an xgb regressor.

In [None]:
all_promotion = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion'].loc['2017']
#using lags
X_2_lags = make_lags(y_whole, lags=4)
X_2_lags = X_2_lags.stack(['store_nbr', 'family'])

#using promotional data
X_2_promo = pd.concat([
    make_lags(all_promotion, lags=3).stack(['store_nbr', 'family']),
    all_promotion.stack(['store_nbr', 'family']),
    make_leads(all_promotion, leads=1).stack(['store_nbr', 'family']),
], axis=1)

#putting them both together
X_2_whole = pd.concat([X_2_lags, X_2_promo], axis=1).dropna()

#label encoding the family column
le = LabelEncoder()
X_2_whole = (X_2_whole
    .reset_index('family')  # convert index to column
    .assign(family=lambda x: le.fit_transform(x.family)))

#stacking the y value for the benefit of a not linear model like XGBoost
y_2 = y.stack(['store_nbr', 'family'])
#fixing the order
y_2 = y_2.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])
X_2 = X_2_whole.loc['2017':'2017-07-31']
X_2_fore = X_2_whole.loc['2017-08':'2017-08-15']

In [None]:
y_2

In [None]:
X_2

In [None]:
DirRec_xgboost = RegressorChain(base_estimator=XGBRegressor())
DirRec_xgboost.fit(X_2, y_2)
y_2_fit = pd.DataFrame(
   DirRec_xgboost.predict(X_2),
   index=y_2.index,
   columns=y_2.columns,
).clip(0.0)
y_2_pred = pd.DataFrame(
   DirRec_xgboost.predict(X_2_fore),
    index=y_whole.stack(['store_nbr', 'family']).loc['2017-08'].index,
   columns=y_2.loc['2017-07-15':'2017-07-31'].columns,
).clip(0.0)

In [None]:
y_2_fit

In [None]:
y_2_pred

In [None]:
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
y_2_fit_avg = (y_2_fit.groupby('date').mean().squeeze())
y_2_pred_avg = (y_2_pred.groupby('date').mean().squeeze())

#create the forecasted values:
y_2_forecast = y_2_pred.loc['2017-08-15']
y_2_forecast_avg = y_2_pred.groupby('date').mean()
y_2_forecast_avg = y_2_forecast_avg.loc['2017-08-15']
y_2_forecast_avg.index = df_test.unstack(['store_nbr', 'family']).index
y_2_forecast_avg

In [None]:
rmsle_fit_full = mean_squared_log_error(y, linear_y_fit_full) ** 0.5
print(f'Full Training RMSLE: {rmsle_fit_full:.5f}')

In [None]:
y_whole = store_sales.unstack(['store_nbr', 'family']).loc['2017':'2017-07-31', 'sales']
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = plot_multistep(y_2_fit_avg, ax=ax, every=16)
ax = plot_multistep(y_2_pred_avg, ax=ax, every=16)
ax = y_2_forecast_avg.plot(ax=ax, label="Forecast", color='C3')

Not that great at predicting. Let's use our hybridized model!

In [None]:
#splitting data for linear model
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=16, shuffle=False)
#splitting data for XGBRegressor
X_2_train = X_2.loc['2017':'2017-07-15']
X_2_valid = X_2.loc['2017-07-16':'2017-07-31']

In [None]:
X_2_train

In [None]:
hybrid_forecast_model = BoostedHybrid(LinearRegression(), RegressorChain(base_estimator=XGBRegressor()))


hybrid_forecast_model.fit(X_train, X_2_train, y_train)
hybrid_forecast_fit = hybrid_forecast_model.predict(X_train, X_2_train)
hybrid_forecast_pred = hybrid_forecast_model.predict(X_valid, X_2_valid)

hybrid_forecast_fit = hybrid_forecast_fit.clip(0.0)
hybrid_forecast_pred = hybrid_forecast_pred.clip(0.0)

In [None]:
hybrid_forecast_pred

In [None]:
#reorder the columns based on the proper order of steps
hybrid_forecast_fit = hybrid_forecast_fit.stack(['store_nbr', 'family']).reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])
hybrid_forecast_fit = hybrid_forecast_fit.unstack(['store_nbr', 'family'])

hybrid_forecast_pred = hybrid_forecast_pred.stack(['store_nbr', 'family']).reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])
hybrid_forecast_pred = hybrid_forecast_pred.unstack(['store_nbr', 'family'])

y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
hybrid_forecast_fit_avg = (hybrid_forecast_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
hybrid_forecast_pred_avg = (hybrid_forecast_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())


#create the forecasted values:
check = hybrid_forecast_pred.loc['2017-07-31']
check = check.groupby('date').mean()
check.index = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales'].loc['2017-07-31':'2017-08-15'].index

In [None]:
rmsle_train = mean_squared_log_error(y_train, hybrid_forecast_fit) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, hybrid_forecast_pred) ** 0.5
rmsle_forecast= mean_squared_log_error(y_valid.loc['2017-07-31'], hybrid_forecast_pred.loc['2017-07-31']) ** 0.5
print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')
#print(f'Forecast RMSLE: {rmsle_forecast:.5f}')

In [None]:
ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = plot_multistep(hybrid_forecast_fit_avg, ax=ax, every=16)
ax = plot_multistep(hybrid_forecast_pred_avg, ax=ax, every=16)
ax = check.plot(ax=ax, label="Forecasted", color='C0')

Looks like the new model performs pretty well!

In [None]:
hybrid_forecast_model = BoostedHybrid(LinearRegression(), RegressorChain(base_estimator=XGBRegressor()))

#fit the model on the full data
hybrid_forecast_model.fit(X, X_2, y)
hybrid_forecast = hybrid_forecast_model.predict(X_fore, X_2_fore)
hybrid_forecast_full_fit = hybrid_forecast_model.predict(X, X_2)


hybrid_forecast = hybrid_forecast.clip(0.0)
hybrid_forecast_full_fit = hybrid_forecast_full_fit.clip(0.0)

In [None]:
#reorder the columns based on the proper order of steps
hybrid_forecast_full_fit = hybrid_forecast_full_fit.stack(['store_nbr', 'family']).reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])
hybrid_forecast_full_fit = hybrid_forecast_full_fit.unstack(['store_nbr', 'family'])

hybrid_forecast = hybrid_forecast.stack(['store_nbr', 'family']).reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])
hybrid_forecast = hybrid_forecast.unstack(['store_nbr', 'family'])

y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
hybrid_forecast_full_fit_avg = (hybrid_forecast_full_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
hybrid_forecast_avg = (hybrid_forecast.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())


#create the forecasted values:
forecast = hybrid_forecast.loc['2017-08-15']
forecast.index.names=['date', 'store_nbr', 'family']
forecast_avg = check.groupby('date').mean()
forecast_avg.index = df_test.unstack(['store_nbr', 'family']).index

In [None]:
rmsle_train = mean_squared_log_error(y, hybrid_forecast_full_fit) ** 0.5
print(f'Full Training RMSLE: {rmsle_train:.5f}')

In [None]:
ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = plot_multistep(hybrid_forecast_full_fit_avg, ax=ax, every=16)
ax = plot_multistep(hybrid_forecast_avg, ax=ax, every=16)
ax = forecast_avg.plot(ax=ax, label="Forecasted", color='C0')

Here's what our forecast will look like.

# Submission to Competition

Now that we have our model ready we can submit to the store sales forecasting competition! Below is code getting our forecast in the right format so that it can be graded for the competition.

In [None]:
forecast

In [None]:
forecast_formatted = forecast.unstack('date', 'family').stack('date')
forecast_formatted

In [None]:
df_test

In [None]:
forecast_formatted.index = df_test.index
pd.DataFrame(forecast_formatted, columns = ['sales'])
#forecast_formatted = pd.DataFrame(forecast_formatted, index = df_test.index)
forecast_formatted = pd.DataFrame(forecast_formatted, columns = ['sales'])
forecast_formatted

In [None]:
y_submit = forecast_formatted.join(df_test.id).reindex(columns=['id', 'sales'])
y_submit

In [None]:
y_submit.to_csv('submission.csv', index=False)
