In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
sns.set_style('ticks')
import plotly.express as px
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
# Deterministic Process is used to construct basic configurations such as a constant, a time trend of any order, and either a seasonal or a Fourier transformation.
# The Fourier transformation separates noise from the data. It can be used to separate the noise in a Calendar data.
from statsmodels.graphics.tsaplots import plot_pacf
# To plot the Partial Autocorrelation Function (PACF). I want to plot PACF to be specific with the regression model of Time Series Data.
from statsmodels.tsa.stattools import adfuller
# The Augmented Dickey-Fuller test (adfuller) is the statistical test used to determine whether a Time Series Data is Stationary, by looking at the trend of the data.
from matplotlib import pyplot as plt, style
style.use('seaborn-ticks')
from tqdm import tqdm
# tqdm is a library in Python which is used for creating Progress Meters/Bars.
# Progress Meters/Bars are used as a visual cue that is a reliable estimate of the execution time of my code.
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
# The TransformedTargetRegressor is useful for applying non-linear transformation to the Target in Regression.
# I decided to import ColumnTransformer because I'll be doing different transformations on different columns, and I want to avoid transforming each column separately then stitching them together; Column Transformer will do this work for me.
from sklearn.preprocessing import PowerTransformer
# PowerTransformer is used to make Features more Gaussian-like, which is useful for Heteroscedasticity (non-constant variance) issues.
# In other words, PowerTransformer changes the Distribution of the Features into more Gaussian-like to be able to perform Regression.
from sklearn.pipeline import make_pipeline
# To create a Pipeline that would do the needed transforms on the Numerical and Categorical Columns and combine several other steps that can be cross-validated together, while keeping the original dataset.
from sklearn.linear_model import Ridge, ARDRegression
# Ridge adds a tolerable Bias in exchange for the significant drop in Variance of the Model to prevent an overfitted Regression Line.
# Automatic Relevance Determination (ARD) Regression has a Gaussian Distribution that is narrower with higher peak because more of its Coefficients are 0. This means that the Features are more stable as there is less varaince
from sklearn.neighbors import KNeighborsRegressor
# I imported KNeighborsRegressor to predict continuous values by taking the Mean of the nearest kneighbor.
from sklearn.svm import SVR
# Support Vector Regression acknowledges the presence of non-linearity in the data that is used to predict discrete values by finding the best fit line through Support Vectors.
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
# ExtraTreesRegressor implements a meta-estimator that fits randomized decision trees (extra trees) on various subsamples of the dataset then uses averages to improve accuracy and control over-fitting.
# GradientBoostingRegressor is used for tabular datasets to create a new model that can predict the Errors of prior models. It then adds the weaker models together and makes a final prediction with the least loss when adding new models.
from sklearn.ensemble import BaggingRegressor, VotingRegressor
# BaggingRegressor implements a meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction and reduce the variance by introducing randomization into its construction procedure, and then making an ensemble out of it.
# In contrast, VotingRegressor A voting regressor implements a meta-estimator that fits several base regressors, each on the whole dataset, then it averages the individual predictions to form a final prediction.
%pip install -Uq upgini
from upgini import FeaturesEnricher, SearchKey, ModelTaskType
from upgini.metadata import RuntimeParameters, CVType
# Upgini is a simple feature search & enrichment library in Python. With Upgini, I spend less time for external data search and feature engineering, which it will do automatically.
# Feature Enrichment means combining existing Features in the dataset with Features received from external sources, a popular way to augment existing datasets and improve the quality of the conclusions.
# SearchKey is used to define search keys that are used to initiate search with the Upgini library. Search Keys are a set of columns that join external data sources and features.
# ModelTaskType is declared to use the appropriate task type for the Model.
# RuntimeParameters is imported to use optional parameters.
# CVType is the Cross-Validation Type parameter for Features Enricher.
import gc
gc.enable()
# To enable automatic garbage collection.
# Garbage collection helps in memory management by using the “stop-the-world” process where all execution stops while the garbage collector looks for and deletes objects to be collected.

import warnings
from warnings import filterwarnings, simplefilter
filterwarnings('ignore')
# filterwarnings is a function that, when used with (’ignore’), does not display the warning message.
simplefilter('ignore')
# simplefilter is a function that, when used with (’ignore’), allows me to use known-deprecated code without having to see the warning.
# But simplefilter does not suppress the warning for other code that might not be aware of its use of deprecated code.

In [None]:
train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv',
                    parse_dates = ['date'], infer_datetime_format = True,
                    dtype = {'store_nbr' : 'category',
                             'family' : 'category'},
                    usecols = ['date', 'store_nbr', 'family', 'sales'])
train['date'] = train.date.dt.to_period('D')
train = train.set_index(['date', 'store_nbr', 'family']).sort_index()
print(train.shape)
train.head(15)

In [None]:
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv',
                   parse_dates = ['date'], infer_datetime_format = True)
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['date', 'store_nbr', 'family']).sort_values('id')
print(test.shape)
test.head()

## Feature Engineering

In [None]:
calendar = pd.DataFrame(index = pd.date_range('2013-01-01', '2017-08-31')).to_period('D')
oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv',
                  parse_dates = ['date'], infer_datetime_format = True,
                  index_col = 'date').to_period('D')
oil['avg_oil'] = oil['dcoilwtico'].rolling(7).mean()
calendar = calendar.join(oil.avg_oil)
calendar['avg_oil'].fillna(method = 'ffill', inplace = True)
calendar.dropna(inplace = True)
# I concatenated calendar with oil price and replaced NaN with the last valid observed price.

In [None]:
_ = plot_pacf(calendar.avg_oil, lags = 10)
# I calculated partial autocorrelation to select proper lags for oil price features.
# Lags are the number of intervals between two autocorrelated observations. They are used to compute the Autocorrelation among Features to observe the seasonality of the data.
# The Partial Autocorrelation Function (PACF) is the correlation between the Target and a determined Feature, taking into account how both that Target and the determined Feature are related to other Features.

In [None]:
n_lags = 3
# I chose 3 as the oil price lags because it is nearest to zero after lags 1 and 2 - considered as "white noise."
for l in range(1, n_lags + 1):
    calendar[f'oil_lags{l}'] = calendar.avg_oil.shift(l)
calendar.dropna(inplace = True)

In [None]:
hol = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv',
                  parse_dates = ['date'], infer_datetime_format = True,
                  index_col = 'date').to_period('D')
hol = hol[hol.locale == 'National']  
hol = hol.groupby(hol.index).first()
print(hol.shape)
hol.head(30)
# This is the data set containing the Holidays in Ecuador. I included this to know the spending behavior on National holidays.
# I only included National Holidays to prevent false positives (There might be non-National Holidays that weren't transferred but were mistakenly labeled as transferred).
# I then parsed the dates, and removed duplicate holidays.

In [None]:
calendar = calendar.join(hol)
# Here, I joined the calendar and holiday datasets.
calendar['dofw'] = calendar.index.dayofweek
# This outputs the ordinal values of the day of the week, starting at 0 for Monday and ending at 6 for Sunday.
calendar['wd'] = 1
# I added a column for weekdays, represented by value 1.
calendar.loc[calendar.dofw > 4, 'wd'] = 0
# I added 0 as the representation for weekends.
calendar.loc[calendar.type == 'Work Day', 'wd'] = 1.
# A workday event is also represented by value 1 since it's on a weekday.
calendar.loc[calendar.type == 'Transfer', 'wd'] = 0
# 'Transfer' contains the dates on which the holidays that were transferred, meaning it's not a work day.
# Since there is no work and a 'Transfer' day is essentially a holiday, I decided to have it represented as 0.
calendar.loc[calendar.type == 'Bridge', 'wd'] = 0
# 'Bridge' days are extra days added to a Holiday, so I used 0 to represent it as well.
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = 0
# If the Holiday is not transferred, then that means it is not a work day. Hence, it is represented by 0.
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True), 'wd'] = 1
# In this case however, since Holiday has been tagged as transferred, these Holidays are treated as normal days on which there is work - hence the representation of value 1.

calendar = pd.get_dummies(calendar, columns = ['dofw'], drop_first = True)
calendar = pd.get_dummies(calendar, columns = ['type'])
# I used One-hot Encoding to create Dummies for Nominal Data and transform them into Binary (0s and 1s) so that these values can be understood by the algorithm.
calendar.drop(['locale', 'locale_name', 'description', 'transferred'], axis = 1, inplace = True)
# I dropped these unused columns as they have not gained any relevance in my analysis.

print(calendar.shape)
calendar.head()

In [None]:
calendar['wageday']=0
calendar.loc[(calendar.index.to_timestamp().is_month_end) | (calendar.index.day == 15), 'wageday'] = 1
# Since Wages are paid every two weeks on the 15th and on the last day of the month, I engineered a feature in which 1 represents the 15th and the last days of the months on which the customers received their wages, and 0 which represents other days.
# This is a time dependent feature.

In [None]:
ax = plt.subplots(1,1,figsize = (20,4))
train.loc["2016-07-01":].filter(like = 'SCHOOL AND OFFICE SUPPLIES', axis=0).groupby(["date"]).sales.sum().plot(title = "SCHOOL AND OFFICE SUPPLIES")
plt.show()
# Here, I plotted School and Office Supplies family because I figured this is a time series data with regular trend at the start of the School Season.
# This plot shows the peak sales of school and office supplies, which coincides with the school seasons: April and May, than August and September.

In [None]:
school_season = []
for i, r in calendar.iterrows():
    if i.month in [4, 5, 8, 9] :
        school_season.append(1)
    else :
        school_season.append(0)
calendar['school_season'] = school_season
# During the school season months, I placed 1 in the column school season to indicate the spike, otherwise 0. 
# Based on the plot above, I created variable 'school_season' in the dataset in order to represent the months of the School Season as 1 and the rest as 0.

## Zero Forecasting

> 

In [None]:
c = train.groupby(["store_nbr","family"]).tail(15).groupby(["store_nbr","family"]).sales.sum().reset_index()
# Can I change this to 14? What will be the implication?
c = c[c.sales == 0].drop("sales",axis = 1)
c = c[c.family != "SCHOOL AND OFFICE SUPPLIES"]
c.shape
# It is a common practice of grocery stores that some of their store branches don't sell specific products or stop selling some products after some time. 
# So I decided to do a zero forecast on products that did not sell for 14 consecutive days, except for School and Office Supplies products that sells on a seasonal basis.

In [None]:
print("Shape of train before zero forecasting:", train.shape)
outer_join = train.reset_index().merge(c, how = 'outer', indicator = True)
train = outer_join[~(outer_join._merge == 'both')].drop('_merge', axis = 1)
train = train.set_index(['date', 'store_nbr', 'family']).sort_index()
del outer_join
gc.collect()
print("Shape of train after zero forecasting:", train.shape)
# Here I compared the shape of the Training Dataset before and after removing zero sales through Zero Forecasting. The reduction in the number of values is easily noticeable.
# I will save the products with zero sales into a separate dataframe, then append it to the Test Set during the submission phase of this Notebook.

In [None]:
zero_prediction = []
for i in range(0, len(c)):
    zero_prediction.append(
        pd.DataFrame({
            "date":pd.date_range("2017-08-16", "2017-08-31").tolist(),
            "store_nbr":c.store_nbr.iloc[i],
            "family":c.family.iloc[i],
            "sales":0
        })
    )
zero_prediction = pd.concat(zero_prediction)
zero_prediction['date'] = zero_prediction.date.dt.to_period('D')
del c
gc.collect()
zero_prediction = zero_prediction.set_index(['date', 'store_nbr', 'family'])
zero_prediction.head()
# Here I saved the products with zero sales into a separate dataframe called 'zero_prediction'. I will use this dataframe later in this submission phase.

In [None]:
a = train.groupby(["date","store_nbr"]).sum().reset_index()
a = a[a["sales"] > 0].groupby("store_nbr")[["date"]].min().sort_values(by="date",ascending = False).head(5)
a.rename(columns = {'date':'open_date'}, inplace = True)
a
# I want to have the data of all stores, so here I check the opening dates of all stores to be sure that I start my analysis on the date all stores are in operation.
# Since the latest store opened was 20 April 2017, I will start my forecasting on 21 April 2017.

## Training

In [None]:
y = train.unstack(['store_nbr', 'family']).loc["2017-04-21":]
# Here I unstacked the columns 'store_nbr' and 'family' and located 21 April 2017 to prepare my data for training.
fourier = CalendarFourier(freq = 'W', order = 3)
# The Fourier transform separates noise from the data. CalendarFourier can be used to separate the noise in a Calendar data.
# I set the parameter frequency to Week and Order to 3 because...
# What are the parameters of this?

dp = DeterministicProcess(index = y.index,
                          order = 1,
                          seasonal = False,
                          constant = False,
                          additional_terms = [fourier],
                          drop = True)
# Deterministic Process is used to construct basic configurations such as a constant, a time trend of any order, and either a seasonal or a Fourier component.
x = dp.in_sample()
# The .in_sample() returns the full set of values that match the index required by the Deterministic Process.
x = x.join(calendar)
x.index.name = "date"


xtest = dp.out_of_sample(steps = 16)
# I used out_of_sample because the Test Data will have a prediction for the next 16 days from 15.08 till 31.08.
xtest = xtest.join(calendar)
xtest.index.name = "date"

del hol
del calendar
del dp
del oil
_ = gc.collect()
# Invoking the garbage collector manually during the execution of a program is a good idea to handle memory being consumed by reference cycles.

In [None]:
a = train["2017-04-21":].reset_index()
a["ind"] = 1
a["ind"] = a.groupby("family").ind.cumsum().values
a = pd.pivot(a, index = "ind", columns = "family", values = "sales").corr(method="spearman")
mask = np.triu(a.corr(method="spearman"))
# I used Spearman Rank Correlation calculate for correlations of the continuous ordinal data that follow a monotonic relationship in this dataset.
plt.figure(figsize=(20, 20))
sns.heatmap(a,
        annot=True,
        fmt='.1f',
        cmap='YlGnBu',
        square=True,
        mask=mask,
            # Here I used a mask to reduce redundancy in the Heatmap and add symmtery to the plot.
        linewidths=1,
            # 'linewidth' is an optional float parameter that draws the width of the lines that will divide each cell and add lines between each cell.
        cbar=False)

plt.title("Sales Correlations for Product Families",fontsize = 24)
plt.show()
# I created this Heatmap to analyze the Sales Correlations for all 33 Products in the Dataset.
# It is apparent in this Heatmap that the Products Baby Care, Books, Home Appliances, Ladieswear, Lawn and Garden, and Pet Supplies aren't correlated with other products at all.
# There are other Products that have very low correlation with each other. Nonetheless, since the rest of the products are correlated, this Heatmap shows that this is a Multivariate Time Series problem.

In [None]:
fig, ax = plt.subplots(1,2,figsize = (20,4))
train.loc["2016-08-01":].filter(like = 'BABY CARE', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[0], title = "BABY CARE")
train.loc["2016-08-01":].filter(like = 'BOOKS', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[1], title = "BOOKS")
plt.show()
fig, ax = plt.subplots(1,2,figsize = (20,4))
train.loc["2016-01-01":].filter(like = 'HOME APPLIANCES', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[0], title = "HOME APPLIANCES")
train.loc["2016-08-01":].filter(like = 'LADIESWEAR', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[1], title = "LADIESWEAR")
plt.show()
fig, ax = plt.subplots(1,2,figsize = (20,4))
train.loc["2016-08-01":].filter(like = 'LAWN AND GARDEN', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[0], title = "LAWN AND GARDEN")
train.loc["2016-08-01":].filter(like = 'PET SUPPLIES', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[1], title = "PET SUPPLIES")
plt.show()
fig, ax = plt.subplots(1,2,figsize = (20,4))
train.loc["2016-08-01":].filter(like = 'SCHOOL AND OFFICE SUPPLIES', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[0], title = "SCHOOL AND OFFICE SUPPLIES")
train.loc["2016-08-01":].filter(like = 'LINGERIE', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[1], title = "LINGERIE")
plt.show()
fig, ax = plt.subplots(1,2,figsize = (20,4))
train.loc["2016-08-01":].filter(like = 'GROCERY II', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[0], title = "GROCERY II")
train.loc["2016-08-01":].filter(like = 'HARDWARE', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[1], title = "HARDWARE")
plt.show()
fig, ax = plt.subplots(1,2,figsize = (20,4))
train.loc["2016-08-01":].filter(like = 'LIQUOR,WINE,BEER', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[0], title = "LIQUOR,WINE,BEER")
train.loc["2016-08-01":].filter(like = 'MAGAZINES', axis=0).groupby(["date"]).sales.sum().plot(ax = ax[1], title = "MAGAZINES")
plt.show()
# Here, I plotted the Sales of products with that aren't correlated with other products at all: Baby Care, Books, Home Appliances, Books, Ladieswear, Lawn and Garden, Pet Supplies.
# I also plotted the Sales of School and Office Supplies products and the products with very low correlation with it: Lingerie, Grocery II, Hardware, Liquor, Wine, & Beer, and Magazines.
# I did this to check for Trends, Seasonality, and Anomalies for these low-correlated Products.
# The plos of Pet Supplies, Hardware, Ladiesware, Grocery II, Lingerie products show a monotonic relationship.
# The plot of School and Office Supplies products stands out as it looks like a Non-stationary Time Series (TS) Data. I need to verify this with the Augmented Dickey-Fuller Test.
# The plot of Home Appliances products also appears to be a Non-stationary TS, which must be verified using the Augmented Dickey-Fuller Test.
# The plot of Books products shows that Sales started sometime in October 2016, and has sharply declined since. The near-zero daily sales for this product might be a result of assortiment decline.
# The plot of Lawn and Garden products shows an uptick in Sales starting in December 2016, which may be due to the introduction of these products in more store branches.
# The plot of Liquor, Wine, Beer products shows a significant uptick in the last weeks of December 2016, and peaking therein. On the first week of January 2017, its sales significant dropped, which may be because of the demand of these products during the Holiday season.
# After running the Augmented Dickey-Fuller Test below, I discovered that the sales of products Books, Baby Care, and Lawn and Garden are Stationary, while the sales of the rest are not.

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["BABY CARE"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["BOOKS"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["HOME APPLIANCES"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["LADIESWEAR"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["LAWN AND GARDEN"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["SCHOOL AND OFFICE SUPPLIES"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["LINGERIE"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["GROCERY II"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["LIQUOR,WINE,BEER"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["MAGAZINES"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
result = adfuller(
    np.log1p(
        y.loc["2016-08-01":, y.columns.get_level_values("family").isin(["HARDWARE"])].mean(axis="columns")
    )
)
# Here I used the Augmented Dickey-Fuller (ADF) Test to check for Stationarity of the variables.
print('ADF Statistic: %f' % result[0])
# The ADF Statistic is used to know whether to reject the Null (Time Series Data is Non-Stationary). The Null is rejected if it falls within the ADF Critical Values.
# If p-value is > 0: Non-stationary. If p-value =0: Stationary.
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

if result[0] < result[4]["1%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
non_st_ts = ["SCHOOL AND OFFICE SUPPLIES","BOOKS"]
# For the non-stationary sales of products with a low multivariate correlation, Tree-based Regressors are used.
# Here I use School and Office Supplies and Books products because only these non-stationary TS variables will improve the Tree-based Regressors versus baseline Ridge.
# I used GradientBoostingRegressor + the ExtraTreesRegressor ensemble models here.

In [None]:
a = train.groupby("family").sales.mean().sort_values(ascending = False).reset_index()
px.bar(a, y = "family", x="sales", color = "family", title = "Top Selling Product Families")
# Here I wanted to know the top selling Products, which are inelastic and have a strong intra-week seasonality.
# The inelasticity of the top selling Products is due to the fact that almost all of them are necessities.
# I decided to use K-nearest Neighbors and Bayesian Regressors for the top 15 Products, and separate the bottom selling Products.
# K-nearest Neighbors Regression predicts the actual numerical value by taking the mean of the nearest kneighbors, depending on the value of k. Deciding on the value of k is a trial and error process.

In [None]:
low_sales_ts = ["MAGAZINES","LAWN AND GARDEN","BABY CARE",
                "CELEBRATION","GROCERY II","HARDWARE","AUTOMOTIVE",
                "HOME AND KITCHEN I","HOME AND KITCHEN II",
                "HOME APPLIANCES","LINGERIE",
                "LADIESWEAR","PLAYERS AND ELECTRONICS",
                "PET SUPPLIES","BEAUTY","PREPARED FOODS",
                "HOME CARE","CLEANING"]

In [None]:
sdate = '2017-04-21'
x=x.loc[sdate:]
y=y.loc[sdate:]

In [None]:
# Upgini is a Python library used for a simple feature search & enrichment.
enricher = FeaturesEnricher(
    # Here I defined the search keys date and country to initiate the search of enriching features in the Upgini library.
    # I want to use this low-code library to boost my accuracy while saving time on feature engineering. The external Features from Upgini enrich my Dataset's features.
    search_keys={
        "date": SearchKey.DATE
    },
    country_code = "EC",
    # 'EC' stands for Ecuador
    cv = CVType.time_series,
)
# Here I initiated a Cross Validation time series parameter to FeaturesEnricher.
# Cross Validation lets me evaluate and increase the accuracy of my model (lower Bias) without increasing its variance, thereby finding the sweet spot in the Bias-Variance Tradeoff.

In [None]:
enriched_ts_map = {}
y_fe_1 = y.loc[:, y.columns.get_level_values("family").isin(["LIQUOR,WINE,BEER"])].mean(axis="columns")
y_fe_2 = y.loc[:, y.columns.get_level_values("family").isin(["CLEANING"])].mean(axis="columns")


y_fe_1 = np.log1p(y_fe_1)
y_fe_2 = np.log1p(y_fe_2)
# I used the log transformation to reshape y distribution closer to Gaussian.
# The np.log1p() is a mathematical numpy library function that helps calculate the natural logarithmic value of x+1, where x belongs to all the input array elements. The log1p() is reverse of exp(x) – 1.
# Natural logarithms are preferred because they are directly interpretable as approximate proportional differences. 

In [None]:
%%time

X_enriched = enricher.fit_transform(
    x.copy().reset_index(), 
    y_fe_1.reset_index(drop=True).values,
    # 'For LIQUOR, WINE, BEER' products
    calculate_metrics=True,
    keep_input=True,
    max_features=3,
).set_index("date")
# Here, I tried to do feature enrichment on the LIQUOR,WINE,BEER products.
# The x new features might improve accuracy as ranked by the SHapley Additive exPlanataion (SHAP) value. 
# The SHAP Value is used to take into account the local contribution of a Feature, and how its contribution changes as the value of the Feature changes.
# The higher the SHAP value, the higher the contribution of that feature to the prediction, and vice-versa.
# However, SHAP value cannot evaluate the quality/accuracy of the prediction through the feature.

In [None]:
enriched_ts_map["LIQUOR,WINE,BEER"] = list(set(X_enriched.columns) - set(x.columns))
# Here I omitted the unenriched columns and retained the feature-enriched columns.

In [None]:
X_test_enriched = enricher.transform(
    xtest.copy().reset_index(),
    keep_input=True,
    max_features=3, 
).set_index("date")
# Here I enriched all the features in the training dataset.

In [None]:
X_enriched2 = enricher.fit_transform(
    X_enriched.reset_index(), 
    y_fe_2.reset_index(drop=True).values,
    # 'For CLEANING' products
    calculate_metrics=True,
    keep_input=True, 
    max_features=3,
).set_index("date")

X_test_enriched2 = enricher.transform(
    X_test_enriched.reset_index(),
    keep_input=True,
    max_features=3, 
).set_index("date")
# # Here, I tried to do feature enrichment on the CLEANING products using the same process I did on LIQUOR,WINE,BEER products.

In [None]:
enriched_ts_map["CLEANING"] = list(set(X_enriched2.columns) - set(X_enriched.columns))
# Here I omitted the unenriched columns and retained the feature-enriched columns.

In [None]:
print("Number of Features, Initial -> After Enrichment:",x.shape[1],"->",X_enriched2.shape[1])
int_features = set(x.columns.to_list())
ext_features = [col for ext_features_ in enriched_ts_map.values() for col in ext_features_]

x = X_enriched2
xtest = X_test_enriched2
del X_enriched, X_enriched2
del X_test_enriched, X_test_enriched2
del y_fe_1
del y_fe_2
_ = gc.collect()
# To capture joint sales, I decided to use the Products for my sales prediction. This is because FMCG have strong correlation between product categories: when a person buys product X, it may follow that he buys product Y too.
# And true enough, after checking the correlation of the products, I have found that most products are correlated (25 out of 33).

In [None]:
lnr_reg = TransformedTargetRegressor(
    regressor = LinearRegression(fit_intercept = True, n_jobs = -1),
    func=np.log1p,
    inverse_func=np.expm1
)
# The TransformedTargetRegressor is useful for applying a non-linear transformation to the target y in regression problems. 
# In other words, it is used to scale and transform the Target to prepare it for Regression using sklearn.
lnr = make_pipeline(
    ColumnTransformer([("drop_f", "drop", ext_features)], remainder="passthrough"),
    PowerTransformer(),
    lnr_reg
)
# 'make_pipeline' is a utility function to construct pipelines. A pipeline takes in estimates and allows for automatically applying the transformers and the model to the ‘pipeline’ and fit the model, instead of doing all this manually.
# The purpose of the pipeline is to combine several steps that can be cross-validated together while setting different parameters.
# The pipeline also does needed transformations and keeps the original data.
# The ColumnTransformer allows different columns or column subsets of the input to be transformed separately, and the features generated by each Transformer will be concatenated to form a single feature space. 
# ColumnTransformer is useful for heterogeneous or columnar data in order to combine several feature extraction mechanisms or transformations into a single transformer.
# By specifying remainder='passthrough', all remaining columns that were not specified in transformers will be automatically passed through, then will be concatenated with the output of the transformers. 

lnr.fit(x, y)
yfit_lnr = pd.DataFrame(lnr.predict(x), index = x.index, columns = y.columns).clip(0.)
ypred_lnr = pd.DataFrame(lnr.predict(xtest), index = xtest.index, columns = y.columns).clip(0.)

y_ = y.stack(['store_nbr', 'family'])
y_['lnr'] = yfit_lnr.stack(['store_nbr', 'family'])['sales']
# I will then add the prediction of this Regression to the existing train and test data.

In [None]:
ylnr = yfit_lnr.append(ypred_lnr)
x = x.join(ylnr)
xtest = xtest.join(ylnr)
del yfit_lnr
del ypred_lnr
del ylnr
_ = gc.collect()

In [None]:
SEED = 6192022
# For reproducibility of results

class CustomRegressor():
    
    def __init__(self, ext_features = None, knn_features = None, non_st_ts = None, low_sales_ts = None, enriched_ts_map = None, n_jobs=-1):
        # To initiate the creation of this Class.
        # Why did I call self here?
        self.n_jobs = n_jobs
        self.ext_features = ext_features
        self.knn_features = knn_features
        self.non_st_ts = non_st_ts
        self.low_sales_ts = low_sales_ts
        self.enriched_ts_map = enriched_ts_map
        self.estimators_ = None
        self.product_names_ = None
        self.store_names_ = None
# This CustomRegressor fits the models on every component of the Multivariate TS - Single TS as the combination Family x Store.
# There will be 1658 independent models.
# There are 4 groups of models in this CustomRegressor. 
# 1st Group: GradientBoostingRegressor + ExtraTreesRegressor Voting Ensemble Model. This is used on selected non-stationary Time Series with low correlations.
# 2nd Group: K-nearest Neighbors Regressor + Bayesian Regressor + Ridge + SVR Voting Ensemble Model. This is used on High Sales Products I defined earlier.
# 3rd Group: Ridge + SVR Voting Ensemble. This is used on Products with Low Sales Products I also defined earlier.
# 4th Group: Ridge. This is used on Products enriched with External Features.

    def _estimator_(self, X, y):
        warnings.simplefilter(action='ignore', category=FutureWarning)
        # For base class warnings about deprecated features.
        
        # We remove external features for the products, which univariate TS were not used during feature search & enrichment
        # These external features won't be relevant; including them would reduce the accuracy of my Model.
        remove_ext_features = ColumnTransformer([("drop_f", "drop", self.ext_features)],remainder="passthrough")
        
        if y.name[2] in self.non_st_ts:
        #The 1st Group
            b1 = GradientBoostingRegressor(n_estimators = 2, max_depth=3, loss='huber', random_state=SEED)
            # Gradient Boosting Regressor is used to predict a continuous value. 
            # It is an ensemble learning technique where new models are created that predict the Errors/Residuals of prior models, and then added together to make the final prediction. 
            # While adding the Models, the Errors/Residuals are minimized to get a better prediction. This is the result of using a Gradient Descent Algorithm, hence its name.
            # Gradient Boosting Regressor adds the Models sequentially until no further significant improvements on the Accuracy can be made.
            # In addition, it is powerful enough to find nonlinear relationships between the Target and Features.
            # The parameter 'n_estimators' sets the number of boosting stages to perform (default=100). A large number of boosting stages usually results in better performance.
            # The parameter 'max_depth' is optional. It sets maximum depth of the individual regression estimators, which limits the number of nodes in the tree (default=3).
            # Tha parameter 'loss='huber'' sets the loss function to be optimized through huber - a  combination of least squares regression and least absolute regression that allows quantile regression (use alpha to specify the quantile).
            # The loss function is used during model training to evaluate the performance of the Model in predicting the data. The lower the Loss Function, the better.
            r1 = ExtraTreesRegressor(n_estimators = 250, n_jobs=self.n_jobs, random_state=SEED)
            # The ExtraTreesRegressor is another meta-estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
            # Extremely Randomized (ExtRa) Trees uses the entire input sample as a Tree, but may increase Variance and decrease Bias. Extra Trees also chooses the Node Splits randomly, then chooses the best split among all subsets of Features.
            # The ExtRa Trees algorithm is faster than Random Forest and their results are practically the same.
        
        # The 2nd Group
            b2 = BaggingRegressor(base_estimator=r1,
                                  n_estimators=10,
                                  n_jobs=self.n_jobs,
                                  random_state=SEED)
            # BaggingRegressor is a meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction.
            # This meta-estimator reduces the variance by introducing randomization in its construction procedure.
            model = make_pipeline(
                remove_ext_features,
                VotingRegressor ([('gbr', b1), ('et', b2)]))
                # The VotingRegressor is a meta-estimator that fits several base regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction.
                # This meta-estimator can be used with any collection of existing models and it treats all models the same, meaning all models generally equally to the prediction (unweighted).
                
        elif y.name[2] in self.enriched_ts_map.keys():
            ext_features_ = [col for col in self.ext_features if col not in self.enriched_ts_map[y.name[2]]]
            remove_ext_features_ = ColumnTransformer([("drop_f", "drop", ext_features_)],remainder="passthrough")
            
            power_tr = PowerTransformer()
            # The PowerTransform applies a power transform on the Feature to reshape their distributions  closer to Gaussian (Normal).
            # There are many kinds of Power Transforms. In this case, PowerTransformer is useful for modeling issues related to Heteroscedasticity.
            # This PowerTransformer uses Yeo-Johnson (Log(y)+1 Transform) to reshape the Distribution of the positive and negative Numeric (continuous) variables in the Dataset closer into a Normal Distribution..
            
            ridge = TransformedTargetRegressor(
                regressor = Ridge(fit_intercept=True, solver='auto', alpha=0.7, normalize=True, random_state=SEED),
                # The TransformedTargetRegressor is a meta-estimator to regress on a transformed Target. It is useful for applying a non-linear transformation to the Target in Regression problems.
                # In other words, it is used to scale and transform the Target to prepare it for Regression using sklearn.
                # The main idea behind Ridge is to introduce a small amount of Bias in the training data, in exchange for a much lower Variance.
                # In other words, Ridge improves the robustness of the Model.
                # 'fit_intercept' set to True to fit the Intercept since I don't expect X and y to be centered.
                # 'solver' set to Auto to automatically choose the computational routines based on the Data Type.
                # 'alpha' set to 0.7 It is a non-negative constant (float) that controls the strength of the Ridge by multiplying the Ridge Regression (L2) Term.
                # 'normalize' set to True to normalize regressors X by subtracting the Mean and dividing by the L2-norm.
                func=np.log1p,
                inverse_func=np.expm1
            )
            ridge_round_to_int = TransformedTargetRegressor(
                regressor=ridge,
                inverse_func=np.rint
            # 'np.rint' is a mathematical function that rounds elements of the array to the nearest integer.
            )
            model = make_pipeline(
                remove_ext_features_,
                power_tr,
                ridge_round_to_int
            )
            
            
        elif y.name[2] in self.low_sales_ts:
            # Here I did the same for the low sales time series data.
            ridge = TransformedTargetRegressor(
                regressor = Ridge(fit_intercept=True, solver='auto', alpha=0.75, normalize=True, random_state=SEED),
                func=np.log1p,
                inverse_func=np.expm1
            )
            svr = TransformedTargetRegressor(
                regressor = SVR(C = 0.2, kernel = 'rbf'),
                func=np.log1p,
                inverse_func=np.expm1
            )
            model = VotingRegressor([('ridge', ridge), ('svr', svr)])
        else:
            ridge = make_pipeline(
                remove_ext_features,
                TransformedTargetRegressor(
                    regressor = Ridge(fit_intercept=True, solver='auto', alpha=0.6, normalize=True, random_state=SEED),
                    func=np.log1p,
                    inverse_func=np.expm1)
            )
            svr = make_pipeline(
                remove_ext_features,
                TransformedTargetRegressor(
                    regressor = SVR(C = 0.2, kernel = 'rbf'),
                    func=np.log1p,
                    inverse_func=np.expm1)
            ) 
          
            knn = make_pipeline(
                ColumnTransformer([("selector", "passthrough", self.knn_features)], remainder="drop"),
                PowerTransformer(),
                KNeighborsRegressor(n_neighbors=3, n_jobs=self.n_jobs)
            # KNeighborsRegressor predicts continuous values, not labels, by taking the Mean of the nearest K neighbor.
            # Here the target is predicted by local interpolation of the targets with the nearest neighbors in the training set.
            # I used a specific feature set for KNN to cluster observations in a way that captures week and intra-week seasonality.
            )
            ard = make_pipeline(
                remove_ext_features,
                TransformedTargetRegressor(
                    regressor = ARDRegression(fit_intercept=True, normalize=True, n_iter=300),
                    func=np.log1p,
                    inverse_func=np.expm1)
            )
            # ARDRegression fits the weight of a Regression Model using an Automatic Relevance Determination (ARD) prior, which is used to update the weights to get the final output.
            # The weights of the Regression Model are assumed to be normally distributed, though there are more 0s (sharp peak at 0), hence the Distribution is narrower.
            # The output of ARDRegression has a lower Mean Squared Error (MSE) than that of Ridge, meaning it is a more performant model.
            estimators = [
                ('ridge', ridge),
                ('svr', svr),
                ("ard", ard),
                ("knn",knn)
            ]
            model = VotingRegressor(estimators)
            
        model.fit(X, y)
        return model
    
    def fit(self, X, y):
        print("Fitting Stage...")
        self.product_names_ = [str(y.iloc[:, i].name[2]) for i in range(y.shape[1])]
        self.store_names_ = [str(y.iloc[:, i].name[1]) for i in range(y.shape[1])]
        self.estimators_ = []
        for i, n in tqdm(enumerate(self.product_names_)):
            estimator_ = self._estimator_(
                # Here, I selected as features only predictions of product sales in the same store or same product in other stores
                X.filter(
                    regex= n + "'\)$|\(\d|^[a-zA-Z_0-9., ]+$|\('sales', '" + str(y.iloc[:, i].name[1]) + "',",
                    axis=1,
                ),
                y.iloc[:, i],
            )
            self.estimators_.append(estimator_)
            # What is regex here?
            
    def predict(self, X):
        print("Prediction stage...")
        y_pred = []
        for e, n, m in tqdm(zip(self.estimators_, self.product_names_, self.store_names_)):
            y_pred_ = e.predict(
                # Select as features only predictions of product sales in the same store or same product in other stores
                X.filter(
                    regex= n + "'\)$|\(\d|^[a-zA-Z_0-9., ]+$|\('sales', '" + m + "',",
                    axis=1,
                )
            )
            y_pred.append(y_pred_)
            
        return np.stack(y_pred, axis=1)
# In this code block, I essentially declared the class CustomRegressor to define my own Regressors to work together with sklearn and hyperparameter optimizations.

In [None]:
%%time 

knn_features = list(int_features - set(['oil_lags2', 'oil_lags1',"trend"]))
# I manually selection the features for the KNN Regression.
model = CustomRegressor(ext_features, knn_features, non_st_ts, low_sales_ts, enriched_ts_map, n_jobs=-1)
model.fit(x, y)

y_pred = pd.DataFrame(model.predict(x), index=x.index, columns=y.columns)

In [None]:
from sklearn.metrics import mean_squared_log_error as msle
# I decided to use MSLE because I don't want large Errors to be significantly more penalized than the small Errors.
# The introduction of the logarithm in this loss function means that MSLE treats small differences between small true and predicted values in the same way that it treats the big differences between large true and predicted values.
y_pred = y_pred.stack(['store_nbr', 'family']).clip(0.)
y_ = y.stack(['store_nbr', 'family']).clip(0.)
# .clip() is used to limit the values in an array.
# Given an interval, values outside the interval are clipped to the interval edges. For example, if an interval of [0, 1] is specified, values smaller than 0 become 0, and values larger than 1 become 1.
y_['pred'] = y_pred.values
print(y_.groupby('family').apply(lambda r : np.sqrt(np.sqrt(msle(r['sales'], r['pred'])))))
print('RMSLE : ', np.sqrt(np.sqrt(msle(y_['sales'], y_['pred']))))
# I used the RMSLE here because I want to penalize underestimations of my predictions more than the overestimations.
# This is because in this industry, to some extent having extra inventory or supply might be more preferable to not being able to providing product as much as the demand.
# The RMSLE is a loss function metric that evaluates the performance of my predictions by measuring the Relative Error between the predicted and the actual values, where the scale of the Error is insignificant.

In [None]:
%%time 

ypred = pd.DataFrame(model.predict(xtest), index = xtest.index, columns = y.columns).clip(0.)
ypred = ypred.stack(['store_nbr', 'family'])
ypred = ypred.append(zero_prediction).sort_index()
sub = pd.read_csv('../input/store-sales-time-series-forecasting/sample_submission.csv')
sub['sales'] = ypred.values
sub.to_csv('submission.csv', index = False)
# My Submission