# Store Sales

Further reading:<br>
https://www.kaggle.com/xholisilemantshongo/modeling-sales-3-types-of-regression <br>
https://www.kaggle.com/howoojang/first-kaggle-notebook-following-ts-tutorial <br>
https://www.kaggle.com/maricinnamon/store-sales-time-series-forecast-visualization <br>

# Functions
from course ...

In [None]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("365D") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(ts, fs=fs, detrend=detrend, window="boxcar", scaling='spectrum')
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(["Annual (1)", "Semiannual (2)", "Quarterly (4)", 
                        "Bimonthly (6)", "Monthly (12)", "Biweekly (26)", 
                        "Weekly (52)", "Semiweekly (104)"], rotation=90)
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [None]:
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(x=X[freq], 
                      y=X[y],
                      ax=ax, 
                      hue=X[period],
                      palette=palette, 
                      legend=False)
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(name, 
                    xy=(1, y_), 
                    xytext=(6, 0), 
                    color=line.get_color(), 
                    xycoords=ax.get_yaxis_transform(), 
                    textcoords="offset points", 
                    size=14, 
                    va="center")
    return ax

In [None]:
def seasonality(df, key, freq, col):
    df_grouped = grouped(df, key, freq, col)
    df_grouped['date'] = pd.to_datetime(df_grouped['date'], format = "%Y-%m-%d")
    df_grouped.index = df_grouped['date'] 
    df_grouped = df_grouped.drop(columns=['date'])
    df_grouped.index.freq = freq # manually set the frequency of the index
    
    X = df_grouped.copy()
    X.index = pd.to_datetime(X.index, format = "%Y-%m-%d") 
    X.index.freq = freq 
    # days within a week
    X["day"] = X.index.dayofweek   # the x-axis (freq)
    X["week"] = pd.Int64Index(X.index.isocalendar().week)  # the seasonal period (period)
    # days within a year
    X["dayofyear"] = X.index.dayofyear
    X["year"] = X.index.year
    fig, (ax0, ax1, ax2) = plt.subplots(3, 1, figsize=(20, 30))
    seasonal_plot(X, y='mean', period="week", freq="day", ax=ax0)
    seasonal_plot(X, y='mean', period="year", freq="dayofyear", ax=ax1)
    X_new = (X['mean'].copy()).dropna()
    plot_periodogram(X_new, ax=ax2)

In [None]:
def make_lags(ts, lags, lead_time=1):
    return pd.concat(
        {
            f'y_lag_{i}': ts.shift(i)
            for i in range(lead_time, lags + lead_time)
        },
        axis=1)

In [None]:
def make_multistep_target(ts, steps):
    return pd.concat(
        {f'y_step_{i + 1}': ts.shift(-i)
         for i in range(steps)},
        axis=1)

In [None]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method
        
        
    def fit(self, X_1, X_2, y):
        self.model_1.fit(X_1,y)

        y_fit = pd.DataFrame(
            self.model_1.predict(X_1),
            index=X_1.index, columns=y.columns,
        )

        y_resid = y-y_fit

        self.model_2.fit(X_2, y_resid)

        self.y_columns = y.columns
        self.y_fit = y_fit
        self.y_resid = y_resid
        self.feature_importances_ = self.model_2.feature_importances_
    
    def predict(self, X_1, X_2):
        y_pred = pd.DataFrame(
            self.model_1.predict(X_1),
            index=X_1.index, columns=self.y_columns,
        )
        y_pred += self.model_2.predict(X_2)

        return y_pred.unstack()  # long to wide

# Imports & config

In [None]:
import time
from datetime import datetime

#notebook
#from learntools.time_series.utils import plot_periodogram, seasonal_plot
from learntools.time_series.style import *

#measure notebook running time
start_time = time.time()

%matplotlib inline

# backbone
import numpy as np 
from numpy.random import seed
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

# DNN
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping
from keras import metrics
import tensorflow


from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import xgboost as xgb

# Model 1 (trend)
from pyearth import Earth
from sklearn.linear_model import ElasticNet, Lasso, Ridge

# Model 2
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

pd.options.display.float_format = '{:.2f}'.format
sns.set(style='white', context='notebook', palette='deep')
print("loaded ...")

# Load and check data

In [None]:
store_sales = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'], infer_datetime_format=True, dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'int32',
    }, usecols=['store_nbr', 'family', 'date', 'sales','onpromotion'])

test_data = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'], infer_datetime_format=True, dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'int32',
    })
OIL = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'], infer_datetime_format=True, dtype = {'dcoilwtico':'float32'})
HOLIDAY = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'], infer_datetime_format=True, dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    })
STORES = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
TRANS = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'], infer_datetime_format=True, 
                    dtype={'store_nbr':'int32','transactions':'float32'})

Results were consistently better if only 2017- was considered<br>

In [None]:
#start_date = '2016-01-01'
#start_date = '2014-01-01'
#start_date = '2017-01-01'
start_date = '2017-04-01'

How one example looks ... <br>

In [None]:
fam = 'AUTOMOTIVE'
store = '1'
fig, ax = plt.subplots(figsize=(20,8))
show = (store_sales.family == fam) & (store_sales.store_nbr == store) & (store_sales.date >= start_date)
sns.lineplot(data = store_sales, x = "date", y= store_sales[show]['sales']);

In [None]:
store_sales['date'] = store_sales.date.dt.to_period('D')
test_data['date'] = test_data.date.dt.to_period('D')

In [None]:
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
test_data = test_data.set_index(['store_nbr', 'family', 'date']).sort_index()

In [None]:
store_sales.tail()

## Explore stores

In [None]:
STORES.head()
#STORES.city.value_counts()
#STORES.state.value_counts()
#STORES.type.value_counts()
#STORES.cluster.value_counts()

## Explore transactions
unused, not available for target dates<br>
should that also be predicted? to use then as predicted feature? -> see: Trans_forecast<br>
this slightly improved  score<br>


In [None]:
TRANS['date'] = TRANS.date.dt.to_period('D')
TRANS = TRANS.set_index(['store_nbr', 'date']).sort_index()
TRANS = TRANS.unstack(['store_nbr'])
TRANS.iloc[0,:].fillna(0, inplace=True)
TRANS.apply(lambda col: col.fillna(col.median(), inplace = True), axis=0)
TRANS = TRANS.loc[start_date:]
TRANS.iloc[0,:] = 0.0 #jan 1

In [None]:
TRANS.loc(axis=1)['transactions',1][:22].plot();

In [None]:
plot_periodogram(TRANS.loc(axis=1)['transactions',1]);

In [None]:
fourier_trans = CalendarFourier(freq="W", order=52)
dp_trans = DeterministicProcess(
    index=TRANS.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier_trans],
    drop=True,
)
X_trans = dp_trans.in_sample()
X_trans['day'] = X_trans.index.dayofweek
X_trans['week'] = X_trans.index.week

In [None]:
%%time
transaction_model = LinearRegression(fit_intercept=False).fit(X_trans, TRANS)

In [None]:
X_trans_test = dp_trans.out_of_sample(steps=16)
X_trans_test['day'] = X_trans_test.index.dayofweek
X_trans_test['week'] = X_trans_test.index.week

In [None]:
Trans_forecast = pd.DataFrame(transaction_model.predict(X_trans_test), index = X_trans_test.index, columns = TRANS.columns)

In [None]:
plot_periodogram(Trans_forecast.loc(axis=1)['transactions',1]);

Forecasted transactions can now be used as feature<br>

In [None]:
Trans_forecast.loc(axis=1)['transactions',1].plot();

## Explore OIL

In [None]:
sns.lineplot(data = OIL, x = "date", y= OIL.dcoilwtico);

In [None]:
OIL['date'] = OIL.date.dt.to_period('D')
OIL = OIL.set_index(['date']).sort_index()
idx = pd.DataFrame(pd.period_range(start='2013-01-01', end = '2017-08-31'),columns=['date'])
OIL= idx.join(OIL, on='date')
OIL.ffill(inplace=True)
OIL.bfill(inplace=True)

# Target

In [None]:
y = store_sales.drop(['onpromotion'], axis=1).unstack(['store_nbr', 'family']).loc[start_date:,'sales']
y.head()

# Features

## Seasonal features

In [None]:
fourier = CalendarFourier(freq="M", order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X_seasonal = dp.in_sample()

## Onpromotion

In [None]:
X_promo = store_sales.drop('sales', axis=1)
X_promo = X_promo.reset_index(['store_nbr','family'])
X_promo['store_nbr'] = X_promo['store_nbr'].astype('int32')
X_promo = X_promo.reset_index()
X_promo = X_promo.pivot(index = 'date', columns = ['store_nbr','family'], values = ['onpromotion']).loc[start_date:]

## Weekday, Day of month

In [None]:
X_days = pd.DataFrame(X_seasonal.index)
X_days['Weekday'] =X_days.date.dt.dayofweek
X_days['Week'] =X_days.date.dt.week
X_days['Day'] =X_days.date.dt.day
X_days = X_days.set_index(['date']).sort_index()
X_days.shape

## Holidays

In [None]:
HOLIDAY['Day'] = HOLIDAY.date.dt.dayofweek

In [None]:
HOLIDAY.head(10)

In [None]:
HOLIDAY.locale.value_counts()

In [None]:
X_holidays = (
    HOLIDAY.query("locale in ['National']")
    .loc[:,['date','description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)
            
X_holidays['date'] = X_holidays.date.dt.to_period('D')
X_holidays = X_holidays.set_index(['date']).sort_index().loc['2016-01-01':]
X_holidays = X_holidays.rename(columns={'description':'holiday'})
X_holidays = X_holidays[~X_holidays.index.duplicated()]
X_holidays = pd.get_dummies(X_holidays)

In [None]:
isHoliday = pd.DataFrame(pd.period_range(start=start_date, end = '2017-08-31'),columns=['date'])
for store in range(1,55):
    isHoliday[store] = 0
    
Map = {'Regional':'state', 'Local':'city'}

for i in isHoliday.index:
    selection = HOLIDAY[(HOLIDAY.locale != "National") & (HOLIDAY.date.dt.to_period('D') == isHoliday.iloc[i]['date'])]
    if len(selection): 
        for h in selection.index:
            field = Map[HOLIDAY.iloc[h]['locale']]
            name = HOLIDAY.iloc[h]['locale_name']
            stores = STORES[(STORES[field] == name)]
            nbrs = stores.store_nbr.to_list()
            for store in nbrs:
                isHoliday.loc[isHoliday.index[i], store] = 1

isHoliday = isHoliday.set_index(['date']).sort_index()

In [None]:
HOLY_train = isHoliday.loc[start_date:'2017-08-15']
HOLY_test = isHoliday.loc['2017-08-16':]

## Oil

In [None]:
OIL['14_Mean'] = OIL.dcoilwtico.rolling(14).mean() 
OIL['7_Mean'] = OIL.dcoilwtico.rolling(7).mean() 
OIL['14_Median'] = OIL.dcoilwtico.rolling(14).median() 
OIL['14_Std'] = OIL.dcoilwtico.rolling(14).std() 
OIL = OIL.set_index(['date']).sort_index()
OIL = OIL.loc[start_date:]

In [None]:
OIL.plot(figsize=(20,10));

In [None]:
OIL_train = OIL.loc[start_date:'2017-08-15']
OIL_test = OIL.loc['2017-08-16':]
OIL_train.head()

## Merge X1

In [None]:
X1 = X_seasonal.join(OIL_train, on= 'date').join(TRANS, on='date')

## Merge X2

In [None]:
X2 = X_promo.join(X_days, on='date').join(X_holidays, on='date').fillna(0).join(HOLY_train, on='date')

# Model

## Linear Regression

In [None]:
%%time
linear_model = LinearRegression(fit_intercept=False).fit(X_seasonal, y)
y_pred_linear = pd.DataFrame(linear_model.predict(X_seasonal), index=X_seasonal.index, columns=y.columns)

### Lags from Linear regression

In [None]:
X_test_seasonal = dp.out_of_sample(steps=16)
X_test_seasonal.index.name = 'date'

In [None]:
X_lags = store_sales.drop(['onpromotion'], axis=1).unstack(['store_nbr', 'family']).loc[start_date:,'sales']
cols = X_lags.columns
X_lags = X_lags.stack(['store_nbr', 'family'])
X_lags = X_lags.reset_index()
X_lags = X_lags.rename(columns={0:"sales"}).set_index(['date', 'store_nbr', 'family']).sort_index()

y_forecast_linear = pd.DataFrame(linear_model.predict(X_test_seasonal).clip(0.0), index=X_test_seasonal.index, columns=cols)
y_forecast_linear = y_forecast_linear.stack(['store_nbr', 'family'])
y_forecast_linear= y_forecast_linear.reset_index()
y_forecast_linear = y_forecast_linear.rename(columns={0:"sales"}).set_index(['date', 'store_nbr', 'family']).sort_index()

y_complete = X_lags.append(y_forecast_linear)
y_complete = y_complete.unstack(['store_nbr', 'family'])

In [None]:
LagsX = make_lags(y_complete, 3).fillna(0)

In [None]:
Train_lags = LagsX.loc[start_date:'2017-08-15']
Test_lags = LagsX.loc['2017-08-16':]

## Extend X1

In [None]:
EX1 = X1.merge(Train_lags, on='date')

## Boosted hybrid
RandomForestRegressor: ~ 11 min to run,  <br>
ExtraTreesRegressor: ~ 5 min to run,  1.5 min with 4 cores<br>

In [None]:
%%time
BH_model = BoostedHybrid(model_1 = LinearRegression(), model_2 = ExtraTreesRegressor(random_state=13, bootstrap=True, verbose=1,n_jobs=-1)) 
BH_model.fit(EX1, X2, y)
y_pred = BH_model.predict(EX1, X2).clip(0.0)

In [None]:
y_pred_display = y_pred.unstack(['store_nbr','family'])
y_pred_display.head()

In [None]:
families = y.columns[0:6]
axs = y.loc(axis=1)[families].plot(
    subplots=True, sharex=True, figsize=(20, 12), **plot_params, alpha=0.5,
)
_ = y.loc(axis=1)[families].plot(subplots=True, sharex=True, color='C0', ax=axs)
_ = y_pred_display.loc(axis=1)[families].plot(subplots=True, sharex=True, color='C3', ax=axs)
for ax, family in zip(axs, families):
    ax.legend([])
    ax.set_ylabel(family)

### Train RMSE

In [None]:
train_rmse = mean_squared_error(y,y_pred_display)**0.5
print('train_rmse:', train_rmse)
lin_train_rmse = mean_squared_error(y,y_pred_linear)**0.5
print('linear train_rmse:', lin_train_rmse)

# Features for test set

In [None]:
X_days_test = pd.DataFrame(X_test_seasonal.index)
X_days_test['Weekday'] = X_days_test.date.dt.dayofweek
X_days_test['Week'] = X_days_test.date.dt.week
X_days_test['Day'] = X_days_test.date.dt.day
X_days_test = X_days_test.set_index(['date']).sort_index()

In [None]:
X_promo_test = test_data.drop('id', axis=1)
X_promo_test = X_promo_test.reset_index(['store_nbr','family'])
X_promo_test = X_promo_test.reset_index()
X_promo_test = X_promo_test.pivot(index = 'date', columns = ['store_nbr','family'], values = ['onpromotion'])

## Merge X1 test

In [None]:
X1_test = X_test_seasonal.join(OIL_test, on='date').join(Trans_forecast, on="date").merge(Test_lags, on='date')

## Merge X2 test

In [None]:
X2_test = X_promo_test.join(X_days_test, on='date').join(X_holidays, on='date').fillna(0).join(HOLY_test, on='date')

# Predict

In [None]:
y_forecast = BH_model.predict(X1_test, X2_test).clip(0.0)
y_submit = pd.DataFrame(y_forecast).rename(columns={0:"sales"}).reset_index().set_index(['date', 'store_nbr', 'family']).sort_index()
y_submit = y_submit.join(test_data.id).reindex(columns=['id', 'sales'])
y_submit = y_submit.sort_index(level=['date',"store_nbr","family"])
y_submit.head(20)

In [None]:
fams = ['AUTOMOTIVE','BEAUTY','BEVERAGES','GROCERY I', "BOOKS", "BABY CARE", "CELEBRATION"]
STORE_NBR = '1'
fig, axs = plt.subplots(len(fams), figsize=(20,16))
for i in range(len(fams)):
    #axs[i] = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['sales', STORE_NBR, fams[i]].loc[start_date:].plot(ax=axs[i],label="sales")
    axs[i] = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['sales', STORE_NBR, fams[i]].iloc[-90:].plot(ax=axs[i],label="sales")
    axs[i] = y_submit.unstack(['store_nbr', 'family']).loc(axis=1)['sales', STORE_NBR, fams[i]].plot(ax=axs[i],label="hybrid forecast")
    axs[i] = y_forecast_linear.unstack(['store_nbr', 'family']).loc(axis=1)['sales', STORE_NBR, fams[i]].plot(ax=axs[i],label="linear forecast")
    axs[i].set_title(f'{fams[i]} Sales at Store {STORE_NBR}');
    axs[i].legend();


In [None]:
y_submit.to_csv('submission.csv', index=False)
print('Submission completed')

In [None]:
end_time = time.time()
print("Notebook run time: {:.1f} seconds. Finished at {}".format(end_time - start_time, datetime.now()) )