# Store sales, take #3
previous tries: <br>
https://www.kaggle.com/lovroselic/storesales-ls 0.505 <br>
https://www.kaggle.com/lovroselic/storesales-seasonality-only-ls-take-2 0.510 <br>


---
History:<br>
LR: 0.47621 (V1) <br>
EXT: 0.44987 (V3) <br>
RF: 0.4488 (V2) <br>
-> from april only: 0.44747 <br>
-> droppepd type: 0.44706 (V5)<br>
-> added SD rolling: 0.44112 (V8)<br>
-> data from May 2017: 0.4407 (V9)<br>
---

# Imports & config

In [None]:
import time
from datetime import datetime

#measure notebook running time
start_time = time.time()

%matplotlib inline

# backbone
import os, warnings
import numpy as np 
from numpy.random import seed
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
import calendar

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge,LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.multioutput import MultiOutputRegressor

from xgboost import XGBRegressor
import xgboost as xgb

# DNN
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping
from keras import metrics
import tensorflow

pd.options.display.float_format = '{:.2f}'.format
sns.set(style='white', context='notebook', palette='deep')
print("loaded ...")

In [None]:
# Reproducibility
def set_seed(sd=13):
    seed(sd)
    np.random.seed(sd)
    os.environ['PYTHONHASHSEED'] = str(sd)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(13)

## Data

In [None]:
store_sales = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'], infer_datetime_format=True, dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'int32',
    }, usecols=['store_nbr', 'family', 'date', 'sales','onpromotion'])

test_data = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'], infer_datetime_format=True, dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'int32',
    })
OIL = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'], infer_datetime_format=True, dtype = {'dcoilwtico':'float32'})
HOLIDAY = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'], infer_datetime_format=True, dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    })
STORES = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
TRANS = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'], infer_datetime_format=True, 
                    dtype={'store_nbr':'int32','transactions':'float32'})

print("data loaded ...")

In [None]:
store_sales['date'] = store_sales.date.dt.to_period('D')
test_data['date'] = test_data.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
test_data = test_data.set_index(['store_nbr', 'family', 'date']).sort_index()

---

# Calendar

In [None]:
CALENDAR = pd.DataFrame(pd.period_range(start='2013-01-01', end = '2017-08-31'),columns=['date'])

### OIL

In [None]:
OIL['date'] = OIL.date.dt.to_period('D')
OIL = OIL.set_index(['date']).sort_index()
OIL['MA'] = OIL.dcoilwtico.rolling(7).mean()
OIL['SD'] = OIL.dcoilwtico.rolling(7).std()

In [None]:
CALENDAR = CALENDAR.join(OIL, on = 'date')
CALENDAR.ffill(inplace=True)
CALENDAR.bfill(inplace=True)
CALENDAR = CALENDAR.set_index(['date']).sort_index()

In [None]:
CALENDAR.plot(figsize=(20,10), title= "Oil", xlabel= "date", rot=90);

## Day of Week

In [None]:
CALENDAR['Day'] = CALENDAR.index.dayofweek
CALENDAR['MonthDay'] = CALENDAR.index.day
CALENDAR['Month'] = CALENDAR.index.month
CALENDAR['Season'] = CALENDAR['Month'] // 4

In [None]:
CALENDAR['EndOfMonth'] = CALENDAR.index.map(lambda x: calendar.monthrange(x.year, x.month)[1])
CALENDAR['Payday'] = ((CALENDAR['Day'] == CALENDAR['EndOfMonth']) | (CALENDAR['MonthDay'] == 15)).astype(int)

In [None]:
#CALENDAR.tail(20)

## Holidays

In [None]:
HOLIDAY['date'] = HOLIDAY.date.dt.to_period('D')
HOLIDAY = HOLIDAY.set_index(['date']).sort_index()
HOLIDAY = HOLIDAY[HOLIDAY.locale == 'National']
HOLIDAY = HOLIDAY[~HOLIDAY.index.duplicated()]

## WorkDays

In [None]:
CALENDAR['WD'] = 1
CALENDAR.loc[CALENDAR.Day > 4, 'WD'] = 0

In [None]:
CALENDAR = CALENDAR.merge(HOLIDAY, on='date', how='left')

In [None]:
CALENDAR.loc[(CALENDAR.type == 'Holiday') & (CALENDAR.transferred == True), 'WD'] = 1
CALENDAR.loc[(CALENDAR.type == 'Holiday') & (CALENDAR.transferred == False), 'WD'] = 0
CALENDAR.loc[(CALENDAR.type == 'Work Day'), 'WD'] = 1
CALENDAR.loc[(CALENDAR.type == 'Bridge'), 'WD'] = 0
CALENDAR.loc[(CALENDAR.type == 'Transfer'), 'WD'] = 0

In [None]:
keep_cols = ['dcoilwtico','MA','Day', 'WD', 'SD','Payday','Month','Season']
CALENDAR = CALENDAR[keep_cols]

# Date selection

In [None]:
#start_date = '2016-01-01'
start_date = '2017-04-15'

#start_date = '2017-05-01'
end_date = '2017-08-15'

# Target

In [None]:
y = store_sales.drop(['onpromotion'], axis=1).unstack(['store_nbr', 'family']).loc[start_date:,'sales']
y.head()

## Seasonal

In [None]:
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

## Onpromotion

In [None]:
PROMO = store_sales.drop('sales', axis=1)
PROMO = PROMO.append(test_data.drop('id', axis=1))
PROMO = PROMO.reset_index(['store_nbr','family'])
PROMO['store_nbr'] = PROMO['store_nbr'].astype('int32')
PROMO = PROMO.reset_index()
PROMO = PROMO.groupby(['date','store_nbr'])['onpromotion'].mean()
PROMO = PROMO.unstack('store_nbr')
PROMO.tail()

## Other features

In [None]:
X['oil'] = CALENDAR.loc[start_date:end_date]['MA'].values
X['oil_sd'] = CALENDAR.loc[start_date:end_date]['SD'].values
X['Day'] = CALENDAR.loc[start_date:end_date]['Day'].values
X['WD'] = CALENDAR.loc[start_date:end_date]['WD'].values
X['NewYear'] = (X.index.dayofyear == 1)
X['NewYear'] = X['NewYear'].apply(lambda row: 1 if row else 0)
X['Payday'] = CALENDAR.loc[start_date:end_date]['Payday'].values
X['Season'] = CALENDAR.loc[start_date:end_date]['Season'].values
X = pd.get_dummies(X, columns= ['Day'])
#X = X.join(PROMO.loc[start_date:end_date])
X.shape

In [None]:
X.head(10)

---

# Models

## Linear regression

In [None]:
%%time
linear_model = LinearRegression(fit_intercept=False).fit(X, y)
y_fit_linear = pd.DataFrame(linear_model.predict(X).clip(0.0), index=X.index, columns=y.columns)
#y_fit_linear.head()

## ExtraTrees

In [None]:
%%time
EXT_model = ExtraTreesRegressor(random_state=13, bootstrap=True, n_jobs=-1).fit(X,y)
y_fit_ext = pd.DataFrame(EXT_model.predict(X).clip(0.0), index=X.index, columns=y.columns)
#y_fit_ext.head()

## Random Forest

In [None]:
%%time
RF_model = RandomForestRegressor(random_state=13, n_jobs=-1).fit(X,y)
y_fit_rf = pd.DataFrame(RF_model.predict(X).clip(0.0), index=X.index, columns=y.columns)
y_fit_rf.head()
#RF_model.get_params()

## Ridge
not used for prediction

In [None]:
%%time
Ridge_model = Ridge(fit_intercept=False, alpha= 0.5, normalize=True).fit(X,y)
y_fit_ridge = pd.DataFrame(Ridge_model.predict(X).clip(0.0), index=X.index, columns=y.columns)
#y_fit_ridge.head()

## XGB

In [None]:
%%time
#xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective = 'reg:squarederror', random_state = 13, n_jobs=-1, learning_rate=0.01, reg_alpha=0.1, reg_lambda=10, subsample= 0.95)).fit(X, y)

# 0.04131683657406662
# xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective = 'reg:squarederror', random_state = 13, n_jobs=-1,
#                                                   learning_rate=0.3, reg_alpha=0, reg_lambda=1, subsample= 1)).fit(X, y)

# xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective = 'reg:squarederror', random_state = 13, n_jobs=-1,predictor= 'gpu_predictor',
#                                                   learning_rate=0.045, reg_alpha=0.1, reg_lambda=1.1, subsample= 1)).fit(X, y)

xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective = 'reg:squarederror', random_state = 13, n_jobs=-1,predictor= 'gpu_predictor',
                                                  learning_rate=0.045, reg_alpha=0.5, reg_lambda=1, subsample= 1)).fit(X, y)

In [None]:
y_fit_xgb = pd.DataFrame(xgb_model.predict(X).clip(0.0), index=X.index, columns=y.columns)
y_fit_xgb

# Fit, visual

In [None]:
%%time
fams = ['AUTOMOTIVE','BEAUTY','BEVERAGES','GROCERY I', "BOOKS", "BABY CARE", "CELEBRATION"]
STORE_NBR = '1'
fig, axs = plt.subplots(len(fams), figsize=(30,50))
for i in range(len(fams)):
    axs[i] = y.loc(axis=1)[STORE_NBR, fams[i]].loc[start_date:].plot(ax=axs[i],label="sales")
    #axs[i] = y_fit_linear.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="LR fit")
    axs[i] = y_fit_ext.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="EXT fit")
    axs[i] = y_fit_rf.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="RF fit")
    axs[i] = y_fit_xgb.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="XGB fit")
    #axs[i] = y_fit_ridge.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="Ridge fit")
    
    axs[i].set_title(f'{fams[i]} Sales at Store {STORE_NBR}');
    axs[i].legend();


---

## Fit RMSE

In [None]:
# LR_rmse = mean_squared_error(y,y_fit_linear)**0.5
# EXT_rmse = mean_squared_error(y,y_fit_ext)**0.5
# RF_rmse = mean_squared_error(y,y_fit_rf)**0.5
# Ridge_rmse = mean_squared_error(y,y_fit_ridge)**0.5
# XGB_rmse = mean_squared_error(y,y_fit_xgb)**0.5

In [None]:
LR_rmse = mean_squared_log_error(y,y_fit_linear)**0.5
EXT_rmse = mean_squared_log_error(y,y_fit_ext)**0.5
RF_rmse = mean_squared_log_error(y,y_fit_rf)**0.5
Ridge_rmse = mean_squared_log_error(y,y_fit_ridge)**0.5
XGB_rmse = mean_squared_log_error(y,y_fit_xgb)**0.5

In [None]:
print("----------------------------------------")
print("LR:", LR_rmse)
print("Ridge:", Ridge_rmse)
print("----------------------------------------")
print("EXT:", EXT_rmse)
print("RF:", RF_rmse)
print("----------------------------------------")
print("XGB:", XGB_rmse)

---

# Features for test set

In [None]:
end_test='2017-08-31'
start_test='2017-08-16'

In [None]:
X_test = dp.out_of_sample(steps=16)

In [None]:
X_test['oil'] = CALENDAR.loc[start_test:end_test]['MA'].values
X_test['oil_sd'] = CALENDAR.loc[start_test:end_test]['SD'].values
X_test['Day'] = CALENDAR.loc[start_test:end_test]['Day'].values
X_test['WD'] = CALENDAR.loc[start_test:end_test]['WD'].values
X_test['NewYear'] = (X_test.index.dayofyear == 1)
X_test['NewYear'] = X_test['NewYear'].apply(lambda row: 1 if row else 0)
X_test['Payday'] = CALENDAR.loc[start_test:end_test]['Payday'].values
X_test['Season'] = CALENDAR.loc[start_test:end_test]['Season'].values
X_test = pd.get_dummies(X_test, columns= ['Day']) 
#X_test = X_test.join(PROMO.loc[start_test:end_test])
X_test.head()

---

# Predictions


## LR

In [None]:
LR_forecast = pd.DataFrame(linear_model.predict(X_test).clip(0.0),index=X_test.index, columns=y.columns)
#LR_forecast.head()

## EXT

In [None]:
EXT_forecast = pd.DataFrame(EXT_model.predict(X_test).clip(0.0),index=X_test.index, columns=y.columns)
#EXT_forecast.head()

## RF

In [None]:
RF_forecast = pd.DataFrame(RF_model.predict(X_test).clip(0.0),index=X_test.index, columns=y.columns)
RF_forecast.head()

## XGB

In [None]:
XGB_forecast = pd.DataFrame(xgb_model.predict(X_test).clip(0.0),index=X_test.index, columns=y.columns)
XGB_forecast.head()

In [None]:
%%time
fams = ['AUTOMOTIVE','BEAUTY','BEVERAGES','GROCERY I', "BOOKS", "BABY CARE", "CELEBRATION"]
STORE_NBR = '1'
fig, axs = plt.subplots(len(fams), figsize=(25,30))
for i in range(len(fams)):
    axs[i] = y.loc(axis=1)[STORE_NBR, fams[i]].loc[start_date:].iloc[-90:].plot(ax=axs[i],label="sales")
    #axs[i] = LR_forecast.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="LR forecast")
    axs[i] = EXT_forecast.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="EXT_forecast")
    axs[i] = RF_forecast.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="RF_forecast")
    axs[i] = XGB_forecast.loc(axis=1)[STORE_NBR, fams[i]].plot(ax=axs[i],label="XGB_forecast")
    axs[i].set_title(f'{fams[i]} Sales at Store {STORE_NBR}');
    axs[i].legend();

---

# Submit

In [None]:
def forecast_to_submit(forecast):
    submit = (forecast
            .stack(['store_nbr', 'family'])
            .reset_index()
            .rename(columns={0:"sales", 'level_0':'date'})
            .set_index(['date', 'store_nbr', 'family'])
            .sort_index())
    submit = submit.join(test_data.id).reindex(columns=['id', 'sales'])
    return submit

In [None]:
#y_submit = LR_forecast
y_submit_ext = forecast_to_submit(EXT_forecast)
y_submit_rf = forecast_to_submit(RF_forecast)
y_submit_xgb = forecast_to_submit(XGB_forecast)

#y_submit_xgb.head(20)

In [None]:
y_submit_xgb

## Voting

In [None]:
VOTING = pd.DataFrame(index = sorted(test_data.id))
VOTING['XGB'] = y_submit_xgb.sales.values
VOTING['RF'] = y_submit_rf.sales.values
VOTING['EXT'] = y_submit_ext.sales.values
use_voting = ['XGB','RF','EXT']
use_voting = ['RF','EXT']
VOTING['sales'] = VOTING[use_voting].mean(axis=1)
VOTING.head(10)

In [None]:
y_submit_vote = VOTING[['sales']]

In [None]:
#y_submit = y_submit_rf
y_submit_vote.index.name = 'id'
y_submit = y_submit_vote.reset_index(drop=False)
#y_submit = y_submit_xgb
y_submit.head()

In [None]:
y_submit.to_csv('submission.csv', index=False)
print('Submission completed')

In [None]:
end_time = time.time()
print("Notebook run time: {:.1f} seconds. Finished at {}".format(end_time - start_time, datetime.now()) )