In [None]:
# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import CalendarFourier,DeterministicProcess
from xgboost import XGBRegressor

from pathlib import Path
from warnings import simplefilter

import numpy as np
import pandas as pd

from learntools.time_series.ex3 import *
from learntools.time_series.utils import plot_periodogram, seasonal_plot

import seaborn as sns

simplefilter("ignore")  # ignore warnings to clean up output cells

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


comp_dir = Path('../input/store-sales-time-series-forecasting')
data_dir = Path("../input/ts-course-data")

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()

family_sales = (
    store_sales
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
)

average_sales = (
    store_sales['sales']
    .groupby('date').mean()
    .squeeze().loc['2017']
)

In [None]:
#Let's make a moving average plot to see what kind of trend this series has. 
#Since this series has daily observations, let's choose a window of 365 days to smooth over any short-term changes within the year.
moving_average = average_sales.rolling(
    window=365,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=183,  # choose about half the window size
).mean()              # compute the mean

ax = average_sales.plot(style=".", color="0.5")
moving_average.plot(
    ax=ax, linewidth=3, title="Store Sales - 365-Day Moving Average", legend=False,
);


In [None]:
#Examine Seasonal Plot

# days within a week
X=average_sales.to_frame()
X["day"] = X.index.dayofweek  # the x-axis (freq)
X["week"] = X.index.week  # the seasonal period (period)

# days within a year
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year
fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
seasonal_plot(X, y="sales", period="week", freq="day", ax=ax0)
seasonal_plot(X, y="sales", period="year", freq="dayofyear", ax=ax1);

In [None]:
#Examine Periodogram

X["week"] = X.index.week
X["day"] = X.index.dayofweek
plot_periodogram(average_sales);

#The periodogram agrees with the seasonal plots above: a strong weekly season.
#The weekly season we'll model with indicators. 

In [None]:
#Create seasonal features

y=average_sales.copy()

dp = DeterministicProcess(
    index=y.index,
    constant=True,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=True,               # weekly seasonality (indicators)
    drop=True,                   # drop terms to avoid collinearity
)

# `in_sample` creates features for the dates given in the `index` argument
X = dp.in_sample()

# The intercept is the same as the `const` feature from
# DeterministicProcess. LinearRegression behaves badly with duplicated
# features, so we need to be sure to exclude it here.
model = LinearRegression(fit_intercept=False)
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)
ax = average_sales.plot(style=".", color="0.5", title="Tunnel Traffic - Linear Trend")
_ = y_pred.plot(ax=ax, linewidth=3, label="Trend")

In [None]:
# Check out the deseasonalised Peridogram
y_deseason = y - y_pred

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(10, 7))
ax1 = plot_periodogram(y, ax=ax1)
ax1.set_title("Product Sales Frequency Components")
ax2 = plot_periodogram(y_deseason, ax=ax2);
ax2.set_title("Deseasonalized");

In [None]:
#Create 7-day moving average
y_ma = y.rolling(window=7,center=True).mean()

fig, (ax3, ax4) = plt.subplots(2, 1,figsize=(10, 7))
ax3 = y_deseason.plot(ax=ax3)
ax3.set_title("Deseasonalised");
ax4 = y_ma.plot(ax=ax4)
ax3.set_title("7-day moving average");

In [None]:
#Examine serial dependence

from learntools.time_series.utils import plot_lags, make_lags, make_leads
from statsmodels.graphics.tsaplots import plot_pacf

plot_pacf(y_deseason, lags=8);
plot_lags(y_deseason, lags=8, nrows=2);

#The correlogram and lag plots indicates that none of the lags are significant.

In [None]:
#The competition dataset includes a time series that could potentially be useful as a leading indicator -- the onpromotion series, which contains the number of items on a special promotion that day. 
#Since the company itself decides when to do a promotion, there's no worry about "lookahead leakage"; we could use Tuesday's onpromotion value to forecast sales on Monday, for instance.
onpromotion = (
    store_sales['onpromotion']
    .groupby('date').mean()
    .squeeze()
).loc['2017']

# Drop the New Year outlier
plot_lags(x=onpromotion.iloc[1:], y=y_deseason.iloc[1:], lags=3, leads=3, nrows=1);

#The lag plot indicates that onpromotion may not be correlated with average sales. This suggests that onpromotion may not be useful as feature.

In [None]:
#The competition dataset also includes a time series that could potentially be useful as a leading indicator -- the oil series, which shows the daily oil prices
oil_full = pd.read_csv(
    comp_dir / 'oil.csv',
    usecols=['dcoilwtico','date'],
    dtype={
        'dcoilwtico': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

idx = pd.date_range('01-01-2013', '31-08-2017')

oil_full = oil_full.set_index(['date']).sort_index()

oil_full  =oil_full.reindex(idx,method='ffill')

In [None]:
from sklearn.impute import SimpleImputer

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

oil_full_imputed = pd.DataFrame(numerical_transformer.fit_transform(oil_full))
oil_full_imputed.columns = oil_full.columns
oil_full_imputed.index=oil_full.index
oil_full_imputed.index.name='date'
oil_full_imputed.reset_index(inplace=True)
oil_full_imputed.dtypes
oil_full_imputed.reset_index(inplace=True)
oil_full_imputed['date'] = oil_full_imputed.date.dt.to_period('D')
oil_full_imputed = oil_full_imputed.set_index(['date']).sort_index()

end_train = y_deseason.index.max()
start_train = y_deseason.index.min()

oil_train=oil_full_imputed[start_train:end_train]

In [None]:
#Make y_deseason into a dataframe
y_deseason_df=pd.DataFrame(y_deseason)
average_sales1 = (store_sales.groupby('date').mean().drop('onpromotion',axis=1))
y_deseason_df.columns=average_sales1.columns

In [None]:
# Merge oil_train and y_deseason, since 25th December sales are not captured in y_deseason
oil_train_df=pd.DataFrame(oil_train).drop('index',axis=1)

y_deseason_oil_df=y_deseason_df.merge(oil_train_df,left_index=True,right_index=True,how='left')

# Drop the New Year outlier
plot_lags(x=y_deseason_oil_df['dcoilwtico'].iloc[1:], y=y_deseason_oil_df['sales'].iloc[1:], lags=3, leads=3, nrows=1);

#The lag plot indicates that oil prices may not be correlated with average sales. This suggests that oil prices may not be useful as feature.

In [None]:
# National and regional holidays in the training set and include those holidays which are not transferred
holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'descript.replace(np.nan, '', regex=True).replace(np.nan, '', regex=True)ion': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')
# National and regional holidays in the training set and include those holidays which are not transferred
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional'] & transferred== False")
).drop(['locale','locale_name','type','description'],axis=1)

# Create holiday features
holidays_train=holidays[start_train:end_train]
holidays_train=holidays_train.rename(columns={'transferred':'holiday'})
holidays_train['holiday']=holidays_train['holiday'].replace(False,'Yes',regex=True)
#There are duplicated dates in holidays_train data, remove them
holidays_train=holidays_train[~holidays_train.index.duplicated(keep='first')]
holidays_train

In [None]:
#Define target
y_target = store_sales.drop(columns='onpromotion')
y_target=y_target.unstack(['store_nbr', 'family']).loc['2017']
y_target

In [None]:
# X_1: Features for Linear Regression
X_1 = dp.in_sample()

In [None]:
# X_2: Features for XGBoost
X_2= store_sales.drop(columns='sales')# onpromotion feature
X_2=X_2.reorder_levels(['date','family','store_nbr'], axis=0)
X_2=X_2.groupby(level=['date','family','store_nbr']).sum().loc['2017']

#Create holiday features
X_holidays=pd.get_dummies(holidays_train)

#Merge X_2 with holiday features
X_2=X_2.join(X_holidays.holiday_Yes,how='left')
X_2['holiday_Yes']=X_2['holiday_Yes'].replace(np.nan,0.0)

#Merge X_2 with oil features
X_2=X_2.join(oil_train_df.dcoilwtico,how='left').fillna('ffill')
X_2

In [None]:
# Apply LinearRegression
model_linear=LinearRegression(fit_intercept=False)
model_linear.fit(X_1,y_target)
y_pred_linear=model_linear.predict(X_1)

#Compute residuals
y_pred_lineardf=pd.DataFrame(y_pred_linear,index=y_target.index,columns=y_target.columns)
y_pred_resid=y_target-y_pred_lineardf
y_pred_resid = y_pred_resid.stack().stack()
y_pred_resid

In [None]:
#Apply XGBoost classifier
model_XGB=XGBRegressor()
model_XGB.fit(X_2,y_pred_resid)
y_pred_XGB=model_XGB.predict(X_2)
y_pred_XGB = pd.DataFrame(y_pred_XGB, index=X_2.index, columns=y_pred_resid.columns)
y_pred_XGB

In [None]:
#Sum up predictions from Linear regression and XGB
y_pred_lineardf1=y_pred_lineardf.stack().stack()
y_pred=y_pred_lineardf1+y_pred_XGB
y_pred=y_pred.clip(0.0)
y_pred

In [None]:
#Generate submission
df_test = pd.read_csv(
    comp_dir / 'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

# Create features for test set
X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'

#Create oil features 
end_test = X_test.index.max()
start_test = X_test.index.min()
oil_test=oil_full_imputed[start_test:end_test]
oil_test_df=pd.DataFrame(oil_test).drop('index',axis=1)

#Create holiday features for test set
holidays_test=holidays[start_test:end_test]
holidays_test=holidays_test.rename(columns={'transferred':'holiday'})
holidays_test['holiday']=holidays_test['holiday'].replace(False,'Yes',regex=True)
#There are duplicated dates in holidays_train data, remove them
holidays_test=holidays_test[~holidays_test.index.duplicated(keep='first')]

#Create holiday features
X_holidays_test=pd.get_dummies(holidays_test)

In [None]:
# X_2: Features for XGBoost
X_test2= df_test.drop(columns='id')# onpromotion feature
X_test2=X_test2.reorder_levels(['date','family','store_nbr'], axis=0)
X_test2=X_test2.groupby(level=['date','family','store_nbr']).sum()

#There is no holidays within period of the test data, hence we need to do an outer join, rename holiday column and replace all NaN with 0.0
X_test2=X_test2.join(X_holidays_test.holiday,how='left').replace(np.nan,0.0).rename(columns={'holiday':'holiday_Yes'})

#Merge with oil features
X_test2=X_test2.join(oil_test_df.dcoilwtico,how='left').fillna('ffill')

# Predict with LinearRegression
y_pred_linear_test=model_linear.predict(X_test)
df_test1=df_test.drop(columns='id').unstack(['store_nbr', 'family'])
y_pred_linear_testdf=pd.DataFrame(y_pred_linear_test,index=df_test1.index,columns=df_test1.columns)
y_pred_linear_testdf=y_pred_linear_testdf.rename(columns={'onpromotion': 'sales'})
y_pred_linear_testdf=y_pred_linear_testdf.stack().stack()
y_pred_linear_testdf

In [None]:
#Apply XGBoost classifier
y_pred_XGB_test=model_XGB.predict(X_test2)
y_pred_XGB_test =pd.DataFrame(y_pred_XGB_test, index=X_test2.index, columns=y_pred_linear_testdf.columns)

#Sum up predictions from Linear regression and XGB
y_pred_test=y_pred_linear_testdf+y_pred_XGB_test
y_pred_test=y_pred_test.clip(0.0)
y_pred_test=y_pred_test.reorder_levels(['store_nbr','family','date'], axis=0)
y_pred_test=y_pred_test.groupby(level=['store_nbr','family','date']).sum()
y_pred_test

In [None]:
y_submit =y_pred_test.join(df_test.id).reindex(columns=['id', 'sales'])
y_submit.to_csv('submission.csv', index=False)