In [None]:
pip install pmdarima

# Import Libraies and data

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import Ridge
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier
%matplotlib inline

# Load specific forecasting tools
from statsmodels.tsa.statespace.sarimax import SARIMAX

from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
from statsmodels.tsa.seasonal import seasonal_decompose      # for ETS Plots
from pmdarima import auto_arima                              # for determining ARIMA orders

# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

# Load dataset
path = '../input/store-sales-time-series-forecasting/'


# **Data Exploration**

## Oil Price Data

In [None]:
path = '../input/store-sales-time-series-forecasting/'
df_oil = pd.read_csv(path + 'oil.csv',parse_dates =['date'], infer_datetime_format=True, index_col='date')
px.line(df_oil,title = 'OIL PRICE') 
# Column Interperitation: Crude Oil Prices: West Texas Intermediate (WTI) - Cushing, Oklahoma (DCOILWTICO)

  Notice: There are missing dates

In [None]:
# Create full date
calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

# Calculate moving average
df_oil['ma_oil'] = df_oil['dcoilwtico'].rolling(7).mean()

# Merge
calendar = calendar.merge(df_oil, how='left', left_index = True, right_index=True)

# Fill Nan 
calendar['ma_oil'].fillna(method='ffill',inplace=True)
calendar['ma_oil'].fillna(calendar['dcoilwtico'],inplace=True)
calendar['ma_oil'].fillna(method='bfill',inplace=True)

# Day of week
calendar['dofw'] = calendar.index.day_of_week

px.line(calendar[['dcoilwtico','ma_oil']])

In [None]:
# Check missing data
print(calendar.isna().sum())
calendar.head()

## Events Data

In [None]:
df_hev = pd.read_csv(path + 'holidays_events.csv', parse_dates=['date'], infer_datetime_format=True)

# 'Good Friday' mistake correction
df_hev['date'] = df_hev['date'].replace({'2013-04-29' : pd.to_datetime('2013-03-29')})

# Set Index
df_hev = df_hev.set_index('date').sort_index()

# National level only for simplicity
df_hev = df_hev[df_hev.locale == 'National'] 
de_hev = df_hev.groupby(df_hev.index).first()

In [None]:
# Check missing data
print(df_hev.isna().sum())
df_hev.head()

In [None]:
# Set Work days
calendar['wd'] = True
calendar.loc[calendar.dofw > 4, 'wd'] = False
# Merge canlendar with events
calendar = calendar.merge(df_hev, how='left', left_index=True, right_index=True)

calendar.loc[calendar.type == 'Bridge'  , 'wd'] = False
calendar.loc[calendar.type == 'Work Day', 'wd'] = True
calendar.loc[calendar.type == 'Transfer', 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True ), 'wd'] = True

# Drop extra columns
calendar = calendar.drop(['dcoilwtico','locale','locale_name','description'],axis = 1)

In [None]:
print(calendar.isna().sum())
calendar.head()

# **Train Data**

In [None]:
df_train = pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

In [None]:
print(df_train.isna().sum())
df_train.head()

In [None]:
# plot sum sales across all stores
px.line(df_train.groupby('date').sum(),title = 'Total Sales')

Notice on the first day each year, the total sales drops dramasticly

In [None]:
# set index
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()
df_train.index.freq = 'D'
df_train.head()

In [None]:
# unstack the dataset and set as dependent variable
y = df_train.unstack(['store_nbr', 'family'])
y.head()

# **Test Data**

In [None]:
# same logic as train data
df_test = pd.read_csv(path + 'test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
# set index
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
df_test.index.freq = 'D'

In [None]:
df_test.head()

# Train and Test Split

In [None]:
# Set X
fourier = CalendarFourier(freq='W', order=4)

dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()
X

In [None]:
# Choose the lenth of X as standard
print(len(X),len(calendar))

In [None]:
# Add other dependent varibales
length = len(X)
X['oil'] = calendar['ma_oil'].head(length).values
X['dofw'] = calendar['dofw'].head(length).values
X['wd']   = calendar['wd'].head(length).values
X['type'] = calendar['type'].head(length).values

X = pd.get_dummies(X, columns=['dofw'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

X.fillna(method='bfill',inplace=True)
X.head()

# **Build Rig Model**

In [None]:
# Build the rig model and make predictions
model = Ridge(fit_intercept=True, solver='auto', alpha=0.5, normalize=True)
model.fit(X, y)

In [None]:
# Test predictions

stest = '2017-08-16'
etest = '2017-08-31'

X_test = dp.out_of_sample(steps=16)

# Extentions

X_test['oil']  = calendar.loc[stest:etest]['ma_oil'].values
X_test['dofw'] = calendar.loc[stest:etest]['dofw'].values
X_test['wd']   = calendar.loc[stest:etest]['wd'].values

X_test = pd.get_dummies(X_test, columns=['dofw'], drop_first=True)

# No national level events in this period
X_test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer']] = 0

sales_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])

sales_pred[sales_pred < 0] = 0. # Sales should be >= 0

In [None]:
# Submission

df_sub = pd.read_csv(path + 'sample_submission.csv', index_col='id')
df_sub.sales = sales_pred.values
df_sub.to_csv('submission_2.0.csv', index=True)

# Build Elastic Net Model

In [None]:
from sklearn.linear_model import MultiTaskElasticNetCV

In [None]:
elastic_model = MultiTaskElasticNetCV(l1_ratio=[.1, .5, .7,.9, .95, .99, 1],tol=0.01)

In [None]:
elastic_model.fit(X,y)

In [None]:
elastic_model.l1_ratio_

In [None]:
# Test predictions

stest = '2017-08-16'
etest = '2017-08-31'

X_test = dp.out_of_sample(steps=16)

# Extentions

X_test['oil']  = calendar.loc[stest:etest]['ma_oil'].values
X_test['dofw'] = calendar.loc[stest:etest]['dofw'].values
X_test['wd']   = calendar.loc[stest:etest]['wd'].values

X_test = pd.get_dummies(X_test, columns=['dofw'], drop_first=True)

# No national level events in this period
X_test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer']] = 0

In [None]:
X

In [None]:
X_test

In [None]:
ooo =X.iloc[1:2]

In [None]:
ii = elastic_model.predict(ooo)

In [None]:
sales_pred = pd.DataFrame(elastic_model.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])

In [None]:
df_test

In [None]:
# Submission

df_sub = pd.read_csv(path + 'sample_submission.csv', index_col='id')
df_sub.sales = sales_pred.values
df_sub.to_csv('submission.csv', index=True)

# Draft Below

In [None]:
elastic_modeltic_model()

In [None]:
sdate = '2013-01-01'
edate = '2017-07-15'

In [None]:
X_train = X.loc[sdate:edate]
y_train = y.loc[sdate:edate]
X_test = X.loc[edate:]
y_test = y.loc[edate]

In [None]:
#

# **Build Rig Model**

In [None]:
sdate = '2013-01-01'
edate = '2017-06-15'

In [None]:
# Set y_train
y_train = y.loc[sdate:edate]

In [None]:
# Set X_train
fourier = CalendarFourier(freq='W', order=4)

dp = DeterministicProcess(index=y_train.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X_train = dp.in_sample()
X_train

In [None]:
# Add other dependent varibales
length = len(X_train)
X_train['oil'] = calendar['ma_oil'].head(length).values
X_train['dofw'] = calendar['dofw'].head(length).values
X_train['wd']   = calendar['wd'].head(length).values
X_train['type'] = calendar['type'].head(length).values

X_train = pd.get_dummies(X_train, columns=['dofw'], drop_first=True)
X_train = pd.get_dummies(X_train, columns=['type'], drop_first=False)
X_train.head()

In [None]:
# Check which column has Nan
X_train.columns[X_train.isna().any()].tolist()

In [None]:
X_train.fillna(method='bfill',inplace=True)

In [None]:
# Check which column has Nan again
X_train.columns[X_train.isna().any()].tolist()

In [None]:
# Build the rig model and make predictions
model = Ridge(fit_intercept=True, solver='auto', alpha=0.5, normalize=True)
model.fit(X_train, y_train)
y1_pred = pd.DataFrame(model.predict(X_train), index=X.index, columns=y1.columns)

In [None]:
y1_pred

# Test the model

In [None]:
sdate = '2013-06-16'
edate = '2017-08-15'

In [None]:
error1 = mean_squared_error(test['Employees'], predictions)

In [None]:
y1_pred

In [None]:
y1_pred

In [None]:
y_pred

In [None]:
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]

fourier = CalendarFourier(freq='W', order=4)

dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

# Extentions

X['oil']  = calendar.loc[sdate:edate]['ma_oil'].values
X['dofw'] = calendar.loc[sdate:edate]['dofw'].values
X['wd']   = calendar.loc[sdate:edate]['wd'].values
X['type'] = calendar.loc[sdate:edate]['type'].values

X = pd.get_dummies(X, columns=['dofw'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

model = Ridge(fit_intercept=True, solver='auto', alpha=0.5, normalize=True)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

In [None]:
df_test = pd.read_csv(path + 'test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)

df_test.date = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

In [None]:
df_test

In [None]:
# find missing values
df.isna().sum()

In [None]:
# find min and max date
df.tail()

Notice the date range is **2013-01-01** to **2017-08-15**

# Predict Grocery I Sales First

In [None]:
# filter data
df = df_train[(df_train['family'] == 'GROCERY I') & (df_train['store_nbr'] == 1)]
cols = ['id','store_nbr','family','onpromotion']
df = df.drop(cols,axis =1)
df.head()
df.tail()

In [None]:
idx = pd.period_range('2017-08-11', '2017-08-15')
df.reindex(idx, fill_value=0)


In [None]:
df = df.resample('D').sum().fillna(0)

In [None]:
df

In [None]:
# check continuity
df.index.freq = 'D'
df.sales.isnull().sum()

In [None]:
px.line(df['sales'])

NOTE: A magnitude 7.8 earthquake struck Ecuador on April 16, 2016. People rallied in relief efforts donating water and other first need products which greatly affected supermarket sales for several weeks after the earthquake. 


## Run an ETS Decomposition

In [None]:
result = seasonal_decompose(df['sales'])
result.plot();

In [None]:
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 18,8

decomposition = seasonal_decompose(df,model='additive')
fig = decomposition.plot()
plt.show()

In [None]:
# for testing purpose using pdq =(1,10), seasonal_order= (1,1,0,12)
mod =SARIMAX(df,
    order=(1, 1, 0),
    seasonal_order=(1, 1, 0, 7),
    enforce_stationarity=False,
    enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])

In [None]:
# investigateing any unusual behaviour
results.plot_diagnostics(figsize=(16, 8))
plt.show()

In [None]:
#One step forcasting and validating
pred = results.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False)
pred_ci = pred.conf_int()
ax = df['2014':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('automotive')
plt.legend()
plt.show()

In [None]:
y_forecasted = pred.predicted_mean
y_truth = df['2017-01-01':]
mse = ((y_forecasted - y_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

In [None]:
# df_train.loc['5','AUTOMOTIVE']