In [None]:
import pandas as pd
import numpy as np

from scipy.stats import boxcox

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline 
plt.rcParams['figure.figsize']=[20,6]
plt.rcParams['image.cmap'] = 'gray'
sns.set_theme(style='darkgrid')

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/rossmann-store-sales/train.csv', parse_dates=['Date'])
test  = pd.read_csv('../input/rossmann-store-sales/test.csv', parse_dates=['Date'])
store = pd.read_csv('../input/rossmann-store-sales/store.csv')


### Get the data in the correct format & impute missing vals

In [None]:
# train.isna().sum()
test.isna().sum()

In [None]:
store.isna().sum()

In [None]:
test['Open'] = test.groupby('Date')['Open'].apply(lambda x: x.fillna(x.mode()[0]))

store['CompetitionDistance'] = store['CompetitionDistance'].fillna(store['CompetitionDistance'].median())
# store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0)
# store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(0)
# store['PromoInterval'] = store['PromoInterval'].fillna(0)

store.fillna(0, inplace=True)
c = [i for i in store.columns if store[i].dtype != 'object']
store[c] = store[c].astype(int)

store['CompetitionOpenSinceMonth'].replace(0,1, inplace=True)
store['CompetitionOpenSinceYear'].replace(0,2013, inplace=True)
store['CompetitionSince'] = pd.to_datetime(store['CompetitionOpenSinceYear'].astype(str) +'/' +\
                store['CompetitionOpenSinceMonth'].astype(str) + '/01')

store['Promo2SinceWeek'] = (store['Promo2SinceWeek']/int(4)).astype(int)
store['Promo2SinceWeek'].replace(0,12, inplace=True)
store['Promo2SinceYear'].replace(0,2000, inplace=True)
store['promo2Since'] = pd.to_datetime(store['Promo2SinceYear'].astype(str) + '/' + store['Promo2SinceWeek'].astype(str) + '/01')
store.drop(['CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear'], axis=1, inplace=True)

In [None]:


def statHol(df):
    """
    Not idempotent
    """
    # a = public holiday, b = Easter holiday, c = Christmas, 0 = None
    # Label encoding the state holidays



    df['StateHoliday'] = np.where(df['StateHoliday'] == 'a', 1,
                                    np.where(df['StateHoliday'] == 'b', 2,
                                            np.where(df['StateHoliday'] == 'c', 3, df['StateHoliday'])))

    df['StateHoliday'] = df['StateHoliday'].astype(int)
    
    return df

train = statHol(train)
test  = statHol(test)
# df_all = pd.concat([train, test])
df = pd.merge(train,store, on='Store', how='left') 
dft = pd.merge(test, store, on='Store', how='left')
# df.info()


# EDA-I

Since there are a lot of stores and its immpossible to check each and ever store, hence its a better idea to use the power of randomization and check random stores.

In [None]:
class RandomCheck:
    
    def __init__(self):
        pass
    
    
    def randomStore():
        
        return np.random.choice(train.Store.unique())
    
    def eda(*args):
        
        if args:
            nb = args[0]
        else:
#         nb = np.random.choice(train.Store.unique())
            nb = RandomCheck.randomStore()
    
        df = train[train['Store']==nb]
        df = df.sort_values(by='Date')
        
        store_type = store[store['Store']==nb]['StoreType']
        
        fig, (ax1, ax2, ax3) = plt.subplots(1,3)
        
        fig.suptitle(f"STORE NUMBER:{nb}; TYPE:{store_type.iloc[0].upper()}", fontsize=22, fontweight='bold')
        
        ax1.set_title(f"Open vs Closed")
        ax1 = sns.stripplot(x='Open', y='DayOfWeek', data=df, ax=ax1)
        ax1.set_xticklabels(['Closed', 'Open'])
        
        ax2.set_title("Sales over the week")
        data = pd.DataFrame(df.groupby('DayOfWeek')['Sales'].mean())
        ax2 = sns.barplot(x=data.index, y='Sales', data=data, ax=ax2)
        
        ax3.set_title("Customers over the week")
        data = pd.DataFrame(df.groupby('DayOfWeek')['Customers'].mean())
        ax3 = sns.barplot(x=data.index, y='Customers', data=data, ax=ax3)
        
        plt.tight_layout()
        
        fig, (ax1, ax2, ax3) = plt.subplots(1,3)
        
        ax1.set_title("Promo effect")
        data = pd.DataFrame(df.groupby('Promo')['Sales'].mean())
        ax1 = sns.barplot(x=data.index, y='Sales', data=data, ax=ax1)
        
        ax2.set_title("School Holiday effect")
        data = pd.DataFrame(df.groupby('SchoolHoliday')['Customers'].mean())
        ax2 = sns.barplot(x=data.index, y='Customers', data=data, ax=ax2)
        
        ax3.set_title("State Holiday effect")
        data = pd.DataFrame(df.groupby(['StateHoliday'])['Sales'].mean())
        ax3 = sns.barplot(x=data.index, y='Sales', data=data, ax=ax3)
        ax3.set_xticklabels(['No holiday', 'Public holiday', 'Easter', 'Christmas'])
        
        plt.tight_layout()
        
        l = ['Sales', 'Customers']
        
        fig,ax = plt.subplots(1,2)
        
        fig.suptitle("Univariate Analysis", fontsize=18, fontweight='bold')
        
        for i,j in enumerate(l):
            
            sns.kdeplot(df[j], ax=ax[i])
        
        plt.tight_layout()
        
        df = df[df['Open']==1]
        df['Sales'], _ = boxcox(df['Sales'].replace(0,1))
        df['Customers'], _ = boxcox(df['Customers'].replace(0,1))
        
        fig,ax = plt.subplots(1,2)
        
        fig.suptitle("Normalized", fontsize=14, fontweight='bold')
        
        for i,j in enumerate(l):
            
            sns.kdeplot(df[j], ax=ax[i])
        
        plt.tight_layout()
        
        return nb
        
    def printnb():
        
        nb = RandomCheck.randomStore()
        print(nb)

    def store(nb):
        
        data = train[train['Store']==nb]
        data = data[['Date', 'Sales']]
        data = data.loc[data['Sales']!=0]
        data.set_index('Date', inplace=True)
        v = store[store['Store']==nb]['StoreType'].iloc[0]
        plt.figure(figsize=[14,4])
#         data[:30].plot(title=f"STORE NUMBER: {nb}");
#         data[:300].plot();
#         data.plot();
        data.resample('W').mean().plot(title=f"Store:#{nb}|| Type:{v}\n Weekly view")
        data.resample('M').mean().plot(title="Monthly view")
        data.resample('Y').mean().plot(title="Yearly view")
        
    def eda2():
        
        plt.title("Store Type")
        data = pd.DataFrame(df.groupby('StoreType')['Sales'].mean())
        ax = sns.barplot(x=data.index, y='Sales', data=data)
        
        plt.title("Assortment Type")
        data = pd.DataFrame(df.groupby('Assortment')['Sales'].mean())
        ax = sns.barplot(x=data.index, y='Sales', data=data)
        
    def competition(*args):
                    
        arr =[] 
        fig, ax = plt.subplots(2,2)
        
        for h,i in enumerate(ax.flatten()):
            
            if not args:
                nb = RandomCheck.randomStore()
            else:
                nb = args[0][h]

            arr.append(nb)
                
            data=df[df['Store']==nb]
            data=data[['Date','Sales']].set_index('Date').sort_index()
            data=data[data['Sales']!=0]
            data['26DayMovingAverage'] = data.rolling(window=26).mean()
            
            idx = store[store['Store']==nb]['CompetitionSince'].iloc[0]
            
            
            if idx > pd.to_datetime('2013/01/01'):
              
                data.plot(ax=i, rot=0, title=f"Store #{nb}")
                i.axvline(idx, color='r')
                i.text(idx, 6000,'Competition enters', fontweight='heavy', rotation=45)
                
            else:
                
                idx = data.index[int(len(data)/2)]
                idy = 0.8 * (data.Sales.max())
                data.plot(ax=i, rot=0, title=f"Store #{nb}")
                i.text(idx, idy,'No competition', fontweight='heavy')
                
            plt.tight_layout()
            
        return arr
            
    def competitionDis(arr):
        
        dis = {}
        
        for i in arr:
            
            dis[i] = store[store['Store']==i]['CompetitionDistance'].iloc[0]
        pd.DataFrame.from_dict(dis, orient='index').plot.barh(xlabel='Store#',
                                                              rot=0, 
                                                              legend=False,
                                                             figsize=[18,2])
    def storetype(*args):
        
        if not args:
            args = ['a','b','c','d']
        l=[]
        for n, typ in enumerate(args):

            nb = np.random.choice(df[df['StoreType']==typ]['Store'].unique())
            c=['red','blue','green','black']
            l.append(f"Store#{nb}||Type#{typ}")
            data = df[df['Store']==nb]
            data = data[data['Sales']!=0]
            data['12MA'] = data['Sales'].rolling(window=12).mean()
            plt.plot(data['Date'], data['12MA'], c=c[n], linewidth=2)
        plt.ylabel('Sales')
        plt.xlabel('Date')
        plt.xticks([])
        plt.yticks([])
        plt.legend(labelcolor=c, labels=l)
        
    def promo(arr):
        
        for nb in arr:
            
            data = df[df['Store']==nb]
            data = data[['Date', 'Promo', 'Sales']]
            data = data[data['Sales']!=0]
            dat1 = data[data['Promo']==1].set_index('Date').sort_index()
            dat2 = data[data['Promo']==0].set_index('Date').sort_index()
            data['26MovingAverage'] = data['Sales'].rolling(window=26).mean()
            dat1['Sales'].plot.area(title=f'Promo vs No Promo: Store#{nb}')
            plt.scatter(dat2.index, dat2.Sales, c='orange')
            plt.plot(data.Date, data['26MovingAverage'], c='r', linewidth=6)
            plt.xticks([])
            plt.yticks([])
            plt.ylabel('Sales')
            plt.legend(labelcolor=['orange', 'r', 'b'], labels=['No promo', 'Moving Average', 'Promo'], framealpha=0.0,
                      frameon=True, shadow=True, loc='upper left')
            plt.figure()
        plt.tight_layout()
    
    def promo2(*args):
        
        if args:
            nb=args[0]
        else:
            nb = np.random.choice(store[store['Promo2']==1]['Store'].unique())
        
        v1 = store[store['Store']==nb]['promo2Since'].iloc[0]
        v2 = v1 + pd.DateOffset(months=1)
        v3 = store[store['Store']==nb]['PromoInterval'].iloc[0]
        data = train[train["Store"]==nb]
        data = data[data['Sales']!=0]

        plt.title(f"Store:#{nb} || Promo2 Interval:{v3} || Promo2 start:{v1}")
        plt.plot(data['Date'], data['Sales'])
        if v1>pd.to_datetime('2013-01'):
            plt.axvspan(v1, v2, alpha=0.5, color='red')
        plt.legend(labelcolor=['blue','red'], labels=['sales','promo2'])
        plt.yticks([])
        plt.ylabel('Sales')
        
    def fft(*args):
        
        if args:
            nb=args[0]
        else:
            nb = np.random.choice(store['Store'].unique())
            
        stype = store[store['Store']==nb]['StoreType'].iloc[0]

        data = train[train['Store']==nb]
        data = data[['Date', 'Sales']].set_index('Date').sort_index()

        fft = np.abs(np.fft.rfft(data.Sales.values))
        n_years = len(data)/365.2425
        f_years = np.arange(0, len(fft)) / n_years
        
        plt.figure(figsize=[6,2])
        plt.title(f"SEASONALITY \nStore:{nb} StoreType:{stype}")
        plt.step(f_years, fft)
        plt.ylim(0,2000000)
        plt.xscale('log')
        plt.yticks([])
        plt.xticks([1,12,52,365.24], labels=['1/year','1/months','1/weeks','1/days'])
        plt.show();

Randomized plot gives you details of a new store everytime it is executed and is very useful to get the feel of data one is dealing with.

In [None]:
nb = RandomCheck.eda()

In [None]:
RandomCheck.store(nb)

In [None]:
RandomCheck.fft(nb)

### Observations:
* Most (or maybe all) stores are closed on Day7; need to check if there is/are any store(s) which is open on Day7
* Worth checking stores like #512 which seem to operate daily, even on holidays; it even has a consistent linear trend which most stores dont. Actually, this is a trait of b type stores.
* Stores like #231 & #181 have had periods of 4-6 months of no business; need to check for more like this
* Stores like #109 are examples of how majority of stores are; we must get some info by clustering based on Store type which hasn't been explored yet.
* Problem with store #494 -- Never closed; need to check if there are more like this
* There are a few stores such as #977 which are open even on public holidays; need to check more stores like this
* Apart from the weekly timeframe which is quite evident from the plots, there seems to be no clear seasonality for others.

In [None]:
## to validate the first point where stores are closed on Day7

nsales  = train[(train['Sales']==0) & (train['Open']!=0)]
print(f"Stores open but no sales: {len(nsales)}")
# train = train[(train['Sales']!=0) & (train['Open']==1)]
# print(f"Training data after dropping the days where there is no sales:{len(train)}")

Another reason why dropping the 0 sales makes sense is for the fact that its 0 and not non-zero. In case, where the holiday effect leads to a different type of change such as power consumpition etc., it would be a bad idea to drop holiday rows.

# EDA-II

The main idea is to achieve the followings:
* Analyze effect of store type
* effect of assortment type
* effect of competition presence

In [None]:
for i,j in enumerate(list(df.StoreType.unique())):
    
    k = len([df.groupby('StoreType')['Store'].unique()][0][i])
    print(f"Total number of '{j}' type stores is {k}")
    
for i,j in enumerate(list(df.Assortment.unique())):
    
    if j == 'a':
        j='basic'
    elif j=='b':
        j='extra'
    else:
        j='extended'
    k = len([df.groupby('Assortment')['Store'].unique()][0][i])
    print(f"Total number of '{j}' assortment type is {k}") 

In [None]:

data1 = pd.DataFrame(df.groupby(['StoreType'])['Sales', 'Customers'].mean()).sort_values(by='StoreType')
data2 = pd.DataFrame(df.groupby(['Assortment'])['Sales', 'Customers'].mean()).sort_values(by='Assortment')
data3 = pd.DataFrame(df.groupby(['StoreType'])['Sales', 'Customers'].sum()).sort_values(by='StoreType')
data4 = pd.DataFrame(df.groupby(['Assortment'])['Sales', 'Customers'].sum()).sort_values(by='Assortment')

# fig, (ax1,ax2,ax3,ax4) = plt.subplots(2,2)

# for i in ax.flatten():
    
ax1 = plt.subplot(221)
plt.title("Mean values")
data1.plot.bar(ax=ax1, rot=0)
ax2 = plt.subplot(222)
plt.title("Mean values")
data2.plot.bar(ax=ax2, rot=0)
ax2.set_xticklabels(['basic', 'extra', 'extended'])

ax1 = plt.subplot(223)
plt.title("Summed values")
data3.plot.bar(ax=ax1, rot=0)
ax2 = plt.subplot(224)
plt.title("Summed values")
data4.plot.bar(ax=ax2, rot=0)
ax2.set_xticklabels(['basic', 'extra', 'extended'])
plt.tight_layout()

In [None]:
RandomCheck.storetype()

So, it looks like b type stores on an average have much more sales as well as customers, but the surprising thing is they don't account much of the total revenue.

In [None]:
store['StoreType'].value_counts().sort_index()

In [None]:
ax1 = plt.subplot(131)
df.groupby('StoreType')['Sales'].sum().plot.pie(ax=ax1, title='Revenue share', legend=True, autopct='%1.1f%%', shadow=True)
ax2 = plt.subplot(132)
df.groupby('StoreType')['Customers'].sum().plot.pie(ax=ax2, title='Customer share', legend=True, autopct='%1.1f%%', shadow=True)
ax3 = plt.subplot(133)
store['StoreType'].value_counts().sort_index().plot.pie(rot=0, title='Store types', autopct='%1.1f%%', shadow=True, legend=True)

So, the pie chart provides a lot of info;

1. Customer/Store - a&c=1; b=3.3; d=0.8
1. Sales/Store - a,c&d=1; b=1.8;
1. Sales/Customer - a&c=1; b=0.6; d=1.3

* a & c have similar per store per customer sales figures. Simply put every store has 1 customer for a unit of sale.
* b looked good intially with the highest average sales but has 3 customers for 2 units of sale. The good thing about this store type is that per store revenue generation(almost 2X) is the most than any other type.
* d looks like they could use more promotion to get more customers as they have 1 customer for 1.2 units of sale which is good value. 



Definitely, this analyis is more meaningful if the product range is same across the store types.

example: 1086,983,675,578

In [None]:
arr = RandomCheck.competition([1086,983,675,578])

Stores like 1086, 983, 675 showcase that after the competition enters the sales have gone down.

There are stores like 578 where the opposite has happened.

In [None]:
RandomCheck.competitionDis(arr)

This is quite a nice finding

All the above stores were affected by the competition but the store 578 did not go through a negative effect which others went through because the competition was way too far to affect it. This effect can be verified running some more random checks and it quite an important finding.



In [None]:
RandomCheck.promo(arr)

It seems quite clear that a store's sales during a promo clearly sees a surge compared to when there is no promo. Lets check promo2 also.


In [None]:
store.info()

In [None]:
RandomCheck.promo2()

In [None]:
RandomCheck.fft()

# Conclusion of EDA
1. Store type b are trending the entire time. In general, other store types have an upward trajectory but there are a few which are in a state of slump.
1. Store type effects on customers & revenue is discussed and elaborated above. Based on that it seems that there are quite a lot of opportunities in store type 'b' & 'd' as they add some value in terms of attracting more number of customer per store & more sales per customer, respectively. Store type a & c are quite similar in terms of "per customer and per store" sales numbers and just because of the sheer volume of a type stores they have best gross numbers. 
2. Also, it seems that in b type store the product family is quite different if compared to others as the revenue per store is subsatantially more than the others.
2. Majority of stores have Christmas holiday seasonality.
3. Promo seems to ad a postive lift in the number of sales.
4. Presence of competition has an interesting effect. In case, the competition is present with close quarters (lets say median or mean or withing 1st & 3rd quantile) then it results in loss of some revenue. Otherwise, the competition is practically too far away to affect in loss of some revenue (inturn, leads to increase of sales in some cases).
1. It seems that promo 2 is pointless :(



# Forecast using traditional TS models
* VAR family method: We use this as VAR enables multivariate prediction which means we can predict Sales as well as Customers. In a relistic scenario multivariate process is what we would want.
* ARIMA process: We can just do a univariate prediction as this is what the competition demands. We will see how we can integrate the exogenous variables in this process.

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.api import ExponentialSmoothing
from tqdm import tqdm_notebook
from itertools import product

In [None]:
class Model:
    
    def __init__():
        pass
    
    def optimizeSARIMAX():
        
        # --> (p,q,P,Q)
        
        p=q=Q=P =range(0,4,1)
        combo = list(product(p,q,P,Q))
        d=D=0
        s=12
        
        res = []

        for i in tqdm_notebook(combo):

            try:
                
#                 print(i[0], i[2])

                model = SARIMAX(endogTrain['Sales'], exog=exogTrain, order=(i[0],0,i[1]),
                               seasonal_order=(i[2],0,i[3],s), simple_differencing=False).fit(disp=False)

            except:

                continue
            aic = model.aic
            res.append([i,aic])

        resDF = pd.DataFrame(res,columns=['combo','AIC']).sort_values(by='AIC').reset_index(drop=True)

        return resDF

    def optimizeVAR(endog, exog):

        p = range(0,7,1)
        q = range(0,7,1)

        combo = list(product(p,q))

        res=[]

        for i in tqdm_notebook(combo):

            try:

                model = VARMAX(endog=endog, exog=exog, order=i, enforce_stationarity=True).fit(disp=False)

            except:

                continue

            mse = model.mse
            aic = model.aic

            res.append([i,mse,aic])

        varDF = pd.DataFrame(res, columns=['combo','mse','aic']).sort_values(by='mse')

        return varDF

    def statTests(df):

        print(f"p-value:{adfuller(df)[1]}")
        
    def causeTest(df,n):
        
        print("Sales causes customers")
        grangercausalitytests(endog[['Sales', 'Customers']], 4)
        print("#####################################################################")
        print("\nCustomer causes sales")
        grangercausalitytests(endog[['Customers', 'Sales']], 4)
        
    def preWithZero(*args):
        if args:
            nb = args
        else:
            nb = np.random.choice(len(store))      
        dat = train[train['Store']==nb]
        dat = dat.sort_values(by="Date").reset_index(drop=True)
        endog = dat[['Sales','Customers']]
        dt = dat.pop('Date')
        exog  = dat.drop(['Store','Sales','Customers'], axis=1)
        endog = endog.applymap(float)
        split = int(0.8*len(endog))
        endogTrain = endog[:split]
        exogTrain  = exog[:split]
        endogTest = endog[split:]
        exogTest  = exog[split:]
        
        return dt, endogTrain, exogTrain, endogTest, exogTest
    
    
    def preWithoutZero(*args):
        if args:
            nb = args
        else:
            nb = np.random.choice(len(store))      
        dat = train[train['Store']==nb]
        dat = dat[dat['Sales']!=0]
        dat = dat.sort_values(by="Date").reset_index(drop=True)
        endog = dat[['Sales','Customers']]
        dt = dat.pop('Date')
        exog  = dat.drop(['Store','Sales','Customers'], axis=1)
        endog = endog.applymap(float)
        split = int(0.8*len(endog))
        endogTrain = endog[:split]
        exogTrain  = exog[:split]
        endogTest = endog[split:]
        exogTest  = exog[split:]
        
        return dt, endogTrain, exogTrain, endogTest, exogTest
    
    def plotPred(model):
        
        preds = model.get_prediction(end=len(dt)-1, exog=exogTest)
        predDF = preds.predicted_mean
        plt.plot(dt[-50:], endogTrain['Sales'][-50:], c='g')
        plt.plot(dt[-50:], predDF.iloc[-50:], c='r')
        plt.legend(labelcolor=['green', 'red'], labels=['Actual', 'Forecast'])
        return predDF
        
    def metrics(predDF):
        
        e = predDF - endogTest["Sales"]
        esq = e**2
        mse = esq.mean()
        rmse = np.sqrt(mse)
        rmspe = (np.sqrt(np.mean(np.square((endogTest["Sales"] - predDF) / (endogTest["Sales"]))))) * 100
        mape = np.mean(np.abs((predDF - endogTest["Sales"] )/ endogTest["Sales"])) * 100
        print(f" MSE:{np.round(mse,2)}\n RMSE:{np.round(rmse,2)}\n MAPE:{np.round(mape,2)}%\n RMSPE:{np.round(rmspe,2)}%")
        


#### Ex: Store#104

In [None]:
dt, endogTrain, exogTrain, endogTest, exogTest = Model.preWithZero(104)

In [None]:
dt, endogTrain, exogTrain, endogTest, exogTest = Model.preWithoutZero(104)
Model.statTests(endogTrain['Sales'])

In [None]:

plot_acf(endogTrain['Sales']);
plot_pacf(endogTrain['Sales']);

In [None]:
seasonal_decompose(endogTrain.Sales, period=15).plot();

In [None]:
seasonal_decompose(endogTrain.Sales, period=313).plot();

## Naive Benchmark

In [None]:
p = endogTrain.tail(1).iloc[0][0]
predNaive = np.repeat(p, len(endogTest))
plt.plot(dt[-50:], endogTest['Sales'][-50:], c='g')
plt.plot(dt[-50:], predNaive[-50:], c='r')
plt.legend(labelcolor=['green', 'red'], labels=['Actual', 'Forecast'])

In [None]:
Model.metrics(predNaive)

## Holt's Winter

In [None]:
modelHW = ExponentialSmoothing(endogTrain['Sales'], 
                               seasonal_periods=7, 
                               trend='add', seasonal='add',
                              initialization_method="heuristic",).fit()
# modelHW.summary()
predHW = modelHW.forecast(len(exogTest))

In [None]:
plt.plot(dt[-len(endogTest):], endogTest['Sales'], c='g')
plt.plot(dt[-len(endogTest):], predHW, c='r')
plt.legend(labelcolor=['green', 'red'], labels=['Actual', 'Forecast'])

In [None]:
Model.metrics(predHW)

## SARIMAX 

##### Commented as it takes a lot of time

In [None]:
# modelSarimax = SARIMAX(endog=endogTrain['Sales'], exog=exogTrain, order=(4,0,4),
#                       seasonal_order=(4,0,4,7)).fit(disp=False)
# modelSarimax.plot_diagnostics();

In [None]:
# preds = Model.plotPred(modelSarimax)

In [None]:
# Model.metrics(preds)

## VARMAX model

In [None]:
# df = optimizeVAR(endog, exog)  #### takes close to 20 mins

In [None]:
warnings.filterwarnings('ignore')
model = VARMAX(endogTrain, exogTrain, order=(4,4), enforce_stationarity=True).fit(disp=False) ### Not feasble one 
model.plot_diagnostics();

In [None]:
preds = Model.plotPred(model)

In [None]:
Model.metrics(preds['Sales'])

### Conclusion

Only exponential smoothing shows promise
Un comment cell below to make prediction using Exp Smoothing

In [None]:
# stores   = list(store.Store.unique())
# allPreds = test[['Id', 'Store', 'Date']]
# allPreds['pred'] = 0
# dt = list(test.Date.unique())
# dt.sort()
# pdf = pd.DataFrame(columns=['Date', 'Store', 'preds'])

# for st in tqdm_notebook(stores):
    
#     _, endogTrain, _, _, _ = Model.preWithZero(st)
    
#     modelHW = ExponentialSmoothing(endogTrain['Sales'], 
#                                seasonal_periods=7, 
#                                trend='add', seasonal='add',
#                               initialization_method="heuristic").fit()

#     predHW = modelHW.forecast(48)
    
#     ndf = pd.DataFrame(columns=['Date', 'Store', 'preds'])
#     ndf['Date']  = dt
#     ndf['preds'] = predHW.values
#     ndf['Store'] = st
    
#     pdf = pdf.append(ndf)
    
# subDF = pd.merge(test,pdf,on=['Date','Store'],how='left')
# subDF['preds'] = np.where(subDF['Open']==0,0,subDF['preds'])    
    

# Forecast using FB Prophet

In [None]:
from fbprophet import Prophet
def prophetDataProces(nb):
    
    wa = pd.DataFrame()
    d = test.Date.unique()
    d.sort()
    wa['ds'] = d
    
    dat = train[train['Store']==nb]
    dat = dat.sort_values(by="Date").reset_index(drop=True)
    dat = dat[['Date','Sales']]
    dat.columns = ['ds','y']
    return dat,wa


In [None]:
pdf = pd.DataFrame(columns=['Date', 'Store', 'preds'])
stores   = list(store.Store.unique())

for st in tqdm_notebook(stores):
    
    data, wa = prophetDataProces(st)
    
    model = Prophet()
    model.fit(data)
    
    fcast = model.predict(wa)
    fcast['Store'] = st
    fcast = fcast[['ds', 'Store', 'yhat']]
    fcast.columns = ['Date', 'Store', 'preds']
    pdf = pdf.append(fcast)
    
subDF = pd.merge(test,pdf,on=['Date','Store'],how='left')
subDF['preds'] = np.where(subDF['Open']==0,0,subDF['preds']) 

### Forecast with Neural Network 

#### Baseline: Simple DNN

In [None]:
# import warnings
# warnings.filterwarnings('ignore')
# from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
# import tensorflow as tf

In [None]:
# ###### Processing data -- Univaraite ##########

# def window(arr, window, horizon):
    
#     idx = np.expand_dims(np.arange(window+horizon), axis=0) + \
#                     np.expand_dims(np.arange(len(arr)-(window+horizon-1)), axis=0).T
#     arrWindowed = arr[idx]  
#     return arrWindowed[:,:-1], arrWindowed[:,-1:]

# dfProcessed = df.drop(['Promo2','PromoInterval','CompetitionSince', 'promo2Since', 'Assortment'], axis=1).sort_values(by='Date')

# df_train = dfProcessed[dfProcessed['Date']<pd.to_datetime('2015-07-01')]
# df_test  =  dfProcessed[dfProcessed['Date']>pd.to_datetime('2015-06-30')]
# colsToScale = ['Sales', 'Customers', 'CompetitionDistance']
# # labelEncode = ['StoreType']
# dfProcessed = df.drop(['Promo2','PromoInterval','CompetitionSince', 'promo2Since', 'Assortment'], axis=1)

# encoder = LabelEncoder()
# encoder.fit(df_train['StoreType'])
# scaler = MinMaxScaler()
# scaler.fit(df_train[colsToScale])

# df_train[colsToScale] = scaler.fit_transform(df_train[colsToScale])
# df_test[colsToScale]  = scaler.fit_transform(df_test[colsToScale])

# df_train['StoreType'] = encoder.fit_transform(df_train['StoreType'])
# df_test['StoreType'] = encoder.fit_transform(df_test['StoreType'])

# trainWindows, trainLabels = window(np.asarray(df_train['Sales']), 7, 1)
# testWindows, testLabels = window(np.asarray(df_test['Sales']), 7, 1)

In [None]:
# ### Multivariate 

# dfProcessed = df.drop(['Customers','Promo2','PromoInterval','CompetitionSince', 'promo2Since', 'Assortment'],
#                       axis=1).sort_values(by='Date')
# dft = dft.drop([ 'Promo2','PromoInterval','CompetitionSince', 'promo2Since', 'Assortment'], axis=1).set_index('Date')
# # dfProcessed = df.drop(['Promo2','PromoInterval','CompetitionSince', 'promo2Since', 'Assortment'], axis=1).sort_values(by='Date')
# dfWindowed = dfProcessed.copy()
# dfWindowed.set_index('Date', inplace=True)



# for i in range(7):
    
#     dfWindowed[f"Sales{i+1}"] = dfWindowed['Sales'].shift(periods=i+1)
    
# dfWindowed.dropna(inplace=True)
# yFeat = ['Sales', 'Sales1', 'Sales2', 'Sales3', 'Sales4', 'Sales5', 'Sales6', 'Sales7']
# y = dfWindowed[yFeat]
# X = dfWindowed.drop(yFeat, axis=1)

# X_train = X[X.index<pd.to_datetime('2015-07-01')]
# X_test  =  X[X.index>pd.to_datetime('2015-06-30')]
# y_train = y[y.index<pd.to_datetime('2015-07-01')]
# y_test  =  y[y.index>pd.to_datetime('2015-06-30')]

# # minMaxed = ['CompetitionDistance']
# # minMaxed = ['Customers', 'CompetitionDistance', 'Sales1', 'Sales2', 'Sales3', 'Sales4', 'Sales5', 'Sales6', 'Sales7']

# encoder = LabelEncoder()
# encoder.fit(X_train['StoreType'])
# X_train['StoreType'] = encoder.transform(X_train['StoreType'])
# X_test['StoreType'] = encoder.transform(X_test['StoreType'])
# dft['StoreType'] = encoder.transform(dft['StoreType'])


# scaler = MinMaxScaler()
# scaler.fit(np.asarray(X_train['CompetitionDistance']).reshape(-1,1))
# X_train['CompetitionDistance'] = scaler.transform(np.asarray(X_train['CompetitionDistance']).reshape(-1,1))
# X_test['CompetitionDistance'] = scaler.transform(np.asarray(X_test['CompetitionDistance']).reshape(-1,1))
# dft['CompetitionDistance'] = scaler.transform(np.asarray(dft['CompetitionDistance']).reshape(-1,1))

# # y_train = y_train.values.reshape(-1,1)
# scaler2 = MinMaxScaler()
# scaler2.fit(y_train[yFeat])
# y_train[yFeat] = scaler2.transform(y_train[yFeat])
# y_test[yFeat] = scaler2.transform(y_test[yFeat])


# # mx = y_train.max()
# # mn = y_train.min()
# # y_train = (y_train - mn) / (mx - mn)
# # y_test = (y_test - mn) / (mx - mn)

In [None]:
# model1 = tf.keras.Sequential()
# model1.add(tf.keras.layers.Dense(64, activation='relu'))
# model1.add(tf.keras.layers.Dense(256, activation='relu'))
# model1.add(tf.keras.layers.Dense(256, activation='relu'))
# model1.add(tf.keras.layers.Dense(8))

# model1.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss=tf.keras.losses.mae,
#               metrics=['mae', 'mse'])

# history1 = model1.fit(X_train, y_train, validation_data=(X_test, y_test),
#                      epochs=3, batch_size=128, verbose=1)

In [None]:
# pd.DataFrame(history1.history).plot()

In [None]:
# preds = model1.predict(X_test)
# preds = preds[:,0]
# y_test = y_test.values[:,0]

In [None]:
# print(f"MSE:{mse(y_test, preds)}")
# f"MSE:{mse(y_test, preds)}, MAE:{mae(y_test, preds)}"

In [None]:
# try:
#     e = np.finfo(float).eps
#     rmspe = np.sqrt(np.mean(np.square((y_test-preds)/y_test+e))) * 100
# except ZeroDivisionError:
#     print("Div by zero")

### LSTM

In [None]:
# model2 = tf.keras.Sequential()
# model2.add(tf.keras.layers.LSTM(64, return_sequences=True))
# model2.add(tf.keras.layers.LSTM(32, return_sequences=False))
# model2.add(tf.keras.layers.Dense(8, activation='linear'))

# model2.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss=tf.keras.losses.mae,
#               metrics=['mae', 'mse'])

# history2 = model2.fit(X_train, y_train, validation_data=(X_test, y_test),
#                      epochs=3, batch_size=128, verbose=1)

In [None]:


#f"MSE:{mse(y_test, predLSTM)}, MAE:{mae(y_test, predLSTM)}"

# Submission

In [None]:
submission = subDF[['Id','preds']]
submission.rename(columns={'preds':'Sales'}, inplace=True)

submission = submission.set_index('Id')
submission.to_csv("sub.csv")

In [None]:
# ids = dft.pop('Id')
# subPred = model1.predict(dft)
# subPred = scaler2.inverse_transform(subPred)
# subPred = subPred[:,0]
# subDF = pd.DataFrame(ids.reset_index(drop=True))
# subDF['Sales'] = subPred
# subDF = subDF.set_index('Id')
# subDF.to_csv("sub.csv")