In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import time
import math
import datetime
from math import log, floor
from sklearn.neighbors import KDTree

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.utils import shuffle
from tqdm.notebook import tqdm as tqdm

import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# import pywt
from statsmodels.robust import mad

import scipy
import statsmodels
from scipy import signal
import statsmodels.api as sm
from fbprophet import Prophet
from scipy.signal import butter, deconvolve
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

import warnings
warnings.filterwarnings("ignore")

# Load data

In [None]:
calendar = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')

# sales_train_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

sales_train_val = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')

sell_prices = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')

In [None]:
# #Add zero sales for the remaining days 1942-1969
# for d in range(1942,1970):
#     col = 'd_' + str(d)
#     sales_train_val[col] = 0
#     sales_train_val[col] = sales_train_val[col].astype(np.int16)

In [None]:
#Before reduce size
sales_train_val_size = np.round(sales_train_val.memory_usage().sum()/(1024*1024),1)
# sales_train_eval_size = np.round(sales_train_eval.memory_usage().sum()/(1024*1024),1)
calendar_size = np.round(calendar.memory_usage().sum()/(1024*1024),1)
sell_prices_size = np.round(sell_prices.memory_usage().sum()/(1024*1024),1)

print(f'sales_train_val_size   {sales_train_val_size} byte')
print(f'calendar_size          {calendar_size} byte')
print(f'sell_prices_size       {sell_prices_size} byte')

In [None]:
%%time
#Reduce size in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            
    return df 

sales_train_val = downcast(sales_train_val)
# sales_train_eval = downcast(sales_train_eval)
calendar = downcast(calendar)
sell_prices = downcast(sell_prices)

In [None]:
#After reduce size
sales_train_val_size = np.round(sales_train_val.memory_usage().sum()/(1024*1024),1)
# sales_train_eval_size = np.round(sales_train_eval.memory_usage().sum()/(1024*1024),1)
calendar_size = np.round(calendar.memory_usage().sum()/(1024*1024),1)
sell_prices_size = np.round(sell_prices.memory_usage().sum()/(1024*1024),1)

print(f'sales_train_val_size   {sales_train_val_size} byte')
print(f'calendar_size          {calendar_size} byte')
print(f'sell_prices_size       {sell_prices_size} byte')

# Data Information

<img src="https://i0.wp.com/mofc.unic.ac.cy/wp-content/uploads/2020/01/diagram.png?fit=1276%2C705&ssl=1" width="1200">


# Data Preparation

## Amount Sale by State

In [None]:
group = sales_train_val.groupby(['state_id','store_id','cat_id','dept_id'],as_index=False)['item_id'].count().dropna()

group['walmart'] = 'Walmart Distribution'

fig = px.treemap(group, path=['walmart', 'state_id', 'store_id', 'cat_id', 'dept_id'], values='item_id',
                  color='item_id',
                  color_continuous_scale='RdBu',
                  title='Walmart: Distribution of items')

fig.update_layout()
fig.show()

In [None]:
d_col = [c for c in sales_train_val.columns if 'd_' in c]

df = pd.merge(calendar.set_index('d'),sales_train_val.set_index('id')[d_col].T, left_index=True, right_index=True, validate='1:1')

df = df.reset_index().set_index('date').rename(columns={'index':'d'})

In [None]:
state_list = sales_train_val.state_id.unique() #['CA', 'TX', 'WI']
means = []
fig = go.Figure()
for s in state_list:
    state_items = [c for c in df.columns if s in c] #ex: HOBBIES_1_001_WI_3_validation
    data = df[state_items].sum(axis=1).rolling(30).mean()
#     means.append(np.mean(df[state_items].sum(axis=1))) #store to mean for each store CA, TX ....
    fig.add_trace(go.Scatter(x=data.index, y=data, name=s)) #plot each store CA, TX ...
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling 30 days Average Sales vs. Time (per state)")

In [None]:
fig = go.Figure()

for s in state_list:
        state_items = [c for c in df.columns if s in c]
        data = df[state_items].sum(axis=1).rolling(30).mean()
        fig.add_trace(go.Box(x=[s]*len(data), y=data, name=s))
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling 30 days Average Sales vs. State")

* CA has a significantly higher sales to others states because CA has 4 stores while others have only 3 stores

In [None]:
store_list = sales_train_val.store_id.unique() #['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
means = []
fig = go.Figure()
for s in store_list:
    store_items = [c for c in df.columns if s in c] #ex: HOBBIES_1_001_WI_3_validation
    data = df[store_items].sum(axis=1).rolling(30).mean()
    means.append(np.mean(df[store_items].sum(axis=1))) #store to mean for each store CA_1, CA_2 ....
    fig.add_trace(go.Scatter(x=data.index, y=data, name=s)) #plot each store CA_1, CA_2 ...
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling Average 30 days Sales vs. Time (per store)")

* Sales of WI_2 has drastically increased in 2012 and become the 2nd of higest sales in 2016

In [None]:
fig = go.Figure()

for s in store_list:
        store_items = [c for c in df.columns if s in c]
        data = df[store_items].sum(axis=1).rolling(30).mean()
        fig.add_trace(go.Box(x=[s]*len(data), y=data, name=s))
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling 30 days Average Sales vs. Store")

In [None]:
cat_list = sales_train_val.cat_id.unique() #['HOBBIES', 'HOUSEHOLD', 'FOODS']
means = []
fig = go.Figure()
for s in cat_list:
    cat_items = [c for c in df.columns if s in c] #ex: HOBBIES_1_001_WI_3_validation
    data = df[cat_items].sum(axis=1).rolling(30).mean()
    means.append(np.mean(df[cat_items].sum(axis=1))) #store to mean for each store HOBBIES, HOUSEHOLD ....
    fig.add_trace(go.Scatter(x=data.index, y=data, name=s)) #plot each store HOBBIES, HOUSEHOLD ...
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling Average 30 Sales vs. Time (per category)")

In [None]:
fig = go.Figure()
means = []
for s in cat_list:
        store_items = [c for c in df.columns if s in c]
        data = df[store_items].sum(axis=1).rolling(30).mean()
        fig.add_trace(go.Box(x=[s]*len(data), y=data, name=s))
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling 30 days Average Sales vs. category")

In [None]:
dept_list = sales_train_val.dept_id.unique() #['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']
fig = go.Figure()
means = []
for s in dept_list:
    dept_items = [c for c in df.columns if s in c] #ex: HOBBIES_1_001_WI_3_validation
    data = df[dept_items].sum(axis=1).rolling(30).mean()
    means.append(np.mean(df[dept_items].sum(axis=1))) #store to mean for each store HOBBIES, HOUSEHOLD ....
    fig.add_trace(go.Scatter(x=data.index, y=data, name=s)) #plot each store HOBBIES, HOUSEHOLD ...
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling Average 30 Sales vs. Time (per department)")

In [None]:
fig = go.Figure()
means = []
for s in dept_list:
        dept_items = [c for c in df.columns if s in c]
        data = df[dept_items].sum(axis=1).rolling(30).mean()
        fig.add_trace(go.Box(x=[s]*len(data), y=data, name=s))
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling 30 days Average Sales vs. Department")

In [None]:
d_col = [c for c in sales_train_val.columns if 'd_' in c]

store = sales_train_val.groupby('store_id')[d_col].sum()
ts = store.reset_index().T.reset_index()
ts.columns = ['d', 'CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
ts = ts.drop([0]).set_index('d')

ts = ts.merge(calendar.set_index('d'), left_index=True, right_index=True, validate='1:1')
ts['CA'] = ts.CA_1 + ts.CA_2 + ts.CA_3 + ts.CA_4
ts['TX'] = ts.TX_1 + ts.TX_2 + ts.TX_3
ts['WI'] = ts.WI_1 + ts.WI_2 + ts.WI_3
ts['sale'] = ts.CA + ts.TX + ts.WI
ts

In [None]:
#mean sale by state
fig = make_subplots(rows=1, cols=1)

fig = go.Figure()

states = sales_train_val.state_id.unique().tolist()
ts['CA'] = ts['CA'].astype('int64')
ts['TX'] = ts['TX'].astype('int64')
ts['WI'] = ts['WI'].astype('int64')

for i in states:
    fig.add_trace(go.Scatter(x=ts.groupby('month')['CA','TX', 'WI'].mean().index , y=ts.groupby('month')['CA','TX', 'WI'].mean()[i],
                        mode='lines+markers',
                        name=i))
fig.show()

In [None]:
#mean sale by state
tw = ts.groupby('weekday')['CA','TX', 'WI'].mean().reindex(['Saturday', 'Sunday', 'Monday','Tuesday','Wednesday','Thursday', 'Friday'])

fig = make_subplots(rows=1, cols=1)

fig = go.Figure()

states = sales_train_val.state_id.unique().tolist()

for i in states:
    fig.add_trace(go.Scatter(x=tw.index, y=tw[i],
                        mode='lines+markers',
                        name=i))
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=1)

fig = go.Figure()

fig.add_trace(go.Scatter(x=ts.date, y=ts.sale,
                        mode='lines',
                        name='Sales of total state of Walart'))
fig.show()

In [None]:
fig = plt.figure(figsize=(20, 35))

ax1 = fig.add_subplot(511)
sns.boxplot(data=ts, x='year', y=ts.sale, ax=ax1)

ax2 = fig.add_subplot(512)
sns.boxplot(data=ts, x='month', y=ts.sale, ax=ax2)

ax3 = fig.add_subplot(513)
sns.boxplot(data=ts, x='weekday', y=ts.sale, ax=ax3)

ax4 = fig.add_subplot(514)
sns.boxplot(data=ts, x='event_type_1', y=ts.sale, ax=ax4)

ax5 = fig.add_subplot(515)
sns.boxplot(data=ts, x='event_name_1', y=ts.sale, ax=ax5)
ax5.tick_params(axis='x', labelrotation=90)
# ax5.set_xticklabels(xlabels, rotation=90 )
plt.show()

In [None]:
events_1_data = ts['event_type_1'].value_counts()
fig = plt.figure(figsize=(11, 5))
ax = fig.add_subplot()
ax.pie(x=events_1_data.values,
       labels=events_1_data.index,
       shadow=True,
       radius=1,
       autopct='%1.1f%%')
ax.set_title('Distribution of Type 1 Events')
plt.show()

In [None]:
for year in ts.year.unique():
    t = ts[ts.year == year]
    fig = plt.figure(figsize=(20, 6))
    ax = fig.add_subplot()
    t_event_1 = t.loc[t.event_type_1.notnull()]
    ax.plot(t.date, t.sale)
    ax.scatter(t_event_1.date,
               t_event_1.sale,
               color='red',
               label='Type 1 Event')
    ax.set_xticks(t.date.values[::30])
    ax.set_xlabel('Date')
    ax.set_ylabel('Sales')
    ax.grid()
    ax.set_title(f'Sales of total state of Walart for {year}')
    ax.legend()
    plt.show()

# Analysis

`sales_train_validation` Dataset is our train data set: [D1 - D1913].

`sales_train_evalutaion` Dataset is data used to evaluate our models, it contains [D1914 - D1941].

In [None]:
sales_train_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

calendar = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')

In [None]:
forecast_period = 28 #D1914-D1941

d_fcst_columns = sales_train_eval.columns[-forecast_period:].tolist() #d_1914-d_1941

In [None]:
def get_ground_truth(idx, df, d_fcst_columns):
    return df.loc[idx, d_fcst_columns].values  

def plot_results(fcst, y_eval, rmse, algo, item):
    fig = plt.figure(figsize=(11, 5))
    ax = fig.add_subplot()
    ax.plot(fcst, color='red', label='Forecast')
    ax.plot(y_eval, color='blue', label='Ground Truth')
    ax.set_title(f' {algo} for {item}, RMSE: {rmse}')
    ax.grid()
    ax.legend()
    plt.show()

In [None]:
choice_data = sales_train_eval.copy()

choice_data['d_val'] = sales_train_eval[d_col].mean(axis=1)

choice_data.drop(columns=d_col,inplace=True)

In [None]:
idx1 = choice_data.loc[choice_data['d_val'] >= 50].sample(n=1, random_state=1).index
idx2 = choice_data.loc[(choice_data['d_val'] <= 5) & (choice_data['d_val'] > 1)].sample(n=1, random_state=1).index
idx = idx1.tolist() + idx2.tolist()

In [None]:
ts_test = sales_train_eval.iloc[idx]
test_items = ts_test.id.unique().tolist()
test_items = [x[:-11] for x in test_items]

In [None]:
ts_test

In [None]:
test_items

In [None]:
# Dataframe for RMSE
rmse_summary = pd.DataFrame({"items":test_items}, index=idx)

# ARIMA (AutoRegressive Integrated Moving Average)

* ARIMA: Non-Seasonal.
* SARIMA: Seasonal ARIMA.
* SARIMAX: Seasonal ARIMA with eXogenous variables.

In [None]:
from matplotlib.gridspec import GridSpec
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly, add_changepoints_to_plot
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
# import calendar

In [None]:
def get_ts_example(data, d_cols, calendar_data, item_id, store_id, idx=None):
    try:
        if idx is None:
            ts = data.loc[(data['item_id'] == item_id) & (data['store_id'] == store_id)]
            ts = ts[d_cols].T.reset_index()
            ts.columns = ['d', 'sales']
        else:
            ts = data.loc[idx][d_cols].reset_index()
            ts.columns = ['d', 'sales']
        # Make sure that sales column's type is int
        ts["sales"] = ts["sales"].astype("int")
        return merge_with_calendar(ts, calendar_data)
    except Exception as e:
        print(f'Can not extract time series: {e}')
         
def merge_with_calendar(data, calendar_data):
    # data should have a date column "d"
    assert 'd' in data.columns, 'DataFrame should have a column "d" !'
    # Merge With Calendar
    cal = calendar_data[['d', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
                    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']]
    d = pd.merge(cal, data, on="d")
    # Fill Missing Event Values with None
    for col in ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
        d[col].fillna('None', inplace=True)
    return d

In [None]:
def choose_sarimax_order_and_forecast(ps,ds,qs, y_train, y_eval):
    best_model, best_rmse, best_order, best_fcst = None, None, None, None
    for p in ps:
        for d in ds:
            for q in qs:
                order = (p,d,q)
                model = sm.tsa.SARIMAX(y_train, 
                               order=order, 
                               trend='c',
                               enforce_invertibility=False,
                               enforce_stationarity=False).fit(disp=False, warn_convergence=False)
                fcst = model.predict(start=len(y_train), end=len(y_train) - 1 + len(y_eval))
                try:
                    fcst = [round(x) for x in fcst]
                    rmse = round(np.sqrt(mean_squared_error(fcst, y_eval)), 3)
                    if (best_rmse is None) or (rmse < best_rmse):
                        best_model, best_rmse, best_order, best_forecast= model, rmse, order, fcst
                except Exception as e:
                    print(f'For order={order}, model results are invalid: {e}')
    print(f"Best Order: {best_order}")
    return best_rmse, best_forecast

In [None]:
# example: FOODS_3_586_CA_3_evaluation (idx[0]) for training data
df0 = get_ts_example(ts_test, d_col, calendar, item_id=None, store_id=None, idx=idx[0])

# df0['date'] = df0['date'].apply(lambda x : pd.to_datetime(x))

df0 = df0[['date','sales']]

y_train = df0["sales"].values

df0

In [None]:
y_train

In [None]:
result = seasonal_decompose(y_train, model='additive', period=365)

fig = result.plot()

In [None]:
fig, ax = plt.subplots(2,1,figsize=(20,7))

sm.tsa.graphics.plot_acf(y_train, lags=30, ax=ax[0])

ax[0].set_title('Autocorreation Function: lags=30')

sm.tsa.graphics.plot_pacf(y_train, lags=30, ax=ax[1])

ax[1].set_title('Partial Autocorreation Function: lags=30')

plt.show()

In [None]:
# Plotting Autocorrelation with pandas
fig, ax = plt.subplots(1,1,figsize=(20,7))
pd.plotting.autocorrelation_plot(y_train, ax=ax)
plt.show()

In [None]:
sarimax_model = sm.tsa.SARIMAX(y_train, 
                               order=(7,1,7), 
                               trend='c',
                               enforce_invertibility=False,
                               enforce_stationarity=False).fit(disp=False, warn_convergence=False)
sarimax_model.summary()

In [None]:
# actual data
y_eval = get_ground_truth(idx[0], sales_train_eval, d_fcst_columns)
y_eval

In [None]:
# forecast data
fcst = sarimax_model.predict(start=len(y_train), end=len(y_train) - 1 + len(y_eval))
fcst

In [None]:
rmse = round(np.sqrt(mean_squared_error(fcst, y_eval)), 3)
rmse

In [None]:
plot_results(fcst, y_eval, rmse, "SARIMAX", test_items[0])

In [None]:
# SARIMAX Parameters Grid
ps = range(1,8)
ds = range(0,2)
qs = range(0,8)

In [None]:
rmse_sarimax = []
for i,ix in enumerate(idx):
    
    print(f"Processing {test_items[i]}...")
    
    # Get Time Series (Train)
    df0 = get_ts_example(ts_test, d_col, calendar, item_id=None, store_id=None, idx=ix)
    df0['date'] = df0['date'].apply(lambda x : pd.to_datetime(x))
    df0 = df0[['date','sales']]
    y_train = df0["sales"].values
    
    # Get y_eval (actual data)
    y_eval = get_ground_truth(ix, sales_train_eval, d_fcst_columns)
    
    # Train SARIMAX model
    rmse, fcst = choose_sarimax_order_and_forecast(ps,ds,qs, y_train, y_eval)
    print(f'rmse: {rmse}')
    
    # Plot
    plot_results(fcst, y_eval, rmse, "SARIMAX", test_items[i])
    rmse_sarimax.append(rmse)

In [None]:
d_col = [c for c in sales_train_eval.columns if 'd_' in c]
eval_data = pd.merge(calendar, sales_train_eval.groupby(['state_id'])[d_col].sum().T.reset_index().rename(columns = {'index':'d'}) , on="d")[-28:]
eval_data['sale'] = eval_data.CA+eval_data.TX+eval_data.WI
eval_data.head()

In [None]:
states_rmse = []
states = ['CA', 'TX', 'WI', 'sale']

for i in states:
    
    print(f"Processing {i}...")
    
    y_train = ts[i].values.astype(int)
    
    y_eval = eval_data[i].values
    
    rmse, fcst = choose_sarimax_order_and_forecast(ps,ds,qs, y_train, y_eval)
    print(f'rmse: {rmse}')

    # Plot
    plot_results(fcst, y_eval, rmse, "SARIMAX", i)
    states_rmse.append(rmse)