In [None]:
#Import Important libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import plotly.express as px
from fbprophet.plot import plot_plotly, plot_components_plotly
from fbprophet import Prophet
from statsmodels.tsa.seasonal import seasonal_decompose as sd
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import missingno as msno
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from datetime import timedelta
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load our data

features = pd.read_csv("../input/walmart-sales-prediction/features.csv", parse_dates=['Date'])
stores = pd.read_csv("../input/walmart-sales-prediction/stores.csv")
train = pd.read_csv("../input/walmart-sales-prediction/train.csv", parse_dates=['Date'])
test = pd.read_csv("../input/walmart-sales-prediction/test.csv", parse_dates=['Date'])

In [None]:
# Display the first 3 rows

print(features.head(3))
print('\n')
print(stores.head(3))
print('\n')
print(train.head(3))

In [None]:
# Display the dataset shape

print(features.shape)
print(stores.shape)
print(train.shape)

In [None]:
# We will merge our datasets

tdf = train.merge(features, 'left').merge(stores, 'left')

In [None]:
# Merged data head

tdf.head(5)

In [None]:
# Display general information

tdf.info()

In [None]:
# Data Description

tdf.describe().transpose()

In [None]:
# Percentage of missing Values

tdf.isna().sum()/len(tdf)*100

In [None]:
# Visualize our missing data

msno.bar(tdf, color="dodgerblue")
plt.show()

# IMPUTING MISSING DATA

In [None]:
# Missing data is for Markdowns only (Quantitative veriables). We can imput the missing data 
# using a 0, which indicates that there is no markdown.

tdf= tdf.fillna(0)
# DISPLAY MISSING DATA
msno.bar(tdf, color="dodgerblue")
plt.show()

In [None]:
# Correlation matrix

plt.figure(figsize= (15,10))
sns.heatmap(tdf.corr(), annot= True, cmap= 'coolwarm')

In [None]:
# DISTRIBUTION OF THE DEPENDENT VARIABLE

plt.figure(figsize=(20,5))
sns.distplot(tdf['Weekly_Sales'], bins=40, kde=True, color='red')
plt.title('Weekly_Sales distribution')
plt.show()

In [None]:
# Sales by different variables

fig, ax = plt.subplots(2, 2, figsize= (10,10))
ax[0,0].scatter(tdf['Temperature'], tdf['Weekly_Sales'])
ax[0,0].set_title('Weekly_Sales by tempreture')
ax[0,1].scatter(tdf['Fuel_Price'], tdf['Weekly_Sales'])
ax[0,1].set_title('Weekly_Sales by fuel price')
ax[1,0].scatter(tdf['CPI'], tdf['Weekly_Sales'])
ax[1,0].set_title('Weekly_Sales by CPI')
ax[1,1].scatter(tdf['IsHoliday'], tdf['Weekly_Sales'])
ax[1,1].set_title('Weekly_Sales in holidays and not holidays')
plt.show()

 # Modeling

## Decomposing Time Series Data into Trend and Seasonality
A Series is an aggregate or combination of 4 components. All series have a level and noise. The trend and seasonality components are optional.
* Level: The average value in the series.
* Trend: The increasing or decreasing value in the series.
* Seasonality: The repeating short-term cycle in the series.
* Noise: The random variation in the series.

In [None]:
ts=train.groupby("Date")["Weekly_Sales"].sum()

In [None]:
# Visualize Residuals, Seasonal, Trend, and level

res = sm.tsa.seasonal_decompose(ts.values,period=52,model="multiplicative")
res.plot()
plt.show()

here we can see a very small increasing trend and an obvious seasonality.

# Model 1: Prophet

In [None]:
# Display the top rows in ts
ts.head()

In [None]:
# Visualize the weekly sales by year
fig = go.Figure()
years = pd.date_range("2010-01-01","2013-01-01", freq="AS").tolist() # range dates by year
for i in range(len(years)-1):
    ts_year = ts[years[i]:years[i+1]]
    fig.add_trace(
        go.Scatter(
            y=ts_year.values,
            x=ts_year.index.week,
            name=years[i].year,
        ))

fig.update_layout(
    title="weekly sales by year",
    xaxis_title="weeks",
    yaxis_title="sales",
    legend_title="year",
    yaxis_tickprefix = '$',
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))
fig.show()

In [None]:
# Reset index
p_ts = ts.reset_index()
p_ts.columns = ["ds","y"]
p_ts.head()

In [None]:
# Fitting Prophet Model
m = Prophet(yearly_seasonality = True)
m.fit(p_ts)

In [None]:
# Display the future data (26 weeks)
future = m.make_future_dataframe(periods=26, freq='W')
future.tail()

In [None]:
# Predict future sales
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
forecast[:143].shape

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
mean_absolute_percentage_error(p_ts['y'],forecast['yhat'][:143])

In [None]:
# Visualize the forcasted sales
plot_plotly(m, forecast)

In [None]:
# Visualize the components
plot_components_plotly(m, forecast)

# Model 2: SARIMA

In [None]:
def tsplot(ts, lags):
    with plt.style.context("bmh"):    
        fig = plt.figure(figsize=(12, 7))
        ts_ax = plt.subplot2grid((2, 2), (0, 0), colspan=2)
        acf_ax = plt.subplot2grid((2, 2), (1, 0))
        pacf_ax = plt.subplot2grid((2, 2), (1, 1))
        ts.plot(ax=ts_ax)
        p_value = sm.tsa.stattools.adfuller(ts)[1]
        ts_ax.set_title('Dickey-Fuller: p={0:.5f}'.format(p_value))
        smt.graphics.plot_acf(ts, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(ts, lags=lags, ax=pacf_ax)
        plt.tight_layout()
        
tsplot(ts, 26)

In [None]:
ts_diff = ts - ts.shift(52)
tsplot(ts_diff[52:], 26)

In [None]:
ts_diff = ts_diff - ts_diff.shift(1)
tsplot(ts_diff[52+1:], 40)

In [None]:
# Specify SARIMA Components & fit the model
p = 2
d=1 
q = 3
P = 2
D=1 
Q = 3
s = 52
model=sm.tsa.statespace.SARIMAX(ts, order=(p, d, q), seasonal_order=(P, D, Q, s)).fit(disp=-1)
print(model.summary())

In [None]:
tsplot(model.resid[24+1:], lags=40)

In [None]:
# Culaculate MAE
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Visualize MAE

def plotSARIMA(ts, model, n_steps):
    data = pd.DataFrame(ts)
    data.columns = ['actual']
    data['model'] = model.fittedvalues
    data['model'][:s+d] = np.NaN
    
    forecast = model.predict(start = data.shape[0], end = data.shape[0]+n_steps)
    forecast = data.model.append(forecast)
    error = mean_absolute_percentage_error(data['actual'][s+d:], data['model'][s+d:])

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=data.index,
            y=data["actual"],
            name="Actual",
        ))
    fig.add_trace(
        go.Scatter(
            x=forecast.index,
            y=forecast,
            name="Model",
    ))
    fig.add_vrect(
    x0=data.index[-1], x1=forecast.index[-1],
    fillcolor="LightSalmon", opacity=0.5,
    layer="below", line_width=0)
    
    fig.update_layout(
        title=f"Mean Absolute Percentage Error: {error:.2f}%",
        xaxis_title="weeks",
        yaxis_title="sales",
        yaxis_tickprefix = '$',
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="RebeccaPurple"
        ))
    fig.update_xaxes(rangeslider_visible=True)
    fig.show()
    
plotSARIMA(ts, model, 26)

# Model 3: Linear Models

In [None]:
# Adding lags
l_ts = pd.DataFrame(ts)
l_ts.columns = ["y"]

for i in range(26, 54):
    l_ts[f"l{i}"] = l_ts.y.shift(i)
    
msno.bar(l_ts,color="lightgreen")

In [None]:
# Drop NANs
l_ts.dropna(inplace=True)
msno.bar(l_ts,color="lightgreen");
l_ts.shape

In [None]:
 #  5 folds cross-validation
tscv = TimeSeriesSplit(n_splits=5)

def ts_train_test_split(X, y, test_size):
    index = int(test_size*len(X))+1
    
    X_train = X.iloc[:-index]
    y_train = y.iloc[:-index]
    X_test = X.iloc[-index:]
    y_test = y.iloc[-index:]
    
    return X_train, X_test, y_train, y_test

In [None]:
# Visualize the Mean Absolute error
def plotLMResults(model, X_train, X_test):
    pred = model.predict(X_test)
    
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=X.index,
            y=y,
            name="Actual",
        ))
    fig.add_trace(
        go.Scatter(
            x=X_test.index,
            y=pred,
            name="Model",
        ))

    cv = cross_val_score(model, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error")
    
    deviation = np.sqrt(cv.std())
    lower = pred - (1.5 * deviation)
    upper = pred + (1.5 * deviation)
    
    fig.add_trace(
        go.Scatter(
            x=X_test.index,
            y=lower,
            name="lower bond",
            line = dict(shape = 'linear', color = 'rgb(255, 12, 24)', width=0.7, dash = 'dash')
        ))
    fig.add_trace(
        go.Scatter(
            x=X_test.index,
            y=upper,
            name="upper bond",
            line = dict(shape = 'linear', color = 'rgb(255, 12, 24)', width=0.7, dash = 'dash')
        ))
    
    error = mean_absolute_percentage_error(pred, y_test)
    fig.update_layout(
        title=f"Mean Absolute Percentage Error: {error:.2f}%",
        xaxis_title="weeks",
        yaxis_title="sales",
        yaxis_tickprefix = '$',
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="RebeccaPurple"
        ))
    fig.update_xaxes(rangeslider_visible=True)
    fig.show()
    
def plotCoefs(model):
    coefs = pd.DataFrame(model.coef_, X_train.columns)
    coefs.columns = ["coef"]
    coefs["abs"] = coefs.coef.apply(np.abs)
    coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)
    
    fig = px.bar(coefs.coef)
    fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='gold')
    fig.show()
    

y = l_ts.y
X = l_ts.drop(['y'], axis=1)

X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_size=0.3)

lr = LinearRegression()
lr.fit(X_train, y_train)

plotLMResults(lr, X_train, X_test)
plotCoefs(lr)

with only adding a few lags to our linear model, we can get almost the same results as the SARIMA model. 

# Model 4: XGBoost

In [None]:
holidays = train.groupby(["Date"])["IsHoliday"].agg(lambda x: bool(any(x))).sort_index()
fig = px.line(ts, title='Holidays')

for holiday in holidays[holidays].index:
    fig.add_vrect(
        x0=holiday- timedelta(weeks=1) , x1=holiday,
        fillcolor="LightSalmon", opacity=0.7,
        layer="below", line_width=0)

fig.show()

In [None]:
# Tempreture by date
temperature = features.groupby(["Date"])["Temperature"].mean().sort_index()
fig = px.scatter(ts, title='Temperature', color=ts.index.map(lambda x: round(temperature[x])),
                 color_continuous_scale=["blue", "yellow", "red"], labels={"color":"Temperature","value":"total sales"})
fig.update_traces(mode='lines+markers')
fig.update_yaxes(tickprefix="$")
fig.show()

In [None]:
# Fuel prices by date
fuel_price = features.groupby(["Date"])["Fuel_Price"].mean().sort_index()
fig = px.scatter(ts, title='Fuel Price', color=ts.index.map(lambda x: round(fuel_price[x],2)),
                 labels={"color":"Fuel Price","value":"total sales"})
fig.update_traces(mode='lines+markers')
fig.update_yaxes(tickprefix="$")
fig.show()

In [None]:
ts.index[0], ts.index[0]-timedelta(weeks=0)

## Fitting the model

In [None]:
lxgb_ts = l_ts.copy()
for i in range(0,8):
    lxgb_ts[f"h{i}"] = lxgb_ts.index.map(lambda x: holidays[x-timedelta(weeks=i)])
    lxgb_ts[f"t{i}"] = lxgb_ts.index.map(lambda x: temperature[x-timedelta(weeks=i)])
    lxgb_ts[f"f{i}"] = lxgb_ts.index.map(lambda x: fuel_price[x-timedelta(weeks=i)])

In [None]:
standard_scaler = StandardScaler()

y = lxgb_ts.y
X = lxgb_ts.drop(['y'], axis=1)

X_train, X_test, y_train, y_test = ts_train_test_split(X, y, test_size=0.3)

X_train_standard = pd.DataFrame(standard_scaler.fit_transform(X_train)).set_index(X_train.index)
X_test_standard =  pd.DataFrame(standard_scaler.transform(X_test)).set_index(X_test.index)

In [None]:
from xgboost import XGBRegressor 

xgb = XGBRegressor()
xgb.fit(X_train_standard, y_train)

plotLMResults(xgb, X_train_standard, X_test_standard)

# stores

In [None]:
# Stores Time series
sts = train.groupby(["Store","Date"])["Weekly_Sales"].sum().reset_index()

In [None]:
# Display top rows
sts.head()

In [None]:
vsts = sts.groupby(["Store"])["Weekly_Sales"].agg(["sum","mean"]).reset_index()
fig = px.bar(vsts, x='Store', y='sum',
             hover_data=['Store', 'sum', 'mean'], color='mean',
             labels={'sum':'Weekly Sales'}, height=400)
fig.show()

In [None]:
# Walmart weekly sales by Store and Date(Using Plotly)
fig = go.Figure()
for s in sts.Store.unique():
    fig.add_trace(
        go.Scatter(
            x=sts[sts.Store==s].Date,
            y=sts[sts.Store==s].Weekly_Sales,
            name="Store_"+str(s)
        ))
fig.show()

The highest sales were on Dec/24 and Dec/23, these are thanksgiving holidays.

In [None]:
l_sts = pd.DataFrame()
for s in sts.Store.unique():
    df = pd.DataFrame(sts[sts.Store==s])
    for i in range(26, 54):
        df[f"l{i}"] = df.Weekly_Sales.shift(i)
    df.dropna(inplace=True)    
    l_sts = l_sts.append(df)

In [None]:
l_sts = l_sts.set_index("Date")
l_sts.head()

In [None]:
def sts_train_test_split(l_sts, test_size):
    train_set = pd.concat([l_sts[l_sts.Store==s].iloc[:-int(test_size*len(l_sts[l_sts.Store==1]))+1] for s in l_sts.Store.unique()])
    test_set = pd.concat([l_sts[l_sts.Store==s].iloc[-int(test_size*len(l_sts[l_sts.Store==1]))+1:] for s in l_sts.Store.unique()])
    
    y_train = train_set.Weekly_Sales
    X_train = train_set.drop(['Weekly_Sales'], axis=1)
    
    y_test = test_set.Weekly_Sales
    X_test = test_set.drop(['Weekly_Sales'], axis=1)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = sts_train_test_split(l_sts, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
standard_scaler = StandardScaler()
X_train_standard = pd.DataFrame(standard_scaler.fit_transform(X_train)).set_index(X_train.index)
X_test_standard =  pd.DataFrame(standard_scaler.transform(X_test)).set_index(X_test.index)

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train_standard, y_train)

In [None]:
pred = xgb.predict(X_test_standard)
matest = X_test.copy()
matest["Pred_Sales"] = pred
matest["Actual_Sales"] = y_test
matest.head()

In [None]:
fig = go.Figure()
for s in range(1,5):
    fig.add_trace(
        go.Scatter(
            x=matest[matest.Store==s].index,
            y=matest[matest.Store==s].Pred_Sales,
            name="Store_"+str(s)+"_pred"
        ))
    fig.add_trace(
        go.Scatter(
            x=matest[matest.Store==s].index,
            y=matest[matest.Store==s].Actual_Sales,
            name="Store_"+str(s)+"_actual",
            line = dict(shape = 'linear', color = 'rgb(255, 12, 24)', width=0.7, dash = 'dash')
        ))
error = mean_absolute_percentage_error(pred, y_test)
fig.update_layout(
    title=f"Mean Absolute Percentage Error: {error:.2f}%",
    xaxis_title="weeks",
    yaxis_title="sales",
    yaxis_tickprefix = '$',
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
store = tdf.groupby(["Store","Size","Type"])["Weekly_Sales"].sum().reset_index()


fig = px.bar(store, x='Store', y="Weekly_Sales",
             hover_data=['Store', 'Weekly_Sales'], color='Type',height=400, title="Weekly_Sales by Store Type")
fig.show()


fig = px.bar(store, x='Store', y="Weekly_Sales",
             hover_data=['Store', 'Size'], color='Size', height=400, title="Weekly_Sales by Store Size")
fig.show()

In [None]:
lxgb_sts = pd.DataFrame()
for s in sts.Store.unique():
    df = pd.DataFrame(sts[sts.Store==s]).set_index("Date")
    for i in range(26, 54):
        df[f"l{i}"] = df.Weekly_Sales.shift(i)
    
    df.dropna(inplace=True)    
    lxgb_sts = lxgb_sts.append(df)
    
for i in range(0,12):
        lxgb_sts[f"h{i}"] = lxgb_sts.index.map(lambda x: holidays[x-timedelta(weeks=i)])
        lxgb_sts[f"t{i}"] = lxgb_sts.index.map(lambda x: temperature[x-timedelta(weeks=i)])
        lxgb_sts[f"f{i}"] = lxgb_sts.index.map(lambda x: fuel_price[x-timedelta(weeks=i)])
        
lxgb_sts["Size"] = lxgb_sts.Store.map(lambda x: store[store.Store==x]["Size"].item())
lxgb_sts["Type"] = lxgb_sts.Store.map(lambda x: store[store.Store==x]["Type"].item()).astype('category').cat.codes

In [None]:
X_train, X_test, y_train, y_test = sts_train_test_split(lxgb_sts, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
standard_scaler = StandardScaler()
X_train_standard = pd.DataFrame(standard_scaler.fit_transform(X_train)).set_index(X_train.index)
X_test_standard =  pd.DataFrame(standard_scaler.transform(X_test)).set_index(X_test.index)

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train_standard, y_train)

In [None]:
pred = xgb.predict(X_test_standard)
matest = X_test.copy()
matest["Pred_Sales"] = pred
matest["Actual_Sales"] = y_test
matest.head()

In [None]:
fig = go.Figure()
for s in range(1,5):
    fig.add_trace(
        go.Scatter(
            x=matest[matest.Store==s].index,
            y=matest[matest.Store==s].Pred_Sales,
            name="Store_"+str(s)+"_pred"
        ))
    fig.add_trace(
        go.Scatter(
            x=matest[matest.Store==s].index,
            y=matest[matest.Store==s].Actual_Sales,
            name="Store_"+str(s)+"_actual",
            line = dict(shape = 'linear', color = 'rgb(255, 12, 24)', width=0.7, dash = 'dash')
        ))
error = mean_absolute_percentage_error(pred, y_test)
fig.update_layout(
    title=f"Mean Absolute Percentage Error: {error:.2f}%",
    xaxis_title="weeks",
    yaxis_title="sales",
    yaxis_tickprefix = '$',
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))
fig.update_xaxes(rangeslider_visible=True)
fig.show()