# Store Sales Time Series Analysis

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# colors = ['#126E82', '#0A043C', '#F25287', '#F0A500', '#7D1935']
plt.style.use('tableau-colorblind10')
sns.set_style('whitegrid')
# sns.set_palette(colors)

## Loading DataSets

In [None]:
train_data = pd.read_csv("../input/store-sales-time-series-forecasting/train.csv", parse_dates =['date'])
test_data = pd.read_csv("../input/store-sales-time-series-forecasting/test.csv", parse_dates =['date'])
holidays = pd.read_csv("../input/store-sales-time-series-forecasting/holidays_events.csv", parse_dates =['date'])
oil = pd.read_csv("../input/store-sales-time-series-forecasting/oil.csv", parse_dates =['date'])
transaction = pd.read_csv("../input/store-sales-time-series-forecasting/transactions.csv", parse_dates =['date'])
stores = pd.read_csv("../input/store-sales-time-series-forecasting/stores.csv")

In [None]:
# rename the column name of oil dataframe.
oil.rename(columns={'dcoilwtico':'oilPrice'}, inplace=True)

# Let's merge oil data into the train and test data
train = train_data.merge(oil, on='date')
test = test_data.merge(oil, on='date') 

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print("train shape :", train.shape)
print("test shape :", test.shape)

In [None]:
# Let's examine holidays dataframe.
holidays.head()

In [None]:
train = train.merge(holidays[['date', 'type', 'transferred']], on='date')
train = train.merge(stores, on='store_nbr')
train.rename(columns={'type_x':'holiday_type', 'type_y':'store_type'}, inplace=True)

In [None]:
train['Year'] = train.date.dt.year
train['Year-Month'] = train['date'].apply(lambda x : x.strftime('%Y-%m'))
train['Month'] = train.date.dt.month
train['Day'] = train.date.dt.day

In [None]:
# Sales in thousands.
train.sales = train.sales / 1000

### Summary Statistics

In [None]:
train.describe()

### Missing Values

In [None]:
# Missing data in train dataset
train.isna().sum()

# Data Analysis

### Sales Distribution

In [None]:
# box plots to see distribution of sales in each year.
plt.figure(figsize=(10, 6))
sns.boxplot(data=train, x='Year', y='sales')

plt.tight_layout()
plt.show()

We can see that, Outliers present in each year, but specifically in year 2016 there are some extreme outliers present which is becasue of earthquake on April 16, 2016. People rallied in relief efforts donating water and other first need products which greatly affected supermarket sales for several weeks after the earthquake. So, let's remove this extreme outliers in 2016.

In [None]:
# Let's examine sales in Year 2016
data2016 = train.loc[train.Year == 2016, 'sales']
data2016.reset_index(drop=True, inplace=True)

# plot
plt.figure(figsize=(14, 8))
plt.scatter(data2016.index, data2016.values)
plt.show()

In [None]:
# sales greater than 40000
train.loc[train.sales > 40000].shape # there are 10 entries for sales greater than 40,000.

# Let's remove values greater than 40,000.
train = train.loc[train.sales < 40000]
train.sales.max()

### Line plot

In [None]:
# Sales observation over time.
plt.figure(figsize=(14, 8))
sns.lineplot(data = train, x='date', y='sales', label='Sales')

plt.xlabel('Date')
plt.ylabel('Sales')
plt.title("Sales observation over time")

plt.legend()
plt.show()

There is **increasing** **trend** or **growth** in sales over the time.

In [None]:
# Year-to-Year observation of sales.
year_data = pd.DataFrame(train.groupby('Year-Month').sum()['sales'])

# plot
year_data.plot(kind='line', figsize=(14, 8), marker="o")

plt.xlabel("Year-Month")
plt.ylabel("Sales")
plt.title("Year-Month observation of Total Sales")
plt.show()

We can see that, Every year there is Rise in Sales in Month of December and drop in January. This may be because of Christmas. Let's examine the sales over month and see if the same pattern observe in each year.

In [None]:
# Monthly observation of Sales for each year.
monthly_sales = pd.DataFrame(train.groupby(by = ['Year', 'Month']).sum()['sales'])

# let's add 0 for remaining months(9, 10, 11, 12) in 2017.
re_months = [9, 10, 11, 12]
for month in re_months:
    monthly_sales.loc[(2017, month), :] = 0
    
yrs = [2013, 2014, 2015, 2016, 2017]

# Plots
fig, axs = plt.subplots(nrows = 5, ncols=1, figsize=(12, 10))
for i in range(len(yrs)):
    yr = yrs[i]
    axs[i] = monthly_sales.loc[yr].plot(ax=axs[i], marker="o", label=yr)
    axs[i].set_ylabel(str(yr)+'Sales')

fig.suptitle("Monthly Trend Pattern Observations")
plt.legend()
plt.tight_layout()
plt.show()

We can see that, There is peak in December month for each year however, at different levels. <br>
Wages in the public sector are paid every two weeks on the 15th and on the last day of the month. Let's observed the sales at 15th day and last day of month.

In [None]:
# prepare data
filter1 = (train.Day == 15) 
filter2 = (train.Day.apply(lambda x: x in [31, 30, 29, 28]))

Sales = train.loc[(filter1 | filter2), ['date','Year-Month','sales']]
sales_data = pd.DataFrame(Sales.groupby(by=['Year-Month']).sum())

#plot
sales_data.plot(kind='bar', figsize=(14, 8), edgecolor="black", fill=True, alpha=0.75, linewidth=1.5)
sns.lineplot(data = sales_data, x = sales_data.index, y = 'sales', color="orange")

plt.title('Monthly Wages Observation')

plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

We can see that, there is increase in sales in April 2016 as expected.

### HeatMap

In [None]:
# prepare data
d = train[['date', 'sales']]
d.set_index('date', inplace=True)
ptable = pd.pivot_table(data=d, index=d.index.year, columns=d.index.quarter)

# plot
plt.figure(figsize=(14, 8))
sns.heatmap(ptable, square=True, cmap='Blues', xticklabels=["Q1", "Q2", "Q3", "Q4"])
plt.show()

Heatmap shows peak at Quarter Q4 for every year. For each of the years the upward trend observed in all quarters.

## Seasonality Factor

In [None]:
# Groupby Sales by Quarter
# Only use upto 2016 because we have partial data for 2017
data_2016 = d.loc[:'2016']
avg_2016 = int(data_2016.mean())

# Avg sales per quarter
qrt_avg = data_2016.groupby(data_2016.index.quarter)["sales"].mean()

# Groupby quarter
qrt_table = pd.pivot_table(data_2016, index=data_2016.index.quarter, columns=data_2016.index.year)

# add qrt_avg to qrt_table
qrt_table["avg"] = qrt_avg

# Additive Seasonality Factor: Subtract mean from avg column
qrt_table["additive"] = (qrt_table["avg"] - avg_2016).round(2)

# Multiplicative Seasonality Factor: Divide mean from avg column
qrt_table["multiplicative"] = (qrt_table["avg"]/avg_2016).round(2)

qrt_table.index.name="Quarters"
print("Seasonal Factor Analysis Table")
qrt_table

Seasonality Analysis table shows that in quarter 4 we can see that there is increament in sales by ~69k as compare to others and there is sudden drop in quarter 1. We can see from the above table is that the sales is not stable, The multiplicative seasonallity would capture the pattern better than additive seasonality.

## Stationarity
Time series is Stationary if it has,
* Constant Mean
* Constant Variance
* Constant Covariance

Let's verify it by observing change in mean, variance and statistical test (**adfuller**)

In [None]:
# prepare_data 
data = pd.DataFrame(train_data.groupby(by=['date']).sum()['sales'])

def test_stationarity(timeseries, title):
    
    # calculating rolling statistics.
    roll_mean = timeseries['sales'].rolling(window = 91,  center=True).mean()
    roll_std = timeseries['sales'].rolling(window = 91,  center=True).std()

    # plotting rolling statistics with orignal data.
    plt.figure(figsize=(14, 5), dpi=100)
    plt.plot(timeseries.sales, label= title, marker=".", alpha=0.6)
    plt.plot(roll_mean, label="Rolling Mean", color="red", linestyle="--")
    plt.plot(roll_std, label="Rolling Standard Deviation")

    plt.title("Rolling Statistics")
    plt.legend()
    plt.show()

test_stationarity(data, 'raw data')

Above plot show that, Both Mean and Standard deviation is increasing over time. Therefore, this time series is not stationary.

### Coefficient of Variance
C.V = std/mean

* If C.V<0.75 **-** Low Variability
* If 0.75<C.V<1.3 **-** Medium Variability
* If C.V>1.3 **-** High Variability

In [None]:
# coefficient of variance. 
cv = data.sales.std()/data.sales.mean()
cv

This has low variability process.

In [None]:
# Let's find if covariance is constant or not using acf plot and pacf plot of statsmodels.
plt.rcParams['figure.figsize'] = (14, 4);
plot_acf(data.sales);
plt.tight_layout()
plot_pacf(data.sales);
plt.tight_layout()

The ACF Plot shows that, it has momentum process since all AC's are positive. Let's check stationarity of data using Augmented Dickey-fuller test (adfuller test).

### ADFuller Test.

In [None]:
# Let's take a adfuller test on sales data.

def adfuller_test(data, description):
    
    print(f"Augmented Dickey-fuller test result for {description}")
    result = adfuller(data.dropna(), autolag="AIC")
    
    print("ADF test statistic: {:.3f}".format(result[0]))
    print("p-value:{:.3f}".format(result[1]))
    
    print("Critical Values:")
    for k, v in result[4].items():
        print('\t{}: {} - The data is {} stationary with {}% confidence'.format(k, v, 'not' if v<result[0] else '', 100-int(k[:-1])))
        
adfuller_test(data, 'raw data')

Augmented Dickey-fuller test is a statistical test for stationarity. If the p-value is less than 0.05 then the series is stationary. Here the p-value is ~0.08989 so, the time series is not stationary.

# To make Time Series Stationary
## Detrend

In [None]:
# de-trending
data_detrend = ((data - data.rolling(91).mean()) / data.rolling(91).std()).dropna()

# To check if detrended data is stationary or not?
adfuller_test(data_detrend, "de-trended data")
test_stationarity(data_detrend, "de-trended data")

Both the adfuller statistical test and rolling statistics graph shows that the series is now stationary. The relative smoothness of rolling mean and rolling standard deviation graph shows the sationarity in time series.

## Differencing

This method removes the underlying seasonal or cyclical patterns in the time series. I used a 15-lag difference since wages in public sector is paid every 2 weeks.

In [None]:
diff_data = (data - data.shift(15))
test_stationarity(diff_data, 'difference data')
adfuller_test(diff_data, 'difference data')

Both stationarity tests shows that, the this time series is stationary. Differencing performs much better as compare to de-trending.

In [None]:
# Let's perform the differecning on data.
diff_data = data.sales.diff()[1:]

In [None]:
# acf from difference data
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARMA

In [None]:
plot_acf(diff_data); # ma = 2
plot_pacf(diff_data); # ar = 2

In [None]:
from datetime import datetime
from datetime import timedelta
train_end = datetime(2017, 7, 31)
test_end = datetime(2017, 8, 15)

train_df = diff_data[:train_end]
test_df = diff_data[train_end + timedelta(days=1): test_end]

In [None]:
import warnings
warnings.filterwarnings('ignore')

def plots(model_name, preds):
    # plotting forecast vs actual values.
    plt.plot(test_df, marker=".", label='actual')
    plt.plot(preds,marker=".", label='forecast')
    plt.legend()
        
    plt.title("Actual Vs Forecast Sales", fontsize=18, loc="left")
    plt.show()
    
def residual_plot(model_name, residuals):
        
    # residuals plot
    plt.plot(residuals)
    plt.axhline(0, linestyle="--", color='red', alpha=0.6)
        
    plt.title(f"Residuals from {model_name} model", loc='left', fontsize=18)
    plt.xlabel("Date")
    plt.ylabel("Error")
    plt.show()
        
def error(model_name, preds):
    
    # calculating Root mean squared log error.
    RMSLE  = np.sqrt(np.mean((np.log(test_df - preds)**2)))
    print(f"Root Mean Squared Log Error of {model_name} Model: {RMSLE}")
    
def model_eval(model, model_name):
    
    # fitting model
    model_fit = model.fit()
    print(model_fit.summary())
    
    # prediction
    preds = model_fit.predict(start=1668, end=1682)
    preds.index = test_df.index
    residuals = test_df - preds
    
    return preds, residuals

In [None]:
import warnings
warnings.filterwarnings('ignore')

model = ARMA(train_df, order=(2, 2))
preds, residuals = model_eval(model, "ARMA")

In [None]:
error("ARMA",preds)
plots("ARMA", preds)

In [None]:
residual_plot("ARMA", residuals)

In [None]:
from statsmodels.tsa.api import ARIMA 

# build ARIMA model
model = ARIMA(train_df, order=(2, 1, 2))
preds, residuals = model_eval(model, 'ARIMA')

plots("ARIMA", preds)

In [None]:
residual_plot("ARIMA", residuals)
error("ARIMA", preds)

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
order = (0, 1, 0)
seasonal_order = (1, 0, 1, 12)

model = SARIMAX(train_df, order = order, seasonal_order = seasonal_order, trend='t')
preds, residuals = model_eval(model, 'SARIMAX')

In [None]:
plots("SARIMAX", preds)

In [None]:
residual_plot("SARIMAX", residuals)

In [None]:
error("SARIMAX", preds)

In [None]:
data = train_data[['date', 'sales']]
data.set_index('date', inplace=True)

In [None]:
data

In [None]:
# build model
model = ARIMA(data, order=(2, 1, 2))

print("Fitting model")
model_fit = model.fit()


In [None]:
test_predict = pd.DataFrame(test_predict)
test_predict['date'] = index
test_predict.rename(columns={0:'sales'}, inplace=True)

In [None]:
result = pd.merge(test_data, test_predict, on='date')
result.head()

In [None]:
submission = result[['id', 'sales']]
submission.set_index('id', inplace=True)

In [None]:
submission.to_csv("submission.csv")