In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.

Time series data definition: Data collected on the same metrics or same objects at regular time intervals. It could be stock market records or sales records.

### Import Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from datetime import timedelta
from pandas import Series


# ignore warnings
warnings.filterwarnings('ignore')

%matplotlib inline


from statsmodels.tsa.stattools import adfuller

In [None]:
# Data path
path = '/kaggle/input/competitive-data-science-predict-future-sales/'

#Load
train = pd.read_csv(path + 'sales_train.csv')
test = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')
items = pd.read_csv(path + 'items.csv')
item_categories = pd.read_csv(path + 'item_categories.csv')
shops = pd.read_csv(path + 'shops.csv')

### Inspect the data

In [None]:
#check the data
print("************** TRAIN **************")
print(train.describe())
print(train.head())

In [None]:
#look at the test data
print("************** TEST**************")
print(test.describe())
print(test.head())

### Data fields
ID - an Id that represents a (Shop, Item) tuple within the test set

shop_id - unique identifier of a shop

item_id - unique identifier of a product

item_category_id - unique identifier of item category

item_cnt_day - number of products sold. You are predicting a monthly amount of this measure

item_price - current price of an item

date - date in format dd/mm/yyyy

date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33

item_name - name of item

shop_name - name of shop

item_category_name - name of item category

In [None]:
train.isnull().sum()

In [None]:
train.info()

### Simplify Time Series Data
We will start by simplifying the input data a bit to explore data types. To do so, we will look at item price and item cnt by Date . This allows us to look a Time Series dataset with multiple time series. 

In [None]:
#grouping sales per day
base = train.groupby(['date'])['item_cnt_day'].sum().reset_index()
base.head()

In [None]:
#grouping sales per month
base_m = train.groupby(['date_block_num'])['item_cnt_day'].sum().reset_index()
base_m.head()

In [None]:
#grouping sales per shop
base_s = train.groupby(['shop_id'])['item_cnt_day'].sum().reset_index()
base_s.head()

In [None]:
#creating rolling averages for the base grouped data
b_rolling_mean = base.rolling(window = 12).mean() # rolling average 
b_rolling_std = base.rolling(window = 12).std() # rolling std 

#creating rolling averages for the month base grouped data
m_rolling_mean = base_m.rolling(window = 12).mean() # rolling average of 12 months
m_rolling_std = base_m.rolling(window = 12).std() # rolling std of 12 months

#creating rolling averages for the shoped grouped sales
s_rolling_mean = base_s.rolling(window = 12).mean() # rolling average 
s_rolling_std = base_s.rolling(window = 12).std() # rolling std 


In [None]:
#visualizing sales per day
base.plot(color = 'blue', label = 'Sales',figsize=(16,8), title= 'Sales per day', xlabel='date', ylabel='Items sold')
plt.plot(b_rolling_mean, color = 'red', label = 'Rolling Mean')
plt.plot(b_rolling_std, color = 'black', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.show()

In [None]:
#visualizing sales per month count
plt.figure(figsize=(16,8))
plt.title('Total Sales per month')
plt.xlabel('Month')
plt.ylabel('Units Sold')
plt.plot(base_m, color = 'blue', label = 'Sales')
plt.plot(m_rolling_mean, color = 'red', label = 'Rolling Mean')
plt.plot(m_rolling_std, color = 'black', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.show()

In [None]:
#visualizing sales per shop
plt.figure(figsize=(16,8))
plt.title('Sales per shop')
plt.xlabel('shop id')
plt.ylabel('Items sold')
plt.plot(base_s, color = 'blue', label = 'Sales')
plt.plot(s_rolling_mean, color = 'red', label = 'Rolling Mean')
plt.plot(s_rolling_std, color = 'black', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.show()

## Time Series Visualizations
There are a number of packages to help analyze Time Series data and create relevant plots. One example is __[statsmodels](https://www.statsmodels.org/stable/graphics.html#time-series-plots)__, which includes a number of methods for plotting Time Series-specific visualizations:
- __[plot_acf](https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.plot_acf.html#statsmodels.graphics.tsaplots.plot_acf)__: Plot of the Autocorrelation Function
- __[plot_pacf](https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.plot_pacf.html#statsmodels.graphics.tsaplots.plot_pacf)__: Plot of the Partial Autocorrelation Function
- __[month_plot](https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.month_plot.html#statsmodels.graphics.tsaplots.month_plot)__: Seasonal Plot for Monthly Data
- __[quarter_plot](https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.quarter_plot.html#statsmodels.graphics.tsaplots.quarter_plot)__: Seasonal Plot for Quarterly Data

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

print('Monthly data Autocorrelation Plots')
# Autocorrelation and Partial Autocorrelation Functions for Daily Data

acf_plot = plot_acf(base_m['item_cnt_day'], lags= 15, title='Autocorrelation in monthly Sales Data')

pacf_plot = plot_pacf(base_m['item_cnt_day'], lags= 15, title='Partial Autocorrelation in monthly Sales Data')


In [None]:
base_new = base.copy()
base_new.index = base.date
base_new.index = pd.to_datetime(base_new.index)
base_new= base_new.drop(['date'], axis = 1)
base_new

### Stationary Time Series
In order for time series data to be stationary, the data must exhibit four properties over time:
1. constant mean
2. constant variance
3. constant autocorrelation structure
4. no periodic component

Mean, variance, and periodic component (aka seasonality) should be familiar to you. Autocorrelation may not be. Autocorrelation simply means that the current time series measurement is correlated with a past measurement. For example, today's stock price is often highly correlated with yesterday's price.

Perhaps the easiest way to check for constant mean and variance is to chop up the data into separate chunks, calculate statistics for each chunk, and compare. It's not the most rigorous method but it gives you a good sense of whether your data is approximately stationary.

In [None]:
# split data into 22 equal chunks
chunks = np.split(base_new, indices_or_sections= 22)
print(''''''''''mean of each chunk''''''''')
print(np.mean(chunks, axis = 1))

print('''''''''''variance of each chunk''''''''')
print(np.var(chunks, axis = 1))

We can see that the mean of each chunk is close to being constant while the variance chnages significantly, hence we can say the data is NON Stationary.


> Note: We do expect some fluctuation in values. It's highly unlikely that the either the mean or variace will be exactly the same from chunk to chunk, but it should be close. 

If you wanted to get even more sophisticated, you could run a statistical test to determine if the difference in means or the difference in variances is statistically significant.

### Augmented Dickey-Fuller Test
This is a statistical procedure to suss out whether a time series is stationary or not. We won't go into all the nitty gritty details but here's what you need to know:
1. **Null hypothesis:** the series is nonstationary.
2. **Alternative hypothesis:** the series is stationary.

Like any statistical test you should set a significance level or threshold that determines whether you should accept or reject the null. 
> The value 0.05 is common but depends upons numerous factors.


#### Stationary Data & ADF

In [None]:
from statsmodels.tsa.stattools import adfuller

a = train.groupby(['date_block_num'])['item_cnt_day'].sum()
a.astype(float)

adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(a, regression='c')

In [None]:
print('''''''adf value''''''')
print(adf)

print('''''''p value''''''')
print(pvalue)

print('''''''nobs''''''')
print(nobs)

print('''''''critical value''''''')
print(critical_values)

First, **adf** is the value of the test statistic. The more negative the value, the more confident we can be that the series is stationary. Here we see a value of -2.39. That may not mean anything to you just yet but the **pvalue** should

Next, **pvalue** is interpreted like any p-value. Once we set a threshold, we can compare this p-value to that threshold. Either we reject or fail to reject the null. Here **pvalue** is 0.14 which is greater than the threshold so we fail to reject the null that this data is nonstationary. Hence, data is non stationary.

The variable **nobs** is simply the number of observations in the time series, in this case 33.

Finally, the **critical_values** variable provides test statistic threholds for common significant levels. .

 ## Common Nonstationary-to-Stationary Transformations

###  Remove Changing Variance w/Log Transformation
This trick works well when you're dealing with heteroscedastic data. Let's plot that again to remind you what that looks like.

#### Log Transformation

We can apply a log transformation. However, we cannot take the log of nonpositive values. The way we can get around this is by adding a constant to all values to make them positive. 

In [None]:
#create the log of the monthly sales
log_new = np.log(a)

In [None]:
def run_sequence_plot(x, y, title, xlabel="time", ylabel="sales"):
    plt.plot(x, y, 'k-')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(alpha=0.3);
    
run_sequence_plot(a.index, log_new, title = 'log transformed data')

In [None]:
#carry out the stationarity test on the log transformed data

adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(log_new, regression='c')

print('''''''adf value''''''')
print(adf)

print('''''''p value''''''')
print(pvalue)

print('''''''nobs''''''')
print(nobs)

print('''''''critical value''''''')
print(critical_values)

The p value is still above the threshold. Hence the log transformation didnt work to remove the non stationarity on this data. We can then try differencing.

Turns out we can transform this series into stationary by applying what's called a differece. It's a fancy term that simply means you're going to subtract a past value from a current value. An example will make this clear.

We know *lagged* was created with a lag of one. So let's subtract $O_{t-1}$ from $O_{t}$ where $O_{t}$ is the observed data at time *t* and $O_{t-1}$ is the observed data at *t-1*.

In [None]:
def difference(data, interval=1):
    diff = [] # Create empty list
    for i in range(interval, len(data)): # Iterate over every lag
        val = data[i] - data[i - interval] # Take the difference between consective terms
        diff.append(val) # Add the new values to the end of the list
    return Series(diff) # Return the differenced values as a time series

In [None]:
new_a = difference(a) # difference the time series

#plot original data
plt.figure(figsize=(16,16))
plt.subplot(211)
plt.title('Original')
plt.xlabel('Month')
plt.ylabel('Units Sold')
plt.plot(a) 

# plot the differenced data
plt.subplot(212)
plt.title('differenced data')
plt.xlabel('Month')
plt.ylabel('Units Sold')
plt.plot(new_a)
plt.plot()

In [None]:
#carry out the stationarity test on the differenced data

adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(new_a, regression='c')

print('''''''adf value''''''')
print(adf)

print('''''''p value''''''')
print(pvalue)

print('''''''nobs''''''')
print(nobs)

print('''''''critical value''''''')
print(critical_values)

We see that the adf value is quite negative and the p value is very insignificant(0), hence we reject the null and differencing has transformed our data into STATIONARY

In [None]:
plot_acf(difference(difference(a)));
plt.title('Differencing ACF')
plt.show()

### What is Smooothing?
Any data collection process is subject to noise. Oftentimes this noise can obscure useful patterns. Smoothing is a well-known and oft used technique to extract those patterns. 

Smoothing comes in two flavors:
1. Simple 
2. Exponential 


###  Exponential Smoothing
Exponential smoothing is a way to weight observations differently. Specifically, recent observations are weighted moreso than more distant ones.

There are three key exponential smoothing techniques you need to be aware of:
1. Single Exponential Smoothing - no trend or seasonality
2. Double Exponential Smoothing - captures trend
3. Triple Exponential Smoothing - captures trend & seasonality

#### Single Exponential
This method is useful if your data lacks trend and seasonality and you want to approximately track patterns in your data. Furthermore, this method removes the lag associated with the moving average techniques discussed above. 

#### Double Exponential 
Should your data exhibit a trend, you'll want to use this smoothing method. It has all the benefits of Single Exponential with the ability to pickup on trend. 

#### Triple Exponential
Should your data exhibit trend and seasonality, you'll want to use this smoothing method. It has all the benefits of Double Exponential with the ability to pickup on seasonality. 

In [None]:
#split the data
#define test size
test_size = 10
# train sets
train_1 = new_a[:-test_size]
test_1 = new_a[-test_size:]

In [None]:
from statsmodels.tsa.api import ExponentialSmoothing

model_1 = ExponentialSmoothing(train_1,
                              trend= None,
                              seasonal=None,
                              seasonal_periods= None).fit(optimized=True)

preds_1 = model_1.forecast(len(test_1))

In [None]:
plt.plot(train_1.index, train_1, 'b--', label="train")
plt.plot(test_1.index, test_1, color='orange', linestyle="--", label="test")
plt.plot(test_1.index, preds_1, 'r--', label="predictions")
plt.legend(loc='upper left')
plt.title("Triple Exponential Smoothing")
plt.grid(alpha=0.3);

When we are determining our ARIMA model we will come across the following standard inputs:
- order(p,d,q):
    - p is number of AR terms
    - d is number of times that we would difference our data
    - q is number of MA terms
    
When we work with SARIMA models 'S' refers to 'seasonal' and we have the additional standard inputs:
- seasonal order(p,d,q):
    - p is number of AR terms in regards to seasonal lag
    - d is number of times that we would difference our seasonal lag (as seen above)
    - q is number of MA terms in regards to seasonal lag
    - s is number of periods in a season

In [None]:
import statsmodels.api as sm
import warnings

rng = range(5)
best_aic = np.inf
best_model = None
best_order = None

warnings.filterwarnings('ignore')

for i in rng:
    for j in rng:
        temp_model = sm.tsa.statespace.SARIMAX(train_1, order = (i, 0, j))
        results = temp_model.fit()
        temp_aic = results.aic
        if temp_aic < best_aic:
            best_aic = temp_aic
            best_order = (i, 0, j)
            best_model = temp_model

print('Best AIC: %s | Best order: %s' % (best_aic, best_order))

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# fit SARIMA monthly based on helper plots
sar = SARIMAX(train_1, order=(0,0,1), 
                seasonal_order=(0,1,0,12), 
                trend='t').fit() 
                               
sar.summary()

In [None]:
sar_preds = sar.forecast(len(test_1)) #forecst on the kept test data

#plot the graphs to visualize the performance
plt.plot(train_1.index, train_1, 'b--', label="train")
plt.plot(test_1.index, test_1, color='orange', linestyle="--", label="test")
plt.plot(test_1.index, sar_preds, 'r--', label="predictions")
plt.legend(loc='upper left')
plt.title("SARIMA modelling")
plt.grid(alpha=0.3);

We see that the predictions made were quite close to the original test data.

In [None]:
#lets use the model to forecast into the future (24 months)

from statsmodels.tsa.statespace.sarimax import SARIMAXResults

sarima = sm.tsa.statespace.SARIMAX(new_a, order = (0,0,1),trend = 't', seasonal_order=(0,1,0,12))
result = sarima.fit()
preds = SARIMAXResults.predict(result, start = 33, end = 46)


#plot to visualize
ax = new_a.plot(label = 'Observed')
preds.plot(ax = ax, label = 'SARIMA_forecast')
plt.legend()
plt.title('Sales forecast')
ax.set_xlabel('Month')
ax.set_ylabel('Units Sold')
plt.show()


### Prophet Model

The models revolves around two main observations in the practice of creating a variety of business forecasts:
- Completely automatic forecasting techniques can be brittle and they are often too inflexible to incorporate useful assumptions or heuristics.
- Analysts who can produce high quality forecasts are quite rare because forecasting is a specialized data science skill requiring substantial experience.

Prophet is an general additive model that includes a number of highly advanced, intelligent [forecasting methods]


- For trend, a piecewise linear or logistic growth curve trend is used. 
    - Prophet automatically detects changes in trends by selecting changepoints from the data.
- For seasonalities, different seasonality components are modeled using Fourier series.
- One can either use fb provided list or incorporate their own holidays into model.


In [None]:
# prophet model requires that we reset the date as a column instead of being an index
base_n = base_new.reset_index()
base_n.head()
print(base_n.shape)

In [None]:
from fbprophet import Prophet # Import the package

# Prophet requires you to name your columns the following:
base_n.columns = ['ds','y']
prophet_model = Prophet(yearly_seasonality = True) # As determined in stationarity testing
prophet_model.fit(base_n)

# We'll predict 12 months into the future
# 'MS' = month start
future = prophet_model.make_future_dataframe(periods = 12, freq = 'MS')
forecast = prophet_model.predict(future)
forecast.head()

In [None]:
# plot forecast
prophet_model.plot(forecast);

In [None]:
# plot individual components of forecast: trend, weekly/yearly seasonality,
prophet_model.plot_components(forecast);

In [None]:
ax = new_a.plot(label = 'Observed')
preds.plot(ax = ax, label = 'SARIMA forecast', alpha = 0.9, linestyle = '-')
forecast.yhat[33:46].plot(ax = ax, label = 'Prophet forecast', alpha = 0.9, linestyle = '-')

plt.legend()
plt.title('Sales')
ax.set_xlabel('Month')
ax.set_ylabel('Units Sold')
plt.show()

It seems the SARIMA MODEL did a better job at generalising.