In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf


from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
item_cat=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
item = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
sales_train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

# -I.DATA UNDERSTANDING-

In [None]:
item_cat.head()

In [None]:
item.head()

In [None]:
sales_train.head()

In [None]:
shops.head()

In [None]:
test.head()

In [None]:
sales_train.shape

In [None]:
sales_train.info()

In [None]:
sales_train["date"]=pd.to_datetime(sales_train["date"])

In [None]:
sales_train.head()

In [None]:
import datetime as dt
sales_train['day'] = sales_train['date'].dt.day
sales_train['month'] = sales_train["date"].dt.month
sales_train['year'] = sales_train["date"].dt.year

In [None]:
sales_train.head()

## Montly Sales

In [None]:
monthly_sales = sales_train.groupby(["date_block_num","shop_id","item_id"]).agg({"item_price":"mean","item_cnt_day":"sum"})

monthly_sales.head()

## Allover Daily Sales

In [None]:
allover_sales_by_date = sales_train.groupby(["date"])["item_cnt_day"].sum()
allover_sales_by_date.plot(kind="line",
                     xlabel="Days",
                     ylabel="Sales",
                     title= "Allover Sales by Date",
                     figsize=(26,8));

# -II.DATA ANALYSIS-

# 1. Types of Time-Series

Time-series are of generally two types:

    * Additive Time-Series: Additive time-series is time-series where components (trend, seasonality, noise) are added to generate time series.
    
        Time-Series = trend + seasonality + noise
        
    * Multiplicative Time-Series: Multiplicative time-series is time-series where components (trend, seasonality, noise) are multiplied to generate time series. One can notice an increase in the amplitude of seasonality in multiplicative time-series.
    
        Time-Series = trend * seasonality * noise



### MoM Total Daily Sales (continous / time series)

In [None]:
mom_sales =  sales_train.groupby(["date_block_num"])["item_cnt_day"].sum()

mom_sales.plot(kind="line",
                     xlabel="Days",
                     ylabel="Sales",
                     title= "Month-over-Month Total Daily Sales",
                     figsize=(16,8));

# 2. Trend, Seasonality and Stationary


## a.Trend:

    The trends represent an increase or decrease in time-series value over time. If we notice that the value of measurement over time is increasing or decreasing then we can say that it has an upward or downward trend.
    
    How to remove trend from time-series data?

    There are various ways to de-trend a time series. We have explained a few below.

    1. Log Transformation.
    2. Power Transformation.
    3. Local Smoothing - Applying moving window functions to time-series data.
    4. Differencing a time-series.
    5. Linear Regression.
    
    
## b.Seasonality:  

    The seasonality represents variations in measured value which repeats over the same time interval regularly. If we notice that particular variations in value are happening every week, month, quarter or half-yearly then we can say that time series has some kind of seasonality.
    
    How to remove seasonality from time-series data?
    
    Average de-trended values.
    Differencing a time-series.
    Use the loess method.

    (There are various ways to remove seasonality. The task of removing seasonality is a bit complicated. We have explained a few ways below to remove seasonality.)

    
    
## c.Stationary 
    
    If there is a upward or downward trend, or multiplative waving condition, or irregular waving trend in time series, it points to a Non-Stationary time serie. 
    To proceed the analysis, we need to convert it to Stationary timeseries.
    
    There are multiple tests that can be used to check stationarity as well.

    1. ADF( Augmented Dicky Fuller Test)
    2. KPSS
    3. PP (Phillips-Perron test)

    



- Let's check the trend seasonality and stationarity of our time serie visualizing its mean and standard deviation 


In [None]:
plt.subplot(121)
mom_sales.rolling(window = 12).mean().plot(figsize=(25,5), 
                                                 color="tab:blue", 
                                                 title="Rolling Mean Over 12 Month Period", 
                                                 legend = True);
plt.subplot(122)
mom_sales.rolling(window = 12).std().plot(color="tab:orange", 
                                                title="Rolling Variance Over 12 Month Period", 
                                                legend=True);

It is seen that there is a descending additive trend in mean whereas ascending and increasing multivative trend in variance.

# 3. Decompose Time Series To Its Components

    Normally there is 2 decompose model as "multiplicative" model and "additive" model. We will prefer additive model assessing no multiplative condition in the serie.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decompose_result = seasonal_decompose(mom_sales, freq=12, model="additive")
trend = decompose_result.trend
seasonal = decompose_result.seasonal
residual = decompose_result.resid

decompose_result.plot();

## 4. Stationary Testing with Dicky-Fuller 

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries):
    
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['1.Test Statistic','2.p-value','3.Lags Used','4.Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['5.Critical Value (%s)'%key] = value
    print (dfoutput)

test_stationarity(mom_sales)

    We can interpret above results based on p-values of result.
    1. p-value > 0.05 - This implies that time-series is non-stationary.
    2. p-value <=0.05 - This implies that time-series is stationary
                
    Since P-value is greater than 0.05, our time-series is not stationary. It has time-dependent components present that we need to remove.                

# -III.DATA PREPROCESSING-

## Remove Trend and Seasonality

    There are various ways like differencing, power transformation, log transformation, etc. to remove trends from data as we have discussed above. 
    
    We'll practice differencing.

In [None]:
# create a differencing series to remove trend

def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return pd.Series(diff)


In [None]:
ts=sales_train.groupby(["date_block_num"])["item_cnt_day"].sum()
ts.astype('float')
plt.figure(figsize=(16,16))
plt.subplot(311)
plt.title('Original Time Series')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts)

plt.subplot(312)
plt.title('After Removing Trend')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts)
plt.plot(new_ts)
plt.plot()

plt.subplot(313)
plt.title('After Removing Seasonality')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts,12)       # assuming the seasonality is 12 months long
plt.plot(new_ts)
plt.plot()

In [None]:
# now testing the stationarity again after removing trend and seasonality

test_stationarity(new_ts)


    Now after the transformations, our p-value for the Dickey Fuller Test is within 5 %. So we can assume Stationarity of the series.

# -IV.PREDICTIVE ANALYSIS / FORECASTING- 

## AR, MA and ARMA (Autoregressive Moving Average) models:

In [None]:
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

def tsplot(y, lags=None, figsize=(10, 8), style='bmh',title=''):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        #mpl.rcParams['font.family'] = 'Ubuntu Mono'
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
    return 

In [None]:
# Simulate an AR(1) process with alpha = 0.6
np.random.seed(1)
n_samples = int(1000)
a = 0.6
x = w = np.random.normal(size=n_samples)

for t in range(n_samples):
    x[t] = a*x[t-1] + w[t]
limit=12    
_ = tsplot(x, lags=limit,title="AR(1)process")

## AR(1) process -- has ACF tailing out and PACF cutting off at lag=1

In [None]:
# Simulate an AR(2) process

n = int(1000)
alphas = np.array([.444, .333])
betas = np.array([0.])

# Python requires us to specify the zero-lag value which is 1
# Also note that the alphas for the AR model must be negated
# We also set the betas for the MA equal to 0 for an AR(p) model
# For more information see the examples at statsmodels.org
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

ar2 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) 
_ = tsplot(ar2, lags=12,title="AR(2) process")

## AR(2) process -- has ACF tailing out and PACF cutting off at lag=2

In [None]:
# Simulate an MA(1) process
n = int(1000)
# set the AR(p) alphas equal to 0
alphas = np.array([0.])
betas = np.array([0.8])
# add zero-lag and negate alphas
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
ma1 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) 
limit=12
_ = tsplot(ma1, lags=limit,title="MA(1) process")

## MA(1) process -- has ACF cut off at lag=1

In [None]:
# Simulate MA(2) process with betas 0.6, 0.4
n = int(1000)
alphas = np.array([0.])
betas = np.array([0.6, 0.4])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

ma3 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n)
_ = tsplot(ma3, lags=12,title="MA(2) process")

## MA(2) process -- has ACF cut off at lag=2

In [None]:
# Simulate an ARMA(2, 2) model with alphas=[0.5,-0.25] and betas=[0.5,-0.3]
max_lag = 12

n = int(5000) # lots of samples to help estimates
burn = int(n/10) # number of samples to discard before fit

alphas = np.array([0.8, -0.65])
betas = np.array([0.5, -0.7])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

arma22 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n, burnin=burn)
_ = tsplot(arma22, lags=max_lag,title="ARMA(2,2) process")

- Its not very clear/straight-forward. Let's use a systematic approach to finding the order of AR and MA processes.

In [None]:
# pick best order by aic 
# smallest aic value wins
best_aic = np.inf 
best_order = None
best_mdl = None

rng = range(5)
for i in rng:
    for j in rng:
        try:
            tmp_mdl = smt.ARMA(arma22, order=(i, j)).fit(method='mle', trend='nc')
            tmp_aic = tmp_mdl.aic
            if tmp_aic < best_aic:
                best_aic = tmp_aic
                best_order = (i, j)
                best_mdl = tmp_mdl
        except: continue


print('aic: {:6.5f} | order: {}'.format(best_aic, best_order))

## We've correctly identified the order of the simulated process as ARMA(2,2)

     Lets use it for the sales time-series.

In [None]:
# pick best order by aic 
# smallest aic value wins
best_aic = np.inf 
best_order = None
best_mdl = None

rng = range(5)
for i in rng:
    for j in rng:
        try:
            tmp_mdl = smt.ARMA(new_ts.values, order=(i, j)).fit(method='mle', trend='nc')
            tmp_aic = tmp_mdl.aic
            if tmp_aic < best_aic:
                best_aic = tmp_aic
                best_order = (i, j)
                best_mdl = tmp_mdl
        except: continue


print('aic: {:6.5f} | order: {}'.format(best_aic, best_order))

### Simply use best_mdl.predict() to predict the next values

In [None]:
# adding the dates to the Time-series as index
ts=sales_train.groupby(["date_block_num"])["item_cnt_day"].sum()
ts.index=pd.date_range(start = '2013-01-01',end='2015-10-01', freq = 'MS')
ts=ts.reset_index()
ts.head()

## Prophet:

    Recently open-sourced by Facebook research. It's a very promising tool, that is often a very handy and quick solution to the frustrating flatline.
    
    Sure, one could argue that with proper pre-processing and carefully tuning the parameters the above graph would not happen.

    But the truth is that most of us don't either have the patience or the expertise to make it happen.

    Also, there is the fact that in most practical scenarios- there is often a lot of time-series that needs to be predicted. Eg: This competition. It requires us to predict the next month sales for the Store - item level combinations which could be in the thousands.(ie) predict 1000s of parameters!

    Another neat functionality is that it follows the typical sklearn syntax.

    At its core, the Prophet procedure is an additive regression model with four main components:

    A piecewise linear or logistic growth curve trend. Prophet automatically detects changes in trends by selecting changepoints from the data.
    A yearly seasonal component modeled using Fourier series.
    A weekly seasonal component using dummy variables.
    A user-provided list of important holidays.

   Resources for learning more about prophet:

   https://www.youtube.com/watch?v=95-HMzxsghY \
   https://facebook.github.io/prophet/docs/quick_start.html#python-api \
   https://research.fb.com/prophet-forecasting-at-scale/ \
   https://blog.exploratory.io/is-prophet-better-than-arima-for-forecasting-time-series-fa9ae08a5851


In [None]:
from fbprophet import Prophet
#prophet reqiures a pandas df at the below config 
# ( date column named as DS and the value column as Y)
ts.columns=['ds','y']
model = Prophet( yearly_seasonality=True) #instantiate Prophet with only yearly seasonality as our data is monthly 
model.fit(ts) #fit the model with your dataframe

In [None]:
# predict for five months in the furure and MS - month start is the frequency
future = model.make_future_dataframe(periods = 5, freq = 'MS')  
# now lets make the forecasts
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
model.plot(forecast)

In [None]:
model.plot_components(forecast)

    The trend and seasonality from Prophet look similar to the ones that we had earlier using the traditional methods.

## Hierarchical time series:

In [None]:
total_sales=sales_train.groupby(['date_block_num'])["item_cnt_day"].sum()
dates=pd.date_range(start = '2013-01-01',end='2015-10-01', freq = 'MS')

total_sales.index=dates
total_sales.head()

## Middle out:

    Let's predict for the store level

In [None]:
monthly_shop_sales=sales_train.groupby(["date_block_num","shop_id"])["item_cnt_day"].sum()
# get the shops to the columns
monthly_shop_sales=monthly_shop_sales.unstack(level=1)
monthly_shop_sales=monthly_shop_sales.fillna(0)
monthly_shop_sales.index=dates
monthly_shop_sales=monthly_shop_sales.reset_index()
monthly_shop_sales.head()

In [None]:
import time
start_time=time.time()

# Calculating the base forecasts using prophet
# From HTSprophet pachage -- https://github.com/CollinRooney12/htsprophet/blob/master/htsprophet/hts.py
forecastsDict = {}
for node in range(len(monthly_shop_sales)):
    # take the date-column and the col to be forecasted
    nodeToForecast = pd.concat([monthly_shop_sales.iloc[:,0], monthly_shop_sales.iloc[:, node+1]], axis = 1)
#     print(nodeToForecast.head())  # just to check
# rename for prophet compatability
    nodeToForecast = nodeToForecast.rename(columns = {nodeToForecast.columns[0] : 'ds'})
    nodeToForecast = nodeToForecast.rename(columns = {nodeToForecast.columns[1] : 'y'})
    growth = 'linear'
    m = Prophet(growth, yearly_seasonality=True)
    m.fit(nodeToForecast)
    future = m.make_future_dataframe(periods = 1, freq = 'MS')
    forecastsDict[node] = m.predict(future)

In [None]:
#predictions = np.zeros([len(forecastsDict[0].yhat),1]) 
nCols = len(list(forecastsDict.keys()))+1
for key in range(0, nCols-1):
    f1 = np.array(forecastsDict[key].yhat)
    f2 = f1[:, np.newaxis]
    if key==0:
        predictions=f2.copy()
       # print(predictions.shape)
    else:
       predictions = np.concatenate((predictions, f2), axis = 1)

In [None]:
prediction=predictions[-1]
prediction