In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Reading the Datasets
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
shops= pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
print(sales_train.shape)
sales_train.head()


***Since our two columns Item_price and Item_count have negative values i.e. probably the items have been returned back to the shop, the shop is not making any sales on these orders. hence, I will drop the rows that have negative value for item_price and item_count.***

In [None]:
sales_train.drop(sales_train[(sales_train['item_cnt_day']<=0)|(sales_train['item_price']<=0)].index ,axis=0,inplace=True)

***convert the date to datetime object.***

In [None]:
# sales_train['date']=pd.to_datetime(sales_train['date'],dayfirst=True)


**I will not focussing on the exploratory data analysis for this notebook and would dive into time-series modelling and explaining the concepts better.**

*In practice we can assume the series to be stationary if it has constant statistical properties over time and these properties can be:*

*• constant mean*

*• constant variance*

*• an auto co-variance that does not depend on time.*

*These details can be easily retrieved using stat commands in python.*

*The best way to understand you stationarity in a Time Series is by eye-balling the plot:*

In [None]:
data=sales_train.groupby(["date","date_block_num","shop_id","item_id"])["item_cnt_day"].sum().reset_index()
ts=data.groupby(['date'])['item_cnt_day'].sum()
ts.astype('float')
plt.figure(figsize=(12,10))
plt.title('Total Sales of the item')
plt.xlabel('Month-Year')
plt.ylabel('Quantity of Sales')
plt.plot(ts)

**We can see a decreasing trend from the above line plot.** 
 ***In order to apply a time series model, it is important for the Time series to be stationary; in other words all its statistical properties (mean,variance) remain constant over time***

Performing Dicker Fuller Test

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12,center=False).mean()
    rolstd = timeseries.rolling(window=12,center=False).std()#window=12, because of yearly trend for both mean and variance
#Plot rolling statistics:
    plt.figure(figsize=(15,10))
    plt.plot(timeseries, color='blue',label='Original')
    plt.plot(rolmean, color='red', label='Rolling Mean')
    plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [None]:
test_stationarity(ts)

*** if The data is not stationary because the test statistic is greater than the critical value.***
***There are two major factors that make a time series non-stationary. They are:***

***• Trend: non-constant mean***

***• Seasonality: Variation at specific time-frames***
***Trend:***
***The first step is to reduce the trend using transformation, as we can see here that there is a strong positive trend. These transformation can be log, sq-rt, cube root etc . Basically it penalizes larger values more than the smaller. In this case we will use the logarithmic transformation.***

We will not be doing this as the test-statistic in our case is lesser than all the critical values.


In [None]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# def encoding_categorical(dataset):
#     categorical_columns=['shop_id','item_id']
#     for column in categorical_columns:
#         dataset[str(column)]=le.fit_transform(dataset[str(column)])
#     return dataset
# complete_data=encoding_categorical()



**Decomposing:**
***we model both the trend and the seasonality, then the remaining part of the time series is returned.***

In [None]:
ts_data = pd.DataFrame(ts)


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_data,period=100)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.figure(figsize=(15,8))
plt.subplot(411)
plt.plot(ts,label='Orginial')
plt.subplot(412)
plt.plot(trend,label='Trend')
plt.subplot(413)
plt.plot(seasonal,label='Seasonal')
plt.subplot(414)
plt.plot(residual,label='Residual')

**Forecasting a Time Series**:

Now that we have made the Time series stationary, let’s make models on the time series using differencing because it is easy to add the error , trend and seasonality back into predicted values .
We will use statistical modelling method called ARIMA to forecast the data where there are dependencies in the values.
Auto Regressive Integrated Moving Average(ARIMA) — It is like a liner regression equation where the predictors depend on parameters (p,d,q) of the ARIMA model .


• p : This is the number of AR (Auto-Regressive) terms . Example — if p is 3 the predictor for y(t) will be y(t-1),y(t-2),y(t-3).


• q : This is the number of MA (Moving-Average) terms . Example — if p is 3 the predictor for y(t) will be y(t-1),y(t-2),y(t-3).


• d :This is the number of differences or the number of non-seasonal differences .


Now let’s check out on how we can figure out what value of p and q to use. We use two popular plotting techniques; they are:


• Autocorrelation Function (ACF): It just measures the correlation between two consecutive (lagged version). example at lag 4, ACF will compare series at time instance t1…t2 with series at instance t1–4…t2–4


• Partial Autocorrelation Function (PACF): is used to measure the degree of association between y(t) and y(t-p).


In [None]:
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf,pacf

lag_acf=acf(ts,fft=False)
lag_pacf=pacf(ts,method='ols')
plt.figure(figsize=(11,8))
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')

plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()

The two dotted lines on either sides of 0 are the confidence intervals. These can be used to determine the ‘p’ and ‘q’ values as:


• p: The first time where the PACF crosses the upper confidence interval, here its close to 2. hence p = 2.


• q: The first time where the ACF crosses the upper confidence interval, here its close to 2. hence q = 2.

In [None]:
model=ARIMA(ts,order=(2,0,2))
result = model.fit(disp=-1)
plt.plot(ts,label="Original")
plt.plot(result.fittedvalues,color='red',label="Predicted")

In [None]:
forecast_errors = [ts[i]-result.fittedvalues[i] for i in range(len(ts))]
bias = sum(forecast_errors) * 1.0/len(ts)
print('Bias: %f' % bias)
test=pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
predictions = pd.DataFrame(result.fittedvalues).reset_index()
predictions.columns=["date","predictions"]
predictions.head()# Monthly sales forecasting