In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pmdarima

In [None]:
import datetime
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm


In [None]:
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')


In [None]:
sales['date'] = pd.to_datetime(sales['date'], dayfirst=True)

In [None]:
#one item sold monthly in a unique shope
monthly_sales_df = sales.groupby(["date","date_block_num","shop_id", "item_id"])['date', 'item_price', "item_cnt_day"].agg({'date':['min','max'], 'item_price': 'mean', 'item_cnt_day':'sum'})

In [None]:
item_sold_per_cat = items.groupby('item_category_id').count()

In [None]:
monthly_sales_df.loc[monthly_sales_df.index==(33,59,21427)]

In [None]:
item_sold_per_month =sales.groupby(["date_block_num"]).sum()

In [None]:
plt.figure(figsize=(16,8))
data_color_normalized = [x / max(item_sold_per_cat['item_id']) for x in item_sold_per_cat['item_id']]
my_cmap = plt.cm.get_cmap('plasma')
colors = my_cmap(data_color_normalized)
plt.bar(item_sold_per_cat.index, item_sold_per_cat['item_id'], color=colors)
plt.title("Sold item per category")
plt.xlabel('category_id')
plt.ylabel('Total number of item sold per category')

In [None]:
plt.figure(figsize=(16,8))
plt.plot(item_sold_per_month['item_cnt_day'],'bo-')
plt.title("Monthly sale of the Company")
plt.xlabel("Date Block number")
plt.ylabel("Total amount of sale")

In [None]:
result = seasonal_decompose(item_sold_per_month['item_cnt_day'], model='additive', period=12)
plt.figure(figsize=(16,8))
result.plot()
plt.show()

In [None]:
result = seasonal_decompose(item_sold_per_month['item_cnt_day'], model='multiplicative', period=12)
plt.figure(figsize=(16,8))
result.plot()
plt.show()

## Check for stationary
Apparently, the data set is not stationary, because it has trend and seasonality.
There are many ways to find out if the time series is stationary or not.
1. Look at plots: by reviewing a time series plot if there is any trend or seanonality.
2. Summary statistics: checking the mean and variance of the time series.
3. Statistical tests: it is used to reject the null hypothesis.

In [None]:
monthly_sales = item_sold_per_month['item_cnt_day'].values

In [None]:
item_sold_per_month

## Here we split the dataset to different spans and calculate the means of them, if there are not similar, we could conclude a non-stationary dataset

In [None]:
split = int(len(monthly_sales)/2)
print(split)
x1, x2 = monthly_sales[0:split], monthly_sales[split:]
mean1, mean2 = x1.mean(), x2.mean()
var1, var2 = x1.var(), x2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f', (var1, var2))

### obviousely, we have a non-stationary time series. Let's check with the statistical test, here the **augmented dickey-fuller test** is used

In [None]:
adfuller_res = adfuller(monthly_sales)
print('ADF Statistic: %f' % adfuller_res[0])
print('p-value: %f' % adfuller_res[1])
print('Critical Values:')
for key, value in adfuller_res[4].items():
    print('\t%s: %.3f' % (key, value))

#### ADF statistic test value is -2.4. The more negative this value, the more chance to reject the null hypothesis(the time series is stationary)
#### As part of output, we can see the value of -2.4 is bigger than the critical values, then we would have to fail to reject the null hypothesis.
### To make the distribution of values more linear and better meet the expectations of the statistic test we'll use log transform

In [None]:
monthly_sales_log = np.log(monthly_sales)
adfuller_res_log = adfuller(monthly_sales_log)
print('ADF Statistic: %f' % adfuller_res_log[0])
print('p-value: %f' % adfuller_res_log[1])
print('Critical Values:')
for key, value in adfuller_res_log[4].items():
    print('\t%s: %.3f' % (key, value))

### log transform won't change the time series to a stationary one

## Automatic differencing using Panda

In [None]:
item_sold_per_month.head()

In [None]:
monthly_sales = item_sold_per_month['item_cnt_day']
monthly_sales_diff = monthly_sales.diff()
plt.plot(monthly_sales_diff)



In [None]:
monthly_sales_diff = monthly_sales_diff.fillna(monthly_sales.mean())

In [None]:
adfuller_res = adfuller(monthly_sales_diff.values)
print('ADF Statistic: %f' % adfuller_res[0])
print('p-value: %f' % adfuller_res[1])
print('Critical Values:')
for key, value in adfuller_res[4].items():
    print('\t%s: %.3f' % (key, value))

## Here the statistic test value is negtive enough to reject the null hypothesis.
## Now after differencing, we can see the statistictest value of -8 is less than the value of -3.6 at 1%

# ARIMA Model

In [None]:
# Original Series
fig, axes = plt.subplots(3, 2, sharex=True, figsize=(10,16))
axes[0, 0].plot(monthly_sales); axes[0, 0].set_title('Original Series')
plot_acf(monthly_sales, ax=axes[0, 1])

# 1st Differencing
axes[1, 0].plot(monthly_sales.diff(periods=12)); axes[1, 0].set_title('1st Order Differencing')
plot_acf(monthly_sales.diff(periods=12).dropna(), ax=axes[1, 1])

# 2nd Differencing
axes[2, 0].plot(monthly_sales.diff(periods=12).diff(periods=12)); axes[2, 0].set_title('2nd Order Differencing')
plot_acf(monthly_sales.diff(periods=12).diff(periods=12).dropna(), ax=axes[2, 1])

plt.show()

In [None]:
from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm


model = pm.auto_arima(monthly_sales, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

In [None]:
print(model.summary())

In [None]:
smodel = pm.auto_arima(monthly_sales, start_p=1, start_q=1,
                         test='adf',
                         max_p=3, max_q=3, m=12,
                         start_P=0, seasonal=True,
                         d=None, D=1, trace=True,
                         error_action='ignore',  
                         suppress_warnings=True, 
                         stepwise=True)

smodel.summary()

In [None]:
import numpy as np
n_periods = 36
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
# index_of_fc = item_sold_per_month['date'].values
# index_of_fc = pd.date_range(monthly_sales.index[-1], periods = n_periods, freq='MS')

In [None]:
index_of_fc = np.array(range(n_periods), dtype='int64')
index_of_fc

In [None]:
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)

# Plot
plt.figure(figsize=(8,4))
plt.plot(monthly_sales)
plt.plot(fitted_series, color='darkgreen')
plt.fill_between(lower_series.index, 
                 lower_series, 
                 upper_series, 
                 color='k', alpha=.15)

plt.title("SARIMA - Final Forecast")
plt.show()