* ID - an Id that represents a (Shop, Item) tuple within the test set
* shop_id - unique identifier of a shop
* item_id - unique identifier of a product
* item_category_id - unique identifier of item category
* item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
* item_price - current price of an item
* date - date in format dd/mm/yyyy
* date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* item_name - name of item
* shop_name - name of shop
* item_category_name - name of item category

In [None]:
import os
os.listdir('../input/competitive-data-science-predict-future-sales')

In [None]:
import numpy as np 
import pandas as pd
import random as rd 
import math
import datetime 
import matplotlib.pyplot as plt 
import seaborn as sns 

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

import warnings
warnings.filterwarnings("ignore")

In [None]:
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")

sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

# Items

In [None]:
items.head()

In [None]:
topItemCats = items["item_category_id"].value_counts().index[:20]
topItemCatsValues = items["item_category_id"].value_counts().values[:20]

plt.figure(figsize=(18, 6))
ax = sns.barplot(topItemCats, topItemCatsValues, alpha=0.8)
plt.title("Top 20 item catgories sold")
plt.ylabel('Items values', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.show()

# Sales

In [None]:
sales.info()

In [None]:
sales["date"] = pd.to_datetime(sales["date"])

sales.head()

In [None]:
price_idx = sales["item_price"].value_counts().index[:10]
unitsSold = sales["item_price"].value_counts().values[:10]

plt.figure(figsize=(18, 6))
ax = sns.barplot(price_idx, unitsSold, order=price_idx, alpha=0.8)
plt.title("Top 10 most sold amount")
plt.ylabel('Units Sold', fontsize=12)
plt.xlabel('Price', fontsize=12)
plt.show()

In [None]:
shop_id_idx = sales["shop_id"].value_counts().index[:10]
unitsSold = sales["shop_id"].value_counts().values[:10]

plt.figure(figsize=(18, 6))
ax = sns.barplot(shop_id_idx, unitsSold, order=shop_id_idx, alpha=0.8)
plt.title("Top 10 shops with most sales")
plt.ylabel('Units Sold', fontsize=12)
plt.xlabel('Shop ID', fontsize=12)
plt.show()

In [None]:
sales["year"] = sales["date"].dt.year
sales["month"] = sales["date"].dt.month
sales["day"] = sales["date"].dt.day

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(y="year", data=sales)
plt.title("Total sales yearly")

Sales are decreasing each year

In [None]:
# We want to predict total products sold, so we are grouping no. of products sold on each "date_block_num"
ts = sales.groupby(["date_block_num"])["item_cnt_day"].sum()
ts

In [None]:
plt.figure(figsize=(16, 6))
plt.title('Total sales of the company')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts)

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(ts.rolling(12).mean().values, label='Rolling mean')
plt.plot(ts.rolling(12).std().values, label='Rolling std')
plt.legend()

There is a trend in our data

Residuals =>

When you remove the trend, Seasonality and other observable patterns from the data, white noise is left and this    termed as residuals

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

res = sm.tsa.seasonal_decompose(ts.values, freq=12, model="multiplicative")
fig = res.plot()

In [None]:
res = sm.tsa.seasonal_decompose(ts.values, freq=12, model="additive")
fig = res.plot()

ADF test for stationarity

* Null Hypothesis: data is non-stationary
* large p-values are indicative of non-stationarity, and small p-values suggest stationarity. Using the usual 5% threshold, differencing is required if the p-value is greater than 0.05.

In [None]:
def test_stationarity(data):
    dftest = adfuller(data, autolag="AIC")
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of observation Used'])
    for key, val in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = val
    print(dfoutput)
    
test_stationarity(ts)

We have to reduce the p-value by taking log

In [None]:
log_transform = ts
log_transform = log_transform.apply(lambda x: math.log(1 + x))
log_transform = pd.DataFrame(log_transform)
diff = log_transform - log_transform.shift(1)
diff = diff.fillna(0) 
test_stationarity(diff)

The p-value is reduced

In [None]:
plt.figure(figsize=(16,16))

plt.subplot(311)
plt.title('Original')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts)

plt.subplot(312)
plt.title('After De-trend')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(diff)
plt.show()

Now by calculating ACF and PACF we can get the values of p and q which will be used to in ARIMA model

In [None]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf

lag_acf = acf(diff, nlags=20)
lag_pacf = pacf(diff, nlags=20, method='ols')

plt.figure(figsize=(16, 7))
#Plot ACF: 
plt.subplot(121) 
plt.plot(lag_acf, marker="o")
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')


#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf, marker="o")
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()

q - The lag value where the ACF chart crosses the upper confidence interval for the first time, 
    q=1 has +ve correlation
    
p - The lag value where the PACF chart crosses the upper confidence interval for the first time, 
     p=1 has +ve correlation and p=11 has high -ve correlation

In [None]:
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(ts, order=(1, 1, 1))
results_AR = model.fit(disp=-1)