In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/competitive-data-science-predict-future-sales/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
import itertools

# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots

import plotly
import plotly.express as px
import plotly.graph_objects as go

#Modeling
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from fbprophet import Prophet
from math import sqrt


#Evaluation
from sklearn.metrics import mean_squared_error
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import cross_validation
from fbprophet.plot import plot_cross_validation_metric

import warnings

# settings
warnings.filterwarnings('ignore')
path = '../input/competitive-data-science-predict-future-sales/'

In [None]:
train = pd.read_csv(path + 'sales_train.csv')
test = pd.read_csv(path + 'test.csv')
item_categories = pd.read_csv(path + 'item_categories.csv')
items = pd.read_csv(path + 'items.csv')
shops = pd.read_csv(path + 'shops.csv')

In [None]:
def eda(data):
    print("Data Types")
    print(data.dtypes)
    print("Missing value")
    print(data.isnull().sum())
    print(data.isna().sum())

In [None]:
eda(train)

In [None]:
eda(item_categories)

In [None]:
eda(items)

In [None]:
eda(shops)

In [None]:
eda(test)

In [None]:
train = pd.read_csv(path + 'sales_train.csv', parse_dates=['date'], 
                    dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32', 'item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int32'})
test = pd.read_csv(path + 'test.csv', dtype={'ID': 'int32', 'shop_id': 'int32', 'item_id': 'int32'})
item_categories = pd.read_csv(path + 'item_categories.csv', dtype={'item_category_name': 'str', 'item_category_id': 'int32'})
items = pd.read_csv(path + 'items.csv', dtype={'item_name': 'str', 'item_id': 'int32', 'item_category_id': 'int32'})
shops = pd.read_csv(path + 'shops.csv', dtype={'shop_name': 'str', 'shop_id': 'int32'})

In [None]:
train = pd.merge(train, items, on='item_id', how='inner')
train = pd.merge(train, item_categories, on='item_category_id', how='inner')
train = pd.merge(train, shops, on='shop_id', how='inner')


test = pd.merge(test, items, on='item_id', how='inner')
test = pd.merge(test, item_categories, on='item_category_id', how='inner')
test = pd.merge(test, shops, on='shop_id', how='inner')

In [None]:
train

In [None]:
train['total_sales'] = train['item_price'] * train['item_cnt_day']
train.head()

In [None]:
sns.boxplot(x=train["item_price"])

In [None]:
train[train["item_price"] > 100000]

In [None]:
train[train['item_price'] <= 0].count()

In [None]:
train = train[train['item_price'] > 0]

In [None]:
sns.boxplot(x=train["item_cnt_day"])

In [None]:
train[train['item_cnt_day'] <= 0].count()

In [None]:
train = train[train['item_cnt_day'] > 0]

In [None]:
train

In [None]:
train_monthly = train[['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'item_cnt_day']]
# Group by month in this case "date_block_num" and aggregate features.
train_monthly = train_monthly.sort_values('date').groupby(['date_block_num', 'shop_id', 'item_category_id', 'item_id'], as_index=False)
train_monthly = train_monthly.agg({'item_price':['sum', 'mean'], 'item_cnt_day':['sum', 'mean','count']})
# Rename features.
train_monthly.columns = ['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'mean_item_price', 'item_cnt', 'mean_item_cnt', 'transactions']

In [None]:
train_monthly.head()

In [None]:
train =  train.set_index('date')
train.head()

In [None]:
train_arima = train.resample("M").sum() 
ts_sales = train_arima[["total_sales"]]
ts_sales.head()

In [None]:
ts_sales = ts_sales[ts_sales.index <= pd.to_datetime('2015-10-31')]
ts_sales.head()

In [None]:
plt.figure(figsize=(16,9))
plt.title('Total Item of the company')
plt.xlabel('Month')
plt.ylabel('Item')
plt.plot(ts_sales['total_sales'])

In [None]:
#?????
def test_stationarity(timeseries):
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [None]:
test_stationarity(ts_sales['total_sales'])

In [None]:
item_cnt_dec = sm.tsa.seasonal_decompose(ts_sales['total_sales'],freq=12).plot()

In [None]:
sales_acf = sm.graphics.tsa.plot_acf(ts_sales['total_sales'], lags=12)

In [None]:
sales_acf = sm.graphics.tsa.plot_pacf(ts_sales['total_sales'], lags=12)

**[SARIMA/SARIMAX(Seasonal Autoregressive Integrated Moving Average/Exogenous Model)]**

SARIMA는 말그대로 계절성을 포함하는 모형입니다. 계절성이라는 것은 어떤 특정한 패턴이 주기적으로 나타나는 거을 뜻합니다. 예를 들어 우리가 가지고 있는 시계열 데이터가 Monthly 베이스라면, 1년 주기 기반 패턴이 발생할 가능성이 높습니다. 이렇게 주기적으로 나타내는 특성은 AR 모형이나 MA모형만을 활용해서 나타내는 것은 어렵습니다. 

In [None]:
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
seasonal_pdq

In [None]:
# pdq range(0, 2)
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(ts_sales['total_sales'],
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=True,
                                            enforce_invertibility=True)

            results = mod.fit()

            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue

In [None]:
ts_sales['total_sales']

In [None]:
#result of pdq range(0, 2)
model = sm.tsa.statespace.SARIMAX(ts_sales['total_sales'],
                                order=(0,1,0),
                                seasonal_order=(0, 1, 1, 12),
                                enforce_stationarity=True,
                                enforce_invertibility=True)
results = model.fit()
print(results.summary().tables[1])

In [None]:
results

In [None]:
# pdq range(0, 3)
# for param in pdq:
#     for param_seasonal in seasonal_pdq:
#         try:
#             mod = sm.tsa.statespace.SARIMAX(ts_sales['total_sales'],
#                                             order=param,
#                                             seasonal_order=param_seasonal,
#                                             enforce_stationarity=True,
#                                             enforce_invertibility=True)

#             results = mod.fit()

#             print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
#         except:
#             continue
# #result of pdq range(0, 3)
# model = sm.tsa.statespace.SARIMAX(ts_sales['total_sales'],
#                                 order=(0,1,0),
#                                 seasonal_order=(0, 1, 1, 12),
#                                 enforce_stationarity=True,
#                                 enforce_invertibility=True)
# results = model.fit()
# print(results.summary().tables[1])

In [None]:
results.plot_diagnostics(figsize=(16, 9))
plt.show()

In [None]:
#predict since Desember 2014
pred = results.get_prediction(start=pd.to_datetime('2014-12-31'), dynamic=False)
pred_ci = pred.conf_int()

In [None]:
ax = ts_sales['2013-01-31':].plot(label = "observed", figsize=(16, 9))
pred.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Month')
ax.set_ylabel('Sales')
plt.legend()
plt.show()

train_sarima_forecasted = pred.predicted_mean
train_sarima_truth = ts_sales['2014-12-31':]

#Menghiung RMSE
rmse_sarima = sqrt(mean_squared_error(train_sarima_truth, train_sarima_forecasted))
print("Root Mean Squared Error: ", rmse_sarima)

In [None]:
pred_uc = results.get_forecast(steps=3)
pred_ci = pred_uc.conf_int()
ax = ts_sales['2013-01-31':].plot(label='observed', figsize=(16, 9))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Amount of Sales')
plt.legend()
plt.show()

In [None]:
#Reset index first to make date into column
ts_sales = ts_sales.reset_index()
#Pertama kita harus merubah dates menjadi ds dan total_sales menjadi y
ts_sales.rename(columns={'date':'ds','total_sales':'y'},inplace=True)

In [None]:
# Make predictions with the Prophet
m_p = Prophet()
m_p.fit(ts_sales)
future = m_p.make_future_dataframe(periods = 3, freq = 'M')
prediction = m_p.predict(future)
prediction.tail(3)

In [None]:
m_p.plot(prediction)
plt.show()

In [None]:
m_p.plot_components(prediction)
plt.show()

# Cross validation

In [None]:
cv = cross_validation(m_p,initial='720 days', period='120 days', horizon = '240 days')

In [None]:
cv.head()

In [None]:
df_pm= performance_metrics(cv)
df_pm.head()

In [None]:
plot_cross_validation_metric(cv, metric='rmse')
plt.show()

In [None]:
pred_ci