**Predict Future Sales: ARIMA**

In [None]:
!pip install pmdarima

In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

from pmdarima.arima import auto_arima

In [None]:
df_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
df_test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

df_train.shape, df_test.shape

In [None]:
df_train.dtypes, df_test.columns

In [None]:
#df_train = df_train[df_train['item_price'] > 0]

#To datetime datatype
df_train['date'] = df_train['date'].apply(lambda x:datetime.datetime.strptime(x, '%d.%m.%Y'))

We will be calculating monthly sales of items and hence, we will sum daily sales for each month in our dataframe.

In [None]:
df_train['month'] = df_train['date'].dt.to_period('M')
df_train['month'] = df_train['month'].astype(str)
df_train['month'] = pd.to_datetime(df_train['month'])
df_train.dtypes

In [None]:
dff_train = df_train.groupby(['month']).agg({'item_cnt_day':'sum'})
dff_train['month'] = dff_train.index
dff_train.rename(columns = {'item_cnt_day':'item_cnt_month'}, inplace = True)
print(dff_train.shape, dff_train.columns)

In [None]:
#dff_train.head(10)
len(dff_train.drop(['month'], axis = 1))

In [None]:
plt.figure(figsize = (9, 6))
plt.grid()
plt.plot(dff_train['item_cnt_month'])
plt.title('Monthly Sales of items')
plt.xlabel('Time')
plt.ylabel('Sales count')
plt.show()

In [None]:
pd.plotting.autocorrelation_plot(dff_train['item_cnt_month'])
print('Autocorrelation =', round(dff_train['item_cnt_month'].autocorr(), 4))

In [None]:
plot_acf(dff_train['item_cnt_month'])
plt.grid()
plot_pacf(dff_train['item_cnt_month'])
plt.grid()
plt.show()

In [None]:
seasonal_decompose(dff_train['item_cnt_month'], model = 'additive').plot().set_size_inches(10, 8) #default

In [None]:
seasonal_decompose(dff_train['item_cnt_month'], model = 'multiplicative').plot().set_size_inches(10, 8)

In [None]:
#Stationarity test
def adf_test(dataseries):
    adf = adfuller(dataseries)
    output = pd.Series(adf[0:3], index = ['ADF Statistic', 'p-value', 'Lags'])
    for key, value in adf[4].items():
        output["Critical Value (%s)" %key] = value
    
    return print(output)

In [None]:
#check p-value
adf_test(dff_train['item_cnt_month'])

There's a clear downward trend in data and the data series is non-stationary.

In [None]:
#Grid search for parameters
def order_parameters(training_data):
    
    search_params = auto_arima(training_data, start_p = 0, start_q = 0, m = 12, seasonal = True, test = "adf", d = None, trace = True, alpha = 0.05, information_criterion = 'aic', suppress_warnings = True, 
                    stepwise = True)
 
    print("AIC = ", round(search_params.aic(), 2))
                  
    return search_params

In [None]:
model = order_parameters(dff_train['item_cnt_month'])

In [None]:
print(model.summary())

In [None]:
prediction, confint = model.predict(n_periods = 6, return_conf_int = True) #95% CI default
df_confint = pd.DataFrame(confint)
print(confint.round(2))
print(prediction.round(2))

In [None]:
period_index = pd.period_range(start = dff_train.index[-1], periods = 6, freq='M')

df_predict = pd.DataFrame({'Predicted item_cnt_month': prediction.round(2)}, index = period_index)
#df_predict['month'] = df_predict.index
print(df_predict)

In [None]:
#November 2015 sales

df_predict.head(2)

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(dff_train['item_cnt_month'], label = 'Actuals')
plt.plot(df_predict.to_timestamp(), color = 'orange', label = 'Predicted')
plt.fill_between(period_index.to_timestamp(), df_confint[0], df_confint[1], color = 'grey', alpha = 0.25, label = 'Confidence Interval')
plt.legend(loc = 'lower left')
plt.title('Time-series Forecasting (SARIMA)')
plt.grid()
plt.show()

In [None]:
train_df_tuple = df_train.groupby(['shop_id', 'item_id'])['date', 'item_cnt_day'].agg({'item_cnt_day':'sum'})
train_df_tuple = train_df_tuple.reset_index()
print(train_df_tuple)

In [None]:
df_test['item_cnt_month'] = (prediction[0].round(2)*len(df_test)/len(train_df_tuple))/len(df_test)
#df_test['item_cnt_month'] = prediction[0]*len(df_test)/len(train_df_tuple)
submission  = df_test.drop(['shop_id', 'item_id'], axis = 1)
print(submission)

In [None]:
submission.to_csv('submission.csv', index = False)