In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error as mse
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
#import data
train_data = pd.read_csv('../input/train.csv',parse_dates=['date'],index_col='date')
test_data = pd.read_csv('../input/test.csv',parse_dates=['date'],index_col='date')

**Data Exploration**

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe()

We have 10 stores and 50 items. 

** For store 1 and item 1**

In [None]:
sales = train_data[(train_data['store']==1)&(train_data['item']==1)]['sales']

In [None]:
print(sales)

In [None]:
plt.plot(sales)
plt.xlabel('date')
plt.ylabel('sales')
plt.title('STORE 1 ITEM 1')
plt.show()

In [None]:
plt.plot(sales.groupby(pd.Grouper(freq='Y')).mean())
plt.xlabel('date')
plt.ylabel('sales')
plt.xticks(rotation=45)
plt.title('Yearly average')
plt.show()

We can see a clear trend here.

In [None]:
plt.plot(sales.groupby(pd.Grouper(freq='M')).mean())
plt.xlabel('date')
plt.ylabel('sales')
plt.xticks(rotation=45)
plt.title('Monthly average')
plt.show()

Sale is higher in mid of a year. Data is Seasonal.

In [None]:
plt.plot(sales.groupby(pd.Grouper(freq='W')).mean())
plt.xlabel('date')
plt.ylabel('sales')
plt.xticks(rotation=45)
plt.title('Weekly average')
plt.show()

In [None]:
decomposition = seasonal_decompose(sales.groupby(pd.Grouper(freq='M')).mean(),model='multiplicative')
decomposition.plot()
plt.show()

Here, we can see the trend, seasonality and residual. 

**ARIMA Model**

In [None]:
stores = train_data['store'].unique()
items = train_data['item'].unique()

In [None]:
store  = 1
item = 1
sales = train_data[(train_data['store'] == store)&(train_data['item'] == item)]['sales']
sales = sales.astype('float64')

In [None]:
plt.plot(sales)
plt.xlabel('date')
plt.ylabel('sales')
plt.show()

In [None]:
#train-validation split
train_size = int(len(sales)*0.7)
train, val = sales[:train_size], sales[train_size:]

In [None]:
autocorrelation_plot(sales)
plt.show()

In [None]:
#model 
model = ARIMA(train,order=(5,1,0))
model_fit = model.fit(disp=0)

In [None]:
print(model_fit.summary())

In [None]:
#plot residual error
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.title('Residual Error')
plt.show()

In [None]:
residuals.plot(kind='kde')
plt.xlabel('Residual')
plt.show()

In [None]:
#predict on validation set
history = [x for x in train]
prediction_val = list()
for t in range(len(val)):
    model = ARIMA(history,order=(5,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    prediction_val.append(int(output[0]))
    history.append(val[t])
    

In [None]:
error = mse(prediction_val,val)
print('Mean Squared Error = {}'.format(error))

In [None]:
#plot
plt.plot(prediction_val,color='red')
plt.xlabel('date')
plt.ylabel('sales')
plt.title('Prediction on Validation set')
plt.show()

In [None]:
plt.plot(val)
plt.xlabel('date')
plt.ylabel('sales')
plt.xticks(rotation=45)
plt.show()

In [None]:
#predict on test set
history = [x for x in sales]
prediction_test = list()
for t in range(90):
    model = ARIMA(history,order=(5,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    prediction_test.append(int(output[0]))
    history.append(val[t])
    

In [None]:
#creating a dataframe
dates = pd.date_range('1/1/2018',periods=90,freq='D')
prediction_test_df = pd.DataFrame(prediction_test,index=dates)
prediction_test_df.columns = ['sales']

In [None]:
#plot
plt.plot(prediction_test_df,color='green')
plt.xlabel('date')
plt.ylabel('sales')
plt.xticks(rotation=45)
plt.show()

In [None]:
ids = list(test_data[(test_data['store'] == store)&(test_data['item'] == item)]['id'])

In [None]:
submission = pd.DataFrame({'id':ids,'sales':prediction_test})

In [None]:
print(submission)

**Similarly,  for other store-item combination,**

In [None]:
complete_submission = pd.DataFrame({'id':[],'sales':[]})
for store in stores:
    for item in items:
        sales = train_data[(train_data['store'] == store)&(train_data['item'] == item)]['sales']
        sales = sales.astype('float64')
        #predict on test set
        history = [x for x in sales]
        prediction_test = list()
        for t in range(90):
            model = ARIMA(history,order=(5,1,0))
            model_fit = model.fit(disp=0)
            output = model_fit.forecast()
            prediction_test.append(int(output[0]))
            history.append(val[t])
            
        #creating a dataframe
        dates = pd.date_range('1/1/2018',periods=90,freq='D')
        prediction_test_df = pd.DataFrame(prediction_test,index=dates)
        prediction_test_df.columns = ['sales']
        ids = list(test_data[(test_data['store'] == store)&(test_data['item'] == item)]['id'])
        submission = pd.DataFrame({'id':ids,'sales':prediction_test})
        
        complete_submission = complete_submission.append(submission)

        

In [None]:
complete_submission.to_csv('submission.csv',index=False)