In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Objective **

Forecast the total amount of products sold in every shop for November 2015 in  given set.

**File descriptions Provided**

sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.

test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.

sample_submission.csv - a sample submission file in the correct format.

items.csv - supplemental information about the items/products.

item_categories.csv  - supplemental information about the items categories.

shops.csv- supplemental information about the shops.


Data fields
ID - an Id that represents a (Shop, Item) tuple within the test set
shop_id - unique identifier of a shop
item_id - unique identifier of a product
item_category_id - unique identifier of item category
item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
item_price - current price of an item
date - date in format dd/mm/yyyy
date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
item_name - name of item
shop_name - name of shop
item_category_name - name of item category

**Load Libraries**

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

# ARIMA
!pip install pmdarima > /dev/null
import pmdarima as pm
from pmdarima.arima import auto_arima

In [None]:
df_train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')

In [None]:
df_train.shape

In [None]:
df_train.info()

Here we  found date ,Dtype is object not datetime . So we need to bring in format after removing any null value.

In [None]:
df_train.isnull().sum()

In [None]:
df_train.head(2)

# Datetime re-format

In [None]:
df_train['date']=df_train['date'].apply(lambda x:datetime.datetime.strptime(x,'%d.%m.%Y'))
                                                                            
                                                                              

In [None]:
df_train.dtypes

In [None]:
df_test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

df_test

**Get month and year of date from train_df**

In [None]:
df_train['month_year'] = df_train['date'].dt.to_period('M')
df_train.head(5)

**Sum all the item_cnt_day by month_year and plot**

In [None]:
grouped_df = df_train.groupby(['month_year'])['month_year','item_cnt_day'].agg({'item_cnt_day':'sum'})
grouped_df = grouped_df.reset_index()
grouped_df.set_index(['month_year'], inplace=True)
grouped_df.rename(columns = {'item_cnt_day':'item_cnt_month'}, inplace = True)
# grouped_df = grouped_df.to_timestamp()
grouped_df.head(2)




**Predict using S-ARIMA**

In [None]:
model = auto_arima(
    y=grouped_df,
    seasonal=True,
    start_p = 1, max_p =5,
    start_q =1, max_q =5,
    d = None,
    start_P = 1, max_P =5,
    start_Q =1, max_Q =5,
    D = None,
    m=12,
)

In [None]:
print(model.summary())

In [None]:
prediction, confint = model.predict(n_periods=12, return_conf_int=True)
confint_df = pd.DataFrame(confint)
prediction

In [None]:
period_index = pd.period_range(
    start = grouped_df.index[-1],
    periods = 12,
    freq='M'
)
predicted_df = pd.DataFrame({'value':prediction}, index=period_index)
predicted_df

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(grouped_df.to_timestamp(), label='Actual data')
plt.plot(predicted_df.to_timestamp(), color='orange', label='Predicted data')
plt.fill_between(period_index.to_timestamp(), confint_df[0], confint_df[1],color='grey',alpha=.2, label='Confidence Intervals Area')
plt.legend()
plt.show()

In [None]:
print(f'sales last month: {grouped_df.values[-1][0]}')
print(f'sales next month: {prediction[0]}')

In [None]:
group_pair_train = df_train.groupby(['shop_id', 'item_id'])['date', 'item_cnt_day'].agg({'item_cnt_day':'sum'})
group_pair_train = group_pair_train.reset_index()
group_pair_train.head(2)

In [None]:
df_test['item_cnt_month'] = (prediction[0]*len(df_test)/len(group_pair_train))/len(df_test)
submission  = df_test.drop(['shop_id', 'item_id'], axis=1)
submission.to_csv('submission.csv', index=False)

In [None]:
!sed -n 1,10p submission.csv