In [None]:
# Import Libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20,10)  # Set default figure size
pd.options.display.float_format = '{:,.2f}'.format  # Set default float format (2 digits after decimal place)

In [None]:
data = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
data

In [None]:
data.dtypes

In [None]:
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items.head()

In [None]:
# Separating the year from date column
data['year'] = pd.DatetimeIndex(data['date']).year
data.head(100)

In [None]:
# Separating the month from date column
data['month'] = pd.DatetimeIndex(data['date'], dayfirst=True).month
data.head(10)

In [None]:
#formating dates as a date object
import datetime
data['date'] = data['date'].apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
data.dtypes

Since the date format is dd/mm/yyyy in the date column, I am using 'dayfirst=True' parameter. This parameter is used to consider the date column as day first (dd/mm/yyyy). By default this parameter is False. If we didn't set this parameter as True then, the date column will be validated in mm/dd/yyy format. 

In [None]:
ts = data.groupby(['date_block_num'])['item_cnt_day'].sum()

In [None]:
plt.figure(figsize=(16,8))
plt.plot(ts)
plt.title("Total sales of the company")
plt.xlabel("Consecutive month from Jan 2013 to Oct 2015")
plt.ylabel("Sales count")
plt.show()

In [None]:
# Find revenue
data['revenue'] = data['item_price'] * data['item_cnt_day']
data.head(5)

In [None]:
month_revenue = data.groupby(['date_block_num'])['revenue'].sum()

In [None]:
plt.figure(figsize=(16,8))
plt.plot(month_revenue)
plt.title("Monthly revenue of the company")
plt.xlabel("Consecutive month from Jan 2013 to Oct 2015")
plt.ylabel("Revenue")
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_axes([10,0,1,1])
# ax.set_xticks(np.arange(2013, 2016, 1))

from matplotlib.ticker import FuncFormatter
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(millions)
ax.yaxis.set_major_formatter(formatter)
 
ax.plot(data['date_block_num'], data['revenue'])
plt.show()

In [None]:
# Thanks to https://www.geeksforgeeks.org/python-pandas-dataframe-groupby/

total_revenue_per_month = data.groupby(['year', 'month'])['revenue'].sum().reset_index()
# total_revenue.apply(lambda x: '%.5f' % x, axis=1)
total_revenue_per_month.head(20)

## Yearwise revenue

In [None]:
total_revenue_per_year = data.groupby(['year'])['revenue'].sum().reset_index()
total_revenue_per_year.head(5)

In [None]:
# Thanks to https://matplotlib.org/3.1.0/gallery/ticks_and_spines/custom_ticker1.html
# Thanks to https://www.tutorialspoint.com/matplotlib/matplotlib_bar_plot.htm

fig = plt.figure()
ax = fig.add_axes([10,0,1,1])
ax.set_xticks(np.arange(2013, 2016, 1))

from matplotlib.ticker import FuncFormatter
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(millions)
ax.yaxis.set_major_formatter(formatter)
 
ax.bar(total_revenue_per_month['year'],total_revenue_per_month['revenue'])
plt.show()

In [None]:
ax.set_xticks(np.arange(2013, 2016, 1))

def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(millions)
ax.yaxis.set_major_formatter(formatter)

ax.scatter(total_revenue_per_month['year'],total_revenue_per_month['revenue'])
ax.plot(total_revenue_per_month['year'],total_revenue_per_month['revenue'])
plt.show()

In [None]:
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)
formatter = FuncFormatter(millions)
ax.yaxis.set_major_formatter(formatter)

total_revenue_per_year['year'] = total_revenue_per_year['year'].astype(str)
plt.plot(total_revenue_per_year['year'], total_revenue_per_year['revenue'])

In [None]:
total_revenue_per_year['year']

In [None]:
plt.locator_params(integer=True)
total_revenue_per_year['year'] = total_revenue_per_year['year'].astype(int)
plt.plot(total_revenue_per_year['year'], total_revenue_per_year['revenue'])

# ARIMA model Implementation

In [None]:
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(data['revenue'], order = (4, 1, 0))

In [None]:
model.fit()

In [None]:
predictions = model.fit().predict()

In [None]:
predictions

In [None]:
# plt.xlim([0, 200])
plt.plot(predictions)