<a href="https://colab.research.google.com/github/sandhrabijoy/Store-Item-Demand-Forecasting/blob/main/SalesForecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting up basics for forecasting pipeline


In [4]:
import numpy as np
import pandas as pd

import math
import itertools

#matplotlib libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
import seaborn as sns

#date libraries
from dateutil import parser
from datetime import datetime, timedelta, date
import holidays

#prophet library
from prophet import Prophet
from prophet.diagnostics import performance_metrics
from prophet.plot import plot_cross_validation_metric
from prophet.diagnostics import cross_validation

#pandas options
pd.set_option('display.float_format', lambda x:'%.2f' % x)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

#matplotlib setting defaults
sns.set(font='Arial',rc={
    "axes.axisbelow":False,
    "axes.edgecolor":"lightgrey",
    "axes.facecolor":"None",
    "axes.grid":False,
    "axes.labelcolor":"dimgrey",
    "axes.spines.right":False,
    "axes.spines.top":False,
    "figure.facecolor":"white",
    "lines.solid_capstyle":"round",
    "patch.edgecolor":"w",
    "patch.force_edgecolor":True,
    "text.color":"dimgrey",
    "xtick.bottom":False,
    "xtick.color":"dimgrey",
    "xtick.direction":"out",
    "xtick.top":False,
    "ytick.color":"dimgrey",
    "ytick.direction":"out",
    "ytick.left":False,
    "ytick.right":False
})

In [5]:
def missing_data(input_data):
  total = input_data.isnull().sum()
  percent = (input_data.isnull().sum()/input_data.isnull().count()*100)
  table = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
  types=[]
  for col in input_data.columns:
    dtype=str(input_data[col].dtype)
    types.append(dtype)
  table["Types"]=types
  return(pd.DataFrame(table))

def mape(actual,pred):
  actual,pred=np.array(actual),np.array(pred)
  return np.mean(np.abs(actual-pred)/actual)*100

Reading Data

In [11]:
df= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")

In [12]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [13]:
df.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')

In [16]:
df['date'] = pd.to_datetime(df['date'].str.replace('-', ''), format='%Y%m%d')

In [18]:
min(df['date']),max(df['date'])

(Timestamp('2013-01-01 00:00:00'), Timestamp('2017-08-15 00:00:00'))

In [19]:
agg_df=df.groupby(['date','family']).agg({'sales':'sum'}).reset_index().sort_values(['family','date'])
agg_df.head()

Unnamed: 0,date,family,sales
0,2013-01-01,AUTOMOTIVE,0.0
33,2013-01-02,AUTOMOTIVE,255.0
66,2013-01-03,AUTOMOTIVE,161.0
99,2013-01-04,AUTOMOTIVE,169.0
132,2013-01-05,AUTOMOTIVE,342.0
