# Demand Forecasting
 Dataset : Historical Product Demand from Kaggle
 
 The dataset contains historical product demand for a manufacturing company with footprints globally. The company providesthousands of products within dozens of product categories. There are four central warehouses to ship products within the region it is responsible for. Since the products are manufactured in different locations all over the world, it normally takes more than one month to ship products via ocean to different central warehouses. If forecasts for each product in different central with reasonable accuracy for the monthly demand for month after next can be achieved, it would be beneficial to the company in multiple ways.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

IMPORT LIBRARY

**Import library for data visualizaton**

In [None]:
import matplotlib.pyplot as plt
import sys 
import matplotlib 
import seaborn as sns

DATA SET FROM KAGGLE

**Import dataset from kaggle**

**Check the data types and data set description**

In [None]:
FilePath = r'/kaggle/input/productdemandforecasting/Historical Product Demand.csv'
df = pd.read_csv(FilePath)

print("Data Types")
print(df.dtypes)
print("\t")

print(df.head())
print("\t")

print(df.describe())
print("\t")

DATA WRANGLING AND VISUALIZATION

From describe function we know that there are:
1. 2160 unique products
2. 4 different warehouses
3. 33 product categories

From dtypes function we know that the data types from each column are in object format, we need to reformat the data types.
1. Date column to datetime data types
2. Order_Demand to integer data types



In [None]:
# The visualization of raw dataset
# Groupby Warehouse

ProdWarehouse = df.groupby(['Warehouse']).count()['Product_Category'].reset_index(name='counts').sort_values(['counts'],ascending=False)


%matplotlib inline
plt.figure(figsize=(15,15))
ax = sns.barplot(y='counts',x='Warehouse',data=ProdWarehouse,palette="tab10")
ax.set(ylabel='counts', xlabel='Warehouse')
plt.show()

In [None]:
#Groupby Category
ProdCategory = df.groupby(['Product_Category']).size().reset_index(name='counts').sort_values(['counts'],ascending=False)
 
%matplotlib inline
plt.figure(figsize=(15,15))
ax = sns.barplot(              y='Product_Category',x='counts',data=ProdCategory,palette="tab10")
ax.set(ylabel='Product_Category', xlabel='counts')
plt.show()

In [None]:

f, axes = plt.subplots(2, 2, figsize=(20, 20),sharex=True)
sns.despine(left=True)

# Subplot 1: Warehouse J
prod_WHouseJ=df.loc[df['Warehouse']=='Whse_J']
prod_WHouseJ=prod_WHouseJ.groupby(['Product_Category']).sum()['Order_Demand'].reset_index(name='Order_Demand_WHouseJ').sort_values(['Order_Demand_WHouseJ'],ascending=False)

sns.barplot(x='Order_Demand_WHouseJ',y='Product_Category',data=prod_WHouseJ,palette="tab10",ax=axes[0,0])
axes[0,0].set( xlabel='Order_Demand_WHouseJ',ylabel='Product_Category')


# Subplot 2: Warehouse A
prod_WHouseA=df.loc[df['Warehouse']=='Whse_A']
prod_WHouseA=prod_WHouseA.groupby(['Product_Category']).sum()['Order_Demand'].reset_index(name='Order_Demand_WHouseA').sort_values(['Order_Demand_WHouseA'],ascending=False)

sns.barplot(x='Order_Demand_WHouseA',y='Product_Category',data=prod_WHouseA,palette="tab10",ax=axes[0,1])
axes[0,1].set( xlabel='Order_Demand_WHouseA',ylabel='Product_Category')

# Subplot 3: Warehouse S
prod_WHouseS=df.loc[df['Warehouse']=='Whse_S']
prod_WHouseS=prod_WHouseS.groupby(['Product_Category']).sum()['Order_Demand'].reset_index(name='Order_Demand_WHouseS').sort_values(['Order_Demand_WHouseS'],ascending=False)

sns.barplot(x='Order_Demand_WHouseS',y='Product_Category',data=prod_WHouseS,palette="tab10",ax=axes[1,0])
axes[1,0].set( xlabel='Order_Demand_WHouseS',ylabel='Product_Category')

# Subplot 4: WarehouseC
prod_WHouseC=df.loc[df['Warehouse']=='Whse_C']
prod_WHouseC=prod_WHouseC.groupby(['Product_Category']).sum()['Order_Demand'].reset_index(name='Order_Demand_WHouseC').sort_values(['Order_Demand_WHouseC'],ascending=False)

sns.barplot(x='Order_Demand_WHouseC',y='Product_Category',data=prod_WHouseC,palette="tab10",ax=axes[1,1])
axes[1,1].set( xlabel='Order_Demand_WHouseC',ylabel='Product_Category')

plt.show()

In [None]:
# Change data types

df['Date'] = pd.to_datetime(df['Date'])
df['Order_Demand'] = df['Order_Demand'].astype(str).map(lambda a: a.lstrip('(').rstrip(')')).astype(int)


# Sort data by period

df_SortedDate = df.sort_values('Date').reset_index().drop('index',axis=1)
df_SortedDate

In [None]:
print("Data Types")
print(df.dtypes)
print("\t")

print(df.head())
print("\t")

print('Number of missing values by column',[sum(df[i].isnull()) for i in df.columns])
print('All missing values are in Date column.')

In [None]:
# Display the product with no date information.

bool_series = pd.isnull(df["Date"])
df[bool_series]

In [None]:
#Drop product with no Date information
df_Date=df.dropna()

print("Data Dimension",df_Date.shape)
print("\t")
print("The number of products is",len(df_Date['Product_Code'].value_counts().index))
print("Period range is from",df_Date['Date'].min(),"to", df_Date['Date'].max())
print("Order Quantity is from",df_Date['Order_Demand'].min(),"to", df_Date['Order_Demand'].max())
print("\t")

In [None]:
# Remove rows that have the same products, dates, order demand, and warehouse
# Order with same product code, dates, order demand, and warehouse information indicates that customer making same order by mistakes
# Leave the last order in the record 

df_NoDup = df_Date.drop_duplicates(subset = ['Product_Code','Date','Order_Demand','Warehouse'], keep = 'last')
df_NoDup 

In [None]:
# First order in 2011-01-08 and last order in 2017-01-09
# Data in Jan, 2017 should be removed; otherwise, model interpretation will be mislead.
# It is because the latest date in this dataset is 2017-01-09 while forecasting is for monthly horizon.
# Remove data in Jan 2017

df_11_to_16 = df_NoDup.loc[df_NoDup['Date']<'2017-01']
df_11_to_16

In [None]:
df_monthly = df_11_to_16.rename(columns = {"Date": 'Period'})
df_monthly['Period']=df_monthly['Period'].dt.to_period('M')
df_monthly = df_monthly.groupby(['Product_Code','Period'])['Order_Demand'].sum().reset_index().sort_values('Period').reset_index().drop('index',axis=1)
df_monthly

In [None]:
# Check to see if periods in dataset is continuous
# Create a duration with continuous periods

period_range = pd.date_range('2011-01-01','2016-12-31', freq='MS').to_period('M')
period_range = set(period_range)
data_period = set(df_monthly['Period'])
period_range.difference(data_period)

In [None]:
# The missing periods are 5 months in 2011, including Feb, Mar, Apr, Jul, and Aug.
# Drop the record before 2011-12

df_monthly= df_monthly.loc[df_monthly['Period'] > '2011-12']
df_monthly

In [None]:
# Remove stopped products
# After removing order in 2017-01 we consider the last order is in 2016-12
# If the product didn't have history during 2016 we consider the product stopped

latest_datamonth = df_monthly.groupby('Product_Code')['Period'].max().reset_index()
latest_datamonth = latest_datamonth.loc[latest_datamonth['Period'] > '2015-12']

data_ready = df_monthly.loc[df_monthly['Product_Code'].isin(latest_datamonth['Product_Code'])]
data_ready

In [None]:
# Remove new products for statistical forecast method
# A minimum of 24 months data is required to forecast trends and seasonality using statistical forecasting methods. 

from operator import attrgetter
duration_data = data_ready.groupby('Product_Code').agg({'Period': ['min', 'max']}).reset_index()
duration_data['Duration (Month)'] = (duration_data[('Period', 'max')] - duration_data[('Period', 'min')]).apply(attrgetter('n')) + 1
duration_data = duration_data.loc[duration_data['Duration (Month)'] > 24 ]
data = data_ready.loc[data_ready['Product_Code'].isin(duration_data['Product_Code'])]
data

In [None]:
# In cases when history is too short (let's say the history less than 24 months), 
# the products are likely to be new products and forecasting methods for new products are different.

data_NewProd = data_ready.loc[~data_ready['Product_Code'].isin(duration_data['Product_Code'])]
data_NewProd

In [None]:
# The dataset includes 2061 products
# Coose several products for demonstration

data_sorted=data.groupby(['Product_Code']).sum().reset_index().sort_values(['Order_Demand'],ascending=False)
data_sorted.head(5)

In [None]:
data_transpose = pd.pivot_table(data, values = 'Order_Demand', index = 'Period', columns = 'Product_Code',aggfunc=np.sum).reset_index().rename_axis("", axis="columns")

#Fill in missing values with 0. Months with missing values are implied to have zero demands.
data_transpose = data_transpose.fillna(0)
data_transpose = data_transpose.set_index('Period')
data_transpose

In [None]:
# The dataset includes data for 2061 products, which takes Python too long to return model results.
# Thus, I sampled only several products with highests demands for demonstration purpose
data_set = data_transpose[['Product_1359','Product_1248','Product_0083','Product_1341','Product_1241']]
data_set.shape

In [None]:
data_set

TIME SERIES VISUALIZATION

In [None]:
# Plot each product column before split the data to train and test set 
import warnings
for i in data_set.columns[0:]:
    ax = data_set[i].plot(figsize=(15, 5), color ='green')
    fig = ax.get_figure()
    plt.show(block=False)
    plt.close(fig)

DECOMPOSE THE DATA

By looking at the graph of sales data above, we can see a general increasing trend with no clear pattern of seasonal or cyclical changes. The next step is to decompose the data to view more of the complexity behind the linear visualization. A useful Python function called seasonal_decompose within the 'statsmodels' package can help us to decompose the data into four different components:

* Observed
* Trended
* Seasonal
* Residual

https://www.bounteous.com/insights/2020/09/15/forecasting-time-series-model-using-python-part-one/

In [None]:
import statsmodels.api as sm

# graphs to show seasonal_decompose

#def seasonal_decompose (i):
for i in data_set.columns[0:]: 
    y=data_set[i]
    y.index=y.index.to_timestamp()
    decomposition = sm.tsa.seasonal_decompose(y, model='additive',extrapolate_trend='freq')
    fig = decomposition.plot()
    fig.set_size_inches(14,7)
    plt.show()

CHECK FOR STATIONARY

Next, we need to check whether the dataset is stationary or not. A dataset is stationary if its statistical properties like mean, variance, and autocorrelation do not change over time.

Most time series datasets related to business activity are not stationary since there are usually all sorts of non-stationary elements like trends and economic cycles. But, since most time series forecasting models use stationarity—and mathematical transformations related to it—to make predictions, we need to ‘stationarize’ the time series as part of the process of fitting a model.

Two common methods to check for stationarity are Visualization and the Augmented Dickey-Fuller (ADF) Test. 

https://www.bounteous.com/insights/2020/09/15/forecasting-time-series-model-using-python-part-one/

https://www.kaggle.com/sumi25/understand-arima-and-tune-p-d-q

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries, window = 12, cutoff = 0.01):

    #Determing rolling statistics
    rolmean = timeseries.rolling(window).mean()
    rolstd = timeseries.rolling(window).std()

    #Plot rolling statistics:
    fig = plt.figure(figsize=(12, 8))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC', maxlag = 20 )
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    pvalue = dftest[1]
    if pvalue < cutoff:
        print('p-value = %.4f. The series is likely stationary.' % pvalue)
    else:
        print('p-value = %.4f. The series is likely non-stationary.' % pvalue)
    
    print(dfoutput)

In [None]:
# Check the stationarity of five highest ordered products 
# Product_1359,Product_1248,Product_0083,Product_1341,Product_1241

for i in data_set.columns[0:]: 
    test_stationarity(data_set[i])
    print (i)

MAKE THE DATA STATIONER

From the stationary test before, we know that Product 0083 and Product 1341 is likely non stationary.

To proceed with our time series analysis, we need to stationarize the dataset. There are many approaches to stationarize data, but we’ll use de-trending, differencing, and then a combination of the two.



In [None]:
# Detrending

def detrending_func(y):
    y_detrend =  (y - y.rolling(window=12).mean())/y.rolling(window=12).std()
    return y_detrend

In [None]:
y=data_set['Product_0083']
a=detrending_func(y)
#test_stationarity(a)
a

In [None]:
# Differencing

def differencing_func(y):
    y_12lag =  y - y.shift(12)
    return  y_12lag

In [None]:
y=data_set['Product_0083']
b=differencing_func(y)
#test_stationarity(b)
b

In [None]:
# Detrending + Differencing
def diff_detrend_func(y) :
    y_detrend =  (y - y.rolling(window=12).mean())/y.rolling(window=12).std()
    y_12lag_detrend =  y_detrend - y_detrend.shift(12)
    return y_12lag_detrend

In [None]:
y=data_set['Product_0083']
c=diff_detrend_func(y)
test_stationarity(c)

In [None]:
# Product 0083 is likely non-stationary
# Stationarize this product

y=data_set['Product_0083']

# Reduce it's trend using transformation

y_log=np.log(y)
plt.plot(y_log)

There is some noise in realizing the forward trend here. There are some methods to model these trends and then remove them from the series. Some of the common ones are:
• Smoothing: using rolling/moving average
• Aggression: by taking the mean for a certain time period (year/month)

https://medium.com/@stallonejacob/time-series-forecast-a-basic-introduction-using-python-414fcb963000

In [None]:
# Smoothing: using rolling/moving average
# In smoothing we usually take the past few instances (rolling estimates) We will discuss two methods under smoothing
# Moving average and Exponentially weighted moving average.
# The first method we will use Smoothing-Moving average 
# take x consecutive values and this depends on the frequency if it is 1 year we take 12 values

moving_avg = y_log.rolling(12).mean()
plt.plot(y_log)
plt.plot(moving_avg, color='red')

In [None]:
y_log_moving_avg_diff=y_log - moving_avg
y_log_moving_avg_diff

In [None]:
y_log_moving_avg_diff.dropna(inplace=True)
y_log_moving_avg_diff

In [None]:
test_stationarity(y_log_moving_avg_diff)

In [None]:
# Smoothing-Weighted Moving Average

expweighted_avg=y_log.ewm(halflife=12,ignore_na=False,min_periods=0,adjust=True).mean()
plt.plot(y_log)
plt.plot(expweighted_avg,color='red')

In [None]:
# Check it's stationarity

y_log_ewma_diff=y_log - expweighted_avg
test_stationarity(y_log_ewma_diff)

Stationary test for product 1341

In the test result before stated that after doing smoothing-weighted average the product is still non-stationary. The test conducted with 1 % cut off (1 % critical value).

The test statistic is smaller than 10% critical value. So, we can say we are almost 90% confident that this is stationary.



SEASONALITY ( ALONG WITH TREND )

Previously we saw just trend part of the time series, now we will see both trend and seasonality. 

Most Time series have trends along with seasonality. 

There are two common methods to remove trend and seasonality, they are:

• Differencing: by taking difference using time lag

• Decomposition: model both trend and seasonality, then remove them

In [None]:
# Seasonality (along with trend)
# Differencing

y_log_diff=y_log - y_log.shift()
plt.plot(y_log_diff)


In [None]:
# Looks ok to me but let’s parse it using our stationary testing function

y_log_diff.dropna(inplace=True)
test_stationarity(y_log_diff)



In [None]:
# Seasonality (along with trend)
# Decomposition

import statsmodels.api as sm

# graphs to show seasonal_decompose

def seasonal_decompose (x):
    decomposition = sm.tsa.seasonal_decompose(x, model='additive',extrapolate_trend='freq')
    fig = decomposition.plot()
    fig.set_size_inches(14,7)
    plt.show()
    

In [None]:
seasonal_decompose (y_log)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
def seasonal_decom (a):
    decomposition=seasonal_decompose(a)

    trend=decomposition.trend
    seasonal=decomposition.seasonal
    residual=decomposition.resid

    plt.subplot(411)
    plt.plot(a, label='Original')
    plt.legend(loc='best')
    plt.subplot(412)
    plt.plot(trend, label='Trend')
    plt.legend(loc='best')
    plt.subplot(413)
    plt.plot(seasonal, label='Seasonality')
    plt.legend(loc='best')
    plt.subplot(414)
    plt.plot(residual, label='Residuals')
    plt.legend(residual='best')
    plt.tight_layout()



In [None]:
seasonal_decom (y_log)

In [None]:
y_log_decompose=residual
y_log_decompose.dropna(inplace=True)
test_stationarity(y_log_decompose)

In [None]:
from statsmodels.tsa.arima_model import ARIMA
# ACF dan PACF plots :
from statsmodels.tsa.stattools import acf, pacf

lag_acf=acf(y_log_diff, nlags=20)
lag_pacf=pacf(y_log_diff, nlags=20, method='ols')

# Plot ACF
plt.subplot(121)
plt.plot(lag_acf)


From five products we have choosen before, there are three product that have stationary time series

Products 1359, Products 1248, and Products 1241. We will make a forecasting model for theese three products.

In [None]:
# function to split train and test set
# history length of a product starts from the month of first order, not all products have the same history length
# the function is to split train-test based on product's history length instead of dataset length

def train_test_split(data_set):
    List = data_set.values.tolist()
    i = List.index(next(filter(lambda x: x!=0, myList)))
    data_set = data_set.iloc[i:,]
    train_set = data_set[:int(0.8*(len(data)))]
    test_set = data_set[int(0.8*len(data)):]
    return train_set, test_set, data_set

In [None]:
# create data for forecasting
start = data.index.tolist()[-1] + 1
fcastperiods = 12  # forecast periods is subject to change by forecast users
full_period = [start + x for x in range(0,fcastperiods)]                                                                   

In [None]:
rom statsmodels.tsa.arima_model import ARIMA
# ACF dan PACF plots :
from statsmodels.tsa.stattools import acf, pacf

lag_acf=acf(y_log_diff, nlags=20)
lag_pacf=pacf(y_log_diff, nlags=20, method='ols')

# Plot ACF
plt.subplot(121)
plt.plot(lag_acf)

In [None]:

# ARIMA example
from statsmodels.tsa.arima.model import ARIMA
from random import random
# contrived dataset
data_set = [x + random() for x in range(1, 100)]
# fit model
model = ARIMA(data_set, order=(1, 1, 1))
model_fit = model.fit()
# make prediction
yhat = model_fit.predict(len(data_set), len(data_set), typ='levels')
print(yhat)

In [None]:
# fit an ARIMA model and plot residual errors
from pandas import datetime
from pandas import read_csv
from pandas import DataFrame
from statsmodels.tsa.arima.model import ARIMA
from matplotlib import pyplot
# load dataset
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')
series = read_csv('shampoo-sales.csv', header=0, index_col=0, parse_dates=True, squeeze=True, date_parser=parser)
series.index = series.index.to_period('M')
# fit model
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())
# line plot of residuals
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
# density plot of residuals
residuals.plot(kind='kde')
pyplot.show()
# summary stats of residuals
print(residuals.describe())