In [1]:
import pandas as pd
import numpy as np
import requests
import urllib

from sqlalchemy import create_engine

# Custom upload with connection string
from engine_info import server_info

import warnings
warnings.filterwarnings('ignore')

from matplotlib import rcParams

In [2]:
rcParams['figure.figsize'] = 12,8

In [3]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [4]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [19]:
# Upload sales data
sales = pd.read_sql_table(
    table_name='Durban_Fresh_produce_market',
    con=connection,
    parse_dates=['Date']
)

## Data preparation

In [20]:
sales.head()

Unnamed: 0,Commodities,Weight_Kg,Size_Grade,Container,Province,Low_Price,High_Price,Average_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Date
0,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,CAPE,50.0,100.0,51.39,9250,180,1620,1397,2020-02-26
1,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,ORANGE FREE STATE,0.0,0.0,0.0,0,0,0,80,2020-02-26
2,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,NATAL,10.0,10.0,10.0,30,3,27,6,2020-02-26
3,APPLE GOLDEN DELICIOUS,9.0,2U,JE090,CAPE,50.0,50.0,50.0,9000,180,1620,0,2020-02-26
4,APPLE GOLDEN DELICIOUS,18.3,2X,M4183,CAPE,0.0,0.0,0.0,0,0,0,1,2020-02-26


In [21]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151571 entries, 0 to 151570
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Commodities     151571 non-null  object        
 1   Weight_Kg       151571 non-null  object        
 2   Size_Grade      151571 non-null  object        
 3   Container       151571 non-null  object        
 4   Province        151571 non-null  object        
 5   Low_Price       151571 non-null  object        
 6   High_Price      151571 non-null  object        
 7   Average_Price   151571 non-null  object        
 8   Sales_Total     151571 non-null  object        
 9   Total_Qty_Sold  151571 non-null  object        
 10  Total_Kg_Sold   151571 non-null  object        
 11  Stock_On_Hand   151571 non-null  object        
 12  Date            151571 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(12)
memory usage: 15.0+ MB


In [22]:
# Convert some of the columns to their appropriate data type
float_columns = ['Weight_Kg', 'Low_Price', 'High_Price', 'Average_Price', 
                 'Sales_Total', 'Total_Kg_Sold', 'Total_Qty_Sold', 'Stock_On_Hand']


# Convert the columns to numeric
for col in float_columns:
    # sales[col] = sales[col].astype(float)
    sales[col] = pd.to_numeric(sales[col])

In [23]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151571 entries, 0 to 151570
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Commodities     151571 non-null  object        
 1   Weight_Kg       151571 non-null  float64       
 2   Size_Grade      151571 non-null  object        
 3   Container       151571 non-null  object        
 4   Province        151571 non-null  object        
 5   Low_Price       151571 non-null  float64       
 6   High_Price      151571 non-null  float64       
 7   Average_Price   151571 non-null  float64       
 8   Sales_Total     151570 non-null  float64       
 9   Total_Qty_Sold  151570 non-null  float64       
 10  Total_Kg_Sold   151571 non-null  float64       
 11  Stock_On_Hand   151570 non-null  float64       
 12  Date            151571 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(8), object(4)
memory usage: 15.0+ MB


In [24]:
sales.head()

Unnamed: 0,Commodities,Weight_Kg,Size_Grade,Container,Province,Low_Price,High_Price,Average_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Date
0,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,CAPE,50.0,100.0,51.39,9250.0,180.0,1620.0,1397.0,2020-02-26
1,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,ORANGE FREE STATE,0.0,0.0,0.0,0.0,0.0,0.0,80.0,2020-02-26
2,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,NATAL,10.0,10.0,10.0,30.0,3.0,27.0,6.0,2020-02-26
3,APPLE GOLDEN DELICIOUS,9.0,2U,JE090,CAPE,50.0,50.0,50.0,9000.0,180.0,1620.0,0.0,2020-02-26
4,APPLE GOLDEN DELICIOUS,18.3,2X,M4183,CAPE,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2020-02-26


In [27]:
# Check for the number of days in the database
print(f"{sales['Date'].nunique()} days recorded in the database")

147 days recorded in the database


In [28]:
# Remove days whereby total sales equal 0 because it registers average_price as zero.
filtered_sales = sales[sales['Sales_Total'] != 0]

In [29]:
print(f"{filtered_sales['Date'].nunique()} days recorded in the database")

147 days recorded in the database


Therefore no days were lost due to filtering by 'Sales_Total'

In [31]:
filtered_sales.head()

Unnamed: 0,Commodities,Weight_Kg,Size_Grade,Container,Province,Low_Price,High_Price,Average_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Date
0,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,CAPE,50.0,100.0,51.39,9250.0,180.0,1620.0,1397.0,2020-02-26
2,APPLE GOLDEN DELICIOUS,9.0,2S,JE090,NATAL,10.0,10.0,10.0,30.0,3.0,27.0,6.0,2020-02-26
3,APPLE GOLDEN DELICIOUS,9.0,2U,JE090,CAPE,50.0,50.0,50.0,9000.0,180.0,1620.0,0.0,2020-02-26
7,APPLE GRANNY SMITH,12.0,1S,EC120,CAPE,60.0,60.0,60.0,6000.0,100.0,1200.0,164.0,2020-02-26
8,APPLE GRANNY SMITH,12.0,1S,EC120,CAPE,60.0,96.0,62.3,13020.0,209.0,2508.0,0.0,2020-02-26


In [53]:
sales[(sales['Commodities'] == 'PINEAPPLE QUEEN VICTORIA') & (sales['Container'] == 'LM080') & (sales['Province'] == 'NATAL')].head(20)

Unnamed: 0,Commodities,Weight_Kg,Size_Grade,Container,Province,Low_Price,High_Price,Average_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Date
791,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,60.0,120.0,113.82,18780.0,165.0,1320.0,149.0,2020-04-07
792,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,50.0,100.0,92.0,65410.0,711.0,5688.0,430.0,2020-04-07
793,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,40.0,90.0,83.25,44290.0,532.0,4256.0,44.0,2020-04-07
794,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,15.0,70.0,57.78,9765.0,169.0,1352.0,46.0,2020-04-07
795,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,15.0,120.0,120.0,840.0,7.0,56.0,5.0,2020-04-07
1892,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,225.0,225.0,225.0,80775.0,359.0,2872.0,6.0,2020-05-19
1893,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,200.0,200.0,200.0,224800.0,1124.0,8992.0,100.0,2020-05-19
1894,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,200.0,200.0,200.0,88600.0,443.0,3544.0,2.0,2020-05-19
1895,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,180.0,180.0,180.0,13140.0,73.0,584.0,0.0,2020-05-19
1896,PINEAPPLE QUEEN VICTORIA,8.0,,LM080,NATAL,160.0,160.0,160.0,2400.0,15.0,120.0,5.0,2020-05-19


In [64]:
df = filtered_sales.groupby(['Province', 'Container', 'Size_Grade', 'Weight_Kg', 'Commodities', 'Date']
    )[['Low_Price', 'High_Price', 'Sales_Total', 'Total_Qty_Sold', 'Total_Kg_Sold', 'Stock_On_Hand']].agg(
        {
            'Low_Price':min,
            'High_Price':max,
            'Sales_Total':sum,
            'Total_Qty_Sold':sum,
            'Total_Kg_Sold':sum,
            'Stock_On_Hand':sum
        }
)

In [65]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand
Province,Container,Size_Grade,Weight_Kg,Commodities,Date,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,A8075,,7.5,HABANERO YELLOW,2020-07-01,350.0,350.0,700.0,2.0,15.0,6.0
,A8075,,7.5,HABANERO YELLOW,2020-07-08,350.0,350.0,1400.0,4.0,30.0,12.0
,A8075,,7.5,HABANERO YELLOW,2020-07-09,350.0,350.0,1400.0,4.0,30.0,12.0
,A8075,,7.5,HABANERO YELLOW,2020-07-14,350.0,350.0,1050.0,3.0,22.5,2.0
,A8075,,7.5,HABANERO YELLOW,2020-07-20,300.0,320.0,620.0,2.0,15.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
WESTERN FREESTATE,PP100,4M,10.0,POTATO LANORMA (WASHED),2020-03-10,8.0,10.0,1080.0,130.0,1300.0,0.0
WESTERN FREESTATE,PP100,4R,10.0,POTATO (WASHED) MONDIAL,2020-01-23,5.0,10.0,5470.0,1049.0,10490.0,0.0
WESTERN FREESTATE,PP100,4S,10.0,POTATO (WASHED) MONDIAL,2020-01-23,5.0,5.0,1440.0,288.0,2880.0,0.0
WESTERN FREESTATE,PP100,4S,10.0,POTATO (WASHED) MONDIAL,2020-06-19,5.0,5.0,835.0,167.0,1670.0,0.0


In [68]:
df.reset_index(['Province', 'Container', 'Size_Grade', 'Weight_Kg', 'Commodities'], inplace=True)

In [70]:
df.head()

Unnamed: 0_level_0,Province,Container,Size_Grade,Weight_Kg,Commodities,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-07-01,,A8075,,7.5,HABANERO YELLOW,350.0,350.0,700.0,2.0,15.0,6.0
2020-07-08,,A8075,,7.5,HABANERO YELLOW,350.0,350.0,1400.0,4.0,30.0,12.0
2020-07-09,,A8075,,7.5,HABANERO YELLOW,350.0,350.0,1400.0,4.0,30.0,12.0
2020-07-14,,A8075,,7.5,HABANERO YELLOW,350.0,350.0,1050.0,3.0,22.5,2.0
2020-07-20,,A8075,,7.5,HABANERO YELLOW,300.0,320.0,620.0,2.0,15.0,0.0


In [None]:
df['avg_per_kg'] = df['']

In [None]:
# Check which product is sold on a daily basis
day_count = filtered_sales['Date'].nunique()


In [None]:
# Check which product is sold on a daily basis
day_count = filtered_sales['Date'].nunique()
filtered_sales.groupby(['Province', 'Container', 'Size_Grade', 'Weight_Kg', 'Commodities'])['Commodities'].value_counts().apply(lambda x: x / day_count).sort_values(ascending=False)

In [None]:
inventory = inventory[inventory['commodity'] == 'APPLES']
sales = sales[sales['commodity'] == 'APPLES']

In [None]:
df = sales[sales['total_value_sold'] > 0]

In [None]:
df['container'].value_counts()

In [None]:
df[df['container'] == '18.50KG CARTON']['product_combination'].value_counts()

In [None]:
# 10KG POCKET will be the focus of this notebook since it's the most active
len(df)

In [None]:
filtered_df = df[(df['container'] == '18.50KG CARTON') & (df['product_combination'] == 'GOLDEN DELICIOUS,CL 1,*,100,*')]

In [None]:
filtered_df.head()

In [None]:
price = filtered_df[['date', 'ave_per_kg']]

In [None]:
price.set_index('date', inplace=True)

In [None]:
price.head()

In [None]:
price.sort_index(inplace=True)

In [None]:
price.index

The freq of the index is currently set to None, this will need to be changed to daily, since the frequency of the data is daily. Furthermore, since there is no data available for weekends, the freq has to be set to Business day (Mon-Fri), with a backfill method to account for those days when it is a holiday and no data updated. 

In [None]:
price = price.asfreq('B', method='backfill')

In [None]:
price.index

The index freq has been set to 'B' for business day with additional dates included like '2020-08-24' which the data was filled with the backfill method.

In [None]:
ax = price.plot(figsize=(8,5), title="Average R/kg of GOLDEN DELICIOUS Class 1 Apples")
ax.set(ylabel='R/kg');

## Introduction

Due to competition, retailers aim to increase profits and reduce costs, increasing the profit margin for perishable food products. This means that avoiding costs due to lost sales, and because of the short-shelf life of their products, ensuring that there is no build up of inventory. Effecient forecasting system can result in reduced inventory, be flexible to changes and increase profits. 

Time series forecasting uses past observations of the same variable to develop a model describing the underlying relationship. The model is then used to extrapolate time series into the future. This approach is useful when there are no other explanatory variables influencing the generation of the underlying data. 

### Trend

The Hodrick-Prescott filter is used to get the trend of the data. This approach separates the time-series into a trend component and a cyclical component.

In [None]:
from statsmodels.tsa.filters.hp_filter import hpfilter

In [None]:
# Tuple unpacking
price_cycle, price_trend = hpfilter(price)

In [None]:
price['trend'] = price_trend

In [None]:
ax = price[['trend','ave_per_kg']].plot()
ax.autoscale(axis='x',tight=True);

Method will probably work better once data is viewed on a monthly basis. For now the approach is acknowledged.

In [None]:
del price['trend']

### Seasonal Decomposition

Time series decomposition involves the deconstruction of the time series data into the trend, seasonal and noise component.

In [None]:
result = seasonal_decompose(price['ave_per_kg'], model='additive')  # model='mul' also works
result.plot();

From above, it can be seen that the trend component is much better than when using the Hodrick-Prescott filter. This might be due to the data having a daily frequency.

## Holt - Winters method

Holt - Winters method is a generalized exponential smooothing method that incorporates **trend** and **seasonal** variation in the model. The model makes use of exponential weighting of the coefficients of past observations in order to give more weight to the most recent observations. 

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
train_data = price.iloc[:-15]
test_data = price.iloc[-15:]

In [None]:
model = ExponentialSmoothing(train_data['ave_per_kg'], trend='add',seasonal='add',seasonal_periods=7) 
fitted_model = model.fit()

In [None]:
test_predictions = fitted_model.forecast(15).rename('Forecast')

In [None]:
train_data['ave_per_kg'].plot(legend=True, label='TRAIN')
test_data['ave_per_kg'].plot(legend=True, label='TEST')
test_predictions.plot(legend=True, label='PREDICTION');

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
hw_pred = np.sqrt(mean_squared_error(test_data, test_predictions))

In [None]:
hw_pred

## Autoregressive (AR) model

The Holt-Winters method forecasts the variable of interest using a linear combination of predictors. These predictors are the set of level, trend and seasonal predictors. 

The autoregression model uses a linear combination of past values of the variable. This is a regression equation whereby the variable of interest is regressed against a set of it's lagged values of order $p$.

### $y_{t} = c + \phi_{1}y_{t-1} + \phi_{2}y_{t-2} + \dots + \phi_{p}y_{t-p} + \varepsilon_{t}$

where $c$ is a constant, $\phi_{1}$ and $\phi_{2}$ are lag coefficients up to order $p$, and $\varepsilon_{t}$ is white noise.

For example, an <strong>AR(1)</strong> model would follow the formula

&nbsp;&nbsp;&nbsp;&nbsp;$y_{t} = c + \phi_{1}y_{t-1} + \varepsilon_{t}$

whereas an <strong>AR(2)</strong> model would follow the formula

&nbsp;&nbsp;&nbsp;&nbsp;$y_{t} = c + \phi_{1}y_{t-1} + \phi_{2}y_{t-2} + \varepsilon_{t}$

and so on.

In [None]:
# Import AR model
from statsmodels.tsa.ar_model import AR

In [None]:
model = AR(train_data['ave_per_kg'])

### AR(1) model

In [None]:
ar1 = model.fit(maxlag=1)

In [None]:
# This is the general format for obtaining predictions
start=len(train_data)
end=len(train_data)+len(test_data)-1
predictions1 = ar1.predict(start=start, end=end, dynamic=False).rename('AR(1) Predictions')

### AR(2) model

In [None]:
model = AR(train_data['ave_per_kg'])
ar2 = model.fit(maxlag=2)
predictions2 = ar2.predict(start=start, end=end, dynamic=False).rename('AR(2) Predictions')

In [None]:
test_data['ave_per_kg'].plot(legend=True)
predictions1.plot(legend=True)
predictions2.plot(legend=True);

### AR(5) model

In [None]:
model = AR(train_data['ave_per_kg'])
ar5 = model.fit(maxlag=5)
predictions5 = ar5.predict(start=start, end=end, dynamic=False).rename('AR(5) Predictions')

In [None]:
test_data['ave_per_kg'].plot(legend=True)
predictions1.plot(legend=True)
predictions2.plot(legend=True)
predictions5.plot(legend=True);

In [None]:
# Identify the best AR() model to use for forecasting
model = AR(train_data['ave_per_kg'])
arfit = model.fit()

In [None]:
arfit.params

### AR(7) model

In [None]:
model = AR(train_data['ave_per_kg'])
ar7 = model.fit(maxlag=7)
predictions7 = ar7.predict(start=start, end=end, dynamic=False).rename('AR(7) Predictions')

In [None]:
test_data['ave_per_kg'].plot(legend=True)
predictions5.plot(legend=True)
predictions7.plot(legend=True);

## Autoregressive Moving Average (ARMA) model

ARMA model is a combination of two models, the AR model utilizing past values of the time series data, and the Moving Average (MA) model, which uses past values of the forecast errors. As seen earlier, this models can be also be used separately, or in this section, combined.

## Autoregressive Integrated Moing Average (ARIMA) model