In [1]:
import numpy as np
import pandas as pd
import re
import jdatetime
import datetime
import calendar

In [2]:
sale_df = pd.read_excel('P2-SalesData.xlsx')
sale_df.head()

Unnamed: 0,Date,Sales
0,14010115,1395.898445
1,14010114,1654.535199
2,14010113,767.028903
3,14010112,1323.005591
4,14010111,1294.524908


In [3]:
sale_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111 entries, 0 to 1110
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1111 non-null   int64  
 1   Sales   1111 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 17.5 KB


In [4]:
def to_gregorian(datetime):
    datetime = int(datetime)
    return jdatetime.date(datetime//10000, datetime//100%100, datetime%100).togregorian()

In [5]:
sale_df['Date'] = sale_df['Date'].apply(to_gregorian)

In [6]:
sale_df['Date'] = pd.to_datetime(sale_df['Date'])
sale_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111 entries, 0 to 1110
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1111 non-null   datetime64[ns]
 1   Sales   1111 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 17.5 KB


In [7]:
def week_of_month(tgtdate):
    days_this_month = calendar.mdays[tgtdate.month]
    for i in range(1, days_this_month):
        d = datetime.datetime(tgtdate.year, tgtdate.month, i)
        if d.day - d.weekday() > 0:
            startdate = d
            break
    # now we canuse the modulo 7 appraoch
    return (tgtdate - startdate).days //7 + 1

In [8]:
# Create Cols
sale_df['Month Day'] = sale_df['Date'].dt.day
sale_df['Week Day'] = sale_df['Date'].dt.weekday
sale_df['Month'] = sale_df['Date'].dt.month
sale_df['Month Week'] = sale_df['Date'].apply(week_of_month)
sale_df['Year Week'] = sale_df['Date'].dt.weekofyear
sale_df['Year Day'] = sale_df['Date'].dt.dayofyear

  


In [9]:
sale_df

Unnamed: 0,Date,Sales,Month Day,Week Day,Month,Month Week,Year Week,Year Day
0,2022-04-04,1395.898445,4,0,4,1,14,94
1,2022-04-03,1654.535199,3,6,4,0,13,93
2,2022-04-02,767.028903,2,5,4,0,13,92
3,2022-04-01,1323.005591,1,4,4,0,13,91
4,2022-03-31,1294.524908,31,3,3,4,13,90
...,...,...,...,...,...,...,...,...
1106,2019-03-25,1314.944083,25,0,3,4,13,84
1107,2019-03-24,1278.500591,24,6,3,3,12,83
1108,2019-03-23,1316.051583,23,5,3,3,12,82
1109,2019-03-22,1281.464259,22,4,3,3,12,81


In [10]:
import keras
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential
from keras.optimizers import Adam 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from keras import backend as K


In [11]:
train_set = sale_df.drop(axis=1, labels='Date')
train_set

Unnamed: 0,Sales,Month Day,Week Day,Month,Month Week,Year Week,Year Day
0,1395.898445,4,0,4,1,14,94
1,1654.535199,3,6,4,0,13,93
2,767.028903,2,5,4,0,13,92
3,1323.005591,1,4,4,0,13,91
4,1294.524908,31,3,3,4,13,90
...,...,...,...,...,...,...,...
1106,1314.944083,25,0,3,4,13,84
1107,1278.500591,24,6,3,3,12,83
1108,1316.051583,23,5,3,3,12,82
1109,1281.464259,22,4,3,3,12,81


In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

In [13]:
scaled_train_set_x = scaler_x.fit_transform(train_set.drop(axis=1, labels='Sales'))
scaled_train_set_y = scaler_y.fit_transform(train_set[['Sales']])

In [14]:
scaled_train_set_x.shape

(1111, 6)

In [15]:
scaled_train_set_y.shape

(1111, 1)

In [16]:
train_x = []
train_y = []

n_past = 30

for i in range(n_past, len(scaled_train_set_x)):
    train_x.append(scaled_train_set_x[i - n_past:i])
    train_y.append(scaled_train_set_y[i][0])

train_x , train_y = np.array(train_x) , np.array(train_y)

In [17]:
train_x.shape

(1081, 30, 6)

In [18]:
def training_model(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(LSTM(30, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu', return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(1))


    model.compile(loss = 'mean_squared_error', optimizer='adam')
    model.summary()

    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    model.fit(X_train, y_train, epochs=500, batch_size=16, shuffle=True, callbacks=[callback], validation_data=(X_train[-100:], y_train[-100:]))

    prediction = model.predict(X_test)

    return mean_squared_error(y_true=y_test, y_pred=prediction), model

In [26]:
tscv = TimeSeriesSplit(n_splits = 3, test_size=100)
mse = []
for train_index, test_index in tscv.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]

    error, model = training_model(X_train, y_train, X_test, y_test)

    mse.append(error)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30)                4440      
                                                                 
 dropout_3 (Dropout)         (None, 30)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 31        
                                                                 
Total params: 4,471
Trainable params: 4,471
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500


Forecasting

In [20]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 30)                4440      
                                                                 
 dropout_2 (Dropout)         (None, 30)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 4,471
Trainable params: 4,471
Non-trainable params: 0
_________________________________________________________________


In [28]:
mse

[0.015193827493609697, 0.0075717436274011654, 0.011162105983504844]

In [29]:
print('Total Error: ', sum(mse) / 3)

Total Error:  0.011309225701505235


In [30]:
# Create dataset
prediction_df = pd.DataFrame({
    'Date': pd.date_range(end='2022-05-04', periods=30)})
prediction_df

Unnamed: 0,Date
0,2022-04-05
1,2022-04-06
2,2022-04-07
3,2022-04-08
4,2022-04-09
5,2022-04-10
6,2022-04-11
7,2022-04-12
8,2022-04-13
9,2022-04-14


In [31]:
# Create Cols
prediction_df['Month Day'] = prediction_df['Date'].dt.day
prediction_df['Week Day'] = prediction_df['Date'].dt.weekday
prediction_df['Month'] = prediction_df['Date'].dt.month
prediction_df['Month Week'] = prediction_df['Date'].apply(week_of_month)
prediction_df['Year Week'] = prediction_df['Date'].dt.weekofyear
prediction_df['Year Day'] = prediction_df['Date'].dt.dayofyear

  


In [45]:
prediction_df

Unnamed: 0,Date,Month Day,Week Day,Month,Month Week,Year Week,Year Day
0,2022-04-05,5,1,4,1,14,95
1,2022-04-06,6,2,4,1,14,96
2,2022-04-07,7,3,4,1,14,97
3,2022-04-08,8,4,4,1,14,98
4,2022-04-09,9,5,4,1,14,99
5,2022-04-10,10,6,4,1,14,100
6,2022-04-11,11,0,4,2,15,101
7,2022-04-12,12,1,4,2,15,102
8,2022-04-13,13,2,4,2,15,103
9,2022-04-14,14,3,4,2,15,104


In [75]:
prediction_df[::-1]

Unnamed: 0,Date,Month Day,Week Day,Month,Month Week,Year Week,Year Day
29,2022-05-04,4,2,5,1,18,124
28,2022-05-03,3,1,5,1,18,123
27,2022-05-02,2,0,5,1,18,122
26,2022-05-01,1,6,5,0,17,121
25,2022-04-30,30,5,4,4,17,120
24,2022-04-29,29,4,4,4,17,119
23,2022-04-28,28,3,4,4,17,118
22,2022-04-27,27,2,4,4,17,117
21,2022-04-26,26,1,4,4,17,116
20,2022-04-25,25,0,4,4,17,115


In [79]:
final_test_df = pd.concat([prediction_df[::-1].drop(axis=1, labels='Date'), train_set.drop(axis=1, labels='Sales')], ignore_index=True)

In [80]:
out_put_x = scaler_x.transform(final_test_df)
out_put_x

array([[0.1       , 0.33333333, 0.36363636, 0.2       , 0.32692308,
        0.3369863 ],
       [0.06666667, 0.16666667, 0.36363636, 0.2       , 0.32692308,
        0.33424658],
       [0.03333333, 0.        , 0.36363636, 0.2       , 0.32692308,
        0.33150685],
       ...,
       [0.73333333, 0.83333333, 0.18181818, 0.6       , 0.21153846,
        0.22191781],
       [0.7       , 0.66666667, 0.18181818, 0.6       , 0.21153846,
        0.21917808],
       [0.66666667, 0.5       , 0.18181818, 0.6       , 0.21153846,
        0.21643836]])

In [81]:
out_put_x.shape

(1141, 6)

In [86]:
final_test_x = []

n_past = 30

for i in range(len(out_put_x)-2*n_past , len(out_put_x)-1*n_past):
    final_test_x.append(out_put_x[i : i + n_past])

final_test_x = np.array(final_test_x)

In [87]:
final_test_x.shape

(30, 30, 6)

In [89]:
final_test_y = model.predict(final_test_x) 



In [90]:
scaler_y.inverse_transform(final_test_y)

array([[1809.2671],
       [1839.1765],
       [1818.8552],
       [1760.4587],
       [1669.5164],
       [1545.556 ],
       [1633.1686],
       [1697.9066],
       [1733.7761],
       [1719.2046],
       [1648.9055],
       [1540.1945],
       [1400.8114],
       [1494.7985],
       [1571.3604],
       [1620.5121],
       [1611.1039],
       [1538.0114],
       [1414.6427],
       [1272.272 ],
       [1374.2333],
       [1502.8148],
       [1610.588 ],
       [1683.0647],
       [1703.4943],
       [1682.0061],
       [1627.0371],
       [1738.5524],
       [1817.0245],
       [1864.4706]], dtype=float32)

In [92]:
model.save(r'D:\retail-sales-prediction\retailmodel.h5')

In [94]:
import pickle
with open(r'D:\retail-sales-prediction\retailmodel.mdl', 'wb') as fff:
    pickle.dump(model, fff, pickle.HIGHEST_PROTOCOL)

INFO:tensorflow:Assets written to: ram://1c0a5682-8d54-45e2-b3ee-f1895d46f86d/assets
