In [None]:
# The pre-processing idea has been benefited greatly from
# https://www.analyticsvidhya.com/blog/2020/10/multivariate-multi-step-time-series-forecasting-using-stacked-lstm-sequence-to-sequence-autoencoder-in-tensorflow-2-0-keras/
# https://aeturrell.github.io/coding-for-economists/time-series.html
# https://coderedirect.com/questions/673308/interpolate-pandas-df
# https://www.kaggle.com/limweixuan1994/store-sales-predictions
# and many more~
## a huge shout-out to them

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
#from tensorflow.keras.engine.input_layer import Input
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import BatchNormalization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path ='../input/store-sales-time-series-forecasting/'

oil= pd.read_csv(os.path.join(data_path,'oil.csv'),index_col='date')
holidays_events=pd.read_csv(os.path.join(data_path,'holidays_events.csv'),index_col='date')
stores= pd.read_csv(os.path.join(data_path,'stores.csv'))
transactions= pd.read_csv(os.path.join(data_path,'transactions.csv'))

train= pd.read_csv(os.path.join(data_path,'train.csv'), index_col='id',parse_dates=['date'], infer_datetime_format=True)
test= pd.read_csv(os.path.join(data_path,'test.csv'),parse_dates=['date'], infer_datetime_format=True)


In [None]:
# # Function that outputs some of the data characteristics
# def get_charas(df):
#     print("\n > head <")
#     print(df.head())
#     print("\n> info <")
#     print(df.info())
#     print("\n> describe <")
#     print(df.describe())
#     print("\n> cols <")
#     print(df.columns)
#     print("\n> dtypes <")
#     print(df.dtypes)
#     print("\n> null <")
#     print(df.isnull().sum())
#     print("\n> n/a values <")
#     print(df.isna().sum())
#     print("\n> Shape Of Data <")
#     print(df.shape)
#     return 

In [None]:
# get_charas(oil) ## there is missing data using interpolation strategy as needed

# # fill in missing date, offset alias "D" means calendar day frequency
# oil = oil.set_index("date").asfreq(freq = "D")

# # fill the NaN value by interpolation
# oil["dcoilwtico"] = oil["dcoilwtico"].interpolate(limit_direction="both")

In [None]:
# stores.head(n=8)
# # stores.shape

In [None]:
train["family"].nunique(dropna = True) 
#nunique() function return number of unique elements in the object, it will drop the N/A

In [None]:
test.head()

In [None]:
# onpromotion won't be used, since based on:
# https://www.kaggle.com/limweixuan1994/store-sales-predictions?scriptVersionId=80578071&cellId=9
# it is not that useful
train_data = train.copy().drop(['onpromotion'], axis=1)
test_data = test.copy().drop(['onpromotion'], axis=1)

In [None]:
##applying the encoders, the data showed that there are ~33 unique elements
ordinal_encoder = OrdinalEncoder(dtype=int)
train_data[['family']] = ordinal_encoder.fit_transform(train_data[['family']])
test_data[['family']] = ordinal_encoder.transform(test_data[['family']])

In [None]:
train_data

In [None]:
##counting the number of days
n_o_days_train=train["date"].nunique(dropna = False) 
print('number of day train:',n_o_days_train)

# number of store
n_o_stores_train=train["store_nbr"].nunique(dropna = False) 
print('number of stores train:',n_o_stores_train)

# number of family
n_o_families_train=train["family"].nunique(dropna = False) 
print('number of family/type of prod train:',n_o_families_train)

In [None]:
##counting the number of days
n_o_days_test=test["date"].nunique(dropna = False) 
print('number of day test:',n_o_days_test)

# number of store
n_o_stores_test=test["store_nbr"].nunique(dropna = False) 
print('number of stores test:',n_o_stores_test)

# number of family
n_o_families_test=test["family"].nunique(dropna = False) 
print('number of family/type of prod test:',n_o_families_test)

In [None]:
# The data need to be re-organized as discrete-time data (days)
# date as timestamp/time-series input, store number and family as columns and sales is the numerical data of interest for RNN
pivoted_train = train_data.pivot(index=['date'], columns=['store_nbr', 'family'], values='sales')
pivoted_train

In [None]:
# pivoted_train[1][0] #store number 1, product no 0

# **Train_Valid_split**

In [None]:
###Train_val data split#############
train_samples = int(n_o_days_train * 0.95) ## percentage of traindata (vs validation)
# train_samples

In [None]:
train_samples_df = pivoted_train[:train_samples]
train_samples_df

In [None]:
valid_samples_df = pivoted_train[train_samples:]
valid_samples_df

In [None]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(train_samples_df)

scaled_train_samples = minmax_scaler.transform(train_samples_df)
# print(scaled_train_samples)
# scaled_train_samples
scaled_validation_samples = minmax_scaler.transform(valid_samples_df)

In [None]:
# sliding window for converting series to sample to be used with supervised learning algorithms
# thanks to
# https://www.analyticsvidhya.com/blog/2020/10/multivariate-multi-step-time-series-forecasting-using-stacked-lstm-sequence-to-sequence-autoencoder-in-tensorflow-2-0-keras/

def split_series(series, n_past, n_future):
  #
  # n_past ==> no of past observations
  #
  # n_future ==> no of future observations 
  #
  X, y = list(), list()
  for window_start in range(len(series)):
    past_end = window_start + n_past
    future_end = past_end + n_future
    if future_end > len(series):
      break
    # slicing the past and future parts of the window
    past, future = series[window_start:past_end, :], series[past_end:future_end, :]
    X.append(past)
    y.append(future)
  return np.array(X), np.array(y)

n_past =16
n_future = 16
n_features = n_o_stores_train * n_o_families_train # num of features

In [None]:
#Now converting the data via split_series function
X_train, y_train = split_series(scaled_train_samples,n_past, n_future)
X_val, y_val = split_series(scaled_validation_samples,n_past, n_future)

In [None]:
print('X_train.shape',X_train.shape)
print('y_train.shape',y_train.shape)
print('X_val.shape',X_val.shape)
print('y_val.shape',y_val.shape)

In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

def timemodel():
    model = keras.Sequential()
### basic RNN model
    model.add(layers.SimpleRNN(units=200, return_sequences=True, input_shape=[n_past, n_features]))
    model.add(keras.layers.BatchNormalization())
    model.add(layers.Dropout(0.2))
    model.add(layers.SimpleRNN(units=200, return_sequences=True))
    model.add(keras.layers.BatchNormalization())
    model.add(layers.Dropout(0.2))

    
### LSTM
#     model.add(layers.LSTM(units=256, return_sequences=True, input_shape=[n_past, n_features]))
#     model.add(keras.layers.BatchNormalization())
#     model.add(layers.Dropout(0.2))
#     model.add(layers.LSTM(units=128, return_sequences=True))
#     model.add(keras.layers.BatchNormalization())
#     model.add(layers.Dropout(0.2))

    model.add(keras.layers.TimeDistributed(keras.layers.Dense(n_features)))
    
    
    model.compile(loss="mae", optimizer=keras.optimizers.Adam(learning_rate=0.001), metrics=['mae'])
    return model

In [None]:
model=timemodel()
model.summary()

In [None]:
early_stopping = keras.callbacks.EarlyStopping(monitor = 'val_mae',
                                               min_delta=0.0001,
                                               patience=100, 
                                               restore_best_weights=True)

# What it means is, monitor validation loss, if the change in loss is less than 0.0001 for 20 epochs, then stop training. 
# Additionally, it returns the best epoch weights

EPOCHS = 1000
#EPOCHS = 1
model_history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val), 
    epochs=EPOCHS, 
    callbacks=[early_stopping], 
    batch_size=512, 
    shuffle=True)

In [None]:
import matplotlib.pyplot as mpl
mpl.plot(model_history.history['loss'])
mpl.plot(model_history.history['val_mae'])
mpl.xlabel('Epochs')
mpl.ylabel('Loss')
mpl.legend(['Train', 'Validation'])
mpl.show()

In [None]:
# pd.set_option('display.max_columns', 1000)

#reset option to default value
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

#and reset all of them back:
# pd.reset_option('all')

In [None]:
x_test_pred = scaled_validation_samples[-n_past:,:].reshape((1, n_past, n_features))
print(x_test_pred.shape)
scaled_y_predict = model.predict(x_test_pred)

In [None]:
scaled_y_predict.shape

In [None]:
# Inverse transform from the previous min max scaler
y_predict = pd.DataFrame(minmax_scaler.inverse_transform(scaled_y_predict.reshape((n_future, n_features))),
                         columns=valid_samples_df.columns)
## each element corresponding to raw value at a specific ID no on pivoted test table 
y_predict

In [None]:
# test_data

In [None]:
pivoted_test = test_data.pivot(index=['date'], columns=['store_nbr', 'family'], values=None)
pivoted_test ## format store_nbr, family, date and each value/element corresponding to the sample indices

In [None]:
# pivoted_test.values

In [None]:
submission = pd.read_csv("../input/store-sales-time-series-forecasting/sample_submission.csv", index_col='id')

In [None]:
submission.shape

In [None]:
# for i, j in y_predict.iterrows():
#     print('i',i)
#     print('j',j)

In [None]:
## mapping ypredict to pivoted test data
for day_ith, day_ith_pred in y_predict.iterrows():
    #day_ith iteration, 16 days in totals
    #day_ith_pred, predicted data of 9 stores, 33 classes of good for each day
    #Iterate over DataFrame rows as (index, Series) pairs.
#     print(n_samples_per_day)
    # n_samples_per_day number of 
    for n_samples_per_day in range(len(day_ith_pred)): ## iterating the number of sample, from 0 to 1781, for 16 days
#         print(pivoted_test.iloc[[day_ith], [n_samples_per_day]])
        sample_id = pivoted_test.iloc[[day_ith], [n_samples_per_day]].values[0][0] #total number of samples
        values= max(0,day_ith_pred.values[n_samples_per_day]) #price that is negative will be set to 0
        submission.at[sample_id, 'sales'] = values

In [None]:
submission

In [None]:
submission.to_csv('submission.csv')