In [None]:
# importing required libraries

import pandas as pd
import numpy as np
from sklearn import preprocessing
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import load_model
from tensorflow.keras import backend as K

# Data Preprocessing

Data preprocessing is the first step towards training any model. Here training data is loaded and alongwith oil prices data. Both the dataframes are merged on the date column in order to get a dataframe which contains all the predictors. 

In [None]:
df = pd.read_csv("time_series_dataset/train/train.csv")
df=df.sort_values(['store_nbr','family'])

df_oil = pd.read_csv("time_series_dataset/oil.csv")
filtered_oil_values=np.where((df_oil['date']>='2013-01-01') & (df_oil['date']<='2017-08-15'))
df_oil = df_oil.loc[filtered_oil_values]

df=pd.merge(df, df_oil, left_on='date', right_on='date',how="left")

oil_mean_price = df['dcoilwtico'].mean()
df['dcoilwtico'] = df['dcoilwtico'].fillna(oil_mean_price)
df = df[["id","date","store_nbr","family","onpromotion","dcoilwtico","sales"]]
df

This step is also the part of data preprocessing, where sales for each product family and store are shifted by one day so that these values become the sales for previous day and stored in the sales column and the target value for sales are stored in sales_pred column in the dataframe. This step is also the part of data preprocessing.

In [None]:
sales_list = []
new_sales_list = []
store_nbr_list = df['store_nbr'].unique()
family_list = df['family'].unique()
for i in store_nbr_list:
    print("Store number : ",i)
    for j in family_list:
        temp_df_values = np.where((df['store_nbr']==i) & (df['family']==j))
        temp_df = df.loc[temp_df_values]
        temp_sales_list = temp_df['sales'].to_list()
        new_temp_sales_list = []
        new_temp_sales_list.append(0.0)
        new_temp_sales_list.extend(temp_sales_list)
        new_temp_sales_list = new_temp_sales_list[:-1]
        new_sales_list.extend(new_temp_sales_list)
        sales_list.extend(temp_sales_list)
#         print(len(new_sales_list))
df["sales"]=new_sales_list
df["sales_pred"]=sales_list
df

In [None]:
#In this step product family is encoded using label encoder
le = preprocessing.LabelEncoder()
family = df['family'].unique()
le.fit(family)
list(le.classes_)
family_encoder = le.transform(df['family'])
df['family'] = family_encoder
df

# Dataset Creation

In [None]:

def create_dataset(df,family_id,scaler):
    """
    Arguments:
    df(pnadas dataframe): preprocessed pandas dataframe
    family_id(int): id of product family for which the dataset needs to be created
    scaler(sklearn object): the scale which will be used to transform the dataset
    
    Description: This function takes the preprocessed dataframe to split and transform the input dataframe into training,
    validation and testing dataframe.
    
    Return(tuple): a tuple of training, validation and testing dataframe objects 
    
    """
    temp_df_values = np.where(df['family']==family_id)
    temp_df = df.loc[temp_df_values]
    new_df = temp_df.drop(["id","date",'family'], axis =1)
    
    train_df, test_df = train_test_split(new_df, test_size=0.1)
    train_df, val_df = train_test_split(train_df, test_size=0.2)
    
    train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
    val_df = pd.DataFrame(scaler.transform(val_df), columns=val_df.columns)
    test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
    
    return train_df,val_df,test_df

# Model Training

In [None]:

def root_mean_squared_log_error(y_true, y_pred):
    """
    Arguments:
    y_true(tensor object): actual values tensor
    y_pred(tensor object): predicted values tensor
    
    Description: Custom loss function which Compute Root Mean Squared Logarithmic Error for model during training
    
    Return(float): computed RMSLE value 
    
    """
    return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))

def rmsle(y_hat, y):
    """
    Arguments:
    y_hat(numpy array): actual values array
    y(numpy array): predicted values array
    
    Description: Loss Funstion which Compute Root Mean Squared Logarithmic Error during testing
    
    Return(float): computed RMSLE value 
    
    """
    metric = np.sqrt(sum((np.array(list(map(lambda x : np.log(x + 1), y_hat)))
                         - np.array(list(map(lambda x : np.log(x + 1), y))))**2)/len(y))
                
    return round(metric, 4)

In [None]:
def train_model(train_df,val_df,learning_rate,epochs,family_id):
    
    """
    Arguments:
    train_df(pnadas dataframe): transformed pandas dataframe for training the model
    val_df(pnadas dataframe): transformed pandas dataframe for validating the model
    learning_rate(float): learning rate for the model to be trained
    epochs(int): number of epochs for which the model needs to be trained
    family_id(int): id of product family for which the model needs to be trained
    
    Description: This function takes the input tarining dataframe and realated parameters such as validation datafraem,
    learning rate, epochs and trains a LSTM model for each product family.
    
    Return(keras model object): trained model for a particular family  
    
    """ 
    #break the datafraem into predictors and target arrays
    train_X, train_y = train_df.values[:, :-1], train_df.values[:, -1]
    val_X, val_y = val_df.values[:, :-1], val_df.values[:, -1]
    
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))

    # setting seed value to get consistant results
    np.random.seed(1234)
    tf.random.set_seed(1234)

    #complile the model
    model = Sequential()
    model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    adm = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=root_mean_squared_log_error, optimizer=adm)
    
    # fit network and save model
    history = model.fit(train_X, train_y, epochs=epochs, batch_size=256, validation_data=(val_X, val_y), verbose=2, shuffle=False)
    model.save('./trained_models/family_'+str(family_id)+'_model.h5')
    print('family_'+str(family_id)+' model trained')
    
    # plot history
    pyplot.figure(figsize=(8,4))
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.savefig('./trained_models/family_'+str(family_id)+'_loss_plot.png')
    pyplot.show()
    
    return model


# Model Testing

In [None]:
def test_model(test_df,model,scaler,family_id):

    """
    Arguments:
    train_df(pnadas dataframe): transformed pandas dataframe for testing the model
    model(keras object): trained model object for a pariticular product family
    scaler(sklearn object): the scale which will be used to transform the dataset
    family_id(int): id of product family for which the model needs to be tested
    
    Description: This function takes the input testing dataframe and related parameters such as trained model, sclaer object
    and test a LSTM model for each product family against the unseen data.
    
    Return(float): RMSLE value for a particular product family  
    
    """ 
    #break the datafraem into predictors and target arrays
    test_X, test_y = test_df.values[:, :-1], test_df.values[:, -1]
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

    #predict the values using model
    yhat = model.predict(test_X)
    yhat[yhat < 0] = 0.0
    
    # transform the scaled dataframe
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
    inv_yhat = concatenate((test_X[:, 0:],yhat), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,-1]
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_X[:, 0:],test_y), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,-1]
    
    #compute RMSLE value
    rmsle_value = rmsle(inv_yhat,inv_y)
    print("RMSLE: %s" %rmsle_value)
    
    #plot the test curve
    pyplot.figure(figsize=(15,5))
    pyplot.plot(inv_y[:200], label='actual sales')
    pyplot.plot(inv_yhat[:200], label='forecasted sales')
    pyplot.legend()
    pyplot.savefig('./trained_models/family_'+str(family_id)+'_forecasted_sales.png')
    pyplot.show()
    
    return rmsle_value

# Driver Code/ Model Pipeline

After preprocessing the dataset, the code can be used to train the model for each product family and training parameters like learning rate and epochs can be changed and saves the rmsle value for each product family

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
family_list = df['family'].unique()
learning_rate = 0.001
epochs = 20
rmsle_dict = {}
for i in family_list:
    train_df,val_df,test_df=create_dataset(df,i,scaler)
    model=train_model(train_df,val_df,learning_rate,epochs,i)
    rmsle_value=test_model(test_df,model,scaler,i)
    rmsle_dict[i]=rmsle_value
    

In [None]:
rmsle_dict

# Test Dataset Preprocessing
The test.csv is loaded and preprocessed same way as training dataset

In [None]:
test_df = pd.read_csv("time_series_dataset/test.csv")
test_df=test_df.sort_values(['store_nbr','family'])
df_oil = pd.read_csv("time_series_dataset/oil.csv")
test_df=pd.merge(test_df, df_oil, left_on='date', right_on='date',how="left")
oil_mean_price = test_df['dcoilwtico'].mean()
test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(oil_mean_price)
test_family_encoder = le.transform(test_df['family'])
test_df['family'] = test_family_encoder
test_df

# Test Dataset Prediction

After preprocessing the test dataset, the model predictions are made for each family by loading the saved model and previous daya sales are taken from the last day in the traing dataset and then predictions are transferred to the next day which act as previous day sales for next day

In [None]:
family_list = df['family'].unique()
test_df_date_list = test_df['date'].to_list()
test_df_date_list=list(set(test_df_date_list))
test_df_date_list.sort()
final_df = pd.DataFrame()
for j in family_list:
    
    # loading the sales values of last day for each family
    filtered_df = np.where((df['date']=='2017-08-15') & (df['family']==j))
    filtered_df = df.loc[filtered_df]
    filtered_df = filtered_df.sort_values(['store_nbr','family'])
    previous_day_sales = filtered_df['sales_pred'].to_list()
    
    # load the saved model
    model = load_model('./trained_models/family_'+str(family_list[j])+'_model.h5',custom_objects={'root_mean_squared_log_error': root_mean_squared_log_error})
    
    for i in test_df_date_list:
        
        # filtering the dataset for each date for particular family
        input_df = np.where((test_df['date']==i) & (test_df['family']==j))
        input_df = test_df.loc[input_df]
        input_df = input_df.sort_values(['store_nbr'])
        input_df['sales']= previous_day_sales
        id_list = input_df["id"].to_list()
        date_list = input_df["date"].to_list()
        family_list = input_df["family"].to_list()
        input_df = input_df.drop(["id","date","family"], axis =1)
        
        # tranform the dataset
        test_scaler = MinMaxScaler(feature_range=(0,1))
        input_df = pd.DataFrame(test_scaler.fit_transform(input_df), columns=input_df.columns)
        input_df_X = input_df.values.reshape((input_df.values.shape[0], 1, input_df.values.shape[1]))
        
        # predict the sales
        test_yhat = model.predict(input_df_X)
        test_yhat[test_yhat < 0] = 0.0
        input_df_X = input_df_X.reshape((input_df_X.shape[0], input_df_X.shape[2]))
        inv_test_yhat = concatenate((input_df_X[:, :-1],test_yhat), axis=1)
        inv_test_yhat = test_scaler.inverse_transform(inv_test_yhat)
        
        #store the results in dataframe
        forecasted_sales_df=pd.DataFrame(inv_test_yhat,columns=['store_nbr','onpromotion','dcoilwtico','sales_forecasted'])
        forecasted_sales_df['id']=id_list
        forecasted_sales_df['date']=date_list
        forecasted_sales_df['family']=family_list
        final_df = final_df.append(forecasted_sales_df, ignore_index=True)
        previous_day_sales = forecasted_sales_df['sales_forecasted'].to_list()
       

In [None]:
# Saving the results to csv
final_df = final_df[["id", "date","store_nbr", "family", "onpromotion", "dcoilwtico", "sales_forecasted"]]
final_df['family']=le.inverse_transform(final_df['family'])
final_df.to_csv('submission.csv')
final_df