In [1]:
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler
import tensorflow as tf
import numpy as np

from tensorflow.keras import callbacks

from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense

from datetime import datetime
import random

import warnings
warnings.filterwarnings('ignore')

import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
def getTrainTestData(inputSize, outputSize, df):
        total = inputSize + outputSize
        data_train = df.iloc[:-(total), :]['ARRIVAL']
        data_test = df.iloc[-(total):, :]['ARRIVAL']
        data_train = np.array(data_train)
        data_test = np.array(data_test) 
        return data_train, data_test

In [3]:
def prepareTrainData(inputSize, outputSize, data_train):
    total = inputSize + outputSize
    x_train, y_train = [], []
    for i in range(0, len(data_train)-total):
        x = data_train[i:i+inputSize]
        y = data_train[i+inputSize:i+total]
        x_train.append(x)
        y_train.append(y)
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    return x_train, y_train

In [4]:
def prepareTestData(inputSize, outputSize, data_test):
    total = inputSize + outputSize
    x_test, y_test = [], []
    for i in range(0, len(data_test)-(total-1)):
        x = data_test[i:i+inputSize]
        y = data_test[i+inputSize:i+total]
        x_test.append(x)
        y_test.append(y)
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return x_test, y_test

In [5]:
def transformData(x_train, y_train, x_test, y_test):
    x_scalar = MaxAbsScaler()
    y_scalar = MaxAbsScaler()

    x_train = x_scalar.fit_transform(x_train)
    y_train = y_scalar.fit_transform(y_train)

    x_test = x_scalar.transform(x_test)
    y_test = y_scalar.transform(y_test)
    return x_train, y_train, x_test, y_test, x_scalar, y_scalar

In [6]:
def inverseTransformData(y_pred, y_test, y_scalar):
    y_pred = y_scalar.inverse_transform(y_pred)
    y_true = y_scalar.inverse_transform(y_test)
    return y_true, y_pred

In [7]:
def makeData3D(x_train, y_train, x_test):
    x_train = np.expand_dims(x_train, axis = 2)
    y_train = np.expand_dims(y_train, axis = 2)
    x_test = np.expand_dims(x_test, axis = 2)
    return x_train, y_train, x_test

In [8]:
# This function keeps the initial learning rate for the first ten epochs  
# and decreases it exponentially after that.  
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

In [9]:
def trainModel(x_train, y_train, inputSize, outputSize, unit1, learningRate, batchSize, dropout1, unit2, dropout2):
    # print('in trainModel')
    # print(x_train.shape, y_train.shape)
    reg = Sequential()
    reg.add(LSTM(units = unit1, activation = 'tanh', input_shape = (inputSize,1), return_sequences = True, dropout = dropout1))
    reg.add(LSTM(unit2, dropout = dropout2))
    reg.add(Dense(outputSize))
    opt = tf.keras.optimizers.Adam(learning_rate = learningRate)
    reg.compile(loss = 'mse', optimizer = opt)

    early_stopping = callbacks.EarlyStopping(monitor = 'val_loss',
                                                    patience = 5,
                                                    mode = 'min',
                                                  min_delta = .0001,
                                                  restore_best_weights = True)
    # csv_logger = callbacks.CSVLogger('drive/MyDrive/MSR Thesis Documents/LSTM Forecasting/training.log')
    lr_scheduler = callbacks.LearningRateScheduler(scheduler)
    # logdir = "drive/MyDrive/MSR Thesis Documents/LSTM Forecasting/" + datetime.now().strftime("%Y%m%d-%H%M%S")
    # tensorboard_callback = callbacks.TensorBoard(log_dir=logdir)


    reg.fit(x_train, y_train, epochs = 100, shuffle = False, validation_split = 0.2, callbacks = [early_stopping, lr_scheduler], verbose = 0, batch_size = batchSize)
    #reg.fit(x_train, y_train, epochs = 100, shuffle = False, validation_split = 0.2)
    # print('out trainModel')
    return reg

In [10]:
def testModel(x_test,reg):
    # print('in testModel')
    # print(x_test.shape)
    y_pred = reg.predict(x_test)
    # print('in testModel')
    return y_pred

In [11]:
def forecastingMonthly(inputSize, outputSize, df, unit1, learningRate, batchSize, dropout1 , unit2, dropout2):
    data_train, data_test = getTrainTestData(inputSize, outputSize, df)   
    x_train, y_train = prepareTrainData(inputSize, outputSize, data_train)
    x_test, y_test = prepareTestData(inputSize, outputSize, data_test)
    x_train, y_train, x_test, y_test, x_scalar, y_scaclar = transformData(x_train, y_train, x_test, y_test)
    x_train, y_train, x_test = makeData3D(x_train, y_train, x_test)
    reg = trainModel(x_train, y_train, inputSize, outputSize, unit1, learningRate, batchSize, dropout1, unit2, dropout2)
    y_pred = testModel(x_test,reg)
    y_true, y_pred = inverseTransformData(y_pred, y_test, y_scaclar)
    y_true = y_true[0, :]
    y_pred = y_pred[0, :]

    return y_true, y_pred

In [12]:
def forecast(df, inputSize, unit1, learningRate, batchSize, dropout1, unit2, dropout2):
    start = df[df['DATE']=='2020-06-01'].index[0]     # read the index of the data 2020-06-01
    end = len(df)                                     # end is the length of data frame.

    predicted, actual = [], []                        
    final = pd.DataFrame()                            
    map = {}
    while(start<end):
        if(start+30<end):
            outputSize = 30
            start = start + 1
            dx = df.iloc[:start]
            y_true, y_pred = forecastingMonthly(inputSize, outputSize, dx, unit1, learningRate, batchSize, dropout1, unit2, dropout2)
            for i in range(outputSize):
              map[df.iloc[start + i]['DATE']] += [y_pred[i]]
            #actual.extend(y_true)
            #predicted.extend(y_pred)
        else:
            outputSize = end - start
            dx = df
            y_true, y_pred = forecastingMonthly(inputSize, outputSize, dx, unit1, learningRate, batchSize, dropout1, unit2, dropout2)
            for i in range(outputSize):
              map[df.iloc[start + i]['DATE']] += [y_pred[i]]
            actual.extend(y_true)
            predicted.extend(y_pred)
            start = start + 1
            #break
    date, value = [], []
    for i in map:
      date.append(i)
      value.append(map[i])

    final = pd.DataFrame({'DATE' : date, 'PREDICTED' : value})
    return final

In [13]:
def rmse30DayWindow(df):
    mse = (df['ACTUAL'] - df["PREDICTED"])**2
    rmse = (mse.mean())**.5
    return rmse

def rmse1DayWindow(df):
    mse = (df['ACTUAL'] - df["PREDICTED"])**2
    rmse = (mse.mean())**.5
    return rmse/len(df)    


def normalizedRmse1Day(df):
    mse = (df['ACTUAL'] - df["PREDICTED"])**2
    rmse = ((mse.sum())**.5)/df['ACTUAL'].mean()
    return rmse/len(df) 

In [14]:
def rmse(df):
    df.columns = ['ACTUAL', 'PREDICTED']
    l30, l1, lnormalized = [], [], []
    for i in range(0,len(df),30):
        x30 = rmse30DayWindow(df[i:i+30])
        x1 = rmse1DayWindow(df[i:i+30])
        xnorm = normalizedRmse1Day(df[i:i+30])
        l30.append(x30)
        l1.append(x1)
        lnormalized.append(xnorm)
    return l30, l1, lnormalized

In [15]:
random.seed(20)
learningRates = [.001]
units1 = [16, 32, 64, 128]
dropouts1 = [0.1, 0.2]
units2 = [16, 32, 64, 128]
dropouts2 = [0.1, 0.2]
batchSizes = [64, 128, 256, 512]

print(len(learningRates), len(units1), len(dropouts1), len(units2), len(dropouts2), len(batchSizes))

# trying 10 random samples : 
zipped = random.sample([[i,j,k, l, m, n] for i in learningRates for j in units1 for k in dropouts1 for l in units2 for m in dropouts2 for n in batchSizes], 10)

print(zipped)
for com in zipped:
    print(com)
    # for each sample set the parameter: 
    learningRate = com[0]
    unit1 = com[1]
    dropout1 = com[2]
    unit2 = com[3]
    dropout2 = com[4]
    batchSize = com[5]
    # ask panda to read_csv:
    dir = 'RAJASTHAN_KOTA_Arrival.csv'
    df = pd.read_csv(dir)
    name_of_file = dir.replace('.csv', '')
    # file to save with specific name: 
    fileToSave = "name_of_file"+"_{}_{}_{}_{}_{}_{}.csv".format(com[0], com[1], com[2], com[3], com[4], com[5])
    print(fileToSave)
    
    # asking forecast to performa things
    final = forecast(df, 60, unit1, learningRate, batchSize, dropout1, unit2, dropout2)
    final.to_csv(fileToSave, index = False)
    # lstmDf = pd.read_csv('drive/MyDrive/MSR Thesis Documents/LSTM Forecasting/'+fileToSave)
    # l30, l1, lnormalized = rmse(lstmDf)
    # print(np.mean(l30), np.mean(l1), np.mean(lnormalized))
    print('-'*20)
    break

1 4 2 4 2 4
[[0.001, 32, 0.1, 32, 0.2, 128], [0.001, 64, 0.1, 16, 0.2, 128], [0.001, 16, 0.2, 64, 0.1, 512], [0.001, 64, 0.2, 16, 0.2, 512], [0.001, 32, 0.1, 64, 0.2, 256], [0.001, 16, 0.1, 32, 0.2, 128], [0.001, 128, 0.1, 64, 0.1, 256], [0.001, 128, 0.1, 64, 0.1, 64], [0.001, 16, 0.2, 16, 0.2, 256], [0.001, 16, 0.2, 64, 0.2, 64]]
[0.001, 32, 0.1, 32, 0.2, 128]
name_of_file_0.001_32_0.1_32_0.2_128.csv


KeyError: '2020-06-02'