Using LSTM to predict time series data using multivariate time series analysis.



In [7]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import os

In [34]:
df = pd.read_csv('../data/price_ts.csv', parse_dates=True)
print(df.columns)


Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')


In [35]:
train_dates = pd.to_datetime(df['Date'])
print(train_dates.tail(15)) #Check last few dates. 

235   2023-07-17
236   2023-07-18
237   2023-07-19
238   2023-07-20
239   2023-07-21
240   2023-07-24
241   2023-07-25
242   2023-07-26
243   2023-07-27
244   2023-07-28
245   2023-07-31
246   2023-08-01
247   2023-08-02
248   2023-08-03
249   2023-08-04
Name: Date, dtype: datetime64[ns]


In [36]:
cols = list(df)[1:6]
print(cols) #['Open', 'High', 'Low', 'Close', 'Adj Close']

['Open', 'High', 'Low', 'Close', 'Adj Close']


In [38]:
# training data - 5 columns
df_for_training = df[cols].astype(float)            # convert all COLUMNs to float as they will store normalised values
display(df_for_training)

Unnamed: 0,Open,High,Low,Close,Adj Close
0,59.234974,59.937550,58.438721,58.688526,58.461750
1,58.836845,59.398907,58.446526,58.493366,58.267342
2,59.492584,60.733803,59.258392,60.218578,59.985889
3,60.889931,61.834503,60.858704,61.592506,61.354507
4,61.896957,62.427792,61.327087,62.396564,62.155453
...,...,...,...,...,...
245,114.440002,114.750000,113.720001,114.239998,114.239998
246,113.690002,114.199997,112.269997,113.220001,113.220001
247,112.970001,113.500000,111.930000,111.970001,111.970001
248,111.830002,113.169998,111.550003,112.360001,112.360001


In [39]:
#Normalize the values for sigmoid and tanh used in LSTM 
# normalize the dataset
scaler = StandardScaler()
scaler = scaler.fit(df_for_training)
df_for_training_scaled = scaler.transform(df_for_training)
display(df_for_training_scaled)

array([[-1.06765007, -1.07855981, -1.07212694, -1.10397542, -1.10651941],
       [-1.08735237, -1.10508348, -1.07173939, -1.113627  , -1.11610943],
       [-1.05490167, -1.03935099, -1.03142693, -1.02830717, -1.03133466],
       ...,
       [ 1.59154721,  1.55894367,  1.58393278,  1.53104349,  1.53300662],
       [ 1.53513183,  1.54269383,  1.56506438,  1.55033082,  1.55224506],
       [ 1.58412408,  1.57716323,  1.5774779 ,  1.58494895,  1.58677544]])

# Reshape LSTM TS data 
For LSTM networks, we require to reshape an input data into ***n_samples x timesteps x n_features***. 
- timesteps => how much past data is considered as a step for every future prediction
- n_features => how many features are used in training data for prediction output
- n_samples => total # of training data samples

LSTM TS data must be arranged as n_samples x (timesteps x n_features) where
0 to timesteps-1 =>
n_samples =>
 - 1st column displaying predicted_value (0-timesteps-1 => predicted_value = training value) 
 - other columns are mutivariate features used for prediction & represent training features
 timestep =>
 This has the predicted output value in predicted_value column. This is for a future prediction window of 1. If predicting >1 look ahead in future (timestep: timestep+lookahead) indices will have the predicted_value

 (  &nbsp;&nbsp;&nbsp;&nbsp;look_back&nbsp;&nbsp;&nbsp;&nbsp;    +&nbsp;&nbsp;&nbsp;&nbsp;   look_ahead&nbsp;&nbsp;&nbsp;&nbsp;          +&nbsp;&nbsp;&nbsp;&nbsp;   next_look_back&nbsp;&nbsp;&nbsp;&nbsp;      +&nbsp;&nbsp;&nbsp;&nbsp;   next_look_ahead .....   )  
 (  &nbsp;&nbsp;&nbsp;&nbsp;past_value&nbsp;&nbsp;&nbsp;&nbsp; +&nbsp;&nbsp;&nbsp;&nbsp;      predicted_value&nbsp;&nbsp;&nbsp;&nbsp;     +&nbsp;&nbsp;&nbsp;&nbsp;   past_value&nbsp;&nbsp;&nbsp;&nbsp;          +&nbsp;&nbsp;&nbsp;&nbsp;   predicted_value ....   )

In [40]:
n_future = 1   # Number of days we want to look into the future based on the past days.
n_past = 14  # Number of past days we want to use to predict the future.
trainX = []     # training input vector
trainY = []     # training output vector

In [None]:
for i in range(n_past, len(df_for_training_scaled) - n_future +1):
    trainX.append(df_for_training_scaled[i - n_past:i, 0:df_for_training.shape[1]])
    trainY.append(df_for_training_scaled[i + n_future - 1:i + n_future, 0])
trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

# Autoencoder specification

In [None]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(trainY.shape[1]))

model.compile(optimizer='adam', loss='mse')
model.summary()
history = model.fit(trainX, trainY, epochs=5, batch_size=16, validation_split=0.1, verbose=1)
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()

Prediction

In [None]:
# filter out UK non working days
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, nearest_workday
from pandas.tseries.offsets import CustomBusinessDay

# Define the UK holiday calendar
class UKHolidayCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday('New Year\'s Day', month=1, day=1, observance=nearest_workday),
        Holiday('Good Friday', month=1, day=1, offset=[pd.DateOffset(weekday=4)]),
        Holiday('Easter Monday', month=1, day=1, offset=[pd.DateOffset(weekday=0)]),
        Holiday('Early May Bank Holiday', month=5, day=1, offset=pd.DateOffset(weekday=0)),
        Holiday('Spring Bank Holiday', month=5, day=31, offset=pd.DateOffset(weekday=0)),
        Holiday('Summer Bank Holiday', month=8, day=31, offset=pd.DateOffset(weekday=0)),
        Holiday('Christmas Day', month=12, day=25, observance=nearest_workday),
        Holiday('Boxing Day', month=12, day=26, observance=nearest_workday)
    ]
uk_business_day = CustomBusinessDay(calendar=UKHolidayCalendar())

In [None]:
n_past = 16
n_days_for_prediction=15
predict_period_dates = pd.date_range(list(train_dates)[-n_past], periods=n_days_for_prediction, freq=uk_business_day).tolist()
prediction = model.predict(trainX[-n_days_for_prediction:]) #shape = (n, 1) where n is the n_days_for_prediction
prediction_copies = np.repeat(prediction, df_for_training.shape[1], axis=-1)
y_pred_future = scaler.inverse_transform(prediction_copies)[:,0]



Forecasted data snapshot

In [None]:
# convert timestamp to date value
forecast_dates = []
for time_i in predict_period_dates:
    forecast_dates.append(time_i.date())
    
df_fore = pd.DataFrame({'Date':np.array(forecast_dates), 'Open':y_pred_future})
df_fore['Date']=pd.to_datetime(df_fore['Date'])

Original data

In [None]:
df_org = df[['Date', 'Open']]
df_org['Date']=pd.to_datetime(df_org['Date'])
df_org = df_org.loc[df_org['Date'] >= '2020-5-1']

In [None]:
# Plot Original vs Forecasted
sns.lineplot(df_org['Date'], df_org['Open'])
sns.lineplot(df_fore['Date'], df_fore['Open'])