In [5]:
# Recurrent Neural Network

# Part 1 - Data Preprocessing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the training set
dataset_train = pd.read_csv("Datasets/SPX_Health_Sector_stocks_all.csv")

In [6]:
dataset_train


Unnamed: 0,Ticker,Date,Open,High,Low,Close,Trading Volume,Volume Weighted Average Price,Number of Transactions
0,ABT,2020-03-11,73.28,78.1500,72.01,74.74,14123240.0,74.9395,113015
1,ABT,2020-03-12,77.91,82.6700,74.83,81.65,18521541.0,77.7958,160884
2,ABT,2020-03-15,70.65,79.2500,70.00,73.66,14358978.0,75.4936,113523
3,ABT,2020-03-16,75.52,79.7900,73.34,79.49,13441141.0,77.9694,124950
4,ABT,2020-03-17,74.98,81.9900,74.39,79.26,17680823.0,78.7228,141919
...,...,...,...,...,...,...,...,...,...
30130,BNTX,2021-12-26,253.24,259.9000,250.23,254.84,1515642.0,256.4526,31055
30131,BNTX,2021-12-27,250.35,256.4854,238.27,241.40,2555171.0,245.9939,49045
30132,BNTX,2021-12-28,235.12,244.3000,231.73,240.61,2713692.0,238.4607,43521
30133,BNTX,2021-12-29,239.02,258.8200,238.00,256.81,2353520.0,251.8848,36782


In [7]:
training_set = dataset_train.iloc[:, 2:3].values
training_set

array([[ 73.28],
       [ 77.91],
       [ 70.65],
       ...,
       [235.12],
       [239.02],
       [253.56]])

In [10]:
# Feature Scaling
# Use Normalization (versus Standardization) for RNNs with Sigmoid Activation Functions
# 'MinMaxScalar' is a Normalization Library
from sklearn.preprocessing import MinMaxScaler
# 'feature_range = (0,1)' makes sure that training data is scaled to have values between 0 and 1
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)

In [11]:
# Creating a data structure with 60 timesteps (look back 60 days) and 1 output
# This tells the RNN what to remember (Number of timesteps) when predicting the next Stock Price
# The wrong number of timesteps can lead to Overfitting or bogus results
# 'x_train' Input with 60 previous days' stock prices
X_train = []
# 'y_train' Output with next day's stock price
y_train = []
for i in range(60, 30135):
    X_train.append(training_set_scaled[i-60:i, 0])
    y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

In [12]:
# Reshaping (add more dimensions)
# This lets you add more indicators that may potentially have corelation with Stock Prices
# Keras RNNs expects an input shape (Batch Size, Timesteps, input_dim)
# '.shape[0]' is the number of Rows (Batch Size)
# '.shape[1]' is the number of Columns (timesteps)
# 'input_dim' is the number of factors that may affect stock prices
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

# Show the dataset we're working with
display(dataset_train)

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Trading Volume,Volume Weighted Average Price,Number of Transactions
0,ABT,2020-03-11,73.28,78.1500,72.01,74.74,14123240.0,74.9395,113015
1,ABT,2020-03-12,77.91,82.6700,74.83,81.65,18521541.0,77.7958,160884
2,ABT,2020-03-15,70.65,79.2500,70.00,73.66,14358978.0,75.4936,113523
3,ABT,2020-03-16,75.52,79.7900,73.34,79.49,13441141.0,77.9694,124950
4,ABT,2020-03-17,74.98,81.9900,74.39,79.26,17680823.0,78.7228,141919
...,...,...,...,...,...,...,...,...,...
30130,BNTX,2021-12-26,253.24,259.9000,250.23,254.84,1515642.0,256.4526,31055
30131,BNTX,2021-12-27,250.35,256.4854,238.27,241.40,2555171.0,245.9939,49045
30132,BNTX,2021-12-28,235.12,244.3000,231.73,240.61,2713692.0,238.4607,43521
30133,BNTX,2021-12-29,239.02,258.8200,238.00,256.81,2353520.0,251.8848,36782


In [13]:
# Part 2 - Building the RNN
# Building a robust stacked LSTM with dropout regularization

# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

In [14]:
# Initialising the RNN
# Regression is when you predict a continuous value
regressor = Sequential()

In [15]:
# Adding the first LSTM layer and some Dropout regularisation
# 'units' is the number of LSTM Memory Cells (Neurons) for higher dimensionality
# 'return_sequences = True' because we will add more stacked LSTM Layers
# 'input_shape' of x_train
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
# 20% of Neurons will be ignored (10 out of 50 Neurons) to prevent Overfitting
regressor.add(Dropout(0.2))

In [16]:
# Adding a second LSTM layer and some Dropout regularisation
# Not need to specify input_shape for second Layer, it knows that we have 50 Neurons from the previous layer
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

# Adding a fourth LSTM layer and some Dropout regularisation
# This is the last LSTM Layer. 'return_sequences = false' by default so we leave it out.
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))