# Long Short Term Memory Stock Prediction Model

### Importing Libraries

In [1]:
# Data Manipulation libraries
import pandas as pd
print(f'pandas: {pd.__version__}')
import numpy as np
print(f'numpy: {np.__version__}')

# Tesorfolw and Keras
import tensorflow as tf
from tensorflow import keras 
print(f'Tensorflow/Keras: {keras.__version__}')
from keras.models import Sequential
from keras import Input
from keras.layers import Bidirectional,LSTM,Dropout,RepeatVector,Dense,TimeDistributed

# Statistical tools
import sklearn
print(f'sklearn: {sklearn.__version__}')
from sklearn.preprocessing import MinMaxScaler

# Visualization
import plotly
import plotly.express as px
import plotly.graph_objects as go
print(f'plotly: {plotly.__version__}')

# Other tools
import pickle
import sys 
import os

main_dir=os.path.dirname(sys.path[0])
print(f'Working right now in {main_dir}')
%autosave 120


pandas: 1.4.3
numpy: 1.23.1
Tensorflow/Keras: 2.9.0
sklearn: 1.1.1
plotly: 5.9.0
Working right now in D:\Coding\Machine learning\Projects


Autosaving every 120 seconds


### Ingesting and preparing data

In [2]:
#Taking input data
stonk = pd.read_csv("data/tata_stock_data.csv", delimiter=',')

#Converting Date column to date format
stonk['Date'] = pd.to_datetime(stonk['Date'], dayfirst=True)
# stonk.index = stonk['Date'] 

#Printing data info
print(f'Dataframe contains stock prices between {stonk.Date.min().date()} and {stonk.Date.max().date()}')
print(f'Total days = {(stonk.Date.max() - stonk.Date.min()).days} days')
print(stonk.head())
stonk.info()

# plt.rcParams["figure.figsize"]=(25,12)
# stonk.plot(y='Close',kind='line', color='purple')
# plt.show()

Dataframe contains stock prices between 2017-01-02 and 2021-12-31
Total days = 1824 days
        Date    Open    High     Low   Close  Volume  High-low  Open-close
0 2017-01-02  474.95  488.50  469.20  487.25  415721     19.30      -12.30
1 2017-01-03  491.00  492.75  479.45  481.25  823203     13.30        9.75
2 2017-01-04  485.00  493.60  484.60  486.50  414780      9.00       -1.50
3 2017-01-05  496.10  503.20  495.00  501.95  539024      8.20       -5.85
4 2017-01-06  506.00  507.85  496.40  497.80  451099     11.45        8.20
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        1239 non-null   datetime64[ns]
 1   Open        1239 non-null   float64       
 2   High        1239 non-null   float64       
 3   Low         1239 non-null   float64       
 4   Close       1239 non-null   float64       
 5   Volume      12

In [38]:
# Set Pandas options to display more columns
pd.options.display.max_columns=150

In [39]:
data = pd.DataFrame(index= range(0,len(stonk)), columns = ['Date', 'Close'])

days=60

for i in range(0,len(stonk)):
    data['Date'][i] = stonk['Date'][i]
    data['Close'][i] = stonk['Close'][i]
    
data.index = data.Date
data.drop('Date', axis=1, inplace=True)
print(data)

final_data = data.values

train_data = final_data[0:int(len(stonk)*0.8)]
print(f'Taking the training data from index 0 to { int(len(stonk)*0.8) - 1} : {int(len(stonk)*0.8)} data points')
test_data = final_data[int(len(stonk)*0.8):]
print(f'Taking the test data from index {int(len(stonk)*0.8)} to {len(stonk)} : {int(len(stonk))-int(len(stonk)*0.8)+1} data points' )

# scaler = MinMaxScaler(feature_range=(0,1))
# scaled_data = scaler.fit_transform(final_data)


             Close
Date              
2017-01-02  487.25
2017-01-03  481.25
2017-01-04   486.5
2017-01-05  501.95
2017-01-06   497.8
...            ...
2021-12-27   471.3
2021-12-28   480.1
2021-12-29   475.8
2021-12-30  470.35
2021-12-31  482.35

[1239 rows x 1 columns]
Taking the training data from index 0 to 990 : 991 data points
Taking the test data from index 991 to 1239 : 249 data points


In [40]:
# Plotting the data taken as input
fig =go.Figure()
fig.add_trace(go.Scatter(x=data.index, 
                             y=data.Close,
                             mode='lines',
                             name='',
                             opacity=1,
                             line=dict(width=1)
                            ))

fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black',
                 title='Date'
                )
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black',
                 title='Price (INR)'
                )

fig.update_layout(dict(plot_bgcolor = 'black'), showlegend = False)

# Set figure title
fig.update_layout(title=dict(text="TATA", font=dict(color='black')))

fig.show()

### Function to prepare data for lstm


In [41]:
def shaping(datain, timestep, scaler):
    
    # Convert input dataframe to array and flatten
    arr=datain.flatten()
    
    # Scale 
    arr_scaled=scaler.fit_transform(arr.reshape(-1, 1)).flatten()
    
    cnt=0
    for mth in range(0, len(datain)-(2*timestep)+1): # Define range 
        cnt=cnt+1 # Gives us the number of samples. Later used to reshape the data
        X_start=mth # Start month for inputs of each sample
        X_end=mth+timestep # End month for inputs of each sample
        Y_start=mth+timestep # Start month for targets of each sample. Note, start is inclusive and end is exclusive, that's why X_end and Y_start is the same number
        Y_end=mth+2*timestep # End month for targets of each sample.  
        
        # Assemble input and target arrays containing all samples
        if mth==0:
            X_comb=arr_scaled[X_start:X_end]
            Y_comb=arr_scaled[Y_start:Y_end]
        else: 
            X_comb=np.append(X_comb, arr_scaled[X_start:X_end])
            Y_comb=np.append(Y_comb, arr_scaled[Y_start:Y_end])
    
    # Reshape input and target arrays arrays 
    X_out=np.reshape(X_comb, (cnt, timestep, 1))
    Y_out=np.reshape(Y_comb, (cnt, timestep, 1))
    return X_out, Y_out

### Training and evaluating LSTM stock prediction model

In [42]:
# Step 1 - Specify parameters
timestep=45
scaler = MinMaxScaler(feature_range=(-1,1))

# Step 2 - Prepare data

df_train = train_data
df_test = test_data


# Use previously defined shaping function to reshape the data for LSTM
X_train, Y_train = shaping(datain=df_train, timestep=timestep, scaler=scaler)
X_test, Y_test = shaping(datain=df_test, timestep=timestep, scaler=scaler)

In [44]:
%%time
##### Step 3 - Specify the structure of a Neural Network
model = Sequential(name="LSTM-Model") # Model
model.add(Input(shape=(X_train.shape[1],X_train.shape[2]), name='Input-Layer')) # Input Layer - need to speicfy the shape of inputs
model.add(Bidirectional(LSTM(units=32, activation='tanh', recurrent_activation='sigmoid', stateful=False), name='Hidden-LSTM-Encoder-Layer')) # Encoder Layer
model.add(RepeatVector(Y_train.shape[1], name='Repeat-Vector-Layer')) # Repeat Vector
model.add(Bidirectional(LSTM(units=32, activation='tanh', recurrent_activation='sigmoid', stateful=False, return_sequences=True), name='Hidden-LSTM-Decoder-Layer')) # Decoder Layer
model.add(TimeDistributed(Dense(units=1, activation='linear'), name='Output-Layer')) # Output Layer, Linear(x) = x


##### Step 4 - Compile the model
model.compile(optimizer='adam', # default='rmsprop', an algorithm to be used in backpropagation
              loss='mean_squared_error', # Loss function to be optimized. A string (name of loss function), or a tf.keras.losses.Loss instance.
              metrics=['MeanSquaredError', 'MeanAbsoluteError'], # List of metrics to be evaluated by the model during training and testing. Each of this can be a string (name of a built-in function), function or a tf.keras.metrics.Metric instance. 
              loss_weights=None, # default=None, Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of different model outputs.
              weighted_metrics=None, # default=None, List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing.
              run_eagerly=None, # Defaults to False. If True, this Model's logic will not be wrapped in a tf.function. Recommended to leave this as None unless your Model cannot be run inside a tf.function.
              steps_per_execution=None # Defaults to 1. The number of batches to run during each tf.function call. Running multiple batches inside a single tf.function call can greatly improve performance on TPUs or small models with a large Python overhead.
             )


##### Step 5 - Fit the model on the dataset
history = model.fit(X_train,# input data
                    Y_train,      # target data
                    batch_size=1,# Number of samples per gradient update. If unspecified, batch_size will default to 32.
                    epochs=60,      # default=1, Number of epochs to train the model. An epoch is an iteration over the entire x and y data provided
                    verbose=0,      # default='auto', ('auto', 0, 1, or 2). Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. 'auto' defaults to 1 for most cases, but 2 when used with ParameterServerStrategy.
                    callbacks=None,# default=None, list of callbacks to apply during training. See tf.keras.callbacks
                    validation_split=0.2,# default=0.0, Fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. 
                    shuffle=True,             # default=True, Boolean (whether to shuffle the training data before each epoch) or str (for 'batch').
                    class_weight=None,      # default=None, Optional dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss function (during training only). This can be useful to tell the model to "pay more attention" to samples from an under-represented class.
                    sample_weight=None,                        # default=None, Optional Numpy array of weights for the training samples, used for weighting the loss function (during training only).
                    initial_epoch=0,            # Integer, default=0, Epoch at which to start training (useful for resuming a previous training run).
                    steps_per_epoch=None,# Integer or None, default=None, Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. When training with input tensors such as TensorFlow data tensors, the default None is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined. 
                    validation_steps=None,# Only relevant if validation_data is provided and is a tf.data dataset. Total number of steps (batches of samples) to draw before stopping when performing validation at the end of every epoch.
                    validation_batch_size=None,# Integer or None, default=None, Number of samples per validation batch. If unspecified, will default to batch_size.
                    validation_freq=100,# default=1, Only relevant if validation data is provided. If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. validation_freq=2 runs validation every 2 epochs.
                    max_queue_size=10,# default=10, Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.
                    workers=1,# default=1, Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.
                    use_multiprocessing=True,      # default=False, Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False. 
                   )


##### Step 6 - Use model to make predictions
# Predict results on training data
pred_train = model.predict(X_train)
# Predict results on test data
pred_test = model.predict(X_test)


##### Step 7 - Print Performance Summary
print("")
print('-------------------- Model Summary --------------------')
model.summary() # print model summary
print("")
print('-------------------- Weights and Biases --------------------')
print("Too many parameters to print but you can use the code provided if needed")
print("")
#for layer in model.layers:
#    print(layer.name)
#    for item in layer.get_weights():
#        print("  ", item)
#print("")

#Print the last value in the evaluation metrics contained within history file
print('-------------------- Evaluation on Training Data --------------------')
for item in history.history:
    print("Final", item, ":", history.history[item][-1])
print("")

#Evaluate the model on the test data using "evaluate"
print('-------------------- Evaluation on Test Data --------------------')
results = model.evaluate(X_test, Y_test)
print("")


-------------------- Model Summary --------------------
Model: "LSTM-Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Hidden-LSTM-Encoder-Layer (  (None, 64)               8704      
 Bidirectional)                                                  
                                                                 
 Repeat-Vector-Layer (Repeat  (None, 45, 64)           0         
 Vector)                                                         
                                                                 
 Hidden-LSTM-Decoder-Layer (  (None, 45, 64)           24832     
 Bidirectional)                                                  
                                                                 
 Output-Layer (TimeDistribut  (None, 45, 1)            65        
 ed)                                                             
                                                                 

In [45]:
print(pred_train,pred_test)

[[[ 0.5910037 ]
  [ 0.56822556]
  [ 0.5640673 ]
  ...
  [ 0.48055148]
  [ 0.48882568]
  [ 0.5076539 ]]

 [[ 0.58139426]
  [ 0.56057835]
  [ 0.5564326 ]
  ...
  [ 0.4767341 ]
  [ 0.48540187]
  [ 0.50400627]]

 [[ 0.5896315 ]
  [ 0.5686246 ]
  [ 0.5638881 ]
  ...
  [ 0.48836404]
  [ 0.49849987]
  [ 0.5172737 ]]

 ...

 [[-0.6958587 ]
  [-0.6858544 ]
  [-0.69869393]
  ...
  [-0.51568466]
  [-0.50866205]
  [-0.5113278 ]]

 [[-0.70349914]
  [-0.69440717]
  [-0.70811975]
  ...
  [-0.5182877 ]
  [-0.5111154 ]
  [-0.5130172 ]]

 [[-0.7053118 ]
  [-0.6951885 ]
  [-0.708926  ]
  ...
  [-0.5177129 ]
  [-0.5103245 ]
  [-0.51129836]]] [[[-0.387883  ]
  [-0.38261443]
  [-0.3744871 ]
  ...
  [-0.39921713]
  [-0.41209948]
  [-0.4199474 ]]

 [[-0.42832738]
  [-0.43577844]
  [-0.43841887]
  ...
  [-0.5757973 ]
  [-0.5928308 ]
  [-0.60049325]]

 [[-0.44924432]
  [-0.47126663]
  [-0.48498058]
  ...
  [-0.6190018 ]
  [-0.6387149 ]
  [-0.64748853]]

 ...

 [[ 0.6692044 ]
  [ 0.7524021 ]
  [ 0.7754998 ]
  ..

### Visualizing the results

In [50]:
# Plot stock prices (actual and predicted) for test (out of time) data
fig = go.Figure()

# Trace for actual stock prices
fig.add_trace(go.Scatter(x=np.array(data.index[-2*timestep:]),
                         y=np.array(data[-2*timestep:]).flatten(),
                         mode='lines',
                         name='Stock Daily Prices - Actual (Test)',
                         opacity=0.8,
                         line=dict(color='black', width=1)
                        ))

# Trace for predicted stock prices
fig.add_trace(go.Scatter(x=np.array(data.index[-timestep:]),
                         y=scaler.inverse_transform(pred_train.reshape(-1,1)).flatten(),
#                          y=scaler.inverse_transform(pred_test.reshape(-1,1)).flatten(),
                         mode='lines',
                         name='Stock Prices - Predicted (Test)',
                         opacity=0.8,
                         line=dict(color='red', width=1)
                        ))

# Change chart background color
# fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black',
                 title='Date'
                )

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black',
                 title='Prices (in INR)'
                )

# Set figure title
fig.update_layout(title=dict(text="TATA Stock Prices", font=dict(color='black')),
                  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                 )

fig.show()

### Saving the Model

In [1]:
# pickle.dumps(model)

# Resources used :
# https://www.projectpro.io/recipes/save-trained-model-in-python
# https://towardsdatascience.com/lstm-recurrent-neural-networks-how-to-teach-a-network-to-remember-the-past-55e54c2ff22e
# https://www.geeksforgeeks.org/understanding-of-lstm-networks/
# https://medium.com/analytics-vidhya/lstms-explained-a-complete-technically-accurate-conceptual-guide-with-keras-2a650327e8f2
# https://www.diva-portal.org/smash/get/diva2:1213449/FULLTEXT01.pdf
# https://towardsdatascience.com/predicting-stock-prices-using-a-keras-lstm-model-4225457f0233

In [48]:

# print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

Num GPUs Available:  1
