# Python implementation for forecasting price of Dow Jones Industrial Average index using linear regression (LR), support vector regression (SVR), long short term memory (LSTM) neural network, artificial neural network (ANN), and stage two hybrid technique with all above four in stage 1 and LSTM in stage 2. Evaluating their forecasting performance using root mean square error (RMSE) and mean absolute percentage error (MAPE). Calculating the importance of inputs in forecasting the output.

## Importing basic libraries like Numpy, Pandas, and others along with Tensorflow.

In [52]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import datetime
import os
import os.path
from numpy.random import seed
seed(11)
import tensorflow as tf
tf.random.set_seed(12)
try:
    import pandas as pd
    print("  pandas: %s"% pd.__version__)
except:
    print("Missing pandas package")



  pandas: 1.4.2


## Import yahoo finance API, and extract the stock index data from yahoo finance.

## NASDAQ 100, S&P 500, Dow Jones Industrial average are the important stock indices of United States of America (USA).

In [53]:
import yfinance as yf
df = yf.download("^DJI", start="1985-01-01", end="2022-07-30", interval="1mo") 
data=df['Close']
data
# ^NDX is symbol for NASDAQ 100
# ^GSPC is symbol for S&P 500
# ^DJI is symbol for Dow Jones Industrial Average (it consists of 30 companies)


[*********************100%***********************]  1 of 1 completed


Date
1992-02-01     3267.699951
1992-03-01     3235.500000
1992-04-01     3359.100098
1992-05-01     3396.899902
1992-06-01     3318.500000
                  ...     
2022-03-01    34678.351562
2022-04-01    32977.210938
2022-05-01    32990.121094
2022-06-01    30775.429688
2022-07-01    32845.128906
Name: Close, Length: 366, dtype: float64

## Exponential moving average (EMA) is calculated with 90 percent weightage to the current value.

In [54]:
df['EMA']=df['Adj Close'].ewm(alpha=0.9).mean()

## The onbalance volume (OBV) is calculated in below steps.

In [55]:
#sign of price variation is calculated to get the know whether cash flow is positive or negative
df['sign']=np.sign(df['Adj Close'].pct_change()) 

In [56]:
df['sign']=df['sign']*df['Volume'] #now sign is multiplied with volume to get the volume effect of cash flow trend

In [57]:
df['OBV']=np.cumsum(df['sign']) # the cumulative effect of volume of cash flow is obtained which is known as on-balance volume.


## Average true range (ATR) is calculated in below steps.

In [58]:
# the three different arguments of the true range are calculated and assigned to arbitrary variables a,b,c
a=pd.DataFrame(df['High']-df['Low'])
b=pd.DataFrame(abs(df['High']-df['Close'].shift(1)))
c=pd.DataFrame(abs(df['High']-df['Close'].shift(1)))


In [59]:
# all the three arguments of the true range are combined into a single dataframe
a['1']=b
a['2']=c


In [60]:
tr=a.max(axis=1) # true range (TR) is the maximum value amoung its three arguments 


In [61]:
df['TR']=tr


In [62]:
df['ATR']=df['TR'].rolling(3).mean() # average true range (ATR) is calculated from true range

## The close price of the index is converted into a dataframe

In [63]:
data = data.to_frame()

## The name of the columns are changed inorder to be consistent with the remaining program

In [64]:
data = data.rename(columns = {'Close':'CLOSE'})


## The datareader function is imported for the purpose of remote data access. The Fama French five factor model data is read from their database. This Fama French data is for stocks markets in United States of America (USA).

In [65]:
import pandas_datareader.data as web
START_DATE = '1985-1-1'
END_DATE = '2022-07-30'
df_five_factor = web.DataReader('F-F_Research_Data_5_Factors_2x3', 
                                'famafrench', 
                                start=START_DATE)[0]
df_five_factor.index = df_five_factor.index.format()

## All the technical indiactors are change into percentage values and multiplied with 100 to get percentages as they have to be in the same format as the five factors and the five factors are in the percentage values. Also the time format is changed to year and month as the five factors are in this format.


In [66]:
df2 = df['ATR'].pct_change()
tr=df2*100
tr.index = tr.index.strftime('%Y-%m')
df_five_factor['ATR'] = tr


In [67]:
df3 = df['OBV'].pct_change()
obv=df3*100
obv.index = obv.index.strftime('%Y-%m')
df_five_factor['OBV'] = obv


In [68]:
df4 = df['EMA'].pct_change()
ema=df4*100
ema.index = ema.index.strftime('%Y-%m')
df_five_factor['EMA'] = ema


## Consumer price index (CPI) inflation for United States of America (USA) in the format of comma seperated values (csv) are downloaded form organisation for economic cooperation and development (OCED) website (https://data.oecd.org/price/inflation-cpi.htm)  .  This data is read and added to the dataframe.

In [69]:
cpi = pd.read_csv('cpi.csv', index_col=0)

## Long term interest rates(IR) for United States of America (USA) in the format of comma seperated values (csv) are downloaded form organisation for economic cooperation and development (OCED) website (https://data.oecd.org/interest/long-term-interest-rates.htm)  .  This data is read and added to the dataframe.

In [70]:
interest = pd.read_csv('IR.csv', index_col=0)

## CPI and IR are also converted in percentages in the steps below.


In [71]:
df_five_factor['CPI']=cpi.pct_change()*100


In [72]:
df_five_factor['IR']=interest.pct_change()*100


## All the ten parameters which include five factors, technical indicators, and economic data are now converted from percentages to normal values with the stock market index start price as reference.

In [73]:
df_five_factor = 1+df_five_factor/100 # Obtain the rate changes effect as a multiplicative factor


In [74]:
df_five_factor =  np.cumprod(df_five_factor) # compute the compounding effect of the rate changes

In [75]:
df_five_factor = df_five_factor*data.iloc[0,0] # consider the start price value of the index as reference.

## Column length of the dataframe is obtained which can be used to determine the dimensionality of input for our machine learning techniques.

In [76]:
col_len = len(list(df_five_factor.columns))
col_len

11

## Renaming coloumns.

In [77]:
df_five_factor.rename(columns = {'Mkt-RF': 'MKT'} , inplace = True)

In [78]:
df_five_factor.rename(columns = {'Mom   ': 'MOM'} , inplace = True)

In [79]:
list_factor=list(df_five_factor.columns)
list_factor

['MKT', 'SMB', 'HML', 'RMW', 'CMA', 'RF', 'ATR', 'OBV', 'EMA', 'CPI', 'IR']

## Adjusting the time format of close value.

In [80]:
y = data.dropna()

y.index = y.index.strftime('%Y-%m')
y.name = 'return'

## Determine the period (how many months) for which you want to forecast. 

In [81]:
predict_period=24
five_factor_data=df_five_factor.shift(predict_period) 
# advance the factors and other parameters as per the forecast period so that current factors can forecast future values.

## Add the stock index close price (the value that has to be forecasted) to the dataframe containing the input parameters.

In [82]:
five_factor_data = five_factor_data.join(y)

## Remove the row entries with missing values so that data is clean for processing and will not cause any issue while the machine is being trained, validated and tested.

In [83]:
five_factor_data = five_factor_data.dropna()

## Convert all the data into floating point numbers so that computations donot through any errors due to incompatibility of data types.

In [84]:
five_factor_data= five_factor_data.astype(float)

In [85]:
five_factor_data=five_factor_data.dropna()
five_factor_data2 = five_factor_data


## Adjusting the length of the dataset to make it suitable for proper splitting and pre-processing.

In [86]:
five_factor_data=five_factor_data.iloc[len(five_factor_data)%10:,:]


In [87]:
from statsmodels.tsa.stattools import adfuller

adfuller(five_factor_data['CLOSE'])

(0.23292605068778946,
 0.9740490166922292,
 7,
 322,
 {'1%': -3.4508226600665037,
  '5%': -2.870558121868621,
  '10%': -2.571574731684734},
 4973.387131799391)

## Use logarthimic scaling to reduce the non-stationarity of the data.

In [88]:
five_factor_data=abs(five_factor_data) # eliminating the negative sign so that the log operation can be applied on the data.
five_factor_data=np.log10(five_factor_data)


In [89]:
adfuller(five_factor_data['CLOSE'])

(-1.4366977156109846,
 0.5645618851903482,
 0,
 329,
 {'1%': -3.4503836022181056,
  '5%': -2.8703653471616826,
  '10%': -2.571471939191249},
 -1581.5868526999059)

## Import sklearn and its functions that are used in this program.

In [90]:
from sklearn import metrics
import sklearn as sk
from sklearn.model_selection import train_test_split


## Split the data into train and test.

In [91]:
train_five_factor_data, test_five_factor_data = train_test_split(five_factor_data, test_size=0.2, random_state=0, shuffle=False)

## Import MinMaxScalar and use it to scale the data as machine learning techniques are efficient when working with data in the range 0 to 1.

## Scalar associated with train dataset is named 's1' and scalar associated with test dataset is named 's2'.

In [92]:
from sklearn.preprocessing import MinMaxScaler
s1 = MinMaxScaler(feature_range=(0,1))
train_five_factor_data[:]=s1.fit_transform(train_five_factor_data)


In [93]:
s2 = MinMaxScaler(feature_range=(0,1))
test_five_factor_data[:]=s2.fit_transform(test_five_factor_data)


## Obtain the column and index names to assign them to dataframe obtained as outputs of the machine learning techniques.

In [94]:
list_columns=test_five_factor_data.columns.to_list()

In [95]:
list_index_test = test_five_factor_data.index.to_list()

In [96]:
list_index_train = train_five_factor_data.index.to_list()


## Import statsmodel library and its formula API so that linear regression model using ordinary least squares criteria be implemented. Train the linear regression model using training dataset.

In [97]:
import statsmodels.formula.api as smf

five_factor_model = smf.ols(
    formula='CLOSE ~ MKT + SMB + HML + RMW + CMA+ RF + IR + CPI + ATR + OBV + EMA', 
    data=train_five_factor_data
).fit()
print(five_factor_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  CLOSE   R-squared:                       0.867
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                     148.8
Date:                Thu, 06 Oct 2022   Prob (F-statistic):          1.89e-103
Time:                        22:16:13   Log-Likelihood:                 309.04
No. Observations:                 264   AIC:                            -594.1
Df Residuals:                     252   BIC:                            -551.2
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1108      0.074      1.500      0.1

## Predict the close price for test data using the trained linear regression model.

In [98]:
df = (five_factor_model.predict(test_five_factor_data.iloc[:,0:col_len]))


## Predict the close price for train data using the trained linear regression model.

In [99]:
df_tr = (five_factor_model.predict(train_five_factor_data.iloc[:,0:col_len]))


## Appending predicted values to the train and test dataframe. 

In [100]:
test_five_factor_data['LR']=df


In [101]:
train_five_factor_data['LR']=df_tr


## Creating dataframes which are similar to train and test data but close prices replaced by predicted data. This is done so that the dimensions match and the inverse transformation of MinMaxScalar are applied.

In [102]:
new_df = test_five_factor_data[list_factor+['LR']]


In [103]:
new_df_tr = train_five_factor_data[list_factor+['LR']]


## Applying inverse transformation of MinMaxScalar associated to test data and train data to data sets where close is replaced by predicted value.

## Due to the dimentionality requirement of inverse MinMaxScalar we cannot have close and predicted value in a single dataframe and then apply the inverse scalar. Thus we are doing them seperately and later we will combine them.

In [104]:
new_df[:] = s2.inverse_transform(new_df)


In [105]:
new_df_tr[:] = s1.inverse_transform(new_df_tr)


## Applying inverse transformation of logarthimic scaling to test and train like data where close is replaced by predicted values.

In [106]:
new_df=10**(new_df)


In [107]:
new_df_tr=10**(new_df_tr)


## Applying inverse transform of MinMaxScalar to test and train data.

In [108]:
new_df_test = test_five_factor_data[list_factor+['CLOSE']]
new_df_test[:] = s2.inverse_transform(new_df_test)


In [109]:
new_df_train = train_five_factor_data[list_factor+['CLOSE']]
new_df_train[:] = s1.inverse_transform(new_df_train)


## Applying inverse transformation of logarthimic scaling to test and train data.

In [110]:
new_df_test=10**(new_df_test)


In [111]:
new_df_train=10**(new_df_train)


## Combining predicted and close value along with input parameters into a single dataframe so that this dataframe can be used for calculating metrics.

In [112]:
new_df_test['LR']=new_df['LR']


In [113]:
new_df_train['LR']=new_df_tr['LR']


## Import sklearn libraries and associated functions for implementing support vector regression. Random search is used for hyperparameter tuning of penality factor C and gamma.

In [114]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, norm, lognorm, expon
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
import scipy

svr=SVR(tol=0.0001)
distributions=[{'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1), 'epsilon':scipy.stats.expon(scale=.1),
  'kernel': ['rbf']}]
scorer = make_scorer(mean_squared_error, greater_is_better=False)
svr1 = RandomizedSearchCV(svr, distributions, n_iter=100,random_state=0,scoring=scorer,cv=10)
search = svr1.fit(train_five_factor_data.iloc[:,0:col_len], train_five_factor_data.iloc[:,col_len])
search.best_params_


{'C': 95.59664156488698,
 'epsilon': 0.013218371012746116,
 'gamma': 0.18839288998793566,
 'kernel': 'rbf'}

## Display the success status of the model in fitting the data. It is zero for success.

In [115]:
search.best_estimator_.fit_status_

0

## Calculate the importance of each input factor (use one for only one factor and zeros for other factors to calculate its overall weight or coefficient).

In [116]:
w_imp=np.zeros(train_five_factor_data.iloc[:,0:col_len].shape[1]) # initiate the array to store importance of input parameters
for k in range(0,train_five_factor_data.iloc[:,0:col_len].shape[1]):
    imp=np.zeros((21,train_five_factor_data.iloc[:,0:col_len].shape[1])) #initiate the array to store values to compute sum
    for j in range(0,21): # run from 0 to 1 in steps of 0.05, 0.05 is used within the loop
        for i in range(0,train_five_factor_data.iloc[:,0:col_len].shape[1]):
            imp[j][k]=j*0.05
    w_imp[k]=sum(search.best_estimator_.predict(imp))
w_imp/21  


array([ 0.2040052 ,  0.16034498,  0.18919832,  0.03402029, -0.04157044,
        0.66082598,  0.37166989,  0.56063515,  0.43911123,  0.14780707,
        0.44592289])

## Predict the close price using the trained SVR model

In [117]:
y1 = pd.DataFrame(search.best_estimator_.predict(test_five_factor_data.iloc[:,0:col_len]),index=list_index_test,columns=['SVR'])


In [118]:
y1_tr = pd.DataFrame(search.best_estimator_.predict(train_five_factor_data.iloc[:,0:col_len]),index=list_index_train,columns=['SVR'])


In [119]:
test_five_factor_data['SVR']=y1


In [120]:
train_five_factor_data['SVR']=y1_tr


## Make necessary dimension changes and apply inverse scalar of MinMaxScalar to SVR predicted output.

In [121]:
new_dfsvr_test = test_five_factor_data[list_factor+['SVR']]
new_dfsvr_test[:] = s2.inverse_transform(new_dfsvr_test)


In [122]:
new_dfsvr_train = train_five_factor_data[list_factor+['SVR']]
new_dfsvr_train[:] = s1.inverse_transform(new_dfsvr_train)


## Apply inverse logarithmic scaling.

In [123]:
new_dfsvr_test=10**(new_dfsvr_test)


In [124]:
new_dfsvr_train=10**(new_dfsvr_train)


## Add SVR prediction results to the existing dataframes for calculating metrics.

In [125]:
new_df_test['SVR']=new_dfsvr_test['SVR']


In [126]:
new_df_train['SVR']=new_dfsvr_train['SVR']


## Import the libraries, functions, and layers necesary to implement the LSTM model.

In [127]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from tensorflow import keras 
from tensorflow.keras import layers
import keras_tuner
import keras
from keras.callbacks import EarlyStopping
import tensorflow as tf


## Reshape the data such that it fits the input dimensional requirements of LSTM. LSTM input shape is (number of rows or time steps,1,number of columns or features).

In [128]:
train_five_factor_data_re = train_five_factor_data.iloc[:,0:col_len].values.reshape(train_five_factor_data.iloc[:,0:col_len].shape[0],1, train_five_factor_data.iloc[:,0:col_len].shape[1])
test_five_factor_data_re = test_five_factor_data.iloc[:,0:col_len].values.reshape(test_five_factor_data.iloc[:,0:col_len].shape[0],1, test_five_factor_data.iloc[:,0:col_len].shape[1])


## Define the regressor for LSTM with hyperparameter tuning, and run hyperparameter tuning using keras tuner with number of layers, units per layer, dropout rate, and leraning rate as hyperparameters. Early stopping is also included while evaluating the model to find the best model.

In [129]:
def build_regressor(hp):
    model = keras.Sequential()
    for i in range(hp.Int("num",min_value=2, max_value=5, step=1)):
        model.add(layers.LSTM(units=hp.Int("units"+str(i), min_value=16, max_value=256, step=16), input_dim=col_len,return_sequences=True, activation='tanh'))
        model.add(layers.Dropout(rate=0.01*hp.Int("rate"+str(i), min_value=30, max_value= 50, step=10)))
    model.add(layers.Dense(units=1))
    learning_rate = hp.Float("lr", min_value=1e-5, max_value=1e-3, sampling="log")

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mean_squared_error",
        # Objectieve is one of the metrics.
        metrics=[keras.metrics.MeanSquaredError()],
    )
    return model


tuner = keras_tuner.RandomSearch(
    hypermodel=build_regressor,
    # The objective name and direction.
    # Name is the f"val_{snake_case_metric_class_name}".
    objective=keras_tuner.Objective("val_mean_squared_error", direction="min"),
    max_trials=15,
    seed=111,
    overwrite=True,
    directory="my_dir",
    project_name="built_in_metrics",
)

tuner.search(
    x=train_five_factor_data_re,
    y=train_five_factor_data.iloc[:,col_len],
    validation_data=(test_five_factor_data_re,test_five_factor_data.iloc[:,col_len]),
    callbacks=[tf.keras.callbacks.EarlyStopping("val_mean_squared_error")],
    epochs=100,

)

tuner.results_summary()


Trial 15 Complete [00h 00m 04s]
val_mean_squared_error: 0.2735501527786255

Best val_mean_squared_error So Far: 0.07033675909042358
Total elapsed time: 00h 01m 23s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in my_dir\built_in_metrics
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x000001B9AFF033D0>
Trial summary
Hyperparameters:
num: 4
units0: 112
rate0: 30
units1: 208
rate1: 40
lr: 0.0009022079415210089
units2: 256
rate2: 30
units3: 160
rate3: 40
Score: 0.07033675909042358
Trial summary
Hyperparameters:
num: 3
units0: 176
rate0: 40
units1: 208
rate1: 30
lr: 0.0005343634220268847
units2: 176
rate2: 40
units3: 256
rate3: 50
units4: 48
rate4: 50
Score: 0.13849397003650665
Trial summary
Hyperparameters:
num: 3
units0: 32
rate0: 40
units1: 128
rate1: 40
lr: 0.0006906159408436501
units2: 144
rate2: 40
units3: 224
rate3: 40
units4: 176
rate4: 40
Score: 0.1916838139295578
Trial summary
Hyperparameters:
num: 4
units0: 112
rate0: 30
units1: 144
rate

In [130]:
num_crv = 15

models1 = tuner.get_best_models(num_models=num_crv)


## Do 10-fold cross validation for find the consistent model amoung the top models.

In [131]:
from sklearn.model_selection import KFold
num_splits = 10
kf=KFold(n_splits=num_splits)


In [132]:
score = pd.DataFrame(index=list(range(num_crv)),columns=list(range(num_splits)))
for i in range(0,num_crv):
    j=0
    for train_index,val_index in kf.split(train_five_factor_data_re):
        x_train,x_val=train_five_factor_data_re[train_index],train_five_factor_data_re[val_index]
        y_train,y_val=train_five_factor_data.iloc[:,col_len][train_index],train_five_factor_data.iloc[:,col_len][val_index]
        score.iloc[i,j]=models1[i].evaluate(x_val,y_val)
        j=j+1




In [133]:
arr1 = pd.DataFrame(index=list(range(num_crv)),columns=list(range(num_splits)))
for i in range(0,num_crv):
    for j in range(0,num_splits):
        arr1.iloc[i,j]=np.array(score)[i][j][1]


In [134]:
arr1['sum']=arr1.sum(axis=1)


In [135]:
arr1['sum'].idxmin()

0

## Assign the topmost consistent model as the best model.

In [136]:
best_model=models1[arr1['sum'].idxmin()]

In [137]:
best_model.build()


## Display the best model that is built.

In [138]:
best_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 112)         55552     
                                                                 
 dropout (Dropout)           (None, None, 112)         0         
                                                                 
 lstm_1 (LSTM)               (None, None, 208)         267072    
                                                                 
 dropout_1 (Dropout)         (None, None, 208)         0         
                                                                 
 lstm_2 (LSTM)               (None, None, 256)         476160    
                                                                 
 dropout_2 (Dropout)         (None, None, 256)         0         
                                                                 
 lstm_3 (LSTM)               (None, None, 160)         2

## Define early stopping criteria for epoches while fitting the model and fit the model.

In [139]:
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', patience=15, verbose=1)


In [140]:
best_model.fit(train_five_factor_data_re, train_five_factor_data.iloc[:,col_len] ,batch_size = 8, epochs = 500, verbose=1,callbacks=[callback])#batch_size=20 default

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x1b9c3a774c0>

## Calculate importance (overall coefficient) of each input on the output.


In [141]:
w_imp=np.zeros(train_five_factor_data_re.shape[2]) # initiate the array to store importance of input parameters
for k in range(0,train_five_factor_data_re.shape[2]):
    imp=np.zeros((21,train_five_factor_data_re.shape[2])) #initiate the array to store values to compute sum
    for j in range(0,21): # run from 0 to 1 in steps of 0.05, 0.05 is used within the loop
        for i in range(0,train_five_factor_data_re.shape[2]):
            imp[j][k]=j*0.05
    w_imp[k]=sum(best_model.predict(imp.reshape(21,1,train_five_factor_data_re.shape[2])))
w_imp/21




array([0.60449378, 0.3978365 , 0.28391062, 0.65568452, 0.44200466,
       0.25560781, 0.44630332, 0.49025427, 0.50852562, 0.2535939 ,
       0.20928074])

## Predict the close price using the fit LSTM model.

In [142]:
y_pred = best_model.predict(test_five_factor_data_re)



In [143]:
y_pred_tr = best_model.predict(train_five_factor_data_re)



## Do the inverse reshapeing so that the data can be obtained in the required format to apply inverse scaling.

In [144]:
y_pred=pd.DataFrame(y_pred.reshape(-1,1),index=list_index_test,columns=['LSTM'])

In [145]:
y_pred_tr=pd.DataFrame(y_pred_tr.reshape(-1,1),index=list_index_train,columns=['LSTM'])

In [146]:
test_five_factor_data['LSTM']=y_pred


In [147]:
train_five_factor_data['LSTM']=y_pred_tr


## Apply inverse scaling related to MinMaxScalar and logarithm scalar.

In [148]:
new_dflstm_test = test_five_factor_data[list_factor+['LSTM']]
new_dflstm_test[:] = s2.inverse_transform(new_dflstm_test)


In [149]:
new_dflstm_test=10**(new_dflstm_test)


In [150]:
new_dflstm_train = train_five_factor_data[list_factor+['LSTM']]
new_dflstm_train[:] = s1.inverse_transform(new_dflstm_train)


In [151]:
new_dflstm_train=10**(new_dflstm_train)


## Make data suitable for calculating metrics.

In [152]:
new_df_test['LSTM']=new_dflstm_test['LSTM']


In [153]:
new_df_train['LSTM']=new_dflstm_train['LSTM']


## Reshaping data for inputing it to ANN

In [154]:
train_five_factor_data_rec = train_five_factor_data.iloc[:,0:col_len].values.reshape(train_five_factor_data.iloc[:,0:col_len].shape[0],1, train_five_factor_data.iloc[:,0:col_len].shape[1])
test_five_factor_data_rec = test_five_factor_data.iloc[:,0:col_len].values.reshape(test_five_factor_data.iloc[:,0:col_len].shape[0],1, test_five_factor_data.iloc[:,0:col_len].shape[1])
train_five_factor_data_y=train_five_factor_data.iloc[:,col_len]
test_five_factor_data_y=test_five_factor_data.iloc[:,col_len]

## Define the regressor with hyperparameter tuning for ANN, and do hyperparameter tuning.

In [155]:
def build_regressor(hp):
    model1 = keras.Sequential()
    for i in range(hp.Int("num",min_value=2, max_value=5, step=1)):
        model1.add(layers.Dense(units=hp.Int("units"+str(i), min_value=16, max_value=256, step=16), activation='tanh'))
        model1.add(layers.Dropout(rate=0.01*hp.Int("rate"+str(i), min_value=30, max_value= 50, step=10)))
    model1.add(layers.Dense(units=1))
    learning_rate = hp.Float("lr", min_value=1e-5, max_value=1e-3, sampling="log")

    model1.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mean_squared_error",
        # Objective is one of the metrics.
        metrics=[keras.metrics.MeanSquaredError()],
    )
    return model1


tuner1 = keras_tuner.RandomSearch(
    hypermodel=build_regressor,
    # The objective name and direction.
    # Name is the f"val_{snake_case_metric_class_name}".
    objective=keras_tuner.Objective("val_mean_squared_error", direction="min"),
    max_trials=15,
    seed=111,
    overwrite=True,
    directory="my_dir",
    project_name="built_in_metrics1",
)

tuner1.search(
    x=train_five_factor_data_rec,
    y=train_five_factor_data_y,
    validation_data=(test_five_factor_data_rec,test_five_factor_data_y),
    callbacks=[tf.keras.callbacks.EarlyStopping("val_mean_squared_error")],
    epochs=100,

)

tuner1.results_summary()


Trial 15 Complete [00h 00m 01s]
val_mean_squared_error: 0.078128881752491

Best val_mean_squared_error So Far: 0.018722105771303177
Total elapsed time: 00h 00m 16s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in my_dir\built_in_metrics1
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x000001B9D900B370>
Trial summary
Hyperparameters:
num: 2
units0: 240
rate0: 40
units1: 144
rate1: 40
lr: 6.681248953196665e-05
units2: 128
rate2: 40
units3: 176
rate3: 40
Score: 0.018722105771303177
Trial summary
Hyperparameters:
num: 4
units0: 112
rate0: 30
units1: 208
rate1: 40
lr: 0.0009022079415210089
units2: 256
rate2: 30
units3: 160
rate3: 40
Score: 0.02348429523408413
Trial summary
Hyperparameters:
num: 3
units0: 224
rate0: 40
units1: 112
rate1: 40
lr: 0.00035797804918054376
units2: 144
rate2: 40
units3: 32
rate3: 40
units4: 96
rate4: 40
Score: 0.029852472245693207
Trial summary
Hyperparameters:
num: 2
units0: 64
rate0: 30
units1: 208
rate1: 40
lr: 0.000146

In [156]:
num_crv = 15

models2 = tuner1.get_best_models(num_models=num_crv)



## Implement 10-fold cross validation.

In [157]:
from sklearn.model_selection import KFold
num_splits = 10
kf1=KFold(n_splits=num_splits)


In [158]:
score1 = pd.DataFrame(index=list(range(num_crv)),columns=list(range(num_splits)))
for i in range(0,num_crv):
    j=0
    for train_index,val_index in kf1.split(train_five_factor_data_rec):
        x_train,x_val=train_five_factor_data_rec[train_index],train_five_factor_data_rec[val_index]
        y_train,y_val=train_five_factor_data.iloc[:,col_len][train_index],train_five_factor_data.iloc[:,col_len][val_index]
        score1.iloc[i,j]=models2[i].evaluate(x_val,y_val)
        j=j+1




In [159]:
arr2 = pd.DataFrame(index=list(range(num_crv)),columns=list(range(num_splits)))
for i in range(0,num_crv):
    for j in range(0,num_splits):
        arr2.iloc[i,j]=np.array(score1)[i][j][1]


In [160]:
arr2['sum']=arr2.sum(axis=1)


In [161]:
arr2['sum'].idxmin()

1

In [162]:
best_model1=models2[arr2['sum'].idxmin()]

## Use the best model to build and fit the model.

In [163]:
best_model1.build()


In [164]:
best_model1.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1, 112)            1344      
                                                                 
 dropout (Dropout)           (None, 1, 112)            0         
                                                                 
 dense_1 (Dense)             (None, 1, 208)            23504     
                                                                 
 dropout_1 (Dropout)         (None, 1, 208)            0         
                                                                 
 dense_2 (Dense)             (None, 1, 256)            53504     
                                                                 
 dropout_2 (Dropout)         (None, 1, 256)            0         
                                                                 
 dense_3 (Dense)             (None, 1, 160)            4

In [165]:
# Early stopping criteria is also used.
best_model1.fit(train_five_factor_data_rec, train_five_factor_data.iloc[:,col_len] ,batch_size = 8, epochs = 500, verbose=1, callbacks=[callback])#batch_size=20 default

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x1b9e1507e50>

## Calculate importance (overall coefficient) of each input on the output.


In [166]:
w_imp=np.zeros(train_five_factor_data_rec.shape[2]) # initiate the array to store importance of input parameters
for k in range(0,train_five_factor_data_rec.shape[2]):
    imp=np.zeros((21,train_five_factor_data_rec.shape[2])) #initiate the array to store values to compute sum
    for j in range(0,21): # run from 0 to 1 in steps of 0.05, 0.05 is used within the loop
        for i in range(0,train_five_factor_data_rec.shape[2]):
            imp[j][k]=j*0.05
    w_imp[k]=sum(best_model1.predict(imp.reshape(21,1,train_five_factor_data_rec.shape[2])))
w_imp/21  




array([0.51834443, 0.44009604, 0.36822135, 0.60512193, 0.44769696,
       0.25119818, 0.45385134, 0.57298797, 0.51821009, 0.36247646,
       0.30650239])

## Predict the output using the best model for ANN.

In [167]:
yc_pred1 = best_model1.predict(test_five_factor_data_rec)



In [168]:
yc_pred1_tr = best_model1.predict(train_five_factor_data_rec)



## Do necessary reshaping.

In [169]:
yc_pred1=yc_pred1.reshape(yc_pred1.shape[0],1)
yc_pred1_tr=yc_pred1_tr.reshape(yc_pred1_tr.shape[0],1)

## Convert array into dataframe.

In [170]:
y_1=pd.DataFrame(yc_pred1,index=list_index_test,columns=['ANN'])
y_2=pd.DataFrame(yc_pred1_tr,index=list_index_train,columns=['ANN'])

In [171]:
test_five_factor_data['ANN']=y_1


In [172]:
train_five_factor_data['ANN']=y_2


## Do inverse transformations.

In [173]:
new_dfclstm_test = test_five_factor_data[list_factor+['ANN']]
new_dfclstm_test[:] = s2.inverse_transform(new_dfclstm_test)


In [174]:
new_dfclstm_test=10**(new_dfclstm_test)


In [175]:
new_dfclstm_train = train_five_factor_data[list_factor+['ANN']]
new_dfclstm_train[:] = s1.inverse_transform(new_dfclstm_train)


In [176]:
new_dfclstm_train=10**(new_dfclstm_train)


In [177]:
new_df_test['ANN']=new_dfclstm_test['ANN']


In [178]:
new_df_train['ANN']=new_dfclstm_train['ANN']


## Compute the root mean square error for comparision of performance of various models used.

In [179]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_lr = sqrt(mean_squared_error(new_df_test['CLOSE'],new_df_test['LR']))
rms_svr = sqrt(mean_squared_error(new_df_test['CLOSE'],new_df_test['SVR']))
rms_lstm = sqrt(mean_squared_error(new_df_test['CLOSE'],new_df_test['LSTM']))
rms_ANN = sqrt(mean_squared_error(new_df_test['CLOSE'],new_df_test['ANN']))
rms_lr, rms_svr, rms_lstm, rms_ANN

(3604.4845739444017, 5053.33639085165, 4355.912249254291, 2983.297934972057)

In [180]:
rms_lr_tr = sqrt(mean_squared_error(new_df_train['CLOSE'],new_df_train['LR']))
rms_svr_tr = sqrt(mean_squared_error(new_df_train['CLOSE'],new_df_train['SVR']))
rms_lstm_tr = sqrt(mean_squared_error(new_df_train['CLOSE'],new_df_train['LSTM']))
rms_ANN_tr = sqrt(mean_squared_error(new_df_train['CLOSE'],new_df_train['ANN']))
rms_lr_tr, rms_svr_tr, rms_lstm_tr, rms_ANN_tr

(1228.738895904747, 463.73322827900915, 1017.5132403389813, 1350.5845824899027)

## Compute the mean absolute percentage error for comparision of performance of various models used.

In [181]:
from sklearn.metrics import mean_absolute_percentage_error
mape_lr = mean_absolute_percentage_error(new_df_test['CLOSE'],new_df_test['LR'])
mape_svr = mean_absolute_percentage_error(new_df_test['CLOSE'],new_df_test['SVR'])
mape_lstm = mean_absolute_percentage_error(new_df_test['CLOSE'],new_df_test['LSTM'])
mape_ANN = mean_absolute_percentage_error(new_df_test['CLOSE'],new_df_test['ANN'])
mape_lr, mape_svr, mape_lstm, mape_ANN

(0.10556097858165692,
 0.1449511782886755,
 0.14825561308364718,
 0.09483705231984074)

In [182]:
mape_lr_tr = mean_absolute_percentage_error(new_df_train['CLOSE'],new_df_train['LR'])
mape_svr_tr = mean_absolute_percentage_error(new_df_train['CLOSE'],new_df_train['SVR'])
mape_lstm_tr = mean_absolute_percentage_error(new_df_train['CLOSE'],new_df_train['LSTM'])
mape_ANN_tr = mean_absolute_percentage_error(new_df_train['CLOSE'],new_df_train['ANN'])
mape_lr_tr, mape_svr_tr, mape_lstm_tr, mape_ANN_tr

(0.08968471540504219,
 0.03206710897065926,
 0.06809542845473876,
 0.10163890699710597)

## Define the MinMaxScalar and use them along with long scaling to scale the data for stage two of the hybrid technique.

In [183]:
s1_2=MinMaxScaler(feature_range=(0,1))
s2_2=MinMaxScaler(feature_range=(0,1))

In [184]:
stage2_test=new_df_test[['LR','SVR','LSTM','ANN','CLOSE']]
stage2_test[:]=s2_2.fit_transform(np.log10(abs(stage2_test)))
stage2_test_data=stage2_test[['LR','SVR','LSTM','ANN']]


In [185]:
stage2_train=new_df_train[['LR','SVR','LSTM','ANN','CLOSE']]
stage2_train[:]=s1_2.fit_transform(np.log10(abs(stage2_train)))
stage2_train_data=stage2_train[['LR','SVR','LSTM','ANN']]


## Shape the dimensions of the data to fit the model requirements.

In [186]:
col_len_2=len(list(stage2_train.columns))-1


In [187]:
train_five_factor_data_2 = stage2_train.iloc[:,0:col_len_2].values.reshape(stage2_train.iloc[:,0:col_len_2].shape[0],1, stage2_train.iloc[:,0:col_len_2].shape[1])
test_five_factor_data_2 = stage2_test.iloc[:,0:col_len_2].values.reshape(stage2_test.iloc[:,0:col_len_2].shape[0],1, stage2_test.iloc[:,0:col_len_2].shape[1])


In [188]:
train_five_factor_data_2_y=stage2_train.iloc[:,col_len_2]
test_five_factor_data_2_y=stage2_test.iloc[:,col_len_2]


## Define the model along with hyperparameter tuning and compile it.

In [189]:
def build_regressor(hp):
    model3 = keras.Sequential()
    for i in range(hp.Int("num",min_value=2, max_value=5, step=1)):
        model3.add(layers.LSTM(units=hp.Int("units"+str(i), min_value=16, max_value=256, step=16), input_dim=col_len_2,return_sequences=True, activation='tanh'))
        model3.add(layers.Dropout(rate=0.01*hp.Int("rate"+str(i), min_value=30, max_value= 50, step=10)))
    model3.add(layers.Dense(units=1))
    learning_rate = hp.Float("lr", min_value=1e-5, max_value=1e-3, sampling="log")

    model3.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mean_squared_error",
        # Objective is one of the metrics.
        metrics=[keras.metrics.MeanSquaredError()],
    )
    return model3


tuner3 = keras_tuner.RandomSearch(
    hypermodel=build_regressor,
    # The objective name and direction.
    # Name is the f"val_{snake_case_metric_class_name}".
    objective=keras_tuner.Objective("val_mean_squared_error", direction="min"),
    max_trials=15,
    seed=111,
    overwrite=True,
    directory="my_dir",
    project_name="built_in_metrics3",
)

tuner3.search(
    x=train_five_factor_data_2,
    y=train_five_factor_data_2_y,
    validation_data=(test_five_factor_data_2,test_five_factor_data_2_y),
    callbacks=[tf.keras.callbacks.EarlyStopping("val_mean_squared_error")],
    epochs=100,

)

tuner3.results_summary()


Trial 15 Complete [00h 00m 04s]
val_mean_squared_error: 0.29043909907341003

Best val_mean_squared_error So Far: 0.0621536485850811
Total elapsed time: 00h 01m 26s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in my_dir\built_in_metrics3
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x000001B9E17338E0>
Trial summary
Hyperparameters:
num: 4
units0: 112
rate0: 30
units1: 208
rate1: 40
lr: 0.0009022079415210089
units2: 256
rate2: 30
units3: 160
rate3: 40
Score: 0.0621536485850811
Trial summary
Hyperparameters:
num: 3
units0: 176
rate0: 40
units1: 208
rate1: 30
lr: 0.0005343634220268847
units2: 176
rate2: 40
units3: 256
rate3: 50
units4: 48
rate4: 50
Score: 0.19090750813484192
Trial summary
Hyperparameters:
num: 3
units0: 32
rate0: 40
units1: 128
rate1: 40
lr: 0.0006906159408436501
units2: 144
rate2: 40
units3: 224
rate3: 40
units4: 176
rate4: 40
Score: 0.21678368747234344
Trial summary
Hyperparameters:
num: 4
units0: 112
rate0: 30
units1: 144
rat

In [190]:
num_crv = 15
models3 = tuner3.get_best_models(num_models=num_crv)

## Perform 10 fold cross validation.

In [191]:
from sklearn.model_selection import KFold
num_splits = 10
kf3=KFold(n_splits=num_splits)


In [192]:
score3 = pd.DataFrame(index=list(range(num_crv)),columns=list(range(num_splits)))
for i in range(0,num_crv):
    j=0
    for train_index,val_index in kf3.split(train_five_factor_data_2):
        x_train,x_val=train_five_factor_data_2[train_index],train_five_factor_data_2[val_index]
        y_train,y_val=train_five_factor_data_2_y[train_index],train_five_factor_data_2_y[val_index]
        score3.iloc[i,j]=models3[i].evaluate(x_val,y_val)
        j=j+1




In [193]:
arr3 = pd.DataFrame(index=list(range(num_crv)),columns=list(range(num_splits)))
for i in range(0,num_crv):
    for j in range(0,num_splits):
        arr3.iloc[i,j]=np.array(score3)[i][j][1]


In [194]:
arr3['sum']=arr3.sum(axis=1)


In [195]:
arr3['sum'].idxmin()

0

## Select the best model after cross validation.

In [196]:
best_model3=models3[arr3['sum'].idxmin()]

In [197]:
best_model3.build()


In [198]:
best_model3.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 112)         52416     
                                                                 
 dropout (Dropout)           (None, None, 112)         0         
                                                                 
 lstm_1 (LSTM)               (None, None, 208)         267072    
                                                                 
 dropout_1 (Dropout)         (None, None, 208)         0         
                                                                 
 lstm_2 (LSTM)               (None, None, 256)         476160    
                                                                 
 dropout_2 (Dropout)         (None, None, 256)         0         
                                                                 
 lstm_3 (LSTM)               (None, None, 160)         2

## Fit the train data with the best model obtained and predict the output.

In [199]:
best_model3.fit(train_five_factor_data_2, train_five_factor_data_2_y ,batch_size = 8, epochs = 500, verbose=1, callbacks=[callback])#batch_size=20 default

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x1b9fd9ec730>

## Importance of LR, SVR, LSTM, ANN predicted values on stage two.

In [200]:
w_imp=np.zeros(train_five_factor_data_2.shape[2]) # initiate the array to store importance of input parameters
for k in range(0,train_five_factor_data_2.shape[2]):
    imp=np.zeros((21,train_five_factor_data_2.shape[2])) #initiate the array to store values to compute sum
    for j in range(0,21): # run from 0 to 1 in steps of 0.05, 0.05 is used within the loop
        for i in range(0,train_five_factor_data_2.shape[2]):
            imp[j][k]=j*0.05
    w_imp[k]=sum(best_model3.predict(imp.reshape(21,1,train_five_factor_data_2.shape[2])))
w_imp/21  




array([-1.39538731e-05,  4.67769986e-01,  8.28707105e-02, -7.45407598e-03])

In [201]:
yc_pred3 = best_model3.predict(test_five_factor_data_2)
yc_pred3_tr = best_model3.predict(train_five_factor_data_2)



## Reshape the data and apply inverse transformations.

In [202]:
yc_pred3=yc_pred3.reshape(yc_pred3.shape[0],1)
yc_pred3_tr=yc_pred3_tr.reshape(yc_pred3_tr.shape[0],1)

In [203]:
pred_test_2=pd.DataFrame(yc_pred3,index=list_index_test,columns=['LSTM_2'])
pred_train_2=pd.DataFrame(yc_pred3_tr,index=list_index_train,columns=['LSTM_2'])

In [204]:
stage2_test_data['LSTM_2']=pred_test_2


In [205]:
stage2_train_data['LSTM_2']=pred_train_2


In [206]:
stage2_test_data[:] = s2_2.inverse_transform(stage2_test_data)
stage2_train_data[:] = s1_2.inverse_transform(stage2_train_data)


In [207]:
stage2_test_data= 10**(stage2_test_data)
stage2_train_data= 10**(stage2_train_data)


In [208]:
stage2_test_data['CLOSE']=new_df_test['CLOSE']
stage2_train_data['CLOSE']=new_df_train['CLOSE']

## Calculating the metrics.

In [209]:
rms_lr = sqrt(mean_squared_error(stage2_test_data['CLOSE'],stage2_test_data['LR']))
rms_svr = sqrt(mean_squared_error(stage2_test_data['CLOSE'],stage2_test_data['SVR']))
rms_lstm = sqrt(mean_squared_error(stage2_test_data['CLOSE'],stage2_test_data['LSTM']))
rms_ANN = sqrt(mean_squared_error(stage2_test_data['CLOSE'],stage2_test_data['ANN']))
rms_lstm_2 = sqrt(mean_squared_error(stage2_test_data['CLOSE'],stage2_test_data['LSTM_2']))
rms_lr, rms_svr, rms_lstm, rms_ANN, rms_lstm_2

(3604.4845739444017,
 5053.33639085165,
 4355.912249254293,
 2983.297934972057,
 5430.5557103612855)

In [210]:
rms_lr_tr = sqrt(mean_squared_error(stage2_train_data['CLOSE'],stage2_train_data['LR']))
rms_svr_tr = sqrt(mean_squared_error(stage2_train_data['CLOSE'],stage2_train_data['SVR']))
rms_lstm_tr = sqrt(mean_squared_error(stage2_train_data['CLOSE'],stage2_train_data['LSTM']))
rms_ANN_tr = sqrt(mean_squared_error(stage2_train_data['CLOSE'],stage2_train_data['ANN']))
rms_lstm_2_tr = sqrt(mean_squared_error(stage2_train_data['CLOSE'],stage2_train_data['LSTM_2']))
rms_lr_tr, rms_svr_tr, rms_lstm_tr, rms_ANN_tr, rms_lstm_2_tr

(1228.7388959047469,
 463.7332282790089,
 1017.5132403389815,
 1350.5845824899025,
 485.5298000768504)

In [211]:
mape_lr = mean_absolute_percentage_error(stage2_test_data['CLOSE'],stage2_test_data['LR'])
mape_svr = mean_absolute_percentage_error(stage2_test_data['CLOSE'],stage2_test_data['SVR'])
mape_lstm = mean_absolute_percentage_error(stage2_test_data['CLOSE'],stage2_test_data['LSTM'])
mape_ANN = mean_absolute_percentage_error(stage2_test_data['CLOSE'],stage2_test_data['ANN'])
mape_lstm_2 = mean_absolute_percentage_error(stage2_test_data['CLOSE'],stage2_test_data['LSTM_2'])
mape_lr, mape_svr, mape_lstm, mape_ANN, mape_lstm_2


(0.10556097858165692,
 0.1449511782886755,
 0.1482556130836472,
 0.09483705231984074,
 0.16078206127805575)

In [212]:
mape_lr_tr = mean_absolute_percentage_error(stage2_train_data['CLOSE'],stage2_train_data['LR'])
mape_svr_tr = mean_absolute_percentage_error(stage2_train_data['CLOSE'],stage2_train_data['SVR'])
mape_lstm_tr = mean_absolute_percentage_error(stage2_train_data['CLOSE'],stage2_train_data['LSTM'])
mape_ANN_tr = mean_absolute_percentage_error(stage2_train_data['CLOSE'],stage2_train_data['ANN'])
mape_lstm_2_tr = mean_absolute_percentage_error(stage2_train_data['CLOSE'],stage2_train_data['LSTM_2'])
mape_lr_tr, mape_svr_tr, mape_lstm_tr, mape_ANN_tr, mape_lstm_2_tr

(0.08968471540504218,
 0.032067108970659224,
 0.06809542845473879,
 0.10163890699710597,
 0.03380193673774823)