In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')

from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

np.random.seed(7)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.preprocessing.sequence import TimeseriesGenerator
!pip install livelossplot
from livelossplot.keras import PlotLossesCallback

## Data Preparation
Reading the market data of BAJAJFINSV stock and preparing a training dataset and validation dataset.

In [None]:
df = pd.read_csv("/kaggle/input/nifty50-stock-market-data/BAJAJFINSV.csv")
df.set_index("Date", drop=False, inplace=True)
# df.head()

In [None]:
df = df.loc[:,['Date', 'Prev Close', 'Open', 'High', 'Low', 'Last', 'Close']]
df.head()

Plotting the target variable **Close** over time

In [None]:
df.Close.plot(figsize=(14, 7))

In [None]:
# Get sizes of each of the datasets
num_cv = int(0.2*len(df))
num_test = int(0.2*len(df))
num_train = len(df) - num_cv - num_test
print("num_train = " + str(num_train))
print("num_cv = " + str(num_cv))
print("num_test = " + str(num_test))

# Split into train, cv, and test
train = df[:num_train][['Date', 'Close']]
cv = df[num_train:num_train+num_cv][['Date', 'Close']]
train_cv = df[:num_train+num_cv][['Date', 'Close']]
test = df[num_train+num_cv:][['Date', 'Close']]

print("train.shape = " + str(train.shape))
print("cv.shape = " + str(cv.shape))
print("train_cv.shape = " + str(train_cv.shape))
print("test.shape = " + str(test.shape))

In [None]:
def get_x_y(data, N, offset):
    """
    Split data into x (features) and y (target)
    """
    x, y = [], []
    for i in range(offset, len(data)):
        x.append(data[i-N:i])
        y.append(data[i])
    x = np.array(x)
    y = np.array(y)
    
    return x, y

In [None]:
N=9
#offset value

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))

train_scaled = scaler.fit_transform(np.array(train['Close']).reshape(-1,1))
print("scaler.data_min_ = " + str(scaler.data_min_))
print("scaler.data_max_ = " + str(scaler.data_max_))

# Split into x and y
x_train, y_train = get_x_y(train_scaled, N, N)
print("x_train.shape = " + str(x_train.shape))
print("y_train.shape = " + str(y_train.shape))

# x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1)) # (446, 7, 1)
# print("x_train.shape = " + str(x_train.shape))

In [None]:
# Scale the cv dataset according the min and max obtained from train set
train_cv_scaled  = scaler.transform(np.array(train_cv['Close']).reshape(-1,1))

# Split into x and y
x_cv, y_cv = get_x_y(train_cv_scaled, N, num_train)
print("x_cv.shape = " + str(x_cv.shape))
print("y_cv.shape = " + str(y_cv.shape))

In [None]:
# Here we scale the train_cv set, for the final model
scaler_final = MinMaxScaler(feature_range=(0, 1))
train_cv_scaled_final = scaler_final.fit_transform(np.array(train_cv['Close']).reshape(-1,1))
print("scaler_final.data_min_ = " + str(scaler_final.data_min_))
print("scaler_final.data_max_ = " + str(scaler_final.data_max_))

# Scale the test dataset according the min and max obtained from train_cv set
test_scaled  = scaler_final.transform(np.array(test['Close']).reshape(-1,1))

In [None]:
# Create the LSTM network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
model.add(Dropout(0.3))
model.add(LSTM(units=50))
model.add(Dropout(0.3))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=50, batch_size=1, callbacks=[PlotLossesCallback()])

In [None]:
def get_mape(y_true, y_pred): 
    """
    Compute mean absolute percentage error (MAPE)
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Do prediction
est = model.predict(x_cv)
est_inv = scaler.inverse_transform(est)

# Get correct scale of y_cv
y_cv_inv = scaler.inverse_transform(y_cv)

# Calculate RMSE
rmse_bef_tuning = sqrt(mean_squared_error(y_cv_inv, est_inv))
print("RMSE = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_pct_bef_tuning = get_mape(y_cv_inv, est_inv)
print("MAPE = %0.3f%%" % mape_pct_bef_tuning)

In [None]:
from pylab import rcParams

In [None]:
# Plot adjusted close over time
rcParams['figure.figsize'] = 10, 8 # width 10, height 8

est_df = pd.DataFrame({'est_inv': est_inv.reshape(-1), 
                       'y_cv_inv': y_cv_inv.reshape(-1),
                       'Date': cv['Date']})

ax = train.plot(x='Date', y='Close', style='b-', grid=True)
ax = cv.plot(x='Date', y='Close', style='y-', grid=True, ax=ax)
ax = test.plot(x='Date', y='Close', style='g-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est_inv', style='r-', grid=True, ax=ax)
ax.legend(['train', 'dev', 'test', 'est'])
ax.set_xlabel("Date")
ax.set_ylabel("Close")

In [None]:
import time
import math
from tqdm import tqdm_notebook

In [None]:
def train_pred_eval_model(x_train_scaled, \
                          y_train_scaled, \
                          x_cv_scaled, \
                          y_cv_scaled, \
                          scaler, \
                          lstm_units=50, \
                          dropout_prob=0.5, \
                          optimizer='adam', \
                          epochs=1, \
                          batch_size=1):
    '''
    Train model, do prediction, scale back to original range and do evaluation
    Use LSTM here.
    Returns rmse, mape and predicted values
    Inputs
        x_train_scaled  : e.g. x_train_scaled.shape=(451, 9, 1). Here we are using the past 9 values to predict the next value
        y_train_scaled  : e.g. y_train_scaled.shape=(451, 1)
        x_cv_scaled     : use this to do predictions 
        y_cv_scaled     : actual value of the predictions (scaled)
        scaler          : scaler that is used to fit_transform train set
        lstm_units      : lstm param
        dropout_prob    : lstm param
        optimizer       : lstm param
        epochs          : lstm param
        batch_size      : lstm param
    Outputs
        rmse            : root mean square error
        mape            : mean absolute percentage error
        est             : predictions
    '''
    # Create the LSTM network
    model = Sequential()
    model.add(LSTM(units=lstm_units, return_sequences=True, input_shape=(x_train_scaled.shape[1],1)))
    model.add(Dropout(dropout_prob)) # Add dropout with a probability of 0.5
    model.add(LSTM(units=lstm_units))
    model.add(Dropout(dropout_prob)) # Add dropout with a probability of 0.5
    model.add(Dense(1))

    # Compile and fit the LSTM network
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    model.fit(x_train_scaled, y_train_scaled, epochs=epochs, batch_size=batch_size, verbose=0)
    
    # Do prediction
    est_scaled = model.predict(x_cv_scaled)
    est = scaler.inverse_transform(est_scaled)
    
    # Get correct scale of y_cv
    y_cv = scaler.inverse_transform(y_cv_scaled)

    # Calculate RMSE and MAPE
    rmse = math.sqrt(mean_squared_error(y_cv, est))
    mape = get_mape(y_cv, est)
    
    return rmse, mape, est

In [None]:
param_label = 'N'
param_list = range(2, 60)

error_rate = {param_label: [], 'rmse': [], 'mape_pct': []}
tic = time.time()
for param in tqdm_notebook(param_list):
    
    # Split train into x and y
    x_train_scaled, y_train_scaled = get_x_y(train_scaled, param, param)

    # Split cv into x and y
    x_cv_scaled, y_cv_scaled = get_x_y(train_cv_scaled, param, num_train)
    
    # Train, predict and eval model
    rmse, mape, _ = train_pred_eval_model(x_train_scaled, \
                                          y_train_scaled, \
                                          x_cv_scaled, \
                                          y_cv_scaled, \
                                          scaler, \
                                          lstm_units=50, \
                                          dropout_prob=0.3, \
                                          optimizer='adam', \
                                          epochs=10, \
                                          batch_size=1)
    
    # Collect results
    error_rate[param_label].append(param)
    error_rate['rmse'].append(rmse)
    error_rate['mape_pct'].append(mape)
    
error_rate = pd.DataFrame(error_rate)
toc = time.time()
print("Minutes taken = " + str((toc-tic)/60.0))
error_rate   

In [None]:
# Plot RMSE 
rcParams['figure.figsize'] = 10, 8 # width 10, height 8

ax = error_rate.plot(x='N', y='rmse', style='bx-', grid=True)
ax = error_rate.plot(x='N', y='mape_pct', style='rx-', grid=True, ax=ax)
ax.set_xlabel("N")
ax.set_ylabel("RMSE/MAPE(%)")

In [None]:
# Get optimum value for param
temp = error_rate[error_rate['rmse'] == error_rate['rmse'].min()]
N_opt = temp['N'].values[0]
print("min RMSE = %0.3f" % error_rate['rmse'].min())
print("min MAPE = %0.3f%%" % error_rate['mape_pct'].min())
print("optimum " + param_label + " = " + str(N_opt))

## Feature Engineering
statistics like mean, standard deviation for three sets of lagged values, one previous day, one looking back 7 days and another looking back 30 days as a proxy for last week and last month metrics.

In [None]:
df.reset_index(drop=True, inplace=True)
lag_features = ["High", "Low", "Open"]
window1 = 3
window2 = 5
window3 = 7

df_rolled_3d = df[lag_features].rolling(window=window1, min_periods=0)
df_rolled_5d = df[lag_features].rolling(window=window2, min_periods=0)
df_rolled_7d = df[lag_features].rolling(window=window3, min_periods=0)

df_mean_3d = df_rolled_3d.mean().shift(1).reset_index().astype(np.float32)
df_mean_5d = df_rolled_5d.mean().shift(1).reset_index().astype(np.float32)
df_mean_7d = df_rolled_7d.mean().shift(1).reset_index().astype(np.float32)

df_std_3d = df_rolled_3d.std().shift(1).reset_index().astype(np.float32)
df_std_5d = df_rolled_5d.std().shift(1).reset_index().astype(np.float32)
df_std_7d = df_rolled_7d.std().shift(1).reset_index().astype(np.float32)


for feature in lag_features:
    df[f"{feature}_mean_lag{window1}"] = df_mean_3d[feature]
    df[f"{feature}_mean_lag{window2}"] = df_mean_5d[feature]
    df[f"{feature}_mean_lag{window3}"] = df_mean_7d[feature]
    
    df[f"{feature}_std_lag{window1}"] = df_std_3d[feature]
    df[f"{feature}_std_lag{window2}"] = df_std_5d[feature]
    df[f"{feature}_std_lag{window3}"] = df_std_7d[feature]

df.fillna(df.mean(), inplace=True)

df.set_index("Date", drop=False, inplace=True)
df.head()

In [None]:
df.Date = pd.to_datetime(df.Date, format="%Y-%m-%d")
df["month"] = df.Date.dt.month
df["week"] = df.Date.dt.week
df["day"] = df.Date.dt.day
df["day_of_week"] = df.Date.dt.dayofweek
df.head()

Splitting the data into train and validation along with features.     
* **train:** Data from 26th May, 2008 to 31st December, 2018.
* **valid:** Data from 1st January, 2019 to 31st December, 2019.

In [None]:
exogenous_features = ['High_mean_lag3', 'High_mean_lag5', 'High_mean_lag7',
                       'High_std_lag3', 'High_std_lag5', 'High_std_lag7', 'Low_mean_lag3',
                       'Low_mean_lag5', 'Low_mean_lag7', 'Low_std_lag3', 'Low_std_lag5',
                       'Low_std_lag7', 'Volume_mean_lag3', 'Volume_mean_lag5',
                       'Volume_mean_lag7', 'Volume_std_lag3', 'Volume_std_lag5',
                       'Volume_std_lag7', 'Prev Close_mean_lag3', 'Prev Close_mean_lag5',
                       'Prev Close_mean_lag7', 'Prev Close_std_lag3', 'Prev Close_std_lag5',
                       'Prev Close_std_lag7', 'month', 'week', 'day', 'day_of_week']

In [None]:
values = df[exogenous_features+["Close"]]
values = values.astype('float32')

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

In [None]:
scaled.shape

In [None]:
train = scaled[:2772, :]
test = scaled[2772:, :]
# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

In [None]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

In [None]:
model = Sequential([
    LSTM(50, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2])),
    Dropout(0.2),
    Dense(20),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(train_X, train_y, epochs=100, batch_size=10, validation_data=(test_X, test_y), callbacks=[PlotLossesCallback()], shuffle=False)

In [None]:
model = Sequential()

model.add(LSTM(
    input_dim=1,
    output_dim=50,
    return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
    100,
    return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(
    output_dim=1))
model.add(Activation('linear'))

In [None]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
test_predictions = model.predict(test_generator)
test_predictions.shape

In [None]:
df_test = df_test[:-6]

In [None]:
df_test.shape

In [None]:
test_predictions[:-1].shape

In [None]:
test_predictions = [i[0] for i in test_predictions]

In [None]:
test_predictions

In [None]:
df_test.shape

In [None]:
pd.Series(test_predictions[:-1]).shape

In [None]:
df_test['Forecast_LSTM'] = pd.Series(test_predictions[:-1])

In [None]:
df_test[["Close"]].plot(figsize=(14, 7))

In [None]:
print("RMSE of LSTM:", np.sqrt(mean_squared_error(df_test.Close, df_test.Forecast_LSTM)))
print("\nMAE of LSTM:", mean_absolute_error(df_test.Close, df_test.Forecast_LSTM))

Trying Predicting next 3 values

In [None]:
train_close = df_train['Close'].to_list()
train_target2 = np.array([train_close[i:i+3] for i in range(len(train_close)-2)])

test_close = df_test['Close'].to_list()
test_target2 = np.array([test_close[i:i+3] for i in range(len(test_close)-2)])

In [None]:
train_dataset2 = train_dataset[:-2]

In [None]:
test_dataset2 = test_dataset[:-4]

In [None]:
# use last 2 values
train_generator2 = TimeseriesGenerator(train_dataset2, train_target2, length=2, batch_size=1)

# use last 2 values
test_generator2 = TimeseriesGenerator(test_dataset2, test_target2, length=2, batch_size=1)

In [None]:
model = Sequential([
    LSTM(200, activation='relu', input_shape=(2, 28)),
    Dropout(0.15),
    Dense(3)
])

model.compile(optimizer='adam', loss='mse')
model.fit(train_generator2,epochs=90, callbacks=[PlotLossesCallback()])