In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Long Short Term Memory [LSTM]:**
**Recurrent Neural Network is a machine learning algorithm, perfectly suited for machine learning
problem that involves sequential data. Recurrent Neural Network model remembers the previous data
point, which causes it to optimize its weights and also tune other hyper parameters according to the
previous data point, with the aim to determine the next data point. Because of this feature we use
Recurrent Neural Network model in time series problems, such as stock market prediction, market price
prediction, Natural Language Processing, as in these predictions, previous past input data has high
influence on future outputs.**

**The whole idea behind the Recurrent Neural Network is to modify the weight values, which could be
achieve by weight optimization techniques. But, as we calculate the loss gradient for weight
optimization technique, we have to use the chain rule, as it might sometimes lead to a problem like
vanishing gradient problem, which is due to the long term dependency. In order to, solve vanishing
gradient problem, we introduce LSTM, which optimizes its weight and uses the sigmoid function, to
forget the inputs which are less prioritized.**



**I will get our dataset from train stock prices, which has almost 12 columns, and among those columns, we only need datas from close column as an input, as these values might have some significance in predicting the future values..**

In [None]:
dataset = "/kaggle/input/jpx-tokyo-stock-exchange-prediction"
train_file = "train_files/stock_prices.csv"
train_all_file = os.path.join(dataset, train_file)

**I will read the csv file from the given path.**

In [None]:
df = pd.read_csv(train_all_file)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
len(df)

**As it could be clearly seen that there are almost 2400000 stock price values**

In [None]:
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
plt.plot(df['Close'])

**The graph seen above is quite incomprehensible.**

**We need to calculate the adjusted close price by multiplying the ajustment factor with "Close" value.**

In [None]:

from decimal import ROUND_HALF_UP, Decimal
def adjusted_price(price):
    price.loc[:, "Date"] = pd.to_datetime(price.loc[:, "Date"], format = "%Y-%m-%d")
    
    def generated_close(df):
        df = df.sort_values("Date", ascending=False)
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        df = df.sort_values("Date")
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generated_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

**As, I have added this new column, which is adjusted close value, and it must be taken as an input to train my neural network.**

In [None]:
df = adjusted_price(df)

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df.info()

In [None]:
plt.plot(df["AdjustedClose"])
plt.ylabel("Adjusted_Close_Value")
plt.xlabel("Year")
plt.show()

In [None]:
len(df["AdjustedClose"])

**To make the graph more clear, and to determine the rise and fall of stock prices, clearly on the graph. I have taken only 100 adjusted Close prices. The graph can be seen below.**

In [None]:
dataset = df["AdjustedClose"]
len(dataset)

In [None]:
df1 = dataset[:100]

In [None]:
len(df1)

In [None]:
df1

**I have defined the number of 100 datas to influence the future stock prices.** **It is clearly seen that the length of the values is more than permssible, as training of the data might strain the computer. So we need to divide the values into more permissible amount and also further divide them into train set and test set**.

In [None]:
df1.isnull().sum()

**Importing the important Packages, that are needed.**

In [None]:
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, Activation, Dropout
import seaborn as sns

In [None]:
plt.plot(df1)
plt.ylabel("AdjustedClose_Values")
plt.xlabel("Year")
plt.show()

**Since, there are large number of datasets, it will be quite straining and time consuming task to fit and run the whole model by taking these large number of close values. So, I have taken only 200000 values, which could be used for my model training process.**

In [None]:
df2 = dataset[:200000]
len(df2)

In [None]:
df2

In [None]:
df2.plot(figsize=(12, 4), legend=True, linestyle="--", marker="o")

In [None]:
sns.displot(df2, bins=100, color='magenta')

**We keep the whole values between 0 and 1, by using the MinMaxScaler**


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
df2 = scaler.fit_transform(np.array(df2).reshape(-1, 1))

In [None]:
print(df2)

In [None]:
train_size = int(len(df2)*0.75)
test_size = len(df2)-train_size
train_data, test_data = df2[0:train_size, :], df2[train_size:len(df2), :1]

**Here, I have taken time step as 100, and we have to train the model by taking the 100 inputs to the 101 as output, and the process continues for 200000 datas. The whole model is developed in this way to predict the another time step stock close value.**

In [None]:
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]
        dataX.append(a)
        dataY.append(dataset[i+time_step, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
time_step = 100
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

In [None]:
print(X_train.shape)

In [None]:
print(y_train.shape)

In [None]:
print(X_test.shape)

In [None]:
print(X_train)

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [None]:
X_train.shape

**Multi Step LSTM is used in the following order as shown below.**

In [None]:
model_1 = Sequential([
    LSTM(units=100, return_sequences=True, input_shape=(100, 1)),
    Dropout(0.2),
    LSTM(100, return_sequences=True),
    Dropout(0.1),
    LSTM(50, return_sequences=True),
    Dropout(0.2),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.2),
    Dense(25, activation="relu"),
    Dropout(0.1),
    Dense(1)
])

In [None]:
model_1.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model_1, 'model_1.jpg', show_shapes=True)

In [None]:
model_1.compile(optimizer="adam", loss="mean_squared_error")

In [None]:
history_1 = model_1.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

In [None]:
loss = model_1.evaluate(X_test, y_test)

In [None]:
import pandas as pd
pd.DataFrame(history_1.history).plot(figsize=(10, 7))

**The loss obtained from the given model is as shown below.**

In [None]:
print(f"Loss of the model is: {loss*100}%.")

**The loss calculated above is 0.0119 %, which is quite low.**

In [None]:
train_predict = model_1.predict(X_train)
test_predict = model_1.predict(X_test)


**I predicted the values for stock market using the model as shown above, and converted them back to the original values, by inverse transfrom of MinMaxScaler.**

In [None]:
train_p = scaler.inverse_transform(train_predict) 
test_p = scaler.inverse_transform(test_predict)

**I calculated the square root of mean squared error of predicted values and the real values, and found the mean squared error for train set is 2705.614 and test set is 3368.504.**

In [None]:
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_train, train_p))

In [None]:
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_test, test_p))

**Now, I have to test our model on test stock market. I have also calculated the adjusted close price of the given test dataset.**

In [None]:
dataset_test = "/kaggle/input/jpx-tokyo-stock-exchange-prediction"
test_file = "example_test_files/stock_prices.csv"
test_all_file = os.path.join(dataset_test, test_file)

In [None]:
df_t = pd.read_csv(test_all_file)
df_t.head()

In [None]:
df_t.tail()

In [None]:
len(df_t)

In [None]:
plt.plot(df_t['Close'])

In [None]:
df_t = adjusted_price(df_t)

In [None]:
df_t.info()

In [None]:
plt.plot(df_t["AdjustedClose"])
plt.xlabel("Year")
plt.ylabel("Adjusted_Close_Value")
plt.show()

**The graph is quite incomprehensible and I have taken only 1000 values, for prediction.**

In [None]:
df_t_1 = df_t[:1000]

In [None]:
df_t_1 = df_t_1.dropna()

In [None]:
df3 = df_t["AdjustedClose"][:1000]
len(df3)


In [None]:
df3

In [None]:
len(df3)

In [None]:
df3.isnull().sum()

In [None]:
df3.plot(figsize=(12, 4), legend=True, linestyle="--", marker="o")

In [None]:
sns.displot(df3, bins=100, color="magenta")

In [None]:
df3

In [None]:
df3 = df3.dropna()

In [None]:
len(df3)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
df3 = scaler.fit_transform(np.array(df3).reshape(-1, 1))

In [None]:
time_step = 100
X_test_1, y_test_1 = create_dataset(df3, time_step)
X_test_1

In [None]:
X_test_1.shape

In [None]:
X_test_1 = X_test_1.reshape(X_test_1.shape[0], X_test_1.shape[1], 1)

In [None]:
X_test_1.shape

In [None]:
test_predict_1 = model_1.predict(X_test_1)

In [None]:
test_p_1 = scaler.inverse_transform(test_predict_1)

In [None]:
y_test_1 = y_test_1.reshape(897,1)

In [None]:
test_p_1.shape

**Square root mean squared error is found to be 3094.5207 on the test values for stock price prediction.**

In [None]:
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_test_1, test_p_1))

In [None]:
import numpy as np
back = 100

testPredictPlot = np.empty_like(df3)
testPredictPlot[:, :] = np.nan
testPredictPlot[back:len(test_p_1)+back, :] = test_p_1


plt.plot(scaler.inverse_transform(df3))
plt.plot(testPredictPlot)
plt.show()

In [None]:
test_p_1 = np.squeeze(test_p_1)

In [None]:
test_p_1 = pd.DataFrame(test_p_1, columns = ['Stock_Price'])

In [None]:
test_p_1 = np.squeeze(test_p_1)

In [None]:
te_p_1 = pd.DataFrame({'id':list(range(0, len(test_p_1-1))), 'Stock_Price': test_p_1})
te_p_1

In [None]:
te_p_1.to_csv("submission.csv", index=False)

# **"model_1" is the trained model with very low losses and high accuracy, you could use that to predict your stock prices.**