In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing standard libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
my_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")

In [None]:
my_df.head()

In [None]:
my_df.sample(5)

In [None]:
my_df.Date.describe()

In [None]:
#Count shows 2332531 rows and the start date 03.12.2021

Count shows 2332531 rows and the start date 03.12.2021

In [None]:
print("Start date: {}, end date: {}".format(my_df.Date.unique().min(), my_df.Date.unique().max()))

i will be using 2021 data for validation and the rest for training

In [None]:
my_df_train = my_df[my_df['Date'] < '2021-01-01'].copy()
my_df_train.shape

The data that we will be working with has 1880531 rows and 21 columns this is our training set

# Validation Data

In [None]:
my_df_valid = my_df[my_df['Date'] >= '2021-01-01'].copy()
my_df_valid.shape

In [None]:
#we will use atleast 20% of the data for validation.

In [None]:
my_df_valid.shape[0] / my_df.shape[0] * 100

In [None]:
my_df.head(5)

# Feature Selection

In [None]:
my_df.head(10)#we need to select numerical features for our model building.'Open', 'High', 'Low', 'Close', 'Volume

In [None]:
#we now select the features for our model.
features = ['Open', 'High', 'Low', 'Close', 'Volume']
target = ['Target']
my_df_train = my_df_train[features + target].reset_index(drop=True).copy()
my_df_valid = my_df_valid[features + target].reset_index(drop=True).copy()
my_df_valid.head()

We need to check for missing values for us to standardize our data set that we will be working with.

In [None]:
my_df.isnull().sum()

In [None]:
my_df_train.dropna(subset=features + target, axis=0, inplace=True)
my_df_valid.dropna(subset=features + target, axis=0, inplace=True)

In [None]:
#we also check if training split and valid split has missing values
my_df_train.isnull().sum() + my_df_valid.isnull().sum()

Therefore from the above our training set and validation set looks clean.

In [None]:
# Statistical summary
my_df_train.describe()

# Preprocessing****

In [None]:
#We will need to perform feature normalization and create tensorflow dataset for our model.

In [None]:
# Define encoding function for numerical features
def encode_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = layers.Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature

In [None]:
import random
import jpx_tokyo_market_prediction
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# Generate tensorflow dataset
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("Target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(my_df_train)
valid_ds = dataframe_to_dataset(my_df_valid)

we note that each of our dataset will yield a tuple(input,target) where input is a dictionary of features

In [None]:
#we create a loop for our train_ds
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

In [None]:
# Batch the dataset
train_ds = train_ds.batch(1024)
valid_ds = valid_ds.batch(1024)

# Building Neural Network Model
we will first define our input layers of our Neural Network model then perform encoding to it.

In [None]:
%%time
# Raw numerical features
Open = keras.Input(shape=(1,), name="Open")
High = keras.Input(shape=(1,), name="High")
Low = keras.Input(shape=(1,), name="Low")
Close = keras.Input(shape=(1,), name="Close")
Volume = keras.Input(shape=(1,), name="Volume")

all_inputs = [Open, High, Low, Close, Volume]

# Encode nfeatures
open_encoded = encode_feature(Open, "Open", train_ds)
high_encoded = encode_feature(High, "High", train_ds)
low_encoded = encode_feature(Low, "Low", train_ds)
close_encoded = encode_feature(Close, "Close", train_ds)
volume_encoded = encode_feature(Volume, "Volume", train_ds)

We will need to concat all the input layers and connect them to multiple hidden dense layers.

In [None]:
# Concat all features of input layer
all_features = layers.concatenate(
    [
        open_encoded,
        high_encoded,
        low_encoded,
        close_encoded,
        volume_encoded,
    ]
)

# Add several hidden layers with batch_norm and dropout
x = layers.Dense(256, activation="relu")(all_features)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)

# Output layer for regression task
output = layers.Dense(1, activation="linear")(x)

# Create our NN model
model = keras.Model(all_inputs, output)
model.compile("adam", "mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

# How does our model look like?????................?****

In [None]:
# Lets check our Neural Network model  structure
model.summary()

# Model Visualization

In [None]:
#Lets visualize our model
keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

# Model Training****
we need to set up epochs 

In [None]:
# Set early_stopping callbacks,
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    min_delta=1e-3,
    restore_best_weights=True,
)

In [None]:
model.fit(train_ds, epochs=50, validation_data=valid_ds, callbacks=[early_stopping])

In [None]:
# We save our model
model.save("nn_model.h5")

In [None]:
# Load trained model
best_model = keras.models.load_model("nn_model.h5")

In [None]:
# Generate tensorflow dataset for test data
def dataframe_to_dataset_test(dataframe):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices(dict(dataframe))
    return ds

In [None]:
# Set random seed
seed = 30
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
# Make predictions and submission
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    test_ds = dataframe_to_dataset_test(prices)
    sample_prediction['target_pred'] = best_model.predict(test_ds)
    sample_prediction = sample_prediction.sort_values(by="target_pred", ascending=False)
    sample_prediction['Rank'] = np.arange(2000)
    sample_prediction = sample_prediction.sort_values(by="SecuritiesCode", ascending=True)
    sample_prediction.drop(['target_pred'], axis=1, inplace=True)
    display(sample_prediction)
    env.predict(sample_prediction)

# Enhanced Recurrent Neural Networks

In [None]:
#Data Preparation
# Getting our train dataset
training_set = my_df_train.iloc[:,1:2].values
print(training_set)

In [None]:
# Performing Feature scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0,1))
training_set_scaled = sc.fit_transform(training_set) 

In [None]:
# Creating a dataset with 60 timesteps and 1 output
X_train = []
Y_train = []
for i in range(60, 751):
    X_train.append(training_set_scaled[i-60 : i, 0])
    Y_train.append(training_set_scaled[i, 0])
X_train, Y_train = np.array(X_train), np.array(Y_train) 

In [None]:
# Reshaping 
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1],1))

# Data Modeling

In [None]:
# Building the RNN
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM

In [None]:
# Initialising the RNN
# ---
#
regressor = Sequential()

# Adding the first LSTM layer and some Dropout regularisation
# ---
#
regressor.add(LSTM(units = 100, return_sequences = True, input_shape = (X_train.shape[1], 1)))
regressor.add(Dropout(0.3))

# Adding the second LSTM layer and some dropout regularisation
# ---
#
regressor.add(LSTM(units = 100, return_sequences = True))
regressor.add(Dropout(0.3))

# Adding the third LSTM layer and some dropout regularisation
# ---
#
regressor.add(LSTM(units = 100, return_sequences = True))
regressor.add(Dropout(0.3))

# Adding the fourth LSTM layer with some dropout
# ---
#
regressor.add(LSTM(units = 100, return_sequences = False))
regressor.add(Dropout(0.3))

# Adding the output layer
# ---
#
regressor.add(Dense(units = 1))

In [None]:
# Compile the RNN 
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
# Fitting the RNN to the training set
regressor.fit(X_train, Y_train, epochs = 50, batch_size = 32)

What can be done to improve the solution?
Getting more training data
Increasing the timesteps
Adding some other indicators
Adding more LSTM layers
Adding more neurons in the LSTM layers etc.