# JPX Tokyo Stock exchange prediction competition

This competition involves building portfolios from stocks in Tokyo Stock Exchange (around 2,000 stocks). Specifically, given the historical pricing data, each participant ranks the stocks from highest to lowest expected returns and is evaluated on the difference in return.
 
# Time series modeling with transformer network
There have been many applications of neural network model to predictions of time series data such as stock prices. Transformer network has been proven to be remarkably superior to earlier RNNs in NLP applications. When it comes to stock price prediction with transformer model, most publicly available samples treat the stock price as univariant time series, where the input is simply the previous closing prices of a stock, which is then used to predict a future price. This approach ignores other potential features, such as volume, daily low, high, option etc., that may potentially influence the future price movements. The stochastic nature of stock price aside, a multi-variant time series approach should obviously provide an advantage.  
The other challenge of this particular JPX competition is that with around 2000 stocks in the pool, a one-model-per-stock approach will likely exceed the total time allowed for training (9 hours) if we simply train and predict each stock individually. This approach also misses out the over-all market trends, in other words, the covariance between stocks. 
To address these two challenges, a single transformer encoder is trained with time series data of all 2000 stocks. The token of the time series has a dimension of 2000, so has the label. Each element of the token represents a single stock. The window size of the time series is a hyper parameter.  To include  other features, besides the stock price itself, a customizer layer is introduced before the transformer block. This so-called diagonal dense layer can be thought of as a dense layer where all off diagonal elements of the weight matrix are set to 0. This layer acts as a linear regression function that combines all features in the input for a single stock and outputs a single value, which is then placed in the time series token to feed into the transformer. This approach allows us to keep the embedding dimension of the token manageable at 2000, thus reduces the size of the model and overfitting tendency. It also retains the association of the features with particular stocks. 

# Imports

In [None]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction

#from IPython.core.debugger import set_trace
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
from tensorflow import keras
from keras import layers
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()


# Helper functions for preparing the time series data set.

In [None]:
# not all days have the same number of stocks, so we need to pad the missing data
def pad_missing_stock_code( sample, codes):
    # missing code
    missing_codes = set( list( range(0, len(codes)))) - set( [i[0] for i in sample])
    # drop the code column
    x = sample[:,1:]
    for idx in sorted(missing_codes):
        x = np.insert( x, idx, 0.0, axis=0)
    return x


def windowed_dataset(series, window_size, batch_size):
    #series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
  #  ds = ds.shuffle( len( series))
    # first 2 columns are closing price, volume, last column is the label - target
    ds = ds.map(lambda w: (w[:,:,0:-1], w[-1,:,-1].reshape(-1)))
    #if batch_size == 1: return ds
    return ds.batch(batch_size).prefetch(1)


#calculate the change percentage between two consecutive days, day1 and day2 have the shape of (stock_list, features_list+label)
#col_list is the list of features needed to calculate the change percentage, the rest features should stay
def calculate_change_percentage_per_day( day1, day2, col_list):
    r = day2.copy()
    for k in col_list:
        for j in range(0, day1.shape[0]):
            if k == 0:  # the difference of day2 open to day1 close
                r[j, k] = 0.0 if day1[j, 0] < 1.e-8 or day2[j, 1] < 1.e-8 else (day2[j, 0] - day1[j, 1]) / day1[j, 0]
            else:
                r[j, k] = 0.0 if day1[j, k] < 1.e-8 or day2[j, k] < 1.e-8 else (day2[j,k] - day1[j,k]) / day1[j,k]

    return r


#calculate the change percentage between two consecutive days
def calculate_change_percentage( series, cols_to_calculate):
    for i in range(1, len(series)):
        series[i-1] = calculate_change_percentage_per_day(series[i-1], series[i], cols_to_calculate)

    series.pop() # remove the first element
    return series


# prep time series data set for training and validation
def prep_time_series_dataset( prices,  window_size, batch_size):   
    codes = list(prices.SecuritiesCode.unique())
    date_list = list(prices.Date.unique())
    prices = prices[['Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close', 'Volume', 'Target']].dropna()
    prices['Low_high_ratio'] = (1 - prices['Low'] / prices['High'])
    prices.dropna()
    prices = prices[['Date', 'SecuritiesCode', 'Open', 'Close', 'Volume', 'Low_high_ratio', 'Target']]
    
    #normalize target value to percentage
    prices["Target"] = prices["Target"]*100
    
    price_series = prices.sort_values(by=['Date', 'SecuritiesCode']).reset_index(drop=True).dropna()

    daily_data_list =[]
    for dt in date_list:
        daily_data = price_series[price_series.Date == dt ].drop(['Date'], axis=1).sort_values(by=['SecuritiesCode'])
        daily_data_list.append( pad_missing_stock_code( daily_data.to_numpy(), codes))

    # daily_data_list is a 1201 long list of 1-d (2000) array, each array is a day's data, sorted by stock code
    # need to calculate the change percentage between two consecutive days per stock
    ds = windowed_dataset( calculate_change_percentage( daily_data_list,[0,1,2]), window_size, batch_size )
    return ds, np.array(daily_data_list)



# Transformer Block

This is mostly a copy from Keras transformer tutorial on [transformer model](https://www.tensorflow.org/text/tutorials/transformer), except no tokenizer is needed here. 

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=256, #embed_dim,
                                             kernel_initializer="glorot_uniform",
                                             dropout=rate)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="elu",
                            bias_initializer=keras.initializers.HeNormal()
                          ),
             layers.Dense(embed_dim)]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
      
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)

        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()

        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
        self.width = maxlen
        self.embed_dim = embed_dim

    def call(self, x):
        positions = tf.range(start=0, limit=self.width, delta=1)
        positions = self.pos_emb(positions)
        return x + positions

This so-called Diagonal Dense layer inherits from Keras Dense layer. In essence, it only keeps the diagonal elements of the weight matrix of the dense layer, and ignore the rest.

In [None]:
class DiagonalDense(layers.Dense):
    def __init__(self ,
                 units,
                 activation,
               use_bias=True,
               kernel_initializer='glorot_uniform',
               bias_initializer='zeros',
               kernel_regularizer=None,
               bias_regularizer=None,
               activity_regularizer=None,
               kernel_constraint=None,
               bias_constraint=None,
               **kwargs):
        super(DiagonalDense, self).__init__(
            units, activation, use_bias, kernel_initializer, bias_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, kernel_constraint, bias_constraint, **kwargs)


    def call(self, inputs):
        if inputs.shape[-2] != self.units:
            raise ValueError('DiagonalDense layer requires the second to last dimension of inputs to be equal to the number of units.'
                             f' Received: inputs.shape={inputs.shape}, units={self.units}')
        rank = inputs.shape.rank
        #no need to do anything
        if rank == 2 or rank is None:
            return super(DiagonalDense,self).call(inputs)
        else:
            #return tf.linalg.diag_part( super(DiagonalDense,self).call(inputs))
            outputs = tf.reduce_sum( tf.math.multiply(inputs, tf.transpose(self.kernel)), axis=-1)

        if self.use_bias:
            outputs = tf.nn.bias_add(outputs, self.bias)

        if self.activation is not None:
            outputs = self.activation(outputs)

        return outputs
       # return tf.einsum( einsum_str, super(DiagonalDense,self).call(inputs))

    #override the parent method the output shape now is simply the input shape without the last dimension
    def compute_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)
        input_shape = input_shape.with_rank_at_least(2)
        if tf.compat.dimension_value(input_shape[-1]) is None:
          raise ValueError('The last dimension of the input shape of a Dense layer '
                           'should be defined. Found None. '
                           f'Received: input_shape={input_shape}')
        return input_shape[:-1]


In [None]:
def learnRatefunction(epoch):
    if epoch < 100:
        lr = 1.0e-1
    elif epoch < 1500:
        lr = 2.0e-1
    else:
        lr = 5.e-2
    return lr

# Generating dataset from the provided csv file

In [None]:
hist_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
# use a subset of the data provided for debugging, as this step takes quite some time
#hist_prices = hist_prices[hist_prices.Date > '2020-01-01']

stock_code_encoder = OrdinalEncoder(dtype=np.int32)
encoded_stocks_array = stock_code_encoder.fit_transform(hist_prices[["SecuritiesCode"]])
hist_prices["SecuritiesCode"] = encoded_stocks_array
encoded_stocks_list = np.unique( encoded_stocks_array)

window_size = 7
batch_size = 64
ds, daily_price_series = prep_time_series_dataset( hist_prices, window_size, batch_size)

# Build and train the transformer model

In [None]:
embed_dim = encoded_stocks_list.shape[0]  # Embedding size for each token, should be 2000
num_heads = 4  # Number of attention heads
ff_dim =128  # Hidden layer size in feed forward network inside transformer

no_epoches = 1000

# split into 90% train, 10% val
#split =18  # batchs  derived from 90% of total no. of sample/batch_size
#train_ds = ds.take( split)
#val_ds = ds.skip(split)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
#train_ds = train_ds.with_options(options)
#val_ds = val_ds.with_options(options)
train_ds = ds.with_options(options)

tf.debugging.set_log_device_placement(True)
gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy(gpus)

with strategy.scope():
    inputs= layers.Input(shape=( window_size, embed_dim, 4))
    x= DiagonalDense(embed_dim, activation='elu', use_bias=True)(inputs)
    embedding_layer = TokenAndPositionEmbedding(window_size,  embed_dim)
    x = embedding_layer(x)
    tb_1 = TransformerBlock(embed_dim, num_heads, ff_dim, rate=0.2)
    x = tb_1(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.8)(x)
    outputs = layers.Dense(embed_dim)(x)

    model = keras.Model(inputs=inputs, outputs=outputs)

    lr_schedule = tf.keras.callbacks.LearningRateScheduler( learnRatefunction  ) 
    opt = tf.keras.optimizers.Adam(learning_rate=5.0e-1, epsilon=1)
    model.compile( optimizer=opt, metrics=["mae"], loss="mse")#, run_eagerly=True)#, keras.losses.Huber(), )

    model.summary()
    history = model.fit( train_ds, epochs=no_epoches, callbacks=[lr_schedule], verbose=0)
    
    print('final loss', history.history['loss'][no_epoches-1])#, 'val_loss:'),  history.history['val_loss'][no_epoches-1])

# Submission
Before we can predict when given with a "future" day's market prices, we need to construct the time series input with historical data with the new data.

In [None]:
def predict_with_latest_price( model, latest_prices, historical_prices,  code_encoder, sharding_options,ts_window_size ):
    # take the most recent chuck of historical data, append the current_date prices to form time series samples
    recent_prices = historical_prices[historical_prices.Date > '2021-10-01']
    # pad the latest data with dummy target value so that it can be concatenated with the historical data
    latest_prices['Target'] = 0.0
    latest_prices["SecuritiesCode"] = code_encoder.fit_transform(latest_prices[["SecuritiesCode"]])
    recent_prices = pd.concat([recent_prices, latest_prices])
    ds, _ = prep_time_series_dataset( recent_prices, window_size, 1)
    pred = model.predict( ds.with_options(sharding_options), batch_size =1 )
    
    # pred is of shape (no_of_time_series_windows, no_of_all_stocks), take the last prediction for all stocks
    return pred[pred.shape[0] -1, :]


In [None]:
import time
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, ops, financials, trades, secondary_prices, sample_prediction) in iter_test:
    sample_prediction = sample_prediction.drop('Rank', axis=1)
    pred = predict_with_latest_price( model, prices, hist_prices, stock_code_encoder, options, window_size)
    dfTemp = pd.DataFrame( pred.reshape(-1,1))
    dfTemp.columns = [ 'Prediction']
    dfTemp['SecuritiesCode'] = stock_code_encoder.inverse_transform( encoded_stocks_list.reshape(-1,1))
    dfTemp['Rank'] = dfTemp['Prediction'].rank(ascending=False,method='first') -1
    dfTemp['Rank'] = dfTemp['Rank'].astype(int)
    dfTemp = dfTemp.drop('Prediction', axis=1)
    sample_prediction = sample_prediction.merge(dfTemp, on='SecuritiesCode', how='left')
 
    env.predict(sample_prediction)