# Baseline: RNN (GRU) model

I only used train_files data so there is no leak to testing data.

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.models import load_model
import pickle

In [None]:
stock_prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', index_col=False)
print(f"Original stock prices: {len(stock_prices)}")
cleaned_stock_prices = stock_prices[stock_prices['Close'].notna()]
print(f"Cleaned stock prices have: {len(cleaned_stock_prices)}")
split_time = int(len(cleaned_stock_prices)*0.8)
print(f"Split time: {split_time}")
train_close_series = cleaned_stock_prices[:split_time]
test_close_series = cleaned_stock_prices[split_time:]

train_series = train_close_series.groupby('SecuritiesCode')['Close'].apply(list)
train_series = train_series.values
test_series = test_close_series.groupby('SecuritiesCode')['Close'].apply(list)
test_series = test_series.values

# Dataset

The competition needs to calculate "Target" to rank 2000 stocks. So I just predict the next 2 days to get the "Target" according to the Metric Definition(https://www.kaggle.com/code/smeitoma/jpx-competition-metric-definition)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
window_size = 90
batch_size = 2048
buffer_size = 10000
OUT_STEPS = 2   # predict next 2 days
RESUME = False
RESUME_EPOCH = 20
EPOCHS = 20

In [None]:
def windowed_dataset(series, window_size):
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size + OUT_STEPS, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size + OUT_STEPS))
    return dataset

Generate train and test windowed data.

In [None]:
train_windowed_data = []
for i in tqdm(range(len(train_series))):
    windowed_series = windowed_dataset(train_series[i], window_size)
    for j in windowed_series:
        train_windowed_data.append(j.numpy())
with open('windowed_train_data90', 'wb') as fp:
    pickle.dump(train_windowed_data, fp)

In [None]:
test_windowed_data = []

for i in tqdm(range(len(test_series))):
    windowed_series = windowed_dataset(test_series[i], window_size)
    for j in windowed_series:
        test_windowed_data.append(j.numpy())

with open('windowed_test_data90', 'wb') as fp:
    pickle.dump(test_windowed_data, fp)

Normalize training data. The mean and standard deviation should only be computed using the training data so that the models have no access to the values in the validation and test sets.

In [None]:
train_d = np.array(train_windowed_data)
test_d = np.array(test_windowed_data)
t_mean = train_d.mean()
t_std = train_d.std()
train_series_norm = []
for i in train_series:
    train_series_norm.append((i - t_mean)/t_std)
test_series_norm = []
for i in test_series:
    test_series_norm.append((i - t_mean)/t_std)

train_windowed_norm = (train_d - t_mean)/t_std
test_windowed_norm = (test_d - t_mean)/t_std

First, we can take a look at native prediction.

In [None]:
total_abs_error = 0
total_squ_error = 0
num = 0
for sample in test_windowed_norm:
    test_pred = sample[:-1]
    test_ground = sample[1:]
    total_abs_error += tf.keras.metrics.mean_absolute_error(test_ground, test_pred).numpy()
    total_squ_error += tf.keras.metrics.mean_squared_error(test_ground, test_pred).numpy()
    num += 1
avg_abs_error = total_abs_error/num
avg_squ_error = total_squ_error/num
print("Test MAE: ", avg_abs_error)
print("Test MSE: ", avg_squ_error)

In [None]:
def plot_series(ids, series):
      for i in ids:
        s = series[i]
        ground_s = s[1:]
        predicted_v = s[:-1]
        mae = tf.keras.metrics.mean_absolute_error(ground_s, predicted_v).numpy()
        mse = tf.keras.metrics.mean_squared_error(ground_s, predicted_v).numpy()
        print(f"MSE {mse}")
        print(f"MAE {mae}")
        time = range(len(ground_s))
        plt.figure(figsize=(10,6))
        plt.plot(time, ground_s)
        plt.plot(time, predicted_v)
        plt.show()

plot_series([1, 3, 50], test_series_norm)

# Model

In [None]:
def get_dataset(dataset, batch_size, buffer_size, mode='train'):
    data = tf.data.Dataset.from_tensor_slices(dataset)
    data = data.map(lambda window: (window[:-OUT_STEPS], window[-OUT_STEPS:]))
    if mode == 'train':
        data = data.shuffle(buffer_size)
    data = data.batch(batch_size).prefetch(AUTO)
    return data

In [None]:
train_dataset = get_dataset(train_windowed_norm, batch_size, buffer_size, mode='train')
test_dataset = get_dataset(test_windowed_norm, batch_size, buffer_size, mode='test')

In [None]:
def create_model():
    model = tf.keras.Sequential([tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1), input_shape=[window_size]),
                                tf.keras.layers.GRU(256, return_sequences=True),
                                tf.keras.layers.GRU(256, return_sequences=True),
                                tf.keras.layers.GRU(512, return_sequences=True),
                                tf.keras.layers.GRU(512, return_sequences=True),
                                tf.keras.layers.GRU(256, return_sequences=True),
                                tf.keras.layers.GRU(256, return_sequences=True),
                                tf.keras.layers.GRU(128, return_sequences=True),
                                tf.keras.layers.GRU(128),
                                # tf.keras.layers.Dropout(0.1),
                                tf.keras.layers.Dense(512, activation='relu'),
                                tf.keras.layers.Dense(256, activation='relu'),
                                tf.keras.layers.Dense(128, activation='relu'),
                                tf.keras.layers.Dense(64, activation='relu'),
                                tf.keras.layers.Dense(32, activation='relu'),
                                # tf.keras.layers.Dropout(0.1),
                                tf.keras.layers.Dense(OUT_STEPS*1),
                                tf.keras.layers.Reshape([OUT_STEPS, 1])])
    return model

In [None]:
def get_lr_callback(plot=False):
    lr_start   = 1e-6
    lr_max     = 1e-4
    lr_min     = 1e-7
    lr_ramp_ep = 1
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if RESUME:
            epoch = epoch + RESUME_EPOCH
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr
        
    if plot:
        epochs = list(range(EPOCHS))
        learning_rates = [lrfn(x) for x in epochs]
#         print(learning_rates)
        plt.scatter(epochs,learning_rates)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback
get_lr_callback(plot=True)

In [None]:
tf.keras.backend.clear_session()
model = create_model()
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(),
             loss=tf.keras.losses.MeanSquaredError(),
             metrics=['mae'])
sv_path = './model/mymodel'
cp_callback = tf.keras.callbacks.ModelCheckpoint(sv_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=test_dataset, callbacks=[get_lr_callback(), cp_callback])

Plot history figures

In [None]:
plt.figure(figsize=(10,6))
epochs = range(len(history.history['loss']))
plt.plot(epochs, history.history['loss'], label='Training loss')
plt.plot(epochs, history.history['val_loss'], label="Validation loss")
plt.legend(loc=0)
plt.figure(figsize=(10,6))
plt.plot(epochs, history.history['mae'], label='Training mae')
plt.plot(epochs, history.history['val_mae'], label="Validation mae")
plt.legend(loc=0)

Zoom in

In [None]:
zoom_split = int(epochs[-1]*0.2)
plt.figure(figsize=(10,6))
plt.plot(epochs[zoom_split:], history.history['loss'][zoom_split:], label='Loss')
plt.plot(epochs[zoom_split:], history.history['val_loss'][zoom_split:], label='Val_loss')
plt.grid(True)
plt.legend(loc=0)
plt.figure(figsize=(10,6))
plt.plot(epochs[zoom_split:], history.history['mae'][zoom_split:], label='Training mae')
plt.plot(epochs[zoom_split:], history.history['val_mae'][zoom_split:], label="Validation mae")
plt.legend(loc=0)

# Prediction

In [None]:
model = create_model()
model.load_weights('./model/mymodel')
model.compile(optimizer=tf.keras.optimizers.Adam(),
             loss=tf.keras.losses.MeanSquaredError(),
             metrics=['mae'])

In [None]:
model.evaluate(test_dataset)

In [None]:
def windowed_test(series, batch_size, shift=1):
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size, shift=shift, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size))
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset
def plot_result(time0, time1, result, test_example_groundtruth):
    plt.figure(figsize=(10, 6))
    plt.plot(time0, test_example_groundtruth, label="Groundtruth")
    plt.plot(time0, result[:,0], label='Predicted Next Day')
    plt.plot(time1, result[:,1], label='Predicted Next Next Day')
    plt.legend(loc=0)
    plt.show()

def show_random_test(test_ids, series):
    for id in test_ids:
        test_example = series[id]
        test_example_input = test_example[:-1]
        test_example_groundtruth = test_example[window_size:]
        test_example_windowed = windowed_test(test_example_input, batch_size, shift=1)
        result = model.predict(test_example_windowed).squeeze()
        time0 = range(len(test_example_groundtruth))
        time1 = range(1, len(test_example_groundtruth)+1)
        plot_result(time0, time1, result, test_example_groundtruth)

In [None]:
show_random_test([1, 3, 20, 50, 60], test_series_norm)

# Submission

It is recommended to first train the model and save the checkpoint and then submit the score to avoid unnecessary running time.

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

In [None]:
tag = 0
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    if tag == 0:
        train_series_for_predict = cleaned_stock_prices[cleaned_stock_prices['Date'] > '2021-01-01']
        train_series_for_predict['Close'] = (train_series_for_predict['Close'] - t_mean)/t_std
        target_series = train_series_for_predict
        tag = 1
    cleaned_prices = prices[prices['Close'].notna()]
    cleaned_prices['Close'] = (cleaned_prices['Close'] - t_mean)/t_std
    target_series = target_series.append(cleaned_prices)
    new_securities = target_series.groupby('SecuritiesCode')['Close'].apply(list)
    security_code = []
    next_day = []
    next2_day = []
    for idx, value in new_securities.items():
        s_id = idx
        if len(value) < window_size:
            sv_padded = np.zeros(window_size)
            sv_padded[-len(value):] = value
            s_v = sv_padded
        else:
            s_v = value[-window_size:]
        s_input = windowed_test(s_v, 1)
        pred = model.predict(s_input).squeeze()
        security_code.append(s_id)
        next_day.append(pred[0]*t_std + t_mean)
        next2_day.append(pred[1]*t_std + t_mean)
    tuple_list = list(zip(security_code, next_day, next2_day))
    next_day_df = pd.DataFrame(tuple_list, columns=['SecuritiesCode', 'NextDay', "Next2Day"])
    new_df = pd.merge(prices, next_day_df, on="SecuritiesCode")
    new_df['Target'] = (new_df['Next2Day'] - new_df['NextDay']) / new_df['NextDay']
    new_df['Rank'] = new_df['Target'].rank(ascending=False, method='first') - 1
    new_df = new_df.sort_values('Rank').reset_index(drop=True)
    new_df['Rank'] = new_df['Rank'].astype('int')
    rankdict = dict(zip(new_df["SecuritiesCode"],new_df["Rank"]))
    sample_prediction['Rank'] = sample_prediction["SecuritiesCode"].map(rankdict)
    env.predict(sample_prediction)