In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'jpx-tokyo-stock-exchange-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F34349%2F3935619%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240430%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240430T211348Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7a47daae27276d549c1f1df0cfeea0f135b5aff16583c390887b5bc146dcd2b15f228d8be8ba8e2a45704852f35e210efa09beb70f5f9c8fad0c5a11e9654c0a3338f4465679883fc18e1078f58fc9751a5f5de9b80901b107d3b7ec41716435cda4163e2d9fbd1476874ac88b46dc056b7fafd5f162dcc1b73393c2d8df25b8ff7dd008041316a8aac3c015fb0881275ecaaceda1b1920d0d798a5330da1acda0dedddda95476c371cd850d89a2eb7f510409c48383db1268385b2d440d6d9ee7efbf1fea6b8322e3c431fd8513bcd456a4443d22b1372f9c81ded0fa4f549da5c7fea8045dd822015be9c2abfb5237e4f4154e5c340585a9c2128a212bf2e5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading jpx-tokyo-stock-exchange-prediction, 252341457 bytes compressed
Downloaded and uncompressed: jpx-tokyo-stock-exchange-prediction
Data source import complete.


# Import

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import mean_squared_error
from math import sqrt
import os
import random
import gc
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

# Config - GPU.
Either GPU or TPU - Comment one of them

In [3]:
# #GPU
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#     try:
#         # Currently, memory growth needs to be the same across GPUs
#         for gpu in gpus:
#             tf.config.experimental.set_memory_growth(gpu, True)
#         logical_gpus = tf.config.list_logical_devices('GPU')
#         print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#     except RuntimeError as e:
#         # Memory growth must be set before GPUs have been initialized
#         print(e)
# strategy = tf.distribute.MirroredStrategy()
# print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

# Config - TPU.
Either GPU or TPU - Comment one of them

In [4]:
#TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)
print('Running on TPU ', tpu.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  
REPLICAS:  8


# Data Load

In [5]:
# The following functions are used to adjust the close prices in the raw stock price data.
# We will generate AdjustedClose using AdjustmentFactor value. This should reduce historical price gap caused by split/reverse-split.

from decimal import ROUND_HALF_UP, Decimal

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [6]:
stock_price_data = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
stock_price_adj_data = adjust_price(stock_price_data)
stock_price_adj_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2332531 entries, 2017-01-04 to 2021-12-03
Data columns (total 13 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   RowId                       object 
 1   SecuritiesCode              int64  
 2   Open                        float64
 3   High                        float64
 4   Low                         float64
 5   Close                       float64
 6   Volume                      int64  
 7   AdjustmentFactor            float64
 8   ExpectedDividend            float64
 9   SupervisionFlag             bool   
 10  Target                      float64
 11  CumulativeAdjustmentFactor  float64
 12  AdjustedClose               float64
dtypes: bool(1), float64(9), int64(2), object(1)
memory usage: 233.6+ MB


In [7]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode",
      close_col, "ExpectedDividend", "High", "Low", "Open", "Close"]].copy()

    # single case
    feats["return_1day"] = feats[close_col].pct_change(1)

    # ExpectedDividend
    feats["ExpectedDividend"] = feats["ExpectedDividend"].mask(feats["ExpectedDividend"] > 0, 1)

    # Amplitude
    feats["Amplitude"] = feats["High"] - feats["Low"]

    # Open to Close
    feats["OpentoClose"] = feats["Open"] - feats["Close"]

    # 52 Week High
    High52 = feats['AdjustedClose']/feats['High'].rolling(250).max()
    High52.rename('High52',inplace = True)
    feats = feats.merge(High52,left_index = True,right_index = True, how = 'left')

    # MACD
    feats["MACD"] = feats[close_col].ewm(span=12, adjust=False).mean() - feats[close_col].ewm(span=26, adjust=False).mean()

    for period in [5, 10, 20, 40, 60]:

      # calculate return using AdjustedClose
      feats["return_{}day".format(period)] = feats[close_col].pct_change(period)

      # volatility
      feats["volatility_{}day".format(period)] = np.log(feats[close_col]).diff().rolling(period).std()

      # moving average
      feats["MA_{}day".format(period)] = feats[close_col].rolling(period).mean()

      # exponential moving average
      feats["EMA_{}day".format(period)] = feats[close_col].ewm(span=period, adjust=False).mean()

      # RSI
      C_Diff = feats['AdjustedClose'] - feats['AdjustedClose'].shift(1)
      U = C_Diff.apply(lambda series: series if series > 0 else 0)
      D = C_Diff.apply(lambda series: -series if series < 0 else 0)
      EMA_U = U.ewm(span = period, adjust = False).mean()
      EMA_D = D.ewm(span = period, adjust = False).mean()
      RSI = EMA_U/(EMA_U+EMA_D) * 100
      RSI.rename('RSI_{}day'.format(period),inplace = True)
      feats = feats.merge(RSI,left_index = True,right_index = True,how = 'left')

      # MACD
      feats["MACD_{}day".format(period)] = feats[close_col].ewm(span=period,
        adjust=False).mean() - feats[close_col].ewm(span=2*period, adjust=False).mean()

      # BIAS
      BIAS = feats['AdjustedClose'].rolling(period).mean()
      BIAS = (feats['AdjustedClose'] - BIAS)/BIAS
      BIAS.rename('BIAS_{}day'.format(period),inplace = True)
      feats = feats.merge(BIAS,left_index = True,right_index = True, how = 'left')

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [8]:
# fetch prediction target SecuritiesCodes
# There are 2000 codes
codes = sorted(stock_price_adj_data["SecuritiesCode"].unique())
len(codes)

2000

In [9]:
from tqdm import tqdm  # Smart progress meter
# generate the features for prediction
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(stock_price_adj_data, code)
    buff.append(feat)
feature = pd.concat(buff)

100%|██████████| 2000/2000 [01:38<00:00, 20.39it/s]


In [10]:
feature.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2332531 entries, 2017-01-04 to 2021-12-03
Data columns (total 46 columns):
 #   Column            Dtype  
---  ------            -----  
 0   SecuritiesCode    int64  
 1   ExpectedDividend  float64
 2   High              float64
 3   Low               float64
 4   Open              float64
 5   Close             float64
 6   return_1day       float64
 7   Amplitude         float64
 8   OpentoClose       float64
 9   High52            float64
 10  MACD              float64
 11  return_5day       float64
 12  volatility_5day   float64
 13  MA_5day           float64
 14  EMA_5day          float64
 15  RSI_5day          float64
 16  MACD_5day         float64
 17  BIAS_5day         float64
 18  return_10day      float64
 19  volatility_10day  float64
 20  MA_10day          float64
 21  EMA_10day         float64
 22  RSI_10day         float64
 23  MACD_10day        float64
 24  BIAS_10day        float64
 25  return_20day      float64
 26 

In [11]:
# Label creation
# Next, we obtain the labels to be used for training the model ...
# ... (this is where we load and split the label data).

def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in th/e universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

In [12]:
# We split the data into **Train** and **Test** sets. ...
# This can also be updated to obtain **Validation** sets later on.

# split data into TRAIN and TEST
TRAIN_END = "2019-02-22"
VALID_START = "2019-03-01"
VALID_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"

def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        valid_X (pd.DataFrame): validation data
        valid_y (pd.DataFrame): label for validation
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, valid_X, tests_X = [], [], []
    trains_y, valid_y, tests_y = [], [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _valid_X = feats[VALID_START: VALID_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _valid_y = labels[VALID_START: VALID_END]
            _test_y = labels[TEST_START:]

            assert len(_train_X) == len(_train_y)
            assert len(_valid_X) == len(_valid_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            valid_X.append(_valid_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            valid_y.append(_valid_y)
            tests_y.append(_test_y)

    # combine features for each codes
    train_X = pd.concat(trains_X)
    valid_X = pd.concat(valid_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    valid_y = pd.concat(valid_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, valid_X, valid_y, test_X, test_y

In [13]:
# generate feature/label
train_X, train_y, valid_X, valid_y, test_X, test_y = get_features_and_label(
    stock_price_adj_data, codes, feature
)

100%|██████████| 2000/2000 [00:25<00:00, 77.14it/s]


In [14]:
# Adding Target price column to the data
train_X.loc[:,'Target'] = train_y
valid_X.loc[:,'Target'] = valid_y
test_X.loc[:,'Target'] = test_y

# Resetting the date index to numbers
train_X.reset_index(inplace= True)
valid_X.reset_index(inplace= True)
test_X.reset_index(inplace= True)
stock_price_adj_data.reset_index(inplace= True)

#
train_X = stock_price_adj_data[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(train_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
valid_X = stock_price_adj_data[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(valid_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
test_X = stock_price_adj_data[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(test_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')

#
train_X.dropna(inplace = True)
valid_X.dropna(inplace = True)
test_X.dropna(inplace = True)

#
train_y = train_X['Target']
valid_y = valid_X['Target']
test_y = test_X['Target']

#
train_X.drop("Target", axis = 1, inplace = True)
valid_X.drop("Target", axis = 1, inplace = True)
test_X.drop("Target", axis = 1, inplace = True)

#
train_X.set_index('Date', inplace = True)
valid_X.set_index('Date', inplace = True)
test_X.set_index('Date', inplace = True)


#
old_test_X = test_X
old_train_X = train_X
old_valid_X = valid_X

# Use below for manually selecting features
# feat_cols = list(range(1, 10))
# train_X = train_X.iloc[:, feat_cols]
# valid_X = valid_X.iloc[:, feat_cols]
# test_X = test_X.iloc[:, feat_cols]


# train & valid split

In [15]:
#X,y split
X_train = train_X.values
X_valid = valid_X.values
X_test = test_X.values
y_train = train_y.values
y_test = test_y.values
y_valid = valid_y.values
print("train_X has shape", X_train.shape)
print("train_y has shape", y_train.shape)
print("valid_X has shape", X_valid.shape)
print("valid_y has shape", y_valid.shape)
print("test_X has shape", X_test.shape)
print("test_y has shape", y_test.shape)

train_X has shape (994642, 49)
train_y has shape (994642,)
valid_X has shape (393968, 49)
valid_y has shape (393968,)
test_X has shape (928606, 49)
test_y has shape (928606,)


In [16]:
train_X.columns

Index(['SecuritiesCode', 'Open_x', 'Close_x', 'Volume', 'ExpectedDividend',
       'High', 'Low', 'Open_y', 'Close_y', 'return_1day', 'Amplitude',
       'OpentoClose', 'High52', 'MACD', 'return_5day', 'volatility_5day',
       'MA_5day', 'EMA_5day', 'RSI_5day', 'MACD_5day', 'BIAS_5day',
       'return_10day', 'volatility_10day', 'MA_10day', 'EMA_10day',
       'RSI_10day', 'MACD_10day', 'BIAS_10day', 'return_20day',
       'volatility_20day', 'MA_20day', 'EMA_20day', 'RSI_20day', 'MACD_20day',
       'BIAS_20day', 'return_40day', 'volatility_40day', 'MA_40day',
       'EMA_40day', 'RSI_40day', 'MACD_40day', 'BIAS_40day', 'return_60day',
       'volatility_60day', 'MA_60day', 'EMA_60day', 'RSI_60day', 'MACD_60day',
       'BIAS_60day'],
      dtype='object')

# model

In [17]:
class GCF:
    SEED = 1
    N_EPOCHS = 100
    BATCH_SIZE = 2048
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-5

    # Transformer Parameters
    EMBED_DIM=256//2
    N_HEAD=8
    FF_DIM=256//2
    DROPOUT=0.001
    N_BLOCK=4

In [18]:
feat_dim = X_train.shape[-1]

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self,embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim=embed_dim
        self.num_heads=num_heads
        if embed_dim % num_heads !=0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim=embed_dim//num_heads
        self.query_dense=layers.Dense(embed_dim)
        self.key_dense=layers.Dense(embed_dim)
        self.value_dense=layers.Dense(embed_dim)
        self.combine_heads=layers.Dense(embed_dim)

    def attention(self,query,key,value):
        score=tf.matmul(query,key,transpose_b=True)
        dim_key=tf.cast(tf.shape(key)[-1],tf.float32)
        scaled_score=score/tf.math.sqrt(dim_key)
        weights=tf.nn.softmax(scaled_score,axis=1)
        output=tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self,inputs):
        batch_size=tf.shape(inputs)[0]
        query=self.query_dense(inputs)
        key=self.key_dense(inputs)
        value=self.value_dense(inputs)

        query=self.separate_heads(
            query,batch_size
        )
        key=self.separate_heads(
            key,batch_size
        )
        value=self.separate_heads(
            value,batch_size
        )

        attention,weights=self.attention(query,key,value)
        attention=tf.transpose(
            attention,perm=[0,2,1,3]
        )
        concat_attention=tf.reshape(
            attention,(batch_size,-1,self.embed_dim)
        )
        output=self.combine_heads(
            concat_attention
        )
        return output

In [19]:
class TransformerBlock(layers.Layer):
    def __init__(self,embed_dim=GCF.EMBED_DIM,feat_dim=feat_dim,num_heads=GCF.N_HEAD,ff_dim=GCF.FF_DIM,rate=GCF.DROPOUT,**kwargs):
        super(TransformerBlock,self).__init__()
        self.att=MultiHeadSelfAttention(num_heads=num_heads,embed_dim=embed_dim)
        self.ffn=keras.Sequential(
            [layers.Dense(ff_dim,activation='gelu'),layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self,inputs,training):
        attn_output=self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1= self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# run

In [20]:
def create_model():
    inputs=layers.Input(shape=(1,feat_dim))

    x=layers.Dense(GCF.EMBED_DIM)(inputs)
    x=layers.LayerNormalization(epsilon=1e-6)(x)

    for k in range(GCF.N_BLOCK):
        transformer_block=TransformerBlock(GCF.EMBED_DIM, feat_dim, GCF.N_HEAD, GCF.FF_DIM, GCF.DROPOUT)
        x=transformer_block(x)

    x=layers.GlobalAveragePooling1D()(x)
    x=layers.Dense(20, activation="relu")(x)

    outputs=layers.Dense(1,activation='linear')(x)

    model=keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
      optimizer=tf.optimizers.Adam(0.001, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9),
      loss='mse',
      metrics=[keras.metrics.RootMeanSquaredError()]
    )
    return model

create_model().summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 49)]           0         
                                                                 
 dense (Dense)               (None, 1, 128)            6400      
                                                                 
 layer_normalization (Layer  (None, 1, 128)            256       
 Normalization)                                                  
                                                                 
 transformer_block (Transfo  (None, None, 128)         99584     
 rmerBlock)                                                      
                                                                 
 transformer_block_1 (Trans  (None, None, 128)         99584     
 formerBlock)                                                    
                                                             

In [21]:
# # model For GPU or TPU
# with strategy.scope():
#     model=create_model()

# # model=create_model() # For CPU

# early_stopping=keras.callbacks.EarlyStopping(
#     monitor='val_loss',
#     patience=GCF.EARLY_STOPPING_PATIENCE,
#     min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
#     restore_best_weights=True,
# )

# reduce_lr=ReduceLROnPlateau(
#     monitor='val_loss',
#     factor=0.5,
#     patience=3,
#     min_lr=1e-5,
#     verbose=1
# )

# #fit
# history=model.fit(
#     np.expand_dims(X_train,axis=1),y_train,
#     validation_data=(np.expand_dims(X_valid,axis=1),y_valid),
#     batch_size=GCF.BATCH_SIZE,
#     epochs=GCF.N_EPOCHS,
#     callbacks=[early_stopping,reduce_lr]
# )

# #predict
# # valid_pred=model.predict(np.expand_dims(X_valid,axis=1))

In [22]:
# import matplotlib.pyplot as plt

# cols = [h.replace("val_", "") for h in history.history.keys() if 'val' in h]

# for c in cols:
#     pd.DataFrame(history.history)[[c, "val_"+c]].plot() #plot
#     plt.title(c)
#     plt.show()

# pd.DataFrame(history.history)['lr'].plot()
# plt.title('lr')
# plt.show()

In [23]:
# # Predicition on test
# #predict
# test_pred=model.predict(np.expand_dims(X_test,axis=1))

## Sharpe Ratio

In [24]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("Predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["Predict"]))
    return df

In [25]:

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [26]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [27]:
# result_Transformer = old_test_X[["SecuritiesCode"]].copy()
# result_Transformer.loc[:, "Predict"] = test_pred
# result_Transformer.loc[:, 'Target'] = y_test

# result_Transformer = result_Transformer.sort_values(["Date", "Predict"], ascending=[True, False])
# result_Transformer = result_Transformer.groupby("Date").apply(set_rank)
# Transformer_test_rmse = sqrt(mean_squared_error(test_pred,y_test))

In [28]:
# print("Transformer RMSE", Transformer_test_rmse)
# print("Transformer Sharpe Ratio", calc_spread_return_sharpe(result_Transformer, portfolio_size=200))

## WITH PCA

In [29]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_key_features = feature.copy()
data_codes = data_key_features[["SecuritiesCode"]]

pca = PCA(n_components = 'mle')
data_components = pca.fit_transform(feature)

data_components = pd.DataFrame(data_components)
data_components["SecuritiesCode"] = data_codes.values
data_components["Date"] = feature.index.values
data_components.set_index("Date", inplace=True)

sum(pca.explained_variance_ratio_[:2]) # >95% of the variance

data_components = data_components[["SecuritiesCode", 0, 1, 2]] # first 3 components

In [30]:
# stock_price_data = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
stock_price_adj_data = adjust_price(stock_price_data)

# fetch prediction target SecuritiesCodes
# There are 2000 codes
codes = sorted(stock_price_adj_data["SecuritiesCode"].unique())
len(codes)

# from tqdm import tqdm  # Smart progress meter
# # generate the features for prediction
# buff = []
# for code in tqdm(codes):
#     feat = get_features_for_predict(stock_price_adj_data, code)
#     buff.append(feat)
# feature = pd.concat(buff)

2000

In [31]:
# generate feature/label
train_X, train_y, valid_X, valid_y, test_X, test_y = get_features_and_label(
    stock_price_adj_data, codes, data_components
)

100%|██████████| 2000/2000 [00:25<00:00, 79.44it/s]


In [32]:
# Adding Target price column to the data
train_X.loc[:,'Target'] = train_y
valid_X.loc[:,'Target'] = valid_y
test_X.loc[:,'Target'] = test_y

# Resetting the date index to numbers
train_X.reset_index(inplace= True)
valid_X.reset_index(inplace= True)
test_X.reset_index(inplace= True)
stock_price_adj_data.reset_index(inplace= True)

#
train_X = stock_price_adj_data[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(train_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
valid_X = stock_price_adj_data[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(valid_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
test_X = stock_price_adj_data[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(test_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')

#
train_X.dropna(inplace = True)
valid_X.dropna(inplace = True)
test_X.dropna(inplace = True)

#
train_y = train_X['Target']
valid_y = valid_X['Target']
test_y = test_X['Target']

#
train_X.drop("Target", axis = 1, inplace = True)
valid_X.drop("Target", axis = 1, inplace = True)
test_X.drop("Target", axis = 1, inplace = True)

#
train_X.set_index('Date', inplace = True)
valid_X.set_index('Date', inplace = True)
test_X.set_index('Date', inplace = True)


#
old_test_X = test_X
old_train_X = train_X
old_valid_X = valid_X

# Use below for manually selecting features
# feat_cols = list(range(1, 10))
# train_X = train_X.iloc[:, feat_cols]
# valid_X = valid_X.iloc[:, feat_cols]
# test_X = test_X.iloc[:, feat_cols]

feat_cols = list(range(1, 4))
train_X = train_X.iloc[:, feat_cols]
valid_X = valid_X.iloc[:, feat_cols]
test_X = test_X.iloc[:, feat_cols]


In [33]:
#X,y split
X_train = train_X.values
X_valid = valid_X.values
X_test = test_X.values
y_train = train_y.values
y_test = test_y.values
y_valid = valid_y.values
print("train_X has shape", X_train.shape)
print("train_y has shape", y_train.shape)
print("valid_X has shape", X_valid.shape)
print("valid_y has shape", y_valid.shape)
print("test_X has shape", X_test.shape)
print("test_y has shape", y_test.shape)

train_X has shape (994642, 3)
train_y has shape (994642,)
valid_X has shape (393968, 3)
valid_y has shape (393968,)
test_X has shape (928606, 3)
test_y has shape (928606,)


In [34]:
feature_subsets = {
    "012": {"X_train": X_train, "X_test": X_test, "X_valid": X_valid},
    "01": {"X_train": X_train[:, [0, 1]], "X_valid": X_valid[:, [0, 1]], "X_test": X_test[:, [0, 1]]},
    "02": {"X_train": X_train[:, [0, 2]], "X_valid": X_valid[:, [0, 2]], "X_test": X_test[:, [0, 2]]},
    "12": {"X_train": X_train[:, [1, 2]], "X_valid": X_valid[:, [1, 2]], "X_test": X_test[:, [1, 2]]},
    "0": {"X_train": X_train[:, [0]], "X_valid": X_valid[:, [0]], "X_test": X_test[:, [0]]},
    "1": {"X_train": X_train[:, [1]], "X_valid": X_valid[:, [1]], "X_test": X_test[:, [1]]},
    "2": {"X_train": X_train[:, [2]], "X_valid": X_valid[:, [2]], "X_test": X_test[:, [2]]}
}

feature_subsets_list = ["012", "01", "02", "12", "0", "1", "2"]

In [35]:
SR_Transformer = dict()
RMSE_Transformer = dict()

In [37]:
for subset_key in tqdm(feature_subsets_list):
    X_train = np.reshape(feature_subsets[subset_key]['X_train'],
                         (feature_subsets[subset_key]['X_train'].shape[0],
                          feature_subsets[subset_key]['X_train'].shape[1]))
    X_test = np.reshape(feature_subsets[subset_key]['X_test'],
                         (feature_subsets[subset_key]['X_test'].shape[0],
                          feature_subsets[subset_key]['X_test'].shape[1]))
    X_valid = np.reshape(feature_subsets[subset_key]['X_valid'],
                         (feature_subsets[subset_key]['X_valid'].shape[0],
                          feature_subsets[subset_key]['X_valid'].shape[1]))
    feat_dim = X_train.shape[-1]

    class MultiHeadSelfAttention(layers.Layer):
      def __init__(self,embed_dim, num_heads=8):
          super(MultiHeadSelfAttention, self).__init__()
          self.embed_dim=embed_dim
          self.num_heads=num_heads
          if embed_dim % num_heads !=0:
              raise ValueError(
                  f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
              )
          self.projection_dim=embed_dim//num_heads
          self.query_dense=layers.Dense(embed_dim)
          self.key_dense=layers.Dense(embed_dim)
          self.value_dense=layers.Dense(embed_dim)
          self.combine_heads=layers.Dense(embed_dim)

      def attention(self,query,key,value):
          score=tf.matmul(query,key,transpose_b=True)
          dim_key=tf.cast(tf.shape(key)[-1],tf.float32)
          scaled_score=score/tf.math.sqrt(dim_key)
          weights=tf.nn.softmax(scaled_score,axis=1)
          output=tf.matmul(weights, value)
          return output, weights

      def separate_heads(self, x, batch_size):
          x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
          return tf.transpose(x, perm=[0, 2, 1, 3])

      def call(self,inputs):
          batch_size=tf.shape(inputs)[0]
          query=self.query_dense(inputs)
          key=self.key_dense(inputs)
          value=self.value_dense(inputs)

          query=self.separate_heads(
              query,batch_size
          )
          key=self.separate_heads(
              key,batch_size
          )
          value=self.separate_heads(
              value,batch_size
          )

          attention,weights=self.attention(query,key,value)
          attention=tf.transpose(
              attention,perm=[0,2,1,3]
          )
          concat_attention=tf.reshape(
              attention,(batch_size,-1,self.embed_dim)
          )
          output=self.combine_heads(
              concat_attention
          )
          return output

    class TransformerBlock(layers.Layer):
      def __init__(self,embed_dim=GCF.EMBED_DIM,feat_dim=feat_dim,num_heads=GCF.N_HEAD,ff_dim=GCF.FF_DIM,rate=GCF.DROPOUT,**kwargs):
          super(TransformerBlock,self).__init__()
          self.att=MultiHeadSelfAttention(num_heads=num_heads,embed_dim=embed_dim)
          self.ffn=keras.Sequential(
              [layers.Dense(ff_dim,activation='gelu'),layers.Dense(embed_dim),]
          )
          self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
          self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
          self.dropout1 = layers.Dropout(rate)
          self.dropout2 = layers.Dropout(rate)

      def call(self,inputs,training):
          attn_output=self.att(inputs)
          attn_output = self.dropout1(attn_output, training=training)
          out1= self.layernorm1(inputs + attn_output)
          ffn_output = self.ffn(out1)
          ffn_output = self.dropout2(ffn_output, training=training)
          return self.layernorm2(out1 + ffn_output)

      def create_model():
          inputs=layers.Input(shape=(1,feat_dim))

          x=layers.Dense(GCF.EMBED_DIM)(inputs)
          x=layers.LayerNormalization(epsilon=1e-6)(x)

          for k in range(GCF.N_BLOCK):
              transformer_block=TransformerBlock(GCF.EMBED_DIM, feat_dim, GCF.N_HEAD, GCF.FF_DIM, GCF.DROPOUT)
              x=transformer_block(x)

          x=layers.GlobalAveragePooling1D()(x)
          x=layers.Dense(20, activation="relu")(x)

          outputs=layers.Dense(1,activation='linear')(x)

          model=keras.Model(inputs=inputs, outputs=outputs)

          model.compile(
            optimizer=tf.optimizers.Adam(0.001, beta_1=0.9, beta_2=0.98,
                                          epsilon=1e-9),
            loss='mse',
            metrics=[keras.metrics.RootMeanSquaredError()]
          )
          return model


      # model For GPU or TPU
      with strategy.scope():
          model=create_model()

      # model=create_model() # For CPU

      early_stopping=keras.callbacks.EarlyStopping(
          monitor='val_loss',
          patience=GCF.EARLY_STOPPING_PATIENCE,
          min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
          restore_best_weights=True,
      )

      reduce_lr=ReduceLROnPlateau(
          monitor='val_loss',
          factor=0.5,
          patience=3,
          min_lr=1e-5,
          verbose=1
      )

      #fit
      history=model.fit(
          np.expand_dims(X_train,axis=1),y_train,
          validation_data=(np.expand_dims(X_valid,axis=1),y_valid),
          batch_size=GCF.BATCH_SIZE,
          epochs=GCF.N_EPOCHS,
          callbacks=[early_stopping,reduce_lr]
      )

    #predict
      test_pred=model.predict(np.expand_dims(X_test,axis=1))
    # test_pred=model.predict(np.expand_dims(X_test,axis=1))


      result_Transformer = old_test_X[["SecuritiesCode"]].copy()
      result_Transformer.loc[:, "Predict"] = test_pred
      result_Transformer.loc[:, 'Target'] = y_test
      result_Transformer = result_Transformer.sort_values(["Date", "Predict"], ascending=[True, False])
      result_Transformer = result_Transformer.groupby("Date").apply(set_rank)
      Transformer_test_rmse = sqrt(mean_squared_error(test_pred,y_test))

      SR_Transformer[subset_key] = calc_spread_return_sharpe(result_Transformer, portfolio_size=200)
      RMSE_Transformer[subset_key] = Transformer_test_rmse

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 13/100


 14%|█▍        | 1/7 [07:51<47:06, 471.10s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 13/100


 29%|██▊       | 2/7 [15:34<38:53, 466.72s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 12/100


 43%|████▎     | 3/7 [23:07<30:40, 460.22s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 12/100


 57%|█████▋    | 4/7 [30:40<22:51, 457.30s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 12/100
Epoch 13/100


 71%|███████▏  | 5/7 [38:24<15:19, 459.98s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 13/100


 86%|████████▌ | 6/7 [46:06<07:40, 460.63s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 11/100


100%|██████████| 7/7 [53:31<00:00, 458.74s/it]


In [None]:
# # model For GPU or TPU
# with strategy.scope():
#     model=create_model()

# # model=create_model() # For CPU

# early_stopping=keras.callbacks.EarlyStopping(
#     monitor='val_loss',
#     patience=GCF.EARLY_STOPPING_PATIENCE,
#     min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
#     restore_best_weights=True,
# )

# reduce_lr=ReduceLROnPlateau(
#     monitor='val_loss',
#     factor=0.5,
#     patience=3,
#     min_lr=1e-5,
#     verbose=1
# )

# #fit
# history=model.fit(
#     np.expand_dims(X_train,axis=1),y_train,
#     validation_data=(np.expand_dims(X_valid,axis=1),y_valid),
#     batch_size=GCF.BATCH_SIZE,
#     epochs=GCF.N_EPOCHS,
#     callbacks=[early_stopping,reduce_lr]
# )

# #predict
# valid_pred=model.predict(np.expand_dims(X_valid,axis=1))

In [38]:
SR_Transformer

{'012': 0.007407379967842348,
 '01': 0.08973948941257964,
 '02': 0.0019416877414624373,
 '12': -0.01569491422462193,
 '0': 0.024679971081948023,
 '1': 0.027527421754791913,
 '2': 0.013697597177703066}

In [39]:
RMSE_Transformer

{'012': 0.02583693035302078,
 '01': 0.025831599081047884,
 '02': 0.025913746311948562,
 '12': 0.025827745096109928,
 '0': 0.025833658552258495,
 '1': 0.025818277014256676,
 '2': 0.02588364796169937}