# Transformer starter

* For my learning, I try to use **Transformer Model** for this competition,
and I referred [this notebook](https://www.kaggle.com/code/takamichitoda/ump-train-transformer-on-tpu).

* The neural network model in this notebook does not seem to be well trained, but there is a possibility to improve the accuracy by feature engineering.

* Note: I am Beginner of machine learning, so if I am missing or misunderstanding something, please let me know in the comments.

# Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import gc
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

#TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
print('Running on TPU ', tpu.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)

# config

In [None]:
class GCF:
    SEED = 0
    N_EPOCHS = 100
    BATCH_SIZE = 30000 
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-3

    # Transformer Parameters　
    EMBED_DIM=64//2
    N_HEAD=8
    FF_DIM=128//2
    DROPOUT=0.0
    N_BLOCK=4

# Data Load

In [None]:
train_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
train_df

# preprocess

In [None]:
#Date convert type
train_df['Date']=pd.to_datetime(train_df['Date'])
train_df['Date']=train_df['Date'].dt.strftime('%Y%m%d').astype(int) 

# dropna columns
train_df=train_df.drop(['RowId','SecuritiesCode','AdjustmentFactor','ExpectedDividend','SupervisionFlag'],axis=1)

#dropna
train_df=train_df.dropna()

#scale
use_cols = ['Open','High','Low','Close','Volume']
ss=StandardScaler()
train_df[use_cols]=ss.fit_transform(train_df[use_cols])

train_df

# train & valid split

In [None]:
# train :80% , valid :20%
train=train_df.loc[train_df['Date']<=20201222,:]
valid=train_df.loc[train_df['Date']>20201222,:]

In [None]:
print('train_rate:',len(train)/len(train_df)*100,'%')
print('valid_size:',len(valid)/len(train_df)*100,'%')

In [None]:
##X,y split

X_train=train[use_cols]
y_train=train['Target']

X_valid=valid[use_cols]
y_valid=valid['Target']

# model

In [None]:
feat_dim = X_train.shape[-1]

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self,embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim=embed_dim
        self.num_heads=num_heads
        if embed_dim % num_heads !=0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim=embed_dim//num_heads
        self.query_dense=layers.Dense(embed_dim)
        self.key_dense=layers.Dense(embed_dim)
        self.value_dense=layers.Dense(embed_dim)
        self.combine_heads=layers.Dense(embed_dim)

    def attention(self,query,key,value):
        score=tf.matmul(query,key,transpose_b=True)
        dim_key=tf.cast(tf.shape(key)[-1],tf.float32)
        scaled_score=score/tf.math.sqrt(dim_key)
        weights=tf.nn.softmax(scaled_score,axis=1)
        output=tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self,inputs):
        batch_size=tf.shape(inputs)[0]
        query=self.query_dense(inputs)
        key=self.key_dense(inputs)
        value=self.value_dense(inputs)

        query=self.separate_heads(
            query,batch_size
        )
        key=self.separate_heads(
            key,batch_size
        )
        value=self.separate_heads(
            value,batch_size
        )

        attention,weights=self.attention(query,key,value)
        attention=tf.transpose(
            attention,perm=[0,2,1,3]
        )
        concat_attention=tf.reshape(
            attention,(batch_size,-1,self.embed_dim)
        )
        output=self.combine_heads(
            concat_attention
        )
        return output

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self,embed_dim=GCF.EMBED_DIM,feat_dim=feat_dim,num_heads=GCF.N_HEAD,ff_dim=GCF.FF_DIM,rate=GCF.DROPOUT,**kwargs):
        super(TransformerBlock,self).__init__()
        self.att=MultiHeadSelfAttention(num_heads=num_heads,embed_dim=embed_dim)
        self.ffn=keras.Sequential(
            [layers.Dense(ff_dim,activation='gelu'),layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self,inputs,training):
        attn_output=self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1= self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# run

In [None]:
def create_model():
    inputs=layers.Input(shape=(1,feat_dim))

    x=layers.Dense(GCF.EMBED_DIM)(inputs)
    x=layers.LayerNormalization(epsilon=1e-6)(x)

    for k in range(GCF.N_BLOCK):
        transformer_block=TransformerBlock(GCF.EMBED_DIM, feat_dim, GCF.N_HEAD, GCF.FF_DIM, GCF.DROPOUT)
        x=transformer_block(x)

    x=layers.GlobalAveragePooling1D()(x)
    x=layers.Dense(20, activation="relu")(x)

    outputs=layers.Dense(1,activation='linear')(x)

    model=keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
      optimizer=tf.optimizers.Adam(1e-4),
      loss='mse',
      metrics=[keras.metrics.RootMeanSquaredError()]
    )
    return model

create_model().summary()

In [None]:
#model
with strategy.scope():
    model=create_model()

early_stopping=keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=GCF.EARLY_STOPPING_PATIENCE,
    min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
    restore_best_weights=True,
)

reduce_lr=ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-5,
    verbose=1
)

#fit
history=model.fit(
    np.expand_dims(X_train.values,axis=1),y_train.values,
    validation_data=(np.expand_dims(X_valid.values,axis=1),y_valid.values),
    batch_size=GCF.BATCH_SIZE,
    epochs=GCF.N_EPOCHS,
    callbacks=[early_stopping,reduce_lr]
)

#predict
valid_pred=model.predict(np.expand_dims(X_valid.values,axis=1))

In [None]:
import matplotlib.pyplot as plt

cols = [h.replace("val_", "") for h in history.history.keys() if 'val' in h]

for c in cols:
    pd.DataFrame(history.history)[[c, "val_"+c]].plot() #plot
    plt.title(c)
    plt.show()
    
pd.DataFrame(history.history)['lr'].plot()
plt.title('lr')
plt.show()

# evaluate
https://www.kaggle.com/code/smeitoma/train-demo/notebook

In [None]:
#result=pd.DataFrame(columns={'SecuritiesCode':,'Date'})
#result = train.loc[len(X_valid),["SecuritiesCode",'Date']].copy()
#result = valid[["SecuritiesCode",'Date']].copy()
result = valid[['Date']].copy()
#result = result.tail(len(X_valid))

# predict
result["predict"] = valid_pred
# actual result
result["Target"] = y_valid


def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

#result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

result = result.drop('Date',axis=1)

In [None]:
result.tail()

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
# calc spread return sharpe
calc_spread_return_sharpe(result, portfolio_size=200)