In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data Processing

The majority of the code follows the provided template code in the [Competition's Train Demo Notebook](https://www.kaggle.com/code/smeitoma/train-demo/notebook), but this notebook uses an **LSTM model** to make stock price predictions instead of the Demo's LGBM model.

The following functions are used to adjust the close prices in the raw stock price data.

In [None]:
from decimal import ROUND_HALF_UP, Decimal

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

We import the code necessary for the LSTM Model.

In [None]:
import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam

We load the data, adjust closing prices, and view the resulting data.

In [None]:
# Load stock price data
df_price = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")

# df_supp =  pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")

In [None]:
df_price = adjust_price(df_price)
# df_supp = adjust_price(df_supp)
df_price.info()

In [None]:
display(df_price.head(2))
display(df_price.tail(2))

The following function represents **Feature Engineering**, and we can use this to set features for the stock price data.

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode",
      close_col, "ExpectedDividend", "High", "Low", "Open", "Close"]].copy()

    # single case
    feats["return_1day"] = feats[close_col].pct_change(1)

    # Amplitude
    feats["Amplitude"] = feats["High"] - feats["Low"]

    # Open to Close
    feats["OpentoClose"] = feats["Open"] - feats["Close"]

    # 52 Week High
    High52 = feats['AdjustedClose']/feats['High'].rolling(250).max()
    High52.rename('High52',inplace = True)
    feats = feats.merge(High52,left_index = True,right_index = True, how = 'left')

    # MACD
    feats["MACD"] = feats[close_col].ewm(span=12, adjust=False).mean() - feats[close_col].ewm(span=26, adjust=False).mean()

    for period in [5, 10, 20, 40, 60]:

      # calculate return using AdjustedClose
      feats["return_{}day".format(period)] = feats[close_col].pct_change(period)

      # volatility
      feats["volatility_{}day".format(period)] = np.log(feats[close_col]).diff().rolling(period).std()

      # moving average
      feats["MA_{}day".format(period)] = feats[close_col].rolling(period).mean()

      # exponential moving average
      feats["EMA_{}day".format(period)] = feats[close_col].ewm(span=period, adjust=False).mean()

      # ExpectedDividend
      feats["ExpectedDividend_{}".format(period)] = feats["ExpectedDividend"].mask(feats["ExpectedDividend"] / feats[close_col] > period / 500, 1)

      # RSI
      C_Diff = feats['AdjustedClose'] - feats['AdjustedClose'].shift(1)
      U = C_Diff.apply(lambda series: series if series > 0 else 0)
      D = C_Diff.apply(lambda series: -series if series < 0 else 0)
      EMA_U = U.ewm(span = period, adjust = False).mean()
      EMA_D = D.ewm(span = period, adjust = False).mean()
      RSI = EMA_U/(EMA_U+EMA_D) * 100
      RSI.rename('RSI_{}day'.format(period),inplace = True)
      feats = feats.merge(RSI,left_index = True,right_index = True,how = 'left')

      # MACD
      feats["MACD_{}day".format(period)] = feats[close_col].ewm(span=period,
        adjust=False).mean() - feats[close_col].ewm(span=2*period, adjust=False).mean()

      # BIAS
      BIAS = feats['AdjustedClose'].rolling(period).mean()
      BIAS = (feats['AdjustedClose'] - BIAS)/BIAS
      BIAS.rename('BIAS_{}day'.format(period),inplace = True)
      feats = feats.merge(BIAS,left_index = True,right_index = True, how = 'left')

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# fetch prediction target SecuritiesCodes
# There are 2000 codes
codes = sorted(df_price["SecuritiesCode"].unique())
len(codes)

In [None]:
from tqdm import tqdm
# generate the features for prediction
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(df_price, code)
    buff.append(feat)
feature = pd.concat(buff)

In [None]:
display(feature.head(2))
display(feature.tail(2))

In [None]:
feature.info()

Execute PCA on the data with full features and retain only the necessary components as features.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_key_features = feature.copy()
data_codes = data_key_features[["SecuritiesCode"]]

pca = PCA(n_components = 'mle')
data_components = pca.fit_transform(feature)

data_components = pd.DataFrame(data_components)
data_components["SecuritiesCode"] = data_codes.values
data_components["Date"] = feature.index.values
data_components.set_index("Date", inplace=True)

sum(pca.explained_variance_ratio_[:2]) # >95% of the variance

data_components = data_components[["SecuritiesCode", 0, 1, 2]] # first 3 components

We perform feature subset selection.

In [None]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"


def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()
        
        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]
            
            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [None]:
# !Use this cell for PCA case only!:

train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, data_components
)


In [None]:
# !Use this cell for no PCA case:

train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, feature
)



In [None]:

train_X.loc[:, 'Target'] = train_y
test_X.loc[:, 'Target'] = test_y
train_X.reset_index(inplace = True)
test_X.reset_index(inplace = True)
df_price.reset_index(inplace = True)
train_X = df_price[["Date", "SecuritiesCode"]].merge(train_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
test_X = df_price[["Date", "SecuritiesCode"]].merge(test_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
train_X.dropna(inplace = True)
test_X.dropna(inplace = True)

train_y = train_X['Target']
test_y = test_X['Target']
train_X.drop("Target", axis = 1, inplace = True);
test_X.drop("Target", axis = 1, inplace = True);

print("train_X has shape", train_X.shape)
print("train_y has shape", train_y.shape)
print("test_X has shape", test_X.shape)
print("test_y has shape", test_y.shape)

train_X.set_index('Date', inplace = True)
test_X.set_index('Date', inplace = True)

old_test_X = test_X
old_train_X = train_X
# feat_cols = list(range(1, 4))
# train_X = train_X.iloc[:, feat_cols]
# test_X = test_X.iloc[:, feat_cols]

In [None]:
test_X = old_test_X
train_X = old_train_X

print("train_X has shape", train_X.shape)
print("train_y has shape", train_y.shape)
print("test_X has shape", test_X.shape)
print("test_y has shape", test_y.shape)

In [None]:
feats_list = train_X.columns.to_list()

In [None]:
# !!! ONLY Use this cell for no PCA case !!!

import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

results_lstm = []

train_X_lstm = np.reshape(train_X, (train_X.shape[0], train_X.shape[1], 1))
test_X_lstm = np.reshape(test_X, (test_X.shape[0], test_X.shape[1], 1))

model = Sequential()
model.add(LSTM(units = 64,input_shape = (7, 1)))
model.add(Dropout(0.4))
model.add(Dense(1))
model.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error'])
model.fit(train_X_lstm[:, :, :],train_y,batch_size = 4096,epochs = 10)
LSTM_test_pred = model.predict(test_X_lstm[:, :, :])
result = old_test_X[["SecuritiesCode"]].copy()
result.loc[:, "Predict"] = LSTM_test_pred
result.loc[:, 'Target'] = test_y.values

results_lstm.append(np.sqrt(mean_squared_error(test_y, LSTM_test_pred)))

result = result.sort_values(["Date", "Predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

# add this subset's Sharpe Ratio to results
results_lstm.append(calc_spread_return_sharpe(result, portfolio_size=200))
    
old_train_X.iloc[:, feat_cols]
old_test_X.iloc[:, feat_cols]

In [None]:
print(results_lstm)

In [None]:
print(train_X_lstm.shape)

We explore the feature weights found.

In [None]:
import shap
DE = shap.DeepExplainer(model, train_X_lstm)
shap_values = DE.shap_values(test_X_lstm, check_additivity=False) # X_validate is 3d numpy.ndarray

shap.initjs()
shap.summary_plot(
    shap_values[0], 
    test_X_lstm,
    feature_names=train_X.columns.tolist(),
    max_display=50,
    plot_type='bar')

Feature Subsets:

In [None]:
# feature_subsets = {
#     "012": {"train_X": train_X, "test_X": test_X},
#     "01": {"train_X": train_X[["SecuritiesCode", 0, 1]], "test_X": test_X[["SecuritiesCode", 0, 1]]},
#     "02": {"train_X": train_X[["SecuritiesCode", 0, 2]], "test_X": test_X[["SecuritiesCode", 0, 2]]},
#     "12": {"train_X": train_X[["SecuritiesCode", 1, 2]], "test_X": test_X[["SecuritiesCode", 1, 2]]},
#     "0": {"train_X": train_X[["SecuritiesCode", 0]], "test_X": test_X[["SecuritiesCode", 0]]},
#     "1": {"train_X": train_X[["SecuritiesCode", 1]], "test_X": test_X[["SecuritiesCode", 1]]},
#     "2": {"train_X": train_X[["SecuritiesCode", 2]], "test_X": test_X[["SecuritiesCode", 2]]}
# }

feature_subsets = {
    "012": {"train_X": train_X, "test_X": test_X},
    "01": {"train_X": train_X[[0, 1]], "test_X": test_X[[0, 1]]},
    "02": {"train_X": train_X[[0, 2]], "test_X": test_X[[0, 2]]},
    "12": {"train_X": train_X[[1, 2]], "test_X": test_X[[1, 2]]},
    "0": {"train_X": train_X[[0]], "test_X": test_X[[0]]},
    "1": {"train_X": train_X[[1]], "test_X": test_X[[1]]},
    "2": {"train_X": train_X[[2]], "test_X": test_X[[2]]}
}

feature_subsets_list = ["012", "01", "02", "12", "0", "1", "2"]

Let's see an example of how to access one subset.

In [None]:
print(feature_subsets["012"]["train_X"])

In [None]:
X_train = feature_subsets["01"]["train_X"].values
X_test = feature_subsets["01"]["test_X"].values
y_train = train_y.values
y_test = test_y.values

In [None]:
print("train_X has shape", X_train.shape)
print("train_y has shape", y_train.shape)
print("test_X has shape", X_test.shape)
print("test_y has shape", y_test.shape)

# Training
Let's train Transformer, LSTM, XGBoost, and Lasso models on all the subsets:

In [None]:
# Training Code Starts

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("Predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["Predict"]))
    return df

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

Train an LSTM model.

In [None]:
import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

results_lstm = []

iter = 1
for subset_key in feature_subsets_list:

    print("On iteration", iter, "of", len(feature_subsets_list), ": Running subset with components (features)", subset_key)
    
    train_X_lstm = np.reshape(feature_subsets[subset_key]['train_X'], (feature_subsets[subset_key]['train_X'].shape[0], feature_subsets[subset_key]['train_X'].shape[1], 1))
    test_X_lstm = np.reshape(feature_subsets[subset_key]['test_X'], (feature_subsets[subset_key]['test_X'].shape[0], feature_subsets[subset_key]['test_X'].shape[1], 1))

    model = Sequential()
    model.add(LSTM(units = 64,input_shape = (7, 1)))
    model.add(Dropout(0.4))
    model.add(Dense(1))
    model.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error'])
    model.fit(train_X_lstm[:, :, :],train_y,batch_size = 4096,epochs = 10)
    LSTM_test_pred = model.predict(test_X_lstm[:, :, :])
    result = old_test_X[["SecuritiesCode"]].copy()
    result.loc[:, "Predict"] = LSTM_test_pred
    result.loc[:, 'Target'] = test_y.values

    result = result.sort_values(["Date", "Predict"], ascending=[True, False])
    result = result.groupby("Date").apply(set_rank)
    
    # add this subset's Sharpe Ratio to results
    results_lstm.append({subset_key: [calc_spread_return_sharpe(result, portfolio_size=200), np.sqrt(mean_squared_error(test_y, LSTM_test_pred))]})
    iter += 1
    
old_train_X.iloc[:, feat_cols]
old_test_X.iloc[:, feat_cols]

In [None]:
print(results_lstm)

We learn that LSTM performs the best when only components 01 is used, with a Sharpe Ratio of 0.12!