In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/trades.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/__init__.py
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_fin_spec.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/trades_spec.csv
/kaggle/input/jpx-tokyo-stock-

# 1. Data Processing

The majority of the code follows the provided template code in the [Competition's Train Demo Notebook](https://www.kaggle.com/code/smeitoma/train-demo/notebook), but this notebook uses an **LSTM model** to make stock price predictions instead of the Demo's LGBM model.

The following functions are used to adjust the close prices in the raw stock price data.

In [25]:
from decimal import ROUND_HALF_UP, Decimal

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

We import the code necessary for the LSTM Model.

In [26]:
import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam

We load the data, adjust closing prices, and view the resulting data.

In [27]:
# Load stock price data
df_price = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")

# df_supp =  pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")

In [28]:
df_price = adjust_price(df_price)
# df_supp = adjust_price(df_supp)
df_price.info()

  price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2332531 entries, 2017-01-04 to 2021-12-03
Data columns (total 13 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   RowId                       object 
 1   SecuritiesCode              int64  
 2   Open                        float64
 3   High                        float64
 4   Low                         float64
 5   Close                       float64
 6   Volume                      int64  
 7   AdjustmentFactor            float64
 8   ExpectedDividend            float64
 9   SupervisionFlag             bool   
 10  Target                      float64
 11  CumulativeAdjustmentFactor  float64
 12  AdjustedClose               float64
dtypes: bool(1), float64(9), int64(2), object(1)
memory usage: 233.6+ MB


  return Index(sequences[0], name=names)


In [7]:
display(df_price.head(2))
display(df_price.tail(2))

Unnamed: 0_level_0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CumulativeAdjustmentFactor,AdjustedClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-01-04,20170104_1301,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073,1.0,2742.0
2017-01-05,20170105_1301,1301,2743.0,2747.0,2735.0,2738.0,17900,1.0,,False,0.00292,1.0,2738.0


Unnamed: 0_level_0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CumulativeAdjustmentFactor,AdjustedClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-12-02,20211202_9997,9997,681.0,692.0,680.0,684.0,342900,1.0,,False,0.014368,1.0,684.0
2021-12-03,20211203_9997,9997,690.0,711.0,686.0,696.0,381100,1.0,,False,0.018414,1.0,696.0


The following function represents **Feature Engineering**, and we can use this to set features for the stock price data.

In [8]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode",
      close_col, "ExpectedDividend", "High", "Low", "Open", "Close"]].copy()

    # single case
    feats["return_1day"] = feats[close_col].pct_change(1)

    # Amplitude
    feats["Amplitude"] = feats["High"] - feats["Low"]

    # Open to Close
    feats["OpentoClose"] = feats["Open"] - feats["Close"]

    # 52 Week High
    High52 = feats['AdjustedClose']/feats['High'].rolling(250).max()
    High52.rename('High52',inplace = True)
    feats = feats.merge(High52,left_index = True,right_index = True, how = 'left')

    # MACD
    feats["MACD"] = feats[close_col].ewm(span=12, adjust=False).mean() - feats[close_col].ewm(span=26, adjust=False).mean()

    for period in [5, 10, 20, 40, 60]:

      # calculate return using AdjustedClose
      feats["return_{}day".format(period)] = feats[close_col].pct_change(period)

      # volatility
      feats["volatility_{}day".format(period)] = np.log(feats[close_col]).diff().rolling(period).std()

      # moving average
      feats["MA_{}day".format(period)] = feats[close_col].rolling(period).mean()

      # exponential moving average
      feats["EMA_{}day".format(period)] = feats[close_col].ewm(span=period, adjust=False).mean()

      # ExpectedDividend
      feats["ExpectedDividend_{}".format(period)] = feats["ExpectedDividend"].mask(feats["ExpectedDividend"] / feats[close_col] > period / 500, 1)

      # RSI
      C_Diff = feats['AdjustedClose'] - feats['AdjustedClose'].shift(1)
      U = C_Diff.apply(lambda series: series if series > 0 else 0)
      D = C_Diff.apply(lambda series: -series if series < 0 else 0)
      EMA_U = U.ewm(span = period, adjust = False).mean()
      EMA_D = D.ewm(span = period, adjust = False).mean()
      RSI = EMA_U/(EMA_U+EMA_D) * 100
      RSI.rename('RSI_{}day'.format(period),inplace = True)
      feats = feats.merge(RSI,left_index = True,right_index = True,how = 'left')

      # MACD
      feats["MACD_{}day".format(period)] = feats[close_col].ewm(span=period,
        adjust=False).mean() - feats[close_col].ewm(span=2*period, adjust=False).mean()

      # BIAS
      BIAS = feats['AdjustedClose'].rolling(period).mean()
      BIAS = (feats['AdjustedClose'] - BIAS)/BIAS
      BIAS.rename('BIAS_{}day'.format(period),inplace = True)
      feats = feats.merge(BIAS,left_index = True,right_index = True, how = 'left')

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [29]:
# fetch prediction target SecuritiesCodes
# There are 2000 codes
codes = sorted(df_price["SecuritiesCode"].unique())
len(codes)

2000

In [30]:
from tqdm import tqdm
# generate the features for prediction
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(df_price, code)
    buff.append(feat)
feature = pd.concat(buff)

  8%|▊         | 163/2000 [00:14<02:45, 11.10it/s]


KeyboardInterrupt: 

In [31]:
display(feature.head(2))
display(feature.tail(2))

Unnamed: 0_level_0,SecuritiesCode,ExpectedDividend,High,Low,Open,Close,return_1day,Amplitude,OpentoClose,High52,...,MACD_40day,BIAS_40day,return_60day,volatility_60day,MA_60day,EMA_60day,ExpectedDividend_60,RSI_60day,MACD_60day,BIAS_60day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-04,1301,0.0,2755.0,2730.0,2734.0,2742.0,0.0,25.0,-8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2742.0,0.0,0.0,0.0,0.0
2017-01-05,1301,0.0,2747.0,2735.0,2743.0,2738.0,-0.001459,12.0,5.0,0.0,...,-0.096357,0.0,0.0,0.0,0.0,2741.868852,0.0,0.0,-0.065032,0.0


Unnamed: 0_level_0,SecuritiesCode,ExpectedDividend,High,Low,Open,Close,return_1day,Amplitude,OpentoClose,High52,...,MACD_40day,BIAS_40day,return_60day,volatility_60day,MA_60day,EMA_60day,ExpectedDividend_60,RSI_60day,MACD_60day,BIAS_60day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-02,9997,0.0,692.0,680.0,681.0,684.0,-0.00146,12.0,-3.0,0.506292,...,-54.122108,-0.111284,-0.206497,0.013802,799.333333,785.312539,0.0,31.289117,-67.940654,-0.144287
2021-12-03,9997,0.0,711.0,686.0,690.0,696.0,0.017544,25.0,-6.0,0.515174,...,-54.254605,-0.092006,-0.195376,0.014042,796.516667,782.384259,0.0,34.205419,-68.269707,-0.126195


In [32]:
feature.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2332531 entries, 2017-01-04 to 2021-12-03
Data columns (total 51 columns):
 #   Column               Dtype  
---  ------               -----  
 0   SecuritiesCode       int64  
 1   ExpectedDividend     float64
 2   High                 float64
 3   Low                  float64
 4   Open                 float64
 5   Close                float64
 6   return_1day          float64
 7   Amplitude            float64
 8   OpentoClose          float64
 9   High52               float64
 10  MACD                 float64
 11  return_5day          float64
 12  volatility_5day      float64
 13  MA_5day              float64
 14  EMA_5day             float64
 15  ExpectedDividend_5   float64
 16  RSI_5day             float64
 17  MACD_5day            float64
 18  BIAS_5day            float64
 19  return_10day         float64
 20  volatility_10day     float64
 21  MA_10day             float64
 22  EMA_10day            float64
 23  ExpectedDividend_

Execute PCA on the data with full features and retain only the necessary components as features.

In [11]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_key_features = feature.copy()
data_codes = data_key_features[["SecuritiesCode"]]

pca = PCA(n_components = 'mle')
data_components = pca.fit_transform(feature)

data_components = pd.DataFrame(data_components)
data_components["SecuritiesCode"] = data_codes.values
data_components["Date"] = feature.index.values
data_components.set_index("Date", inplace=True)

sum(pca.explained_variance_ratio_[:2]) # >95% of the variance

data_components = data_components[["SecuritiesCode", 0, 1, 2]] # first 3 components

We perform feature subset selection.

In [12]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"


def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()
        
        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]
            
            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [13]:
# !Use this cell for PCA case only!:

train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, data_components
)


100%|██████████| 2000/2000 [00:25<00:00, 79.53it/s]


In [33]:
# !Use this cell for no PCA case:

train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, feature
)



100%|██████████| 2000/2000 [00:28<00:00, 69.61it/s]


In [34]:

train_X.loc[:, 'Target'] = train_y
test_X.loc[:, 'Target'] = test_y
train_X.reset_index(inplace = True)
test_X.reset_index(inplace = True)
df_price.reset_index(inplace = True)
train_X = df_price[["Date", "SecuritiesCode"]].merge(train_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
test_X = df_price[["Date", "SecuritiesCode"]].merge(test_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
train_X.dropna(inplace = True)
test_X.dropna(inplace = True)

train_y = train_X['Target']
test_y = test_X['Target']
train_X.drop("Target", axis = 1, inplace = True);
test_X.drop("Target", axis = 1, inplace = True);

print("train_X has shape", train_X.shape)
print("train_y has shape", train_y.shape)
print("test_X has shape", test_X.shape)
print("test_y has shape", test_y.shape)

train_X.set_index('Date', inplace = True)
test_X.set_index('Date', inplace = True)

old_test_X = test_X
old_train_X = train_X
# feat_cols = list(range(1, 4))
# train_X = train_X.iloc[:, feat_cols]
# test_X = test_X.iloc[:, feat_cols]

train_X has shape (1399453, 52)
train_y has shape (1399453,)
test_X has shape (932840, 52)
test_y has shape (932840,)


In [35]:
test_X = old_test_X
train_X = old_train_X

print("train_X has shape", train_X.shape)
print("train_y has shape", train_y.shape)
print("test_X has shape", test_X.shape)
print("test_y has shape", test_y.shape)

train_X has shape (1399453, 51)
train_y has shape (1399453,)
test_X has shape (932840, 51)
test_y has shape (932840,)


In [37]:
feats_list = train_X.columns.to_list()

In [36]:
# !!! ONLY Use this cell for no PCA case !!!

import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

results_lstm = []

train_X_lstm = np.reshape(train_X, (train_X.shape[0], train_X.shape[1], 1))
test_X_lstm = np.reshape(test_X, (test_X.shape[0], test_X.shape[1], 1))

model = Sequential()
model.add(LSTM(units = 64,input_shape = (7, 1)))
model.add(Dropout(0.4))
model.add(Dense(1))
model.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error'])
model.fit(train_X_lstm[:, :, :],train_y,batch_size = 4096,epochs = 10)
LSTM_test_pred = model.predict(test_X_lstm[:, :, :])
result = old_test_X[["SecuritiesCode"]].copy()
result.loc[:, "Predict"] = LSTM_test_pred
result.loc[:, 'Target'] = test_y.values

results_lstm.append(np.sqrt(mean_squared_error(test_y, LSTM_test_pred)))

result = result.sort_values(["Date", "Predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

# add this subset's Sharpe Ratio to results
results_lstm.append(calc_spread_return_sharpe(result, portfolio_size=200))
    
old_train_X.iloc[:, feat_cols]
old_test_X.iloc[:, feat_cols]

  super().__init__(**kwargs)


Epoch 1/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 530ms/step - loss: 0.0290 - mean_squared_error: 0.0290
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 522ms/step - loss: 5.7956e-04 - mean_squared_error: 5.7956e-04
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 517ms/step - loss: 5.0112e-04 - mean_squared_error: 5.0112e-04
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 525ms/step - loss: 4.8768e-04 - mean_squared_error: 4.8768e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 535ms/step - loss: 4.8334e-04 - mean_squared_error: 4.8334e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 516ms/step - loss: 4.7567e-04 - mean_squared_error: 4.7567e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 538ms/step - loss: 4.7368e-04 - mean_squared_error: 4.7368e-04
Epoch 8/10
[1

NameError: name 'feat_cols' is not defined

In [37]:
print(results_lstm)

[0.025800384322248987, 0.04804980860614415]


In [25]:
print(train_X_lstm.shape)

(1399453, 3, 1)


We explore the feature weights found.

In [27]:
import shap
DE = shap.DeepExplainer(model, train_X_lstm)
shap_values = DE.shap_values(test_X_lstm, check_additivity=False) # X_validate is 3d numpy.ndarray

shap.initjs()
shap.summary_plot(
    shap_values[0], 
    test_X_lstm,
    feature_names=train_X.columns.tolist(),
    max_display=50,
    plot_type='bar')

Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
You have provided over 5k background samples! For better performance consider using smaller random sample.


AttributeError: 'tuple' object has no attribute 'as_list'

Feature Subsets:

In [21]:
# feature_subsets = {
#     "012": {"train_X": train_X, "test_X": test_X},
#     "01": {"train_X": train_X[["SecuritiesCode", 0, 1]], "test_X": test_X[["SecuritiesCode", 0, 1]]},
#     "02": {"train_X": train_X[["SecuritiesCode", 0, 2]], "test_X": test_X[["SecuritiesCode", 0, 2]]},
#     "12": {"train_X": train_X[["SecuritiesCode", 1, 2]], "test_X": test_X[["SecuritiesCode", 1, 2]]},
#     "0": {"train_X": train_X[["SecuritiesCode", 0]], "test_X": test_X[["SecuritiesCode", 0]]},
#     "1": {"train_X": train_X[["SecuritiesCode", 1]], "test_X": test_X[["SecuritiesCode", 1]]},
#     "2": {"train_X": train_X[["SecuritiesCode", 2]], "test_X": test_X[["SecuritiesCode", 2]]}
# }

feature_subsets = {
    "012": {"train_X": train_X, "test_X": test_X},
    "01": {"train_X": train_X[[0, 1]], "test_X": test_X[[0, 1]]},
    "02": {"train_X": train_X[[0, 2]], "test_X": test_X[[0, 2]]},
    "12": {"train_X": train_X[[1, 2]], "test_X": test_X[[1, 2]]},
    "0": {"train_X": train_X[[0]], "test_X": test_X[[0]]},
    "1": {"train_X": train_X[[1]], "test_X": test_X[[1]]},
    "2": {"train_X": train_X[[2]], "test_X": test_X[[2]]}
}

feature_subsets_list = ["012", "01", "02", "12", "0", "1", "2"]

Let's see an example of how to access one subset.

In [16]:
print(feature_subsets["012"]["train_X"])

            SecuritiesCode            0            1            2
Date                                                             
2017-01-04            1301 -2645.833380  3660.963101 -3493.631894
2017-01-05            1301 -2646.153157  3661.902444 -3493.244408
2017-01-06            1301 -2653.047560  3652.561464 -3496.837588
2017-01-10            1301 -2639.780956  3669.059209 -3490.556005
2017-01-11            1301 -1904.356269  3255.727480 -3652.977302
...                    ...          ...          ...          ...
2019-12-24            9997 -6554.962401 -1646.892099  3794.387945
2019-12-25            9997 -6564.653168 -1657.029383  3790.513548
2019-12-26            9997 -6568.075486 -1662.473330  3788.458214
2019-12-27            9997 -6544.558814 -1637.252372  3798.014412
2019-12-30            9997 -6558.268184 -1654.917636  3791.307383

[1399453 rows x 4 columns]


In [17]:
X_train = feature_subsets["01"]["train_X"].values
X_test = feature_subsets["01"]["test_X"].values
y_train = train_y.values
y_test = test_y.values

In [18]:
print("train_X has shape", X_train.shape)
print("train_y has shape", y_train.shape)
print("test_X has shape", X_test.shape)
print("test_y has shape", y_test.shape)

train_X has shape (1399453, 2)
train_y has shape (1399453,)
test_X has shape (932840, 2)
test_y has shape (932840,)


# Training
Let's train Transformer, LSTM, XGBoost, and Lasso models on all the subsets:

In [20]:
# Training Code Starts

In [19]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("Predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["Predict"]))
    return df

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

Train an LSTM model.

In [22]:
import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

results_lstm = []

iter = 1
for subset_key in feature_subsets_list:

    print("On iteration", iter, "of", len(feature_subsets_list), ": Running subset with components (features)", subset_key)
    
    train_X_lstm = np.reshape(feature_subsets[subset_key]['train_X'], (feature_subsets[subset_key]['train_X'].shape[0], feature_subsets[subset_key]['train_X'].shape[1], 1))
    test_X_lstm = np.reshape(feature_subsets[subset_key]['test_X'], (feature_subsets[subset_key]['test_X'].shape[0], feature_subsets[subset_key]['test_X'].shape[1], 1))

    model = Sequential()
    model.add(LSTM(units = 64,input_shape = (7, 1)))
    model.add(Dropout(0.4))
    model.add(Dense(1))
    model.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error'])
    model.fit(train_X_lstm[:, :, :],train_y,batch_size = 4096,epochs = 10)
    LSTM_test_pred = model.predict(test_X_lstm[:, :, :])
    result = old_test_X[["SecuritiesCode"]].copy()
    result.loc[:, "Predict"] = LSTM_test_pred
    result.loc[:, 'Target'] = test_y.values

    result = result.sort_values(["Date", "Predict"], ascending=[True, False])
    result = result.groupby("Date").apply(set_rank)
    
    # add this subset's Sharpe Ratio to results
    results_lstm.append({subset_key: [calc_spread_return_sharpe(result, portfolio_size=200), np.sqrt(mean_squared_error(test_y, LSTM_test_pred))]})
    iter += 1
    
old_train_X.iloc[:, feat_cols]
old_test_X.iloc[:, feat_cols]

On iteration 1 of 7 : Running subset with components (features) 012
Epoch 1/10


  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 48ms/step - loss: 0.1971 - mean_squared_error: 0.1971
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - loss: 0.0024 - mean_squared_error: 0.0024
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 47ms/step - loss: 9.1162e-04 - mean_squared_error: 9.1162e-04
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 46ms/step - loss: 5.5094e-04 - mean_squared_error: 5.5094e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 48ms/step - loss: 5.2118e-04 - mean_squared_error: 5.2118e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 49ms/step - loss: 4.8790e-04 - mean_squared_error: 4.8790e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - loss: 4.8048e-04 - mean_squared_error: 4.8048e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 27ms/step - loss: 0.1390 - mean_squared_error: 0.1390
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - loss: 0.0015 - mean_squared_error: 0.0015
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - loss: 6.3373e-04 - mean_squared_error: 6.3373e-04
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - loss: 5.5322e-04 - mean_squared_error: 5.5322e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - loss: 5.2019e-04 - mean_squared_error: 5.2019e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - loss: 4.9213e-04 - mean_squared_error: 4.9213e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - loss: 4.8616e-04 - mean_squared_error: 4.8616e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 28ms/step - loss: 0.1820 - mean_squared_error: 0.1820
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - loss: 0.0029 - mean_squared_error: 0.0029
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - loss: 6.8194e-04 - mean_squared_error: 6.8194e-04
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - loss: 5.3756e-04 - mean_squared_error: 5.3756e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - loss: 4.9211e-04 - mean_squared_error: 4.9211e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - loss: 4.8180e-04 - mean_squared_error: 4.8180e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - loss: 4.7894e-04 - mean_squared_error: 4.7894e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 27ms/step - loss: 0.4559 - mean_squared_error: 0.4559
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - loss: 0.0182 - mean_squared_error: 0.0182
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - loss: 0.0033 - mean_squared_error: 0.0033
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - loss: 9.3103e-04 - mean_squared_error: 9.3103e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 27ms/step - loss: 6.1174e-04 - mean_squared_error: 6.1174e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - loss: 5.7143e-04 - mean_squared_error: 5.7143e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - loss: 5.4092e-04 - mean_squared_error: 5.4092e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 0.1279 - mean_squared_error: 0.1279
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - loss: 0.0014 - mean_squared_error: 0.0014
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 5.0977e-04 - mean_squared_error: 5.0977e-04
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 4.7359e-04 - mean_squared_error: 4.7359e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 4.7506e-04 - mean_squared_error: 4.7506e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - loss: 4.7432e-04 - mean_squared_error: 4.7432e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 4.7379e-04 - mean_squared_error: 4.7379e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 0.1614 - mean_squared_error: 0.1614
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 0.0013 - mean_squared_error: 0.0013
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 0.0010 - mean_squared_error: 0.0010
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 9.4050e-04 - mean_squared_error: 9.4050e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 6.6999e-04 - mean_squared_error: 6.6999e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 6.5300e-04 - mean_squared_error: 6.5300e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 6.0390e-04 - mean_squared_error: 6.0390e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

  super().__init__(**kwargs)


[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 0.1257 - mean_squared_error: 0.1257
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - loss: 8.2586e-04 - mean_squared_error: 8.2586e-04
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - loss: 5.7696e-04 - mean_squared_error: 5.7696e-04
Epoch 4/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 5.5453e-04 - mean_squared_error: 5.5453e-04
Epoch 5/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 4.7274e-04 - mean_squared_error: 4.7274e-04
Epoch 6/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 4.7089e-04 - mean_squared_error: 4.7089e-04
Epoch 7/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 4.6959e-04 - mean_squared_error: 4.6960e-04
Epoch 8/10
[1m342/342[0m [32m━━━━━━━━━━━━

NameError: name 'feat_cols' is not defined

In [23]:
print(results_lstm)

[{'012': [0.07601781150093627, 0.02582681191079746]}, {'01': [-0.017890288109884064, 0.025852689769790566]}, {'02': [0.023358926049466256, 0.02582368339917325]}, {'12': [0.07993814977563883, 0.02642773473815849]}, {'0': [0.03193787186832697, 0.025816629933549995]}, {'1': [-0.10703956757924109, 0.026590215230674917]}, {'2': [0.01733447339342689, 0.025826951773888503]}]


We learn that LSTM performs the best when only component 1 is used, with a Sharpe Ratio of 0.12!