In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data Processing

The majority of the code follows the provided template code in the [Competition's Train Demo Notebook](https://www.kaggle.com/code/smeitoma/train-demo/notebook), but this notebook uses an **LSTM model** to make stock price predictions instead of the Demo's LGBM model.

The following functions are used to adjust the close prices in the raw stock price data.

We import the code necessary for the LSTM Model.

We load the data, adjust closing prices, and view the resulting data.

In [None]:
from decimal import ROUND_HALF_UP, Decimal

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
# Load stock price data
df_price = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
# df_supp =  pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")

In [None]:
df_price = adjust_price(df_price)
# df_supp = adjust_price(df_supp)
df_price.info()

In [None]:
display(df_price.head(2))
display(df_price.tail(2))

The following function represents **Feature Engineering**, and we can use this to set features for the stock price data.

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode", close_col, "ExpectedDividend", "High", "Low"]].copy()

    # calculate return using AdjustedClose
    feats["return_1day"] = feats[close_col].pct_change(1)
    # calculate 2 week return using AdjustedClose
    feats["return_2week"] = feats[close_col].pct_change(10)
    # calculate last 1 month return using AdjustedClose
    feats["return_1month"] = feats[close_col].pct_change(21)
    # calculate last 3 months return using AdjustedClose
    feats["return_3month"] = feats[close_col].pct_change(63)

    # calculate 2 week historical volatility using AdjustedClose
    feats["volatility_2week"] = (
        np.log(feats[close_col]).diff().rolling(10).std()
    )
    # calculate last 1 month historical volatility using AdjustedClose
    feats["volatility_1month"] = (
        np.log(feats[close_col]).diff().rolling(21).std()
    )
    # calculate last 3 months historical volatility using AdjustedClose
    feats["volatility_3month"] = (
        np.log(feats[close_col]).diff().rolling(63).std()
    )

    # ExpectedDividend
    feats["ExpectedDividend"] = feats["ExpectedDividend"].mask(feats["ExpectedDividend"] > 0, 1)

    # Amplitude
    feats["Amplitude"] = feats["High"] - feats["Low"]

    # RSI
    C_Diff = feats['AdjustedClose'] - feats['AdjustedClose'].shift(1)
    U = C_Diff.apply(lambda series: series if series > 0 else 0)
    D = C_Diff.apply(lambda series: -series if series < 0 else 0)
    EMA_U = U.ewm(span = 10, adjust = False).mean()
    EMA_D = D.ewm(span = 10, adjust = False).mean()
    RSI = EMA_U/(EMA_U+EMA_D) * 100
    RSI.rename('RSI',inplace = True)
    feats = feats.merge(RSI,left_index = True,right_index = True,how = 'left')

    # 52 Week High
    High52 = feats['AdjustedClose']/feats['High'].rolling(250).max()
    High52.rename('High52',inplace = True)
    feats = feats.merge(High52,left_index = True,right_index = True, how = 'left')

    # BIAS
    BIAS = feats['AdjustedClose'].rolling(10).mean()
    BIAS = (feats['AdjustedClose'] - BIAS)/BIAS
    BIAS.rename('BIAS' + str(10),inplace = True)
    feats = feats.merge(BIAS,left_index = True,right_index = True, how = 'left')

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# fetch prediction target SecuritiesCodes
# There are 2000 codes
codes = sorted(df_price["SecuritiesCode"].unique())
len(codes)

In [None]:
from tqdm import tqdm
# generate the features for prediction
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(df_price, code)
    buff.append(feat)
feature = pd.concat(buff)

In [None]:
display(feature.head(2))
display(feature.tail(2))

We define a function to obtain labels for the data.

In [None]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

We split the data into **Train** and **Test** sets. This can also be updated to obtain **Validation** sets later on.

In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"

def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [None]:
# generate feature/label
train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, feature
)

In [None]:
train_X.loc[:, 'Target'] = train_y
test_X.loc[:, 'Target'] = test_y
train_X.reset_index(inplace = True)
test_X.reset_index(inplace = True)
df_price.reset_index(inplace = True)
train_X = df_price[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(train_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
test_X = df_price[["Date", "SecuritiesCode", "Open", "Close", "Volume"]].merge(test_X, left_on = ['Date', 'SecuritiesCode'],right_on = ['Date', 'SecuritiesCode'], how = 'right')
train_X.dropna(inplace = True)
test_X.dropna(inplace = True)

train_y = train_X['Target']
test_y = test_X['Target']
train_X.drop("Target", axis = 1, inplace = True);
test_X.drop("Target", axis = 1, inplace = True);

In [None]:
train_X.set_index('Date', inplace = True)
test_X.set_index('Date', inplace = True)

In [None]:
old_test_X = test_X
old_train_X = train_X
feat_cols = list(range(1, 18))
train_X = train_X.iloc[:, feat_cols]
test_X = test_X.iloc[:, feat_cols]

In [None]:
train_X

In [None]:
X_train = train_X.values
X_test = test_X.values
y_train = train_y.values
y_test = test_y.values

In [None]:
print("train_X has shape", X_train.shape)
print("train_y has shape", y_train.shape)
print("test_X has shape", X_test.shape)
print("test_y has shape", y_test.shape)

# 2. Training

We create the model; this is where we can design the neural network's structure and parameters.

In [None]:
import matplotlib.pyplot as plt

for i in range(X_train.shape[1]):
    plt.scatter(X_train[:, i], train_y)
    plt.xlabel(train_X.columns[i])
    plt.ylabel('y_train')
    plt.show()

In [None]:
from sklearn.linear_model import LassoCV, Lasso, LinearRegression

reg_params = 10**(np.linspace(-8, -0.5, 100))
num_params = len(reg_params)

model_lasso = LassoCV(n_alphas = num_params, alphas = reg_params,
                      cv = 5, random_state = 0).fit(X_train, y_train)
model_LR = LinearRegression().fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

best_lasso_alpha = model_lasso.alpha_
lasso_train_r2 = model_lasso.score(X_train, y_train)
lasso_train_pred = model_lasso.predict(X_train)
lasso_test_r2= model_lasso.score(X_test, y_test)
lasso_test_pred = model_lasso.predict(X_test)

LR_train_r2 = model_LR.score(X_train, y_train)
LR_train_pred = model_LR.predict(X_train)
LR_test_r2 = model_LR.score(X_test, y_test)
LR_test_pred = model_LR.predict(X_test)

train_lasso_rmse = np.sqrt(mean_squared_error(y_train, lasso_train_pred))
test_lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_test_pred))
train_LR_rmse = np.sqrt(mean_squared_error(y_train, LR_train_pred))
test_LR_rmse = np.sqrt(mean_squared_error(y_test, LR_test_pred))

In [None]:
print(f"Lasso training R^2 for, {best_lasso_alpha:0.2e}, is: {lasso_train_r2:0.2e}")
print(f"Lasso training RMSE for, {best_lasso_alpha:0.2e}, is: {train_lasso_rmse:0.2e}")
print(f"Lasso test R^2 for, {best_lasso_alpha:0.2e}, is: {lasso_test_r2:0.2e}")
print(f"Lasso test RMSE for, {best_lasso_alpha:0.2e}, is: {test_lasso_rmse:0.2e}")
print(f"LR training R^2 is: {LR_train_r2:0.2e}")
print(f"LR training RMSE is: {train_LR_rmse:0.2e}")
print(f"LR test R^2 is: {LR_test_r2:0.2e}")
print(f"LR test RMSE is: {test_LR_rmse:0.2e}")

In [None]:
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb_model = XGBRegressor()

cv_params = {
    'n_estimators': [100, 200],
    'max_depth': np.linspace(2, 10, 4, dtype = int),
    'learning_rate': 10**(np.linspace(-4, 0, 4)),
}

xgb_gs = GridSearchCV(estimator = xgb_model, param_grid = cv_params, error_score = 'raise')
xgb_gs.fit(X_train, y_train)

In [None]:
best_params = xgb_gs.best_params_
train_xgb_pred = xgb_gs.predict(X_train)
test_xgb_pred = xgb_gs.predict(X_test)
train_xgb_r2 = xgb_gs.score(X_train, y_train)
test_xgb_r2 = xgb_gs.score(X_test, y_test)
train_xgb_rmse = np.sqrt(mean_squared_error(y_train, train_xgb_pred))
test_xgb_rmse = np.sqrt(mean_squared_error(y_test, test_xgb_pred))

In [None]:
print(f"The best parameters for XGBRegressor is: {best_params}")
print(f"XGBoost training R^2 is: {train_xgb_r2:0.2e}")
print(f"XGBoost training RMSE is: {train_xgb_rmse:0.2e}")
print(f"XGBoost test R^2 is: {test_xgb_r2:0.2e}")
print(f"XGBoost test RMSE is: {test_xgb_rmse:0.2e}")

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("Predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["Predict"]))
    return df

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
result = old_test_X[["SecuritiesCode"]].copy()
result.loc[:, "Predict"] = lasso_test_pred
result.loc[:, 'Target'] = test_y.values

result = result.sort_values(["Date", "Predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

In [None]:
print(calc_spread_return_sharpe(result, portfolio_size=200))

In [None]:
result = old_test_X[["SecuritiesCode"]].copy()
result.loc[:, "Predict"] = xgb_gs.predict(X_test)
result.loc[:, 'Target'] = test_y.values

result = result.sort_values(["Date", "Predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

In [None]:
print(calc_spread_return_sharpe(result, portfolio_size=200))

In [None]:
train_X = np.reshape(train_X, (train_X.shape[0], train_X.shape[1], 1))
test_X = np.reshape(test_X, (test_X.shape[0], test_X.shape[1], 1))

In [None]:
print("After Expanding Dimensions for LSTM:")
print("train_X has shape", train_X.shape)
print("test_X has shape", test_X.shape)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam

model = Sequential()
model.add(LSTM(units = 64,input_shape = (7, 1)))
model.add(Dropout(0.4))
model.add(Dense(1))

model.compile(loss = 'mse',optimizer = 'adam', metrics = ['mean_squared_error'])

# model.add(LSTM(50, return_sequences=True, input_shape = (7, 1)))
# model.add(LSTM(50, return_sequences=False))
# model.add(Dense(25))
# model.add(Dense(1))
# model.compile(optimizer="adam", loss="mean_squared_error")

model.summary()

In [None]:
model.fit(train_X[:, :, :],train_y,batch_size = 4096,epochs = 10)

In [None]:
# predict
LSTM_test_pred = model.predict(test_X[:, :, :])

In [None]:
result = old_test_X[["SecuritiesCode"]].copy()
result.loc[:, "Predict"] = LSTM_test_pred
result.loc[:, 'Target'] = test_y.values

result = result.sort_values(["Date", "Predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

In [None]:
print(calc_spread_return_sharpe(result, portfolio_size=200))

In [None]:
old_train_X.iloc[:, feat_cols]

In [None]:
old_test_X.iloc[:, feat_cols]

# Open Close Volume ExpectedDividend High Low:

Lasso: 0.06492441599033187

XGBoost: 0.08032163291177492

LSTM: 0.056728157347210165

# Open Close Volume ExpectedDividend High Low:

Lasso: 

XGBoost: 

LSTM: 

# 