This notebook generates a simple model for stock's returns based on past returns and  volatilities. It was copied from https://www.kaggle.com/code/smeitoma/Train_Demo 

Some ideas taken from https://www.kaggle.com/code/kotaromiura/jpx-lgbm-demo

In [None]:
import os
from decimal import ROUND_HALF_UP, Decimal

import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from tqdm import tqdm
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_dir = "../input/jpx-tokyo-stock-exchange-prediction"

train_files_dir = f"{base_dir}/train_files"
supplemental_files_dir = f"{base_dir}/supplemental_files"

# Generating AdjustedClose price

Calculates AdjustedClose prices,to take into account stocks splits or reverse splits. This has to be done in reverse order. For example, if a company split the stock 5 to 1, the stock price has to be 0.2 of the original price, to keep company valuation the same. Then the past prices have to be multiplied by 0.2, make pears to pears, not orange juice to pears.

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending = False)
        
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        
        # generate AdjustedClose
        df.loc[:, "AdClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
        ))
        
        df.loc[:, "AdOpen"] = (
            df["CumulativeAdjustmentFactor"] * df["Open"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
        ))
        
        df.loc[:, "AdHigh"] = (
            df["CumulativeAdjustmentFactor"] * df["High"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
        ))
        
        df.loc[:, "AdLow"] = (
            df["CumulativeAdjustmentFactor"] * df["Low"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
        ))
        
        # reverse order
        df = df.sort_values("Date")
        
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdClose"] == 0, "AdClose"] = np.nan
        df.loc[df["AdOpen"] == 0, "AdOpen"] = np.nan
        df.loc[df["AdHigh"] == 0, "AdHigh"] = np.nan
        df.loc[df["AdLow"] == 0, "AdLow"] = np.nan
        
        # forward fill AdjustedClose
        df.loc[:, "AdClose"] = df.loc[:, "AdClose"].ffill()
        df.loc[:, "AdOpen"] = df.loc[:, "AdOpen"].ffill()
        df.loc[:, "AdHigh"] = df.loc[:, "AdHigh"].ffill()
        df.loc[:, "AdLow"] = df.loc[:, "AdLow"].ffill()
        
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
df_price = pd.read_csv(f"{train_files_dir}/stock_prices.csv")
df_price_supplemental = pd.read_csv(f"{supplemental_files_dir}/stock_prices.csv")

df_price = pd.concat([df_price, df_price_supplemental])

# generate AdjustedClose
df_price = adjust_price(df_price)

In [None]:
df_price.head(2)

In [None]:
# Adding 17SectorCode and 33SectorCode to df_price

lista = pd.read_csv(f"{base_dir}/stock_list.csv")
sectores = lista[["SecuritiesCode","17SectorCode", "33SectorCode"]]

sector = sectores.loc[sectores['17SectorCode'] != '-']
sector = sector.loc[sectores['33SectorCode'] != '-']

df = sector.loc[sector.set_index('SecuritiesCode').index.isin(df_price.set_index('SecuritiesCode').index)]

df_price = df_price.reset_index().merge(df, how="left").set_index('Date')

In [None]:
del lista, sectores, sector, df

# Pre-processing for model building

This notebook presents a simple model using LightGBM.

First, the features are generated using the price change and historical volatility described above. Changing the past periods to 1, 2 and 3 improves the model from 99th percentile to 67% percentile. 

In [None]:
def calc_change_rate_base(price, column_name, periods):
    for period in periods:
        price[f"{column_name}_pct_{period}"] = price[column_name].pct_change(period)
    return price

def calc_volatility_base(price, column_name, periods):
    for period in periods:
        price[f"{column_name}_vol_{period}"] = np.log(price[column_name]).diff().rolling(window=period, min_periods=1).std()
    return price

def calc_moving_average_rate_base(price, column_name, periods):
    for period in periods:
        price[f"{column_name}_mov_{period}"] = price[column_name].rolling(window=period, min_periods=1).mean() / price[column_name]
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame): pd.DataFrame include stock_price
        code (int): A local code for a listed company
    Returns:4
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdClose"
    open_col = "AdOpen"
    
    feats = price.loc[price["SecuritiesCode"] == code, 
                      ["SecuritiesCode","AdHigh","AdLow",open_col,close_col,'17SectorCode','33SectorCode']].copy()
    
    periods = [10,21,63]
    feats = calc_change_rate_base(feats,close_col, periods)
    feats = calc_volatility_base(feats,close_col, periods)
    feats = calc_moving_average_rate_base(feats,close_col, periods)
    
    #feats = calc_change_rate_base(feats,open_col, periods)
    
    # Additional features
    feats["HLRolling"] = ((feats["AdHigh"]-feats["AdLow"])/ feats["AdLow"]).rolling(21).std()
    
    feats["CLRolling"] = ((feats["AdClose"]-feats["AdLow"])/ feats["AdLow"]).rolling(21).std()
    
    feats["HCRolling"] = ((feats["AdHigh"]-feats["AdClose"])/ feats["AdClose"]).rolling(21).std()
    
    feats = feats.dropna()
    
    feats.reset_index(inplace = True)
    feats["month"] = feats["Date"].dt.month
    feats["day"] = feats["Date"].dt.day
    feats["dow"] = feats["Date"].dt.dayofweek
    feats.set_index('Date',inplace = True)
    
    # filling data for nan and inf
    #feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)
    feats = feats.drop([open_col], axis=1)
    feats = feats.drop("AdHigh", axis=1)
    feats = feats.drop("AdLow", axis=1)
    
    return feats

In [None]:
# fetch prediction target SecuritiesCodes
codes = sorted(df_price["SecuritiesCode"].unique())
len(codes)

In [None]:
# generate feature
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(df_price, code)
    buff.append(feat)
feature = pd.concat(buff)

In [None]:
feature.head(2)

In [None]:
feature['17SectorCode'] = pd.Categorical(feature['17SectorCode'])
feature['33SectorCode'] = pd.Categorical(feature['33SectorCode'])
feature['month'] = pd.Categorical(feature['month'])
feature['day'] = pd.Categorical(feature['day'])
feature['dow'] = pd.Categorical(feature['dow'])

# Label creation

Next, we obtain the labels to be used for training the model (this is where we load and split the label data).

In [None]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    
    #df.loc[:, "label"] = df["Target"]
    labels = fixed_time_horizon(price, 0.01)

    #return df.loc[:, ["SecuritiesCode", "label"]] 
    return labels

In [None]:
labels = get_label(df_price,1301).dropna()
labels.head(5)

In [None]:
# Copied from https://quantdare.com/4-simple-ways-to-label-financial-data-for-machine-learning/

def fixed_time_horizon(price, threshold, name='Target'):
    """Fixed-time horizon labelling.
 
    Compute the financial labels using the fixed-time horizon procedure. See
    references to understand how this method works.
 
    Parameters
    ----------
    price : pandas.DataFrame or pandas.Series
        The data from which the labels are to be calculated. The data should be
        returns and not prices.
    name : str, optional, default: 'Target'
        Column to extract the labels from.        
    threshold : int
        The predefined constant threshold to compute the labels.
 
    Returns
    -------
    labs : pandas.DataFrame
        A pandas dataframe containing the returns and the labels for each 
        return.
 
    References
    ----------
    .. [1] Marcos López de Prado (2018). Advances in Financial Machine Learning 
       Wiley & Sons, Inc.
 
    .. [2] Marcos López de Prado - Machine Learning for Asset Managers.
 
    """
    # to store labels
    labs = pd.DataFrame(index = price.index, columns=["SecuritiesCode", 'label'])
 
    # get indices for each label
    idx_lower = price[price[name] < -threshold].index
    idx_middle = price[abs(price[name]) <= threshold].index
    idx_upper = price[price[name] > threshold].index
 
    # assign labels depending on indices
    labs["SecuritiesCode"] =  price["SecuritiesCode"]
    labs.loc[idx_lower, 'Label'] = -1
    labs.loc[idx_middle, 'Label'] = 0
    labs.loc[idx_upper, 'Label'] = 1
 
    return labs

In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2021-11-25"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2022-03-01"

def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [None]:
# generate feature/label
train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, feature
)

test_X.head(2)

# Building a simple model

Using the created features and labels, build a model using the following procedure

In [None]:
lgbm_params = {
    'seed': 42,
    'learning_rate': 0.001,
    'objective': 'regression',
    'metric': 'rmse',    
    'learning_rate': 0.001,
    'lambda_l1': 0.01, 
    'lambda_l2': 0.01,  
    'num_leaves': 8, 
    'max_depth': 3,
    'bagging_fraction': 0.7,  
    'bagging_freq': 1,     
    'min_child_samples': 10,
    'n_estimators': 500,
    'n_jobs': -1,
}

feat_cols = [
    "17SectorCode","33SectorCode",
    "AdClose_pct_10","AdClose_pct_21","AdClose_pct_63",
    "AdClose_vol_10","AdClose_vol_21","AdClose_vol_63",
    "AdClose_mov_10","AdClose_mov_21","AdClose_mov_63",
    "AdOpen_pct_10","AdOpen_pct_21","AdOpen_pct_63",
    "HLRolling","CLRolling","HCRolling",
    "month", "day", "dow"
]

feat_cols = [
    "17SectorCode","33SectorCode",
    "AdClose_pct_10","AdClose_pct_21","AdClose_pct_63",
    "AdClose_vol_10","AdClose_vol_21","AdClose_vol_63",
    "AdClose_mov_10","AdClose_mov_21","AdClose_mov_63",
    "HLRolling","CLRolling","HCRolling",
    "month", "day", "dow"
]

In [None]:
# initialize model
pred_model = LGBMRegressor(**lgbm_params)

# train
pred_model.fit(train_X[feat_cols].values, train_y)

# prepare result data
result = test_X[["SecuritiesCode"]].copy()

# predict
result.loc[:, "predict"] = pred_model.predict(test_X[feat_cols])

# actual result
result.loc[:, "Target"] = test_y.values

#result.loc[:, "res"] = result.loc[:, "Target"] - result.loc[:, "predict"]

def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

In [None]:
feature_imp = pd.DataFrame([train_X[feat_cols].columns, pred_model.feature_importances_]).T
feature_imp.columns = ['feature', 'imp']
feature_imp.sort_values(by='imp', ascending=False, inplace=True)
feature_imp = feature_imp.iloc[0:12]
feature_imp['imp'] = pd.to_numeric(feature_imp['imp'])
sns.barplot(x = 'imp', y = 'feature', data = feature_imp)
plt.show()

# Evaluation

Input the output of the forecasts of the constructed model into the evaluation function and plot the daily returns.

The evaluation function for this competition is as follows.

Please read [here](https://www.kaggle.com/code/smeitoma/jpx-competition-metric-definition) to know the evaluation function more.

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        
        weights = np.linspace(start = toprank_weight_ratio, stop = 1, num=portfolio_size)
        
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    
    return sharpe_ratio

In [None]:
# calc spread return sharpe
calc_spread_return_sharpe(result, portfolio_size = 200)

Then, we will show daily spread return of the model.

In [None]:
def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): spread return
    """
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    
    return purchase - short

df_result = result.groupby('Date').apply(_calc_spread_return_per_day, 200, 2)

In [None]:
df_result.plot(figsize=(16, 8))

# Saving model

You need to save your model parameter to use created model for your submission.

In [None]:
pred_model.booster_.save_model("./simple-model.txt")

We have created our simple model and evaluate the output of the model.

As a next action, let's submit the model into the leaderboard,

[The Submission Demo notebook](https://www.kaggle.com/code/smeitoma/submission-demo) will explain how to submit with your model.