In [None]:
%cp -r ../input/TA-package/ta-package ./

In [None]:
%cd ./ta-package/ta-0.10.1/ta-0.10.1
!python setup.py install
%cd /kaggle/working/

# Intro

In this notebook, I try to load some external features from TA packages, and apply with vanilla lightgbm method. 

In [None]:
import os
import gc
from decimal import ROUND_HALF_UP, Decimal

import numpy as np
import pandas as pd
import pickle
from lightgbm import LGBMRegressor
from tqdm import tqdm

from sklearn.model_selection import GridSearchCV  # Perforing grid search
from sklearn.model_selection import train_test_split

import ta
from ta import add_all_ta_features
from ta.utils import dropna

import matplotlib.pyplot as plt
import matplotlib as mpl


import copy
from pathlib import Path
import warnings

import warnings
from sklearn.exceptions import DataConversionWarning

from datetime import datetime
from datetime import timedelta

warnings.filterwarnings(action='ignore', category=UserWarning)

mpl.style.use('seaborn')

from sklearn.preprocessing import OrdinalEncoder

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price["Date2"] = price.index

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date2", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date2")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date2"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
    # Adjust with one Hot Encodings, fill  NA
    price['SupervisionFlag'] = price['SupervisionFlag'].astype(int)
    price = price.fillna(0)

    
    price.set_index("Date2", inplace = True)
    price.index.name = "Date"
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code].copy()
    
    momentRSI = ta.momentum.RSIIndicator(feats[close_col], fillna = True) 
    feats['momentum_rsi'] = momentRSI.rsi()
    MACD = ta.trend.MACD(feats[close_col], fillna = True) 
    feats['trend_macd'] = MACD.macd()
    feats['trend_kst'] = ta.trend.kst(feats[close_col], fillna = True)
    
    
    # Adds all 42 features
#     feats = ta.add_all_ta_features(
#         feats, "Open", "High", "Low", close_col, "Volume", fillna=False
#     )
    
    # To only add specific features
    # Example: https://github.com/bukosabino/ta/blob/master/examples_to_use/bollinger_band_features_example.py
    # df['bb_bbm'] = indicator_bb.bollinger_mavg()
    # df['bb_bbh'] = indicator_bb.bollinger_hband()
    # df['bb_bbl'] = indicator_bb.bollinger_lband()
    
    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# def get_label(price, code):
#     """ Labelizer
#     Args:
#         price (pd.DataFrame): dataframe of stock_price.csv
#         code (int): Local Code in the universe
#     Returns:
#         df (pd.DataFrame): label data
#     """
#     df = price.loc[price["SecuritiesCode"] == code].copy()
#     df.loc[:, "label"] = df["Target"]

#     return df.loc[:, ["SecuritiesCode", "label"]]

In [None]:
# split data into TRAIN and TEST
# TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
# TEST_START = "2020-01-06"

def split_train_test(codes, features):
    """
    Args:
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        trains (pd.DataFrame): training data
        tests (pd.DataFrame): test data
    """
    # to store splited data
    trains,vals, tests = [], [],[]

    # split train test one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code]

        train = feats[feats.index <= "2019-12-31"]
        val = feats[(feats.index >= "2020-01-06")&(feats.index < "2021-10-01")]
        test = feats[feats.index >= "2021-10-01"]

        trains.append(train)
        vals.append(val)
        tests.append(test)

    # combine features for each codes
    trains = pd.concat(trains)
    trains = trains.sort_index(ascending=True)
    vals = pd.concat(vals)
    vals = vals.sort_index(ascending=True)
    tests = pd.concat(tests)
    tests = tests.sort_index(ascending=True)
    return trains,vals,tests

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
def cal_log_return(df,codes):
    """
    Args:
        df(pd.DataFrame): predicted results
        codes(list): list of securities codes
    Returns:
        (pd.DataFrame): predicted results with log return
    """
    All = []
    
    for code in tqdm(codes):
        # Calculate log return for each securities code
        codes_df = df[df["SecuritiesCode"] == code]
        codes_df['return'] = (np.log(codes_df['Close']) - np.log(codes_df['Close'].shift(1)))*100
        All.append(codes_df)
        
    All = pd.concat(All)
    All = All.fillna(0)
    return All
    

In [None]:
def cal_log_return_test(df_today,df_prev,codes):
    '''
    create log return for testing(final evaluation)
    '''
    All = []
    for code in tqdm(codes):
        # Calculate log return for each securities code
        codes_df = df_today[df_today["SecuritiesCode"] == code]
        codes_df_prev = df_prev[df_prev["SecuritiesCode"] == code]
        codes_df['return'] = (np.log(codes_df['Close']) - np.log(codes_df_prev['Close']))*100
        All.append(codes_df)
        
    All = pd.concat(All)
    return All

In [None]:
def SecuritiesCode_Mixed(stock_price,stock_info,codes):
    '''
    Mixed Some Securities ino in the dataset
    '''
    
    All = []
    
    for code in tqdm(codes):
        # Calculate log return for each securities code
        
        stock_price_c = stock_price[stock_price["SecuritiesCode"] == code]
        stock_price_c = stock_price_c.merge(stock_info[['SecuritiesCode','NewMarketSegment_num']], on = 'SecuritiesCode' ,how = 'left')

        All.append(stock_price_c)
        
    All = pd.concat(All)
   
    All.set_index(stock_price.index,inplace=True)
    return All

# Load data and generate features


In [None]:
base_path = '../input/jpx-tokyo-stock-exchange-prediction/'
train_files_dir = base_path+'train_files/'
# !unzip {base_path}jpx-tokyo-stock-exchange-prediction.zip > /dev/null

In [None]:
# load stock price data
df_price = pd.read_csv(f"{train_files_dir}/stock_prices.csv")
df_price.loc[: ,"Date"] = pd.to_datetime(df_price.loc[: ,"Date"], format="%Y-%m-%d")
df_price.set_index("Date", inplace=True)
codes = sorted(df_price["SecuritiesCode"].unique())
# generate feature/label
train,val,test = split_train_test(codes, df_price)

In [None]:
print(df_price.shape)
print(train.shape)
print(val.shape)
print(test.shape)

In [None]:
# generate AdjustedClose
train = adjust_price(train)
val = adjust_price(val)
test = adjust_price(test)
train.head()

In [None]:
stock_list = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')
stock_list = stock_list[stock_list['SecuritiesCode'].isin(codes)]

In [None]:
NewMarketSegment = stock_list['NewMarketSegment'].value_counts(ascending = True).index.to_list()
NewMarketSegment

In [None]:
encoder = OrdinalEncoder(categories=[NewMarketSegment],dtype = np.int64)
stock_list[['NewMarketSegment_num']] = encoder.fit_transform(stock_list[['NewMarketSegment']])
stock_list.head()

In [None]:
train = SecuritiesCode_Mixed(train,stock_list,codes)
val = SecuritiesCode_Mixed(val,stock_list,codes)
test = SecuritiesCode_Mixed(test,stock_list,codes)
# df_price = cal_log_return(df_price,codes)
train

In [None]:
train_list = []
val_list = []
test_list = []
for code in tqdm(codes):
    feat_train = get_features_for_predict(train, code)
    feat_val= get_features_for_predict(val, code)
    feat_test= get_features_for_predict(test, code)
    train_list.append(feat_train)
    val_list.append(feat_val)
    test_list.append(feat_test)

trains = pd.concat(train_list)
vals = pd.concat(val_list)
tests = pd.concat(test_list)
trains.head()

In [None]:
# del train
# del test
# del buff
# del feat
gc.collect()

# LightGBM

Most of the code here are from: https://www.kaggle.com/code/metathesis/feature-engineering-training-with-ta

In [None]:
# cv_params = {#          超參數名稱 : 超參數區間
#               'max_depth': [10, 20, 30], # 
#               'learning_rate': [0.01, 0.05, 0.1]}

lgbm_params = {
    'seed': 52,
    'n_jobs': -1,'n_estimators' :300, 'learning_rate': 0.08

}

feat_cols = [
 'Open', 'Close', 'High', 'Low', 'Volume','ExpectedDividend','SupervisionFlag','NewMarketSegment_num','return'
#      "trend_macd",
#     "trend_kst",
#     "trend_ema_fast",
#     "volatility_bbm",
#     "volatility_atr",
]

#base_columns = set(df_price.columns)
#feature_columns = set(train_X.columns)

#feat_cols = feature_columns - base_columns
#feat_cols.update(['Open', 'Close', 'High', 'Low', 'Open', 'Volume'])

In [None]:
# initialize model
pred_model = LGBMRegressor(**lgbm_params)
#Grid Search CV
# gsearch = GridSearchCV(pred_model, param_grid =cv_params, cv =3 )
# gsearch.fit(trains[feat_cols], trains['Target'])

In [None]:
# train

pred_model.fit(trains[feat_cols], trains['Target'],eval_set = (vals[feat_cols], vals['Target']))

Here, we could check the feature importance for the LGBM.

In [None]:
import lightgbm
lightgbm.plot_importance(pred_model)

In [None]:
result = vals.copy()
# predict validation
result.loc[:, "predict"] = pred_model.predict(vals[feat_cols])


result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

result

Micmic the submission way for testset

In [None]:

# current_date = None
# day_s = 15
# All = []


# for i in  range(len(list_test)):
#     test_splitted = list_test[i]
#     test_splitted = adjust_price(test_splitted)
#     test_splitted = SecuritiesCode_Mixed(test_splitted,stock_list,codes)
#     current_date = test_splitted.index[0].date()
#     print(f"prev: {prev_date} current: {current_date}")
#     if prev_date != None:
#         if current_date <= prev_date:
#             raise ValueError("MyError!!")
       
#         ## sum up the previous stock prices
#         aux_df = pd.concat([aux_df,test_splitted])
#         differences = (current_date-test.index[0].date()).days
#         # if difference > 10, then delete row out of ranges in aux
#         if  differences > day_s:
#             index_threshold = current_date - timedelta(days = day_s)
#             aux_df = aux_df[aux_df.index >= str(index_threshold)]
#             gc.collect()
#         ## get log return
# #         aux_df = cal_log_return(aux_df,codes)
#     else:
#         aux_df = test_splitted.copy()
#         ## first day log return as 0
# #         aux_df['return'] = 0
#     aux_df = features_for_predictions(aux_df, code)
#     aux_df['predict'] = pred_model.predict(aux_df[feat_cols])

#     ## get the current predict with current date
#     prices = aux_df[aux_df.index == str(current_date)].copy()
#     prices = prices.sort_values("predict", ascending=False)
#     prices.loc[:, "Rank"] = np.arange(len(prices["predict"]))
#     All.append(prices)
#     prev_date = current_date
#     ## drop predict column for auxiliary dataframe
#     aux_df = aux_df.drop(columns = ['predict'])
# prediction_result = pd.concat(All)


In [None]:
calc_spread_return_sharpe(result)

In [None]:
result_tests = tests.copy()
# predict validation
result_tests.loc[:, "predict"] = pred_model.predict(tests[feat_cols])


result_tests = result.sort_values(["Date", "predict"], ascending=[True, False])
result_tests = result.groupby("Date").apply(set_rank)

result_tests

calc_spread_return_sharpe(result_tests)

# Evaluate for submission

In [None]:
gc.collect()

Here, define a function to create features for the testset. 

In [None]:
def features_for_predictions(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    buff = []
    close_col = "AdjustedClose"
    for code in tqdm(codes):
        feats = price.loc[price["SecuritiesCode"] == code].copy()
    
        # Adds all 42 features
        momentRSI = ta.momentum.RSIIndicator(feats[close_col], fillna = True) 
        feats['momentum_rsi'] = momentRSI.rsi()
        MACD = ta.trend.MACD(feats[close_col], fillna = True) 
        feats['trend_macd'] = MACD.macd()
        feats['trend_kst'] = ta.trend.kst(feats[close_col], fillna = True)
    
#         feats = ta.add_all_ta_features(feats, "Open", "High", "Low", close_col, "Volume", fillna=True)
        feats = feats.fillna(0)
        feats = feats.replace([np.inf, -np.inf], 0)
        feats = feats.drop([close_col], axis=1)
        buff.append(feats)
    feature = pd.concat(buff)
    return feature

In [None]:
import numpy as np
import time
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
day_s = 5
prev_date = None
for prices, options, financials, trades, secondary_prices, sample_prediction in iter_test:
    prices.loc[: ,"Date"] = pd.to_datetime(prices.loc[: ,"Date"], format="%Y-%m-%d")
    prices.set_index("Date", inplace=True)
    prices = adjust_price(prices)
    current_date = prices.index[0].date()
    prices = SecuritiesCode_Mixed(prices,stock_list,codes)
    print(f"prev: {prev_date} current: {current_date}")
    if prev_date != None:
        if current_date <= prev_date:
            raise ValueError("MyError!!") 
        aux_df = pd.concat([aux_df,prices])
        differences = (current_date-aux_df.index[0].date()).days
        # if difference > 10, then delete row out of ranges in aux
        if  differences > day_s:
            index_threshold = current_date - timedelta(days = day_s)
            aux_df = aux_df[aux_df.index >= str(index_threshold)] 
            gc.collect()
        ## sum up the previous stock prices
    else:
        aux_df = prices.copy()
    aux_df = features_for_predictions(aux_df, code)
    prices = aux_df[aux_df.index == str(current_date)].copy()


    prices['predict'] = pred_model.predict(prices[feat_cols])

    ## get the current predict with current date
    
    prices = prices.sort_values("predict", ascending=False)
    prices.loc[:, "Rank"] = np.arange(len(prices["predict"]))
    sample_prediction = pd.merge(sample_prediction.drop(columns = 'Rank'),prices[['SecuritiesCode','Rank']],on = 'SecuritiesCode')
    prev_date = current_date
    env.predict(sample_prediction)
#     prev_date = current_date
#     sample_prediction['Rank'] = np.arange(len(sample_prediction))
    

In [None]:
sample_prediction