In [None]:
%cp -r ../input/tapackage/ta-package ./


In [None]:
%cd ./ta-package/ta-0.10.1/ta-0.10.1
!python setup.py install
%cd /kaggle/working/

In [None]:
import os
import gc
from decimal import ROUND_HALF_UP, Decimal

import numpy as np
import pandas as pd
import pickle
import lightgbm
from lightgbm import LGBMRegressor
from tqdm import tqdm

import ta
from ta import add_all_ta_features
from ta.utils import dropna

import matplotlib.pyplot as plt
import matplotlib as mpl

import copy
from pathlib import Path
import warnings

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=UserWarning)

mpl.style.use('seaborn')

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code].copy()
    
    # Adds all 42 features
    feats = ta.add_all_ta_features(
        feats, "Open", "High", "Low", close_col, "Volume", fillna=False
    )
    
    # To only add specific features
    # Example: https://github.com/bukosabino/ta/blob/master/examples_to_use/bollinger_band_features_example.py
    # df['bb_bbm'] = indicator_bb.bollinger_mavg()
    # df['bb_bbh'] = indicator_bb.bollinger_hband()
    # df['bb_bbl'] = indicator_bb.bollinger_lband()
    
    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
def get_label_new(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
def cal_log_return(df,codes):
    
    All = []
    
    for code in tqdm(codes):
        # Calculate log return for each securities code
        codes_df = df[df["SecuritiesCode"] == code]
        codes_df['return'] = (np.log(codes_df['Close']) - np.log(codes_df['Close'].shift(1)))*100
        All.append(codes_df)
        
    All = pd.concat(All)
    return All
    

In [None]:
def cal_log_return_test(df_today,df_prev,codes):
    
    All = []
    
    for code in tqdm(codes):
        # Calculate log return for each securities code
        codes_df = df_today[df_today["SecuritiesCode"] == code]
        codes_df_prev = df_prev[df_prev["SecuritiesCode"] == code]
        codes_df['return'] = (np.log(codes_df['Close']) - np.log(codes_df_prev['Close']))*100
        All.append(codes_df)
        
    All = pd.concat(All)
    return All
    

In [None]:
def SecuritiesCode_Mixed(stock_price,stock_info,codes):
    
    All = []
    
    for code in tqdm(codes):
        # Calculate log return for each securities code
        
        stock_price_c = stock_price[stock_price["SecuritiesCode"] == code]
        stock_price_c = stock_price_c.merge(stock_info[['SecuritiesCode','NewMarketSegment_num']], on = 'SecuritiesCode' ,how = 'left')

        All.append(stock_price_c)
        
    All = pd.concat(All)
    All.set_index(stock_price.index,inplace=True)
    return All


In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"

def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]


            #print(labels.loc[:, "SecuritiesCode"])
            #print(feats.loc[:, "SecuritiesCode"])

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

# Read files and EDA with stocks info

For the details info of stocks information in Japan stock Exchange: https://www.jpx.co.jp/english/equities/market-restructure/market-segments/index.html

In [None]:
base_path = '../input/jpx-tokyo-stock-exchange-prediction/'
train_files_dir = base_path+'train_files/'
# !unzip {base_path}jpx-tokyo-stock-exchange-prediction.zip > /dev/null

In [None]:
# load stock price data
df_price = pd.read_csv(f"{train_files_dir}/stock_prices.csv")

# generate AdjustedClose
df_price = adjust_price(df_price)
codes = sorted(df_price["SecuritiesCode"].unique())

In [None]:
stock_list = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')
stock_list

In [None]:
stock_list_1st = stock_list[stock_list['SecuritiesCode'].isin(codes)]
stock_list_1st

In [None]:
import seaborn as sns

stock_list_1st['NewIndexSeriesSizeCode'].value_counts().plot(kind='barh')


In [None]:
stock_list_1st['Universe0'].value_counts().plot(kind='barh')

In [None]:
stock_list_1st['NewMarketSegment'].value_counts().plot(kind='barh')
## Apply this category in the LGBM

In [None]:
stock_list_1st['Section/Products'].value_counts().plot(kind='barh')

In [None]:
g = sns.displot(stock_list_1st, x="MarketCapitalization",hue="NewMarketSegment",kind="kde")
g.fig.set_figwidth(15)
g.fig.set_figheight(8)

In [None]:
stock_list_1st.head()

In [None]:
NewMarketSegment = stock_list_1st['NewMarketSegment'].value_counts(ascending = True).index.to_list()
NewMarketSegment

In [None]:
stock_list_1st[["SecuritiesCode",'NewMarketSegment']]

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[NewMarketSegment],dtype = np.int64)
stock_list_1st.loc[:,['NewMarketSegment_num']] = encoder.fit_transform(stock_list_1st.loc[:,['NewMarketSegment']])


In [None]:
stock_list_1st


# Data Prerocessing

<!-- 
some of this is taken from this notebook 
https://www.kaggle.com/code/metathesis/feature-engineering-training-with-ta -->

In [None]:
df_price_new = SecuritiesCode_Mixed(df_price,stock_list_1st,codes)
df_price_new

In [None]:
# generate log returns
df_price_new = cal_log_return(df_price_new,codes)
df_price_new

In [None]:
# deal with bool a na
df_price_new['SupervisionFlag'] = df_price_new['SupervisionFlag'].astype(int)
df_price_new = df_price_new.fillna(0)
df_price_new.head()

In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
VAL_START = "2020-01-06"
TEST_START = "2021-10-02"

target = df_price_new.loc[:, "Target"]
feat = df_price_new.drop(columns = 'Target')

# split data into TRAIN and TEST
# _train_X = feat[: TRAIN_END]
_train_X = feat[: ]
# _val_X = feat[VAL_START:]
# _test_X = feat[TEST_START:]

_train_y = target[: ]

#_train_y = target[: TRAIN_END]
#_val_y = target[VAL_START:]
#_test_y = target[TEST_START:]

# Prepare lgbm model

In [None]:
lgbm_params = {
    'seed': 42,
    'n_jobs': -1,'n_estimators' :100
}
feat_cols=([  'Close', 'High','Open', 'Low', 'Volume','ExpectedDividend','SupervisionFlag', 'SecuritiesCode','return','NewMarketSegment_num'])


In [None]:
# initialize model
pred_model = LGBMRegressor(**lgbm_params)
# train
pred_model.fit(_train_X[feat_cols], _train_y)


In [None]:
# # predict validation
# result_val = _val_X.copy()
# result_val.loc[:, "predict"] = pred_model.predict(_val_X[feat_cols])
# # validation actual result
# result_val.loc[:, "Target"] = _val_y.values

# result_val = result_val.sort_values(["Date", "predict"], ascending=[True, False])
# result_val = result_val.groupby("Date").apply(set_rank)

# calc_spread_return_sharpe(result_val)

In [None]:
# # prepare test data
# result_test = _test_X.copy()
# # predict validation
# result_test.loc[:, "predict"] = pred_model.predict(_test_X[feat_cols])
# # validation actual result
# result_test.loc[:, "Target"] = _test_y.values

# result_test = result_test.sort_values(["Date", "predict"], ascending=[True, False])
# result_test = result_test.groupby("Date").apply(set_rank)

# calc_spread_return_sharpe(result_test)

In [None]:
# Feature Importance
lightgbm.plot_importance(pred_model)

In [None]:
# Submit the Results
import numpy as np
import time
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
prev_date = None
for prices, options, financials, trades, secondary_prices, sample_prediction in iter_test:
    current_date = prices["Date"].iloc[0]
    print(f"prev: {prev_date} current: {current_date}")
    if prev_date != None:
        if current_date <= prev_date:
            raise ValueError("MyError!!")
    # adjust close price
    prices = adjust_price(prices)
    # deal with bool and fill na
    prices['SupervisionFlag'] = prices['SupervisionFlag'].astype(int)
    prices = prices.fillna(0)
    # calculate return rate
    if prev_date == None:
        prices['return'] = 0
        prev_close =prices[['SecuritiesCode','Close']]
    else:
        prices = cal_log_return_test(prices,prev_close,codes)
    # Mix the securiies code info in the stock info
    prices = SecuritiesCode_Mixed(prices,stock_list_1st,codes)
    # predict the test
    prices['predict'] = pred_model.predict(prices[feat_cols])
    prices = prices.sort_values("predict", ascending=False)
    prices.loc[:, "Rank"] = np.arange(len(prices["predict"]))
    # merge prices and sample prediction
    sample_prediction = pd.merge(sample_prediction.drop(columns = 'Rank'),prices[['SecuritiesCode','Rank']],on = 'SecuritiesCode')
    
    prev_date = current_date
    env.predict(sample_prediction)


In [None]:
sample_prediction