# Topological Data Analysis (TDA) for feature extraction with Catboost for training and submission
Topologicial Data Analysis (TDA) uses techniques from topology to analyse datasets. This notebook demonstrates how to perform TDA feature extraction with <cite>[giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration, Tauzin et al, arXiv:2004.02551, 2020.](https://arxiv.org/abs/2004.02551)</cite>

- Several non-TDA features were selected heuristically and additional features were then extracted using TDA.
- A Catboost model is used for training and submission.
- The training code is based on [smeitoma](https://www.kaggle.com/smeitoma)'s [https://www.kaggle.com/code/smeitoma/train-demo](https://www.kaggle.com/code/smeitoma/train-demo) and the submission code is based on [https://www.kaggle.com/code/smeitoma/submission-demo](https://www.kaggle.com/code/smeitoma/submission-demo).
- All files including required python packages are provided in the [https://www.kaggle.com/datasets/aemulcahy/jpx-dataset-001](https://www.kaggle.com/datasets/aemulcahy/jpx-dataset-001) dataset.


In [None]:
from decimal import ROUND_HALF_UP, Decimal
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import warnings

from catboost import CatBoostRegressor
from tqdm import tqdm

if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') != 'Localhost':
    !pip install ../input/jpx-dataset-001/pyflagser-0.4.4-cp37-cp37m-manylinux2010_x86_64.whl
    !pip install ../input/jpx-dataset-001/giotto_tda-0.5.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
else:
    giotto_tda_spec = importlib.util.find_spec('gtda')
    if giotto_tda_spec is None:
        !pip install giotto-tda

from gtda.diagrams import Amplitude
from gtda.homology import VietorisRipsPersistence
from gtda.pipeline import Pipeline
from gtda.time_series import SlidingWindow, TakensEmbedding


In [None]:
class CFG:
    TRAIN = True
    SUBMIT = True
    WINDOW_SIZE = 5
    ZWINDOW_SIZE = 200
    STRIDE = 1
    TIME_DELAY = 1
    DIMENSION = 3
    LOCALHOST = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') == 'Localhost'
    TDA_DATA_PATH = 'Xy_v2l.bz2' if LOCALHOST else '../input/jpx-dataset-001/Xy_v2l.bz2'
    # MODEL_PATH = 'model.cbm' if LOCALHOST else '../input/jpx-dataset-001/model.cbm'
    MODEL_PATH = 'model_all.cbm' if LOCALHOST else '../input/jpx-dataset-001/model_all.cbm'


In [None]:
base_dir = "../input/jpx-tokyo-stock-exchange-prediction"
train_files_dir = f"{base_dir}/train_files"

In [None]:

def adjust_price_v2(price):
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_price(df, orig_str):
        adjusted_str = "Adjusted"+orig_str
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate Adjusted%str%
        df.loc[:, adjusted_str] = (
                df["CumulativeAdjustmentFactor"] * df[orig_str]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df[adjusted_str] == 0, adjusted_str] = np.nan
        # forward fill AdjustedClose
        df.loc[:, adjusted_str] = df.loc[:, adjusted_str].ffill()
        return df

    def generate_adjusted_volume(df):
        orig_str = "Volume"
        adjusted_str = "Adjusted"+orig_str
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate Adjusted%str%
        df.loc[:, adjusted_str] = (
            df[orig_str] / df["CumulativeAdjustmentFactor"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df[adjusted_str] == 0, adjusted_str] = np.nan
        # forward fill AdjustedClose
        df.loc[:, adjusted_str] = df.loc[:, adjusted_str].ffill()
        return df

    generate_adjusted_close = lambda df: generate_adjusted_price(df, 'Close')
    generate_adjusted_open = lambda df: generate_adjusted_price(df, 'Open')

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_open).reset_index(drop=True)
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_volume).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price


In [None]:
# load stock price data
df_price = pd.read_csv(f"{train_files_dir}/stock_prices.csv")
df_price = adjust_price_v2(df_price)

codes = sorted(df_price["SecuritiesCode"].unique())
display('len(codes)', len(codes))
display(df_price)


In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df


In [None]:
def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): spread return
    """
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio


In [None]:
%%time

Xy_buff = []
pipe = Pipeline([
    ('SW', SlidingWindow(size=CFG.WINDOW_SIZE, stride=CFG.STRIDE)),
    ('TE', TakensEmbedding(time_delay=CFG.TIME_DELAY, dimension=CFG.DIMENSION)),
    ('VR', VietorisRipsPersistence(collapse_edges=True, n_jobs=1, homology_dimensions=[0, 1])),
    ('Ampl', Amplitude()),
])

def get_features_for_predict_v2e(price, code):
    _Xy = price.loc[price["SecuritiesCode"] == code].copy()

    _Xy['f_1'] = np.log1p(_Xy['AdjustedVolume'].pct_change())
    _Xy['f_3'] = _Xy['AdjustedVolume'].pct_change()
    _Xy_mean = _Xy['AdjustedClose'].rolling(window=CFG.ZWINDOW_SIZE).mean()
    _Xy_std = _Xy['AdjustedClose'].rolling(window=CFG.ZWINDOW_SIZE).std()
    _Xy['zscore'] = (_Xy['AdjustedClose'] - _Xy_mean)/_Xy_std

    _Xy = _Xy.fillna(0)
    _Xy = _Xy.replace([np.inf, -np.inf], 0)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        X_adjvol = pipe.fit_transform(_Xy['AdjustedVolume'])
        X_a_zscore = pipe.fit_transform(_Xy['zscore'])
        X_a_f_1 = pipe.fit_transform(_Xy['f_1'])
        _Xy = _Xy.tail(X_adjvol.shape[0])

        _Xy['tda_0adjvol'] = X_adjvol[:, 0].tolist()
        _Xy['tda_1adjvol'] = X_adjvol[:, 1].tolist()
        _Xy['tda_0_f_1'] = X_a_f_1[:, 0].tolist()
        _Xy['tda_1_f_1'] = X_a_f_1[:, 1].tolist()
        _Xy['tda_0_zscore'] = X_a_zscore[:, 0].tolist()
        _Xy['tda_1_zscore'] = X_a_zscore[:, 1].tolist()

    # filling data for nan and inf
    _Xy = _Xy.fillna(0)
    _Xy = _Xy.replace([np.inf, -np.inf], 0)

    return _Xy

# if True:
if not os.path.isfile(CFG.TDA_DATA_PATH):
    for code in tqdm(codes):
        _Xy = get_features_for_predict_v2e(df_price, code)
        Xy_buff.append(_Xy)
    Xy = pd.concat(Xy_buff)
    if CFG.LOCALHOST:
        Xy.to_pickle(CFG.TDA_DATA_PATH)
else:
    Xy = pd.read_pickle(CFG.TDA_DATA_PATH)

display(Xy)


In [None]:
pred_model = CatBoostRegressor(random_seed=42,
                               task_type='GPU',
                               loss_function='RMSE',
                               eval_metric='RMSE',
                               n_estimators=900,
                               verbose=False)

feat_cols_v2 = [
    'AdjustedClose',
    'AdjustedOpen',
    'AdjustedVolume',
    'f_1',
    'f_3',
    'zscore',
    'tda_0adjvol',
    'tda_1adjvol',
    'tda_0_f_1',
    'tda_1_f_1',
    'tda_0_zscore',
    'tda_1_zscore',
]

# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"

train_X = Xy[: TRAIN_END][['SecuritiesCode']+feat_cols_v2]
test_X = Xy[TEST_START:][['SecuritiesCode']+feat_cols_v2]
train_full_X = Xy[['SecuritiesCode']+feat_cols_v2]
train_y = Xy[: TRAIN_END]['Target']
test_y = Xy[TEST_START:]['Target']
train_full_y = Xy['Target']


In [None]:
%%time

pred_model.fit(train_X[feat_cols_v2], train_y, early_stopping_rounds=10)
# if CFG.LOCALHOST:
#     pred_model.save_model('model.cbm', format="cbm",)

result = test_X[["SecuritiesCode"]].copy()
result.loc[:, "predict"] = pred_model.predict(test_X[feat_cols_v2])
result.loc[:, "Target"] = test_y.values

result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

# display(calc_spread_return_sharpe_v2(result, portfolio_size=200))
# df_result = result.groupby('Date').apply(_calc_spread_return_per_day, 200, 2)
display(calc_spread_return_sharpe(result, portfolio_size=200))
df_result = result.groupby('Date').apply(_calc_spread_return_per_day, 200, 2)
# 0.15730880249765788


In [None]:
display(df_result.plot(figsize=(20, 8)))


In [None]:
display(df_result.cumsum().plot(figsize=(20, 8)))


In [None]:
feature_importance = pred_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(feat_cols_v2)[sorted_idx])
plt.title('Feature Importance')

In [None]:
from catboost import Pool, cv

cv_dataset = Pool(data=Xy[feat_cols_v2],
                  label=Xy['Target'])

params = {'random_seed': 42,
          # 'task_type': 'GPU',
          'loss_function': 'RMSE',
          'eval_metric': 'RMSE',
          'n_estimators': 900,
          'verbose': False}

scores = cv(cv_dataset,
            params,
            fold_count=5,
             type='TimeSeries',
            plot="True")

In [None]:
%%time

pred_model.fit(Xy[feat_cols_v2], Xy['Target'], early_stopping_rounds=10)
if CFG.LOCALHOST:
    pred_model.save_model('model_all.cbm', format="cbm",)



In [None]:
pred_model = CatBoostRegressor()
pred_model.load_model(CFG.MODEL_PATH, format="cbm",)


In [None]:
if CFG.SUBMIT:
    # load Time Series API
    import jpx_tokyo_market_prediction
    # make Time Series API environment (this function can be called only once in a session)
    env = jpx_tokyo_market_prediction.make_env()
    # get iterator to fetch data day by day
    iter_test = env.iter_test()


In [None]:
if CFG.SUBMIT:
    df_price_raw = pd.read_csv(f"{train_files_dir}/stock_prices.csv") #TODO
    price_cols = [
        "Date",
        "SecuritiesCode",
        "Close",
        "Open",
        "Volume",
        "AdjustmentFactor",
    ]
    df_price_raw = df_price_raw[price_cols]

    # filter data to reduce culculation cost
    # df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= "2021-08-01"]
    counter = 0
    # fetch data day by day
    for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
        current_date = prices["Date"].iloc[0]
        sample_prediction_date = sample_prediction["Date"].iloc[0]
        print(f"current_date: {current_date}, sample_prediction_date: {sample_prediction_date}")

        if counter == 0:
            # to avoid data leakage
            df_price_raw = df_price_raw.loc[df_price_raw["Date"] < current_date]

        # filter data to reduce culculation cost
        threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(CFG.ZWINDOW_SIZE)).strftime("%Y-%m-%d")
        # threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(30)).strftime("%Y-%m-%d")
        # threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(15)).strftime("%Y-%m-%d")
        print(f"threshold: {threshold}")
        df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= threshold]

        # to generate AdjustedClose, increment price data
        df_price_raw = pd.concat([df_price_raw, prices[price_cols]])
        # generate AdjustedClose
        df_price = adjust_price_v2(df_price_raw)

        # get target SecuritiesCodes
        codes = sorted(prices["SecuritiesCode"].unique())

        feature = pd.concat([get_features_for_predict_v2e(df_price, code) for code in codes])
        feature = feature.loc[feature.index == current_date]

        feature.loc[:, "predict"] = pred_model.predict(feature[feat_cols_v2])

        # set rank by predict
        feature = feature.sort_values("predict", ascending=False).drop_duplicates(subset=['SecuritiesCode'])
        feature.loc[:, "Rank"] = np.arange(len(feature))
        feature_map = feature.set_index('SecuritiesCode')['Rank'].to_dict()
        sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)

        # check Rank
        assert sample_prediction["Rank"].notna().all()
        assert sample_prediction["Rank"].min() == 0
        assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1

        # register your predictions
        env.predict(sample_prediction)
        counter += 1

In [None]:
if CFG.SUBMIT:
    ! head submission.csv

In [None]:
if CFG.SUBMIT:
    ! tail submission.csv