In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter
import jpx_tokyo_market_prediction
from lightgbm import LGBMRegressor
from lightgbm import log_evaluation

import seaborn as sns
pd.set_option('display.max_rows', 999)
pd.options.mode.chained_assignment = None  # default='warn'
from decimal import ROUND_HALF_UP, Decimal

from sklearn.metrics import mean_squared_error

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sub_ex_test = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")
options_ex_test = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv")
fin_ex_test = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv")
sec_sprice_ex_test = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv")
trades_ex_test = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/trades.csv")
sprice_ex_test = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv")

In [None]:
options_train = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/options.csv")
fin_train = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")
sec_sprice_train = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv")
trades_train = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/trades.csv")
sprice_train = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")

In [None]:
options_supp = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/options.csv")
fin_supp = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/financials.csv")
sec_sprice_supp = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv")
trades_supp = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/trades.csv")
sprice_supp = pd.read_csv(r"/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")

# Prepare data

In [None]:
#official adjust close price calc, https://www.kaggle.com/code/smeitoma/train-demo/notebook

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
#prices = pd.concat([sprice_train, sprice_supp], axis = 0)

df_price_raw = sprice_train
prices_train = adjust_price(df_price_raw)

In [None]:
prices_train

In [None]:
#https://www.kaggle.com/code/smeitoma/train-demo/notebook

def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode", close_col]].copy()
    
    periods = [10, 21, 63]
    
    for period in periods:
        feats.loc[:, f"return_{period}"] = feats["AdjustedClose"].pct_change(period)
        feats.loc[:, f"volatility_{period}"] = np.log(feats["AdjustedClose"]).diff().rolling(period).std()
    
    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# fetch prediction target SecuritiesCodes
codes = sorted(prices_train["SecuritiesCode"].unique())
len(codes)

In [None]:
# generate feature
from tqdm import tqdm

buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(prices_train, code)
    buff.append(feat)
feature = pd.concat(buff)

In [None]:
feature

In [None]:
target_col = prices_train[["SecuritiesCode", "Target"]]
merged_train_data = pd.merge(left = feature, right = target_col, left_on = [feature.index, "SecuritiesCode"], right_on = [target_col.index, "SecuritiesCode"]).set_index("key_0", drop = True)
merged_train_data.index.names = ["Date"]

merged_train_data

# Train and choose model

### Use Time Series CV to choose the model

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    df =  df.sort_values("target_pred", ascending = False)
    df.loc[:, "Rank"] = np.arange(len(df["target_pred"]))
    return df


#evaluation function to calculate the sharp ratio

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
lgb_params = {
    "seed" : 42,
    "n_jobs" : -1,
    "learning_rate" : 0.05,
    "n_estimators" : 2000,
    "verbose" : 1
}

In [None]:
from sklearn.model_selection import TimeSeriesSplit

feature_cols = merged_train_data.drop("Target", axis = 1).columns

tscv = TimeSeriesSplit(n_splits = 5)
score = {}

print("beginning CV ...\n")

for fold, (train_index, val_index) in enumerate(tscv.split(merged_train_data)):
    
    print(f"------------- Fold {fold} -------------")
    print(f"Train index: from {merged_train_data.iloc[train_index].index[0]} to {merged_train_data.iloc[train_index].index[-1]}")
    print(f"Validation index: from {merged_train_data.iloc[val_index].index[0]} to {merged_train_data.iloc[val_index].index[-1]}")
    
    X_train, y_train = merged_train_data[feature_cols].iloc[train_index], merged_train_data["Target"].iloc[train_index]
    X_val, y_val = merged_train_data[feature_cols].iloc[val_index], merged_train_data["Target"].iloc[val_index]
    
    #train
    lgb_model = LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train[feature_cols], y_train)
    
    #predict
    result = X_val[["SecuritiesCode"]].copy()
    result.loc[:, "target_pred"] = lgb_model.predict(X_val[feature_cols])
    result.loc[:, "Target"] = y_val
        
    #rank
    result = result.sort_values(["Date", "target_pred"], ascending = [True, False])
    result = result.groupby("Date").apply(set_rank)
    
    score[f"Fold_{fold}"] = calc_spread_return_sharpe(result, portfolio_size = 200)
    print(f"Sharp Ratio for Fold {fold}: ", score[f"Fold_{fold}"], "\n")
    
print(f"Average sharp ratio: {sum(score.values())/len(score)}")

In [None]:
lgb_params = {
    "seed" : 42,
    "n_jobs" : -1,
    "learning_rate" : 0.05,
    "n_estimators" : 2000,
    "verbose" : 0
}

In [None]:
X, y = merged_train_data[feature_cols], merged_train_data["Target"]

lgb_model = LGBMRegressor(**lgb_params)
lgb_model.fit(X[feature_cols], y)

# Plot result

def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): spread return
    """
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

df_result = result.groupby('Date').apply(_calc_spread_return_per_day, 200, 2)

df_result.plot(figsize = (20,8))
plt.grid()

df_result.cumsum().plot(figsize=(20, 8))
plt.grid()

# Submission baseline

In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()

In [None]:
price_cols = [
    "Date",
    "SecuritiesCode",
    "Close",
    "AdjustmentFactor"
]

df_price_raw = df_price_raw[price_cols]

counter = 0

#fetch data day by day
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    
    current_date = prices["Date"].iloc[0]
    sample_prediction_date = sample_prediction["Date"].iloc[0]
    print(f"current date: {current_date}, sample_prediction_date: {sample_prediction_date}")
    
    if counter == 0: 
        
        df_price_raw = df_price_raw.loc[df_price_raw["Date"] < current_date]
        
    threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(80)).strftime("%Y-%m-%d")
    print(f"threshold: {threshold}")
    df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= threshold]
    
    df_price_raw = pd.concat([df_price_raw, prices[price_cols]])
    df_price = adjust_price(df_price_raw)
    
    codes = sorted(prices["SecuritiesCode"].unique())
    
    #predict
    feature = pd.concat([get_features_for_predict(df_price, code) for code in codes])
    feature = feature.loc[feature.index == current_date]
    feature.loc[:, "predict"] = lgb_model.predict(feature[feature_cols])
    
    #set rank
    feature = feature.sort_values("predict", ascending = False).drop_duplicates(subset = ["SecuritiesCode"])
    feature.loc[:, "Rank"] = np.arange(len(feature))
    feature_map = feature.set_index("SecuritiesCode")["Rank"].to_dict()
    sample_prediction["Rank"] = sample_prediction["SecuritiesCode"].map(feature_map)
    
    #chk rank
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1
    
    env.predict(sample_prediction)
    counter += 1