# Introduction of simple model submission

This notebook will show how to submit your simple model (built in [here](https://www.kaggle.com/code/smeitoma/train-demo))

In [None]:
import sys
from decimal import ROUND_HALF_UP, Decimal

import numpy as np
import pandas as pd
from lightgbm import Booster, LGBMRegressor

In [None]:
base_dir = "../input/jpx-tokyo-stock-exchange-prediction"

# There are three types of folders.
# We use stock_price.csv in the train_files folder to check our model in public leaderboard in this notebook.
train_files_dir = f"{base_dir}/train_files"

# for forecasting phase leaderboard, you may want to include stock_price.csv in the supplemental_files folder.
# You can remove "forecasting phase leaderboard" comments in this notebook to use stock_price.csv in the supplemental_files folder.
# forecasting phase leaderboard:
# supplemental_files_dir = f"{base_dir}/supplemental_files"


# model parameters generated by https://www.kaggle.com/code/smeitoma/train-demo
model_file = "../input/simplemodel/simple-model.txt"

In [None]:
# feature column names
feat_cols = [
    "return_2week",
    "return_1month",
    "return_3month",
    "volatility_2week",
    "volatility_1month",
    "volatility_3month",
]

In [None]:
def generate_adjusted_close(df):
    """
    Args:
        df (pd.DataFrame)  : stock_price for a single SecuritiesCode
    Returns:
        df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
    """
    # sort data to generate CumulativeAdjustmentFactor
    df = df.sort_values("Date", ascending=False)
    # generate CumulativeAdjustmentFactor
    df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
    # generate AdjustedClose
    df.loc[:, "AdjustedClose"] = (
        df["CumulativeAdjustmentFactor"] * df["Close"]
    ).map(lambda x: float(
        Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    ))
    # reverse order
    df = df.sort_values("Date")
    # to fill AdjustedClose, replace 0 into np.nan
    df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
    # forward fill AdjustedClose
    df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
    return df

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # copy to edit
    price = price.copy()
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode", close_col]].copy()

    # calculate 2 week return using AdjustedClose
    feats["return_2week"] = feats[close_col].pct_change(10)
    # calculate last 1 month return using AdjustedClose
    feats["return_1month"] = feats[close_col].pct_change(21)
    # calculate last 3 months return using AdjustedClose
    feats["return_3month"] = feats[close_col].pct_change(63)

    # calculate 2 week historical volatility using AdjustedClose
    feats["volatility_2week"] = (
        np.log(feats[close_col]).diff().rolling(10).std()
    )
    # calculate last 1 month historical volatility using AdjustedClose
    feats["volatility_1month"] = (
        np.log(feats[close_col]).diff().rolling(21).std()
    )
    # calculate last 3 months historical volatility using AdjustedClose
    feats["volatility_3month"] = (
        np.log(feats[close_col]).diff().rolling(63).std()
    )

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# load stock price data
df_price_raw = pd.read_csv(f"{train_files_dir}/stock_prices.csv")
price_cols = [
    "Date",
    "SecuritiesCode",
    "Close",
    "AdjustmentFactor",
]
df_price_raw = df_price_raw[price_cols]

# filter data to reduce culculation cost 
df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= "2021-08-01"]

# forecasting phase leaderboard:
# df_price_supplemental = pd.read_csv(f"{supplemental_files_dir}/stock_prices.csv")
# df_price_supplemental = df_price_supplemental[price_cols]
# df_price_raw = pd.concat([df_price_raw, df_price_supplemental])

In [None]:
df_price_raw.tail(2)

In [None]:
# load pre-trained model
pred_model = Booster(model_file=model_file)

In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()

In [None]:
counter = 0
# fetch data day by day
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    current_date = prices["Date"].iloc[0]
    sample_prediction_date = sample_prediction["Date"].iloc[0]
    print(f"current_date: {current_date}, sample_prediction_date: {sample_prediction_date}")

    if counter == 0:
        # to avoid data leakage
        df_price_raw = df_price_raw.loc[df_price_raw["Date"] < current_date]

    # filter data to reduce culculation cost
    threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(80)).strftime("%Y-%m-%d")
    print(f"threshold: {threshold}")
    df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= threshold]

    # to generate AdjustedClose, increment price data
    df_price_raw = pd.concat([df_price_raw, prices[price_cols]])
    # generate AdjustedClose
    df_price = adjust_price(df_price_raw)

    # get target SecuritiesCodes
    codes = sorted(prices["SecuritiesCode"].unique())

    # generate feature
    feature = pd.concat([get_features_for_predict(df_price, code) for code in codes])
    # filter feature for this iteration
    feature = feature.loc[feature.index == current_date]

    # prediction
    feature.loc[:, "predict"] = pred_model.predict(feature[feat_cols])

    # set rank by predict
    feature = feature.sort_values("predict", ascending=False).drop_duplicates(subset=['SecuritiesCode'])
    feature.loc[:, "Rank"] = np.arange(len(feature))
    feature_map = feature.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)

    # check Rank
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1

    # register your predictions
    env.predict(sample_prediction)
    counter += 1

In [None]:
! head submission.csv

In [None]:
! tail submission.csv