# Introduction of a simple model submission

This notebook will show how to submit your simple model (built in [here](https://www.kaggle.com/code/jsmithperera/train-lgbm))

It is built from from https://www.kaggle.com/code/smeitoma/Train_Demo

If you want to know how this notebooks takes a model as input, check this: https://www.kaggle.com/discussions/getting-started/333636

In [None]:
import sys
from decimal import ROUND_HALF_UP, Decimal

import numpy as np
import pandas as pd
from lightgbm import Booster, LGBMRegressor
import datetime as dt

In [None]:
base_dir = "../input/jpx-tokyo-stock-exchange-prediction"

# There are three types of folders.
# We use stock_price.csv in the train_files folder to check our model in public leaderboard in this notebook.
train_files_dir = f"{base_dir}/train_files"

# for forecasting phase leaderboard, you may want to include stock_price.csv in the supplemental_files folder.
# You can remove "forecasting phase leaderboard" comments in this notebook to use stock_price.csv in the supplemental_files folder.
# forecasting phase leaderboard:
#supplemental_files_dir = f"{base_dir}/supplemental_files"

# model parameters generated by https://www.kaggle.com/code/smeitoma/train-demo
model_file = "../input/simplemodel/simple-model.txt"

In [None]:
def generate_adjusted_close(df):
    """
    Args:
        df (pd.DataFrame)  : stock_price for a single SecuritiesCode
    Returns:
        df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
    """
    # sort data to generate CumulativeAdjustmentFactor
    df = df.sort_values("Date", ascending=False)
    
    # generate CumulativeAdjustmentFactor
    df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
    
    # generate AdjustedClose
    df.loc[:, "AdClose"] = (
        df["CumulativeAdjustmentFactor"] * df["Close"]
    ).map(lambda x: float(
        Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    ))
    
    df.loc[:, "AdOpen"] = (
            df["CumulativeAdjustmentFactor"] * df["Open"]
    ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
    ))
        
    df.loc[:, "AdHigh"] = (
            df["CumulativeAdjustmentFactor"] * df["High"]
    ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
    ))
        
    df.loc[:, "AdLow"] = (
            df["CumulativeAdjustmentFactor"] * df["Low"]
    ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)
    ))
    
    # reverse order
    df = df.sort_values("Date")
    
    # to fill AdjustedClose, replace 0 into np.nan
    df.loc[df["AdClose"] == 0, "AdClose"] = np.nan
    df.loc[df["AdOpen"] == 0, "AdOpen"] = np.nan
    df.loc[df["AdHigh"] == 0, "AdHigh"] = np.nan
    df.loc[df["AdLow"] == 0, "AdLow"] = np.nan
    
    # forward fill AdjustedClose
    df.loc[:, "AdClose"] = df.loc[:, "AdClose"].ffill()
    df.loc[:, "AdOpen"] = df.loc[:, "AdOpen"].ffill()
    df.loc[:, "AdHigh"] = df.loc[:, "AdHigh"].ffill()
    df.loc[:, "AdLow"] = df.loc[:, "AdLow"].ffill()
    
    return df

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # copy to edit
    price = price.copy()
    
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
def calc_change_rate_base(price, column_name, periods):
    for period in periods:
        price[f"{column_name}_pct_{period}"] = price[column_name].pct_change(period)
    return price

def calc_volatility_base(price, column_name, periods):
    for period in periods:
        price[f"{column_name}_vol_{period}"] = np.log(price[column_name]).diff().rolling(window=period, min_periods=1).std()
    return price

def calc_moving_average_rate_base(price, column_name, periods):
    for period in periods:
        price[f"{column_name}_mov_{period}"] = price[column_name].rolling(window=period, min_periods=1).mean() / price[column_name]
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdClose"
    open_col = "AdOpen"
    high_col = "AdHigh"
    low_col = "AdLow"
    
    feats = price.loc[price["SecuritiesCode"] == code, 
                      ["SecuritiesCode", high_col, low_col, open_col, close_col,"17SectorCode","33SectorCode"]].copy()

    periods = [10,21,63]
    feats = calc_change_rate_base(feats,close_col, periods)
    feats = calc_volatility_base(feats,close_col, periods)
    feats = calc_moving_average_rate_base(feats,close_col, periods)
    
    #feats = calc_change_rate_base(feats,open_col, periods)
   
    # calculate last 3 months historical volatility using AdjustedClose
    feats["HLRolling"] = ((feats["AdHigh"] - feats["AdLow"])/ feats["AdLow"]).rolling(21).std()
    
    feats["CLRolling"] = ((feats["AdClose"]-feats["AdLow"])/ feats["AdLow"]).rolling(21).std()
    
    feats["HCRolling"] = ((feats["AdHigh"]-feats["AdClose"])/ feats["AdClose"]).rolling(21).std()
  
    #feats = feats.dropna()
    
    #feats.reset_index(inplace=True)
    #feats["month"] = feats["Date"].dt.month
    #feats["day"] = feats["Date"].dt.day
    #feats["dow"] = feats["Date"].dt.dayofweek
    #feats.set_index('Date',inplace=True, drop = True)

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    
    # drop AdClose, High and Low columns
    feats = feats.drop([close_col], axis = 1)
    feats = feats.drop([open_col], axis=1)
    feats = feats.drop([high_col], axis = 1)
    feats = feats.drop([low_col], axis = 1)

    return feats

In [None]:
# load stock price data
df_price_raw = pd.read_csv(f"{train_files_dir}/stock_prices.csv")

price_cols = [
    "Date",
    "SecuritiesCode",
    "High", "Low",
    "Open","Close",
    "AdjustmentFactor",
]

df_price_raw = df_price_raw[price_cols]

In [None]:
# Getting sectors for each stock
lista = pd.read_csv(f"{base_dir}/stock_list.csv")
sectores = lista[["SecuritiesCode","17SectorCode","33SectorCode"]]

sector = sectores.loc[sectores['17SectorCode'] != '-']
sector = sector.loc[sectores['33SectorCode'] != '-']

df = sector[sector.set_index('SecuritiesCode').index.isin(df_price_raw.set_index('SecuritiesCode').index)]

df_price_raw = df_price_raw.merge(df, how="left")

#df_price_raw['17SectorCode'] = pd.Categorical(df_price_raw['17SectorCode'])
#df_price_raw['33SectorCode'] = pd.Categorical(df_price_raw['33SectorCode'])

In [None]:
#del lista, sectores, sector

In [None]:
#df_price_supplemental = pd.read_csv(f"{supplemental_files_dir}/stock_prices.csv")
#df_price_supplemental = df_price_supplemental[price_cols]
#df_price_raw = pd.concat([df_price_raw, df_price_supplemental])

df_price_raw = df_price_raw[df_price_raw["Date"] >="2021-02-01"]

print(df_price_raw.head(2))
print(df_price_raw.isna().all())

In [None]:
# load pre-trained model
pred_model = Booster(model_file = model_file)

In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()

In [None]:
# feature column names

#feat_cols = [
#    "17SectorCode","33SectorCode",
#    "AdClose_pct_10","AdClose_pct_21","AdClose_pct_63",
#    "AdClose_vol_10","AdClose_vol_21","AdClose_vol_63",
#    "AdClose_mov_10","AdClose_mov_21","AdClose_mov_63",
#    "AdOpen_pct_10","AdOpen_pct_21","AdOpen_pct_63",
#    "HLRolling","CLRolling","HCRolling",
#    "month", "day", "dow"
#]

feat_cols = [
    "17SectorCode","33SectorCode",
    "AdClose_pct_10","AdClose_pct_21","AdClose_pct_63",
    "AdClose_vol_10","AdClose_vol_21","AdClose_vol_63",
    "AdClose_mov_10","AdClose_mov_21","AdClose_mov_63",
    "HLRolling","CLRolling","HCRolling"
]

In [None]:
counter = 0
# fetch data day by day
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    current_date = prices["Date"].iloc[0]
    sample_prediction_date = sample_prediction["Date"].iloc[0]
    print(f"current_date: {current_date}, sample_prediction_date: {sample_prediction_date}")

    if counter == 0:
        # to avoid data leakage
        df_price_raw = df_price_raw.loc[df_price_raw["Date"] < current_date]

    # filter data to reduce culculation cost
    threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(80)).strftime("%Y-%m-%d")
    print(f"threshold: {threshold}")
    
    df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= threshold]
    
    prices = prices.merge(df, how="left")

    if counter == 0:
        price_cols.append("17SectorCode")
        price_cols.append("33SectorCode")
    
    # to generate AdjustedClose, increment price data
    df_price_raw = pd.concat([df_price_raw, prices[price_cols]])
    
    # generate AdjustedClose
    df_price = adjust_price(df_price_raw)

    # get target SecuritiesCodes
    codes = sorted(prices["SecuritiesCode"].unique())
    
    # generate feature
    feature = pd.concat([get_features_for_predict(df_price, code) for code in codes])
    
    if counter == 0:
        print("10=",feature.loc[feature.index == current_date].shape)
    
    #feature['month'] = pd.Categorical(feature['month'])
    #feature['day'] = pd.Categorical(feature['day'])
    #feature['dow'] = pd.Categorical(feature['dow'])
    
    feature['17SectorCode'] = pd.Categorical(feature['17SectorCode'])
    feature['33SectorCode'] = pd.Categorical(feature['33SectorCode'])
    
    # filter feature for this iteration
    feature = feature.loc[feature.index == current_date]

    # prediction
    feature.loc[:, "predict"] = pred_model.predict(feature[feat_cols])
            
    # set rank by predict
    feature = feature.sort_values("predict", ascending=False).drop_duplicates(subset=['SecuritiesCode'])
    
    feature.loc[:,"Rank"] = np.arange(len(feature))
    
    feature_map = feature.set_index('SecuritiesCode')['Rank'].to_dict()
    
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)

    # check Rank
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1

    # register your predictions
    env.predict(sample_prediction)
    counter += 1

In [None]:
! head submission.csv

In [None]:
! tail submission.csv