# Initial setup.
Import packages and libraries used in most projects, and perform basic project configuration.

In [None]:
# Import the basic packages to get started.
import os                          # Misc. operating system interface
import pandas as pd                # Panel Data manipulation & analysis
import numpy as np                 # Math & Arrays
import gc                          # Garbage Collector. Frees up memory.
import matplotlib.pyplot as plt    # Data visualization. Plots & charts of math functions.
from tqdm import tqdm              # For displaying cool progress indicators. From the Arabic name taqaddum which means 'progress'.


# For EDA:
from sklearn.feature_selection import mutual_info_regression

# Some extras for this project:
from decimal import ROUND_HALF_UP, Decimal    # For quick rounding calculations
from lightgbm import LGBMRegressor            # Light Gradient Boosting Machine, good for ranking
from lightgbm import Booster
import sys


# Packages for the models we'll use:
#import tensorflow as tf
#from tensorflow.keras import layers
#from tensorflow import keras

# Set random seed for reproducibility:
np.random.seed(0)

# Notes:
# Input data files are available in the read-only "../input/" directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

print("Done with setup.")

# Let's look at the input data.
Files under ../input/jpx-tokyo-stock-exchange-prediction

**Copies of data files exist in multiple folders that cover different time windows and serve different purposes.**

* */stock_prices.csv The core file of interest. Includes the daily closing price for each stock and the target column.

* */options.csv Data on the status of a variety of options based on the broader market. Many options include implicit predictions of the future price of the stock market and so may be of interest even though the options are not scored directly.

* */secondary_stock_prices.csv The core dataset contains on the 2,000 most commonly traded equities but many less liquid securities are also traded on the Tokyo market. This file contains data for those securities, which aren't scored but may be of interest for assessing the market as a whole.

* */trades.csv Aggregated summary of trading volumes from the previous business week.

* */financials.csv Results from quarterly earnings reports.

* stock_list.csv - Mapping between the SecuritiesCode and company names, plus general information about which industry the company is in.
Folders

data_specifications/ - Definitions for individual columns.

jpx_tokyo_market_prediction/ Files that enable the API. Expect the API to deliver all rows in under five minutes and to reserve less than 0.5 GB of memory.



train_files/ Data folder covering the main training period.

supplemental_files/ Data folder containing a dynamic window of supplemental training data. This will be updated with new data during the main phase of the competition in early May, early June, and roughly a week before the submissions are locked.

example_test_files/ Data folder covering the public test period. Intended to facilitate offline testing. Includes the same columns delivered by the API (ie no Target column). You can calculate the Target column from the Close column; it's the return from buying a stock the next day and selling the day after that. This folder also includes an example of the sample submission file that will be delivered by the API.

# We need to choose the correct directory.
The code blocks below come from Kaggler s-meitoma's [Train Demo Notebook](https://www.kaggle.com/code/smeitoma/train-demo)

In [None]:
# set base_dir to load data
base_dir = "../input/jpx-tokyo-stock-exchange-prediction"

# There are three types of stock_price.csv
# We use one in the train_files folder for this notebook.
train_files_dir = f"{base_dir}/train_files"

# for forecasting phase leaderboard, you may want to include stock_price.csv in the supplemental_files folder.
# You can remove "forecasting phase leaderboard" comments in this notebook to use stock_price.csv in the supplemental_files folder.
# forecasting phase leaderboard:
# supplemental_files_dir = f"{base_dir}/supplemental_files"


# Generating AdjustedClose price

We will generate AdjustedClose using AdjustmentFactor value. This should reduce historical price gap caused by split/reverse-split.


In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
# load stock price data
df_price = pd.read_csv(f"{train_files_dir}/stock_prices.csv")

# forecasting phase leaderboard:
# df_price_supplemental = pd.read_csv(f"{supplemental_files_dir}/stock_prices.csv")
# df_price = pd.concat([df_price, df_price_supplemental])

# generate AdjustedClose
df_price = adjust_price(df_price)


# Generating simple features

Let us visualize various numerical values that can be calculated from stock price data. Two examples are generated here: price change rate and historical volatility. These two are basic characteristic quantities in stock analysis.

For this analysis, we will use Toyota Motor Corporation (SecuritiesCode: 7203)


In [None]:
price = df_price.loc[df_price["SecuritiesCode"] == 7203].copy()


# Percentage change in price
Two main rates of price change are used: the rate of change from the past and the rate of change in the future. The past rate of change is calculated, for example, in terms of how much change occurred from the closing price three days ago to the closing price today.

When calculating the rate of price change in the last 2 week, last 1 month, or last 3 months, it is often treated as X business days later, such as 10 business days, 21 business days, or 63 business days later, as shown below, rather than X months later on the calendar to simplify handling in practical terms. In this case, the pct_change function of pandas is used for the calculation.


In [None]:
periods = [10, 21, 63]
return_names = []
for period in periods:
    return_names.append(f"return_{period}")
    price.loc[:, f"return_{period}"] = price["AdjustedClose"].pct_change(period)


# Historical volatility
Next, we calculate the historical volatility. The historical volatility calculated here is the standard deviation of the logarithmic price change over the last 10, 21, and 63 business days. Historical volatility is a risk indicator and is used to determine how violently prices have fluctuated. In general, stocks with large historical volatility are considered relatively riskier to hold as assets than stocks with small historical volatility.


In [None]:
periods = [10, 21, 63]
vol_names = []
for period in periods:
    vol_names.append(f"volatility_{period}")
    price.loc[:, f"volatility_{period}"] = np.log(price["AdjustedClose"]).diff().rolling(period).std()

Both price change and historical volatility are very commonly used characteristics in equity analysis. As an example, the price change and volatility of Toyota Motor Corporation can be visualized as follows

In [None]:
price[return_names].plot(figsize=(20, 8))

In [None]:
price[vol_names].plot(figsize=(20, 8))

The financial data (stock_fin.csv) included in this competition also has a variety of features in the field of fundamental analysis, so we encourage you to design various features and use them in the competition.

# Pre-processing for model building

This notebook presents a simple model using LightGBM.

First, the features are generated using the price change and historical volatility described above.

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode", close_col]].copy()

    # calculate 2 week return using AdjustedClose
    feats["return_2week"] = feats[close_col].pct_change(10)
    # calculate last 1 month return using AdjustedClose
    feats["return_1month"] = feats[close_col].pct_change(21)
    # calculate last 3 months return using AdjustedClose
    feats["return_3month"] = feats[close_col].pct_change(63)

    # calculate 2 week historical volatility using AdjustedClose
    feats["volatility_2week"] = (
        np.log(feats[close_col]).diff().rolling(10).std()
    )
    # calculate last 1 month historical volatility using AdjustedClose
    feats["volatility_1month"] = (
        np.log(feats[close_col]).diff().rolling(21).std()
    )
    # calculate last 3 months historical volatility using AdjustedClose
    feats["volatility_3month"] = (
        np.log(feats[close_col]).diff().rolling(63).std()
    )

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# fetch prediction target SecuritiesCodes
codes = sorted(df_price["SecuritiesCode"].unique())
len(codes)

In [None]:
# generate feature
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(df_price, code)
    buff.append(feat)
feature = pd.concat(buff)

# Label creation
Next, we obtain the labels to be used for training the model (this is where we load and split the label data).

In [None]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"

def get_features_and_label(price, codes, features):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [None]:
# generate feature/label
train_X, train_y, test_X, test_y = get_features_and_label(
    df_price, codes, feature
)


# Building a simple model
Using the created features and labels, build a model using the following procedure

In [None]:
lgbm_params = {
    'seed': 42,
    'n_jobs': -1,
}

feat_cols = [
    "return_2week",
    "return_1month",
    "return_3month",
    "volatility_2week",
    "volatility_1month",
    "volatility_3month",
]

In [None]:
# initialize model
pred_model = LGBMRegressor(**lgbm_params)
# train
pred_model.fit(train_X[feat_cols].values, train_y)
# prepare result data
result = test_X[["SecuritiesCode"]].copy()
# predict
result.loc[:, "predict"] = pred_model.predict(test_X[feat_cols])
# actual result
result.loc[:, "Target"] = test_y.values

def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

In [None]:
result.tail()


# Evaluation
Input the output of the forecasts of the constructed model into the evaluation function and plot the daily returns.

The evaluation function for this competition is as follows.

Please read [here](https://www.kaggle.com/code/smeitoma/jpx-competition-metric-definition) to know the evaluation function more.

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
# calc spread return sharpe
calc_spread_return_sharpe(result, portfolio_size=200)

Then, we will show daily spread return of the model.

In [None]:
def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): spread return
    """
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

df_result = result.groupby('Date').apply(_calc_spread_return_per_day, 200, 2)

In [None]:
df_result.plot(figsize=(20, 8))

We also show a cumulative spread return of the mode

df_result.cumsum().plot(figsize=(20, 8))

The model in this notebook is now complete! Try different features and training methods through trial and error!


# Saving model
You need to save your model parameter to use created model for your submission.

In [None]:
pred_model.booster_.save_model("simple-model.txt")

----
# Submission
The code blocks below come from s-meitoma's [Submission Demo](https://www.kaggle.com/code/smeitoma/submission-demo) notebook.

Don't forget the code requirements from the competition page:


   * CPU Notebook <= 9 hours run-time
   * GPU Notebook <= 9 hours run-time
   * Internet access disabled
   * Freely & publicly available external data is allowed, including pre-trained models
   * Submission file must be named submission.csv. The API will generate this submission file for you.

If training + submission takes longer than the maximum run-time, you may wish to separate your notebook into a fork which does the training and writes the models (all the code above this) and fork which reads the pre-trained models and submits your work (all the code below this).

In [None]:
# model parameters generated by https://www.kaggle.com/ksadhfoia/train
#model_file = "../input/simplemodel/simple-model.txt"
model_file = "/kaggle/working/simple-model.txt"

In [None]:
# feature column names
feat_cols = [
    "return_2week",
    "return_1month",
    "return_3month",
    "volatility_2week",
    "volatility_1month",
    "volatility_3month",
]

In [None]:
def generate_adjusted_close(df):
    """
    Args:
        df (pd.DataFrame)  : stock_price for a single SecuritiesCode
    Returns:
        df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
    """
    # sort data to generate CumulativeAdjustmentFactor
    df = df.sort_values("Date", ascending=False)
    # generate CumulativeAdjustmentFactor
    df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
    # generate AdjustedClose
    df.loc[:, "AdjustedClose"] = (
        df["CumulativeAdjustmentFactor"] * df["Close"]
    ).map(lambda x: float(
        Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    ))
    # reverse order
    df = df.sort_values("Date")
    # to fill AdjustedClose, replace 0 into np.nan
    df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
    # forward fill AdjustedClose
    df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
    return df

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # copy to edit
    price = price.copy()
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode", close_col]].copy()

    # calculate 2 week return using AdjustedClose
    feats["return_2week"] = feats[close_col].pct_change(10)
    # calculate last 1 month return using AdjustedClose
    feats["return_1month"] = feats[close_col].pct_change(21)
    # calculate last 3 months return using AdjustedClose
    feats["return_3month"] = feats[close_col].pct_change(63)

    # calculate 2 week historical volatility using AdjustedClose
    feats["volatility_2week"] = (
        np.log(feats[close_col]).diff().rolling(10).std()
    )
    # calculate last 1 month historical volatility using AdjustedClose
    feats["volatility_1month"] = (
        np.log(feats[close_col]).diff().rolling(21).std()
    )
    # calculate last 3 months historical volatility using AdjustedClose
    feats["volatility_3month"] = (
        np.log(feats[close_col]).diff().rolling(63).std()
    )

    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# load stock price data
df_price_raw = pd.read_csv(f"{train_files_dir}/stock_prices.csv")
price_cols = [
    "Date",
    "SecuritiesCode",
    "Close",
    "AdjustmentFactor",
]
df_price_raw = df_price_raw[price_cols]

# filter data to reduce culculation cost 
df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= "2021-08-01"]

# forecasting phase leaderboard:
# df_price_supplemental = pd.read_csv(f"{supplemental_files_dir}/stock_prices.csv")
# df_price_supplemental = df_price_supplemental[price_cols]
# df_price_raw = pd.concat([df_price_raw, df_price_supplemental])

In [None]:
df_price_raw.tail(2)

In [None]:
# load pre-trained model
pred_model = Booster(model_file=model_file)

In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()

In [None]:
counter = 0
# fetch data day by day
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    current_date = prices["Date"].iloc[0]
    sample_prediction_date = sample_prediction["Date"].iloc[0]
    print(f"current_date: {current_date}, sample_prediction_date: {sample_prediction_date}")

    if counter == 0:
        # to avoid data leakage
        df_price_raw = df_price_raw.loc[df_price_raw["Date"] < current_date]

    # filter data to reduce culculation cost
    threshold = (pd.Timestamp(current_date) - pd.offsets.BDay(80)).strftime("%Y-%m-%d")
    print(f"threshold: {threshold}")
    df_price_raw = df_price_raw.loc[df_price_raw["Date"] >= threshold]

    # to generate AdjustedClose, increment price data
    df_price_raw = pd.concat([df_price_raw, prices[price_cols]])
    # generate AdjustedClose
    df_price = adjust_price(df_price_raw)

    # get target SecuritiesCodes
    codes = sorted(prices["SecuritiesCode"].unique())

    # generate feature
    feature = pd.concat([get_features_for_predict(df_price, code) for code in codes])
    # filter feature for this iteration
    feature = feature.loc[feature.index == current_date]

    # prediction
    feature.loc[:, "predict"] = pred_model.predict(feature[feat_cols])

    # set rank by predict
    feature = feature.sort_values("predict", ascending=False).drop_duplicates(subset=['SecuritiesCode'])
    feature.loc[:, "Rank"] = np.arange(len(feature))
    feature_map = feature.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(feature_map)

    # check Rank
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1

    # register your predictions
    env.predict(sample_prediction)
    counter += 1

In [None]:
! head submission.csv

In [None]:
! tail submission.csv