# Tokyo Stock Exchange Prediction with XGBoost
In this notebook, I will build a Tokyo Stock Exchange Prediction using XGBoost. To make it easy to start with, I will only use secondary stock prices data.

This notebook is based on [Tokyo Stock Exchange Prediction with CatBoost](https://www.kaggle.com/code/lonnieqin/tokyo-stock-exchange-prediction-with-catboost)

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import xgboost as xgb

In [None]:
class Config:
    dataset_path = Path("../input/jpx-tokyo-stock-exchange-prediction/")

### Loading data

In [None]:
stock_list = pd.read_csv(Config.dataset_path/"stock_list.csv")
stock_list.head()

In [None]:
trades = pd.read_csv(Config.dataset_path/"train_files/trades.csv")
trades.tail()

In [None]:
stock_prices = pd.read_csv(Config.dataset_path/"train_files/stock_prices.csv")
stock_prices.head()

In [None]:
financials = pd.read_csv(Config.dataset_path/"train_files/financials.csv")
financials.head()

In [None]:
options = pd.read_csv(Config.dataset_path/"train_files/options.csv")
options.head()

In [None]:
secondary_stock_prices = pd.read_csv(Config.dataset_path/"train_files/secondary_stock_prices.csv")
secondary_stock_prices.head()

In [None]:
secondary_stock_prices.Volume.isnull().value_counts()

In [None]:
len(secondary_stock_prices)

## Feature Engineering

In [None]:
def feature_engineering(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df["year"] = df.Date.dt.year
    df["month"] = df.Date.dt.month
    df["day"] = df.Date.dt.day
    df['dayofweek'] = df.Date.dt.dayofweek
    df['hour'] = df.Date.dt.hour
    df.pop("Date")
    df.pop("RowId")
    return df

In [None]:
secondary_stock_prices = feature_engineering(secondary_stock_prices)
secondary_stock_prices.head()

In [None]:
target = secondary_stock_prices.pop("Target")
target.fillna(0, inplace=True)

## Train Validation Split
I will keep last 10% data as hold-out set.

In [None]:
validation_split = 0.1
split_index = int(len(secondary_stock_prices) * (1 - validation_split))
X_train = secondary_stock_prices.iloc[0:split_index]
X_val = secondary_stock_prices.iloc[split_index:]
y_train = target.iloc[0:split_index]
y_val = target.iloc[split_index:]

## Modeling

In [None]:
params = {
    'tree_method' : 'gpu_hist',
    'verbose' : 2,
    'n_estimators':1000, 
    'max_depth': 7,
    'eta': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.8
}
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])

## Submission

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()
historical_return = {}
counter = 0
# The API will deliver six dataframes in this specific order:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    if counter == 0:
        print(prices.head())
        print(options.head())
        print(financials.head())
        print(trades.head())
        print(secondary_prices.head())
        print(sample_prediction.head())
    codes = sample_prediction["SecuritiesCode"].unique()
    prediction_dict = dict([(str(code), 0) for code in codes])
    secondary_prices = feature_engineering(secondary_prices)
    y_pred = model.predict(secondary_prices)
    for i in range(len(secondary_prices)):
        code = str(secondary_prices.iloc[i]["SecuritiesCode"])
        if code in prediction_dict:
            prediction_dict[code] += y_pred[i]
            historical_return[code] = prediction_dict[code]
        else:
            historical_return[code] = y_pred[i]
    for code in codes:
        if (code not in prediction_dict or prediction_dict[code] == 0) and code in historical_return:
            prediction_dict[code] = historical_return[code]
    ranks = np.argsort(-1 * np.array(list(prediction_dict.values())), axis=0)
    rank_dict = dict([(str(code), rank) for code, rank in zip(codes, ranks)])
    secondary_prices["prediction"] = y_pred
    result = []
    for i in range(len(sample_prediction)):
        code = sample_prediction.iloc[i]["SecuritiesCode"]
        result.append(rank_dict[str(code)])
    sample_prediction['Rank'] = result
    env.predict(sample_prediction)
    counter += 1