In [None]:
# LIBRARIES

import pandas as pd
import numpy as np
import gc
import datetime
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [None]:
# LOAD DATA

dtypes_stock_prices = {'SecuritiesCode': 'int32', 'Open': 'float32', 'High': 'float32', 'Low': 'float32', 'Close': 'float32', 'Volume': 'int32', 
                        'AdjustmentFactor': 'float32', 'ExpectedDividend': 'float32', 'Target': 'float32'}

stock_prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', dtype=dtypes_stock_prices)
stock_prices_supp = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv', dtype=dtypes_stock_prices)

In [None]:
stock_prices_final = pd.concat([stock_prices, stock_prices_supp], axis=0)

del stock_prices, stock_prices_supp
gc.collect()

In [None]:
stock_prices_final = stock_prices_final.sort_values(by='Date', ascending=True).reset_index(drop=True)
stock_prices_final.head(5)

In [None]:
stock_prices_final.isnull().sum()

**MODELLING DATASETS**

In [None]:
unique_dates = pd.DataFrame(stock_prices_final['Date'].unique())
unique_dates.columns = ['Date']
unique_dates = unique_dates.sort_values(by='Date').reset_index(drop=True)
unique_dates['Date_id'] = np.arange(unique_dates.shape[0])
stock_prices_final = stock_prices_final.merge(unique_dates, on='Date', how='inner')

In [None]:
train = stock_prices_final.drop(columns=['ExpectedDividend']).dropna().reset_index(drop=True)
x_train = train[["SecuritiesCode","High","Open","Close","Low","Volume"]]
y_train = train[['Target']]
date_ids_train = train[['Date_id']]

del stock_prices_final, train
gc.collect()

In [None]:
x_train.isnull().sum()

In [None]:
x_train.shape

**MODELLING**

In [None]:
def gts_cv(date_ids, n_folds, holdout_size):
    '''
    Group Time Series Cross-Validation
    date_ids (DataFrame): DataFrame with Date_id
    n_folds (int): # Folds.
    holdout_size (int): Holdout period size (number of days).
    '''
    X = date_ids[['Date_id']]
    groups = X['Date_id'].values
    unique_date_ids = np.unique(groups)
    array_split_date_ids = np.array_split(unique_date_ids, len(unique_date_ids) // holdout_size)[::-1]
    array_split_date_ids = np.flip(array_split_date_ids[:n_folds])

    for date_ids in array_split_date_ids:
        test_condition = X['Date_id'].isin(date_ids)
        val_index = X.loc[test_condition].index
        train_condition = X['Date_id'] < (np.min(date_ids))
        train_index = X.loc[train_condition].index

        yield train_index, val_index


In [None]:
def sharpe_ratio_(df, portfolio_size=200, toprank_weight_ratio=2):
    '''
    Calculates Sharpe ratio
    df (DataFrame): Dataframe with Rank/Target.
    portfolio_size (int): # of equities to buy/sell (200 in the competition).
    toprank_weight_ratio (float): The relative weight of the most highly ranked stock compared to the least (2 in the competition).
    '''
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date_id').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()

    return sharpe_ratio

In [None]:
def predict_(data_topred, models):
    '''
    Predict for Models List
    data_topred: Data to predict.
    models: Models list.
    '''
    for model in models:
        y_pred = model.predict(data_topred)
        y_pred = y_pred.reshape(y_pred.shape[0],)
        if model == models[0]:
            final_pred = y_pred / len(models)
        else:
            final_pred += y_pred / len(models)

    return final_pred

In [None]:
xgboost_reg = XGBRegressor(tree_method='gpu_hist', grow_policy='lossguide', min_child_weight=250, n_estimators=10000,
                            subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, colsample_bynode=0.8,
                            learning_rate=0.01, max_depth=3, n_jobs=4, random_state=13)

xgboost_models = []
cvs_dict = {'Fold': [], 'Sharpe_Ratio_train': [], 'Sharpe_Ratio_val': []}
cv = gts_cv(date_ids_train, n_folds=5, holdout_size=70)
fold = 1
for train_index, val_index in cv:
    print(f'xgboost Regressor CV Fold {fold} / Start time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    date_ids_train_cv, date_ids_val_cv = date_ids_train.iloc[train_index], date_ids_train.iloc[val_index]
    x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_cv = y_train.iloc[train_index].to_numpy().ravel()
    y_val_cv = y_train.iloc[val_index].to_numpy().ravel()

    print('Train Shape:', x_train_cv.shape)
    print('Val Shape:', x_val_cv.shape)
    
    model_ = clone(xgboost_reg)
    es = xgb.callback.EarlyStopping(rounds=150, maximize=False, save_best=True)
    model_.fit(x_train_cv, y_train_cv, eval_set=[(x_val_cv, y_val_cv)], callbacks=[es], verbose=True)

    y_pred_train_cv = model_.predict(x_train_cv)
    y_pred_val_cv = model_.predict(x_val_cv)

    rank_train_cv = pd.concat([pd.DataFrame(date_ids_train_cv), pd.DataFrame(y_train_cv), pd.DataFrame(y_pred_train_cv)], axis=1)
    rank_train_cv.columns = ['Date_id', 'Target', 'pred']
    rank_train_cv['Rank'] = rank_train_cv.groupby('Date_id')['pred'].rank(method='first', ascending=False).astype('int32') - 1

    rank_val_cv = pd.concat([pd.DataFrame(date_ids_val_cv).reset_index(drop=True), pd.DataFrame(y_val_cv), pd.DataFrame(y_pred_val_cv)], axis=1)
    rank_val_cv.columns = ['Date_id', 'Target', 'pred']
    rank_val_cv['Rank'] = rank_val_cv.groupby('Date_id')['pred'].rank(method='first', ascending=False).astype('int32') - 1

    sharpe_Ratio_cv_train = sharpe_ratio_(rank_train_cv)
    sharpe_Ratio_cv_val = sharpe_ratio_(rank_val_cv)
    cvs_dict['Fold'].append(fold)
    cvs_dict['Sharpe_Ratio_train'].append(sharpe_Ratio_cv_train)
    cvs_dict['Sharpe_Ratio_val'].append(sharpe_Ratio_cv_val)

    print(f'xgboost Regressor CV Fold {fold} / End time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    print(f'CV Train Sharpe Ratio: {sharpe_Ratio_cv_train}')
    print(f'CV Val Sharpe Ratio: {sharpe_Ratio_cv_val}')
    print('----------------------------------------------------------')
    xgboost_models += [model_]
            
    del model_, y_pred_train_cv, y_pred_val_cv, rank_train_cv, rank_val_cv
    gc.collect()
    fold += 1

print('Overall Folds Train Sharpe Ratio:', round(np.asarray(cvs_dict['Sharpe_Ratio_train']).mean(), 4))
print('Overall Folds Val Sharpe Ratio:', round(np.asarray(cvs_dict['Sharpe_Ratio_val']).mean(), 4))


**INFERENCE**

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices["Volume"].fillna(1,inplace=True)
    prices.fillna(0,inplace=True)
    sample_prediction["Prediction"] = predict_(prices[["SecuritiesCode","High","Open","Close","Low","Volume"]], xgboost_models)
    sample_prediction["rate"] = sample_prediction["Prediction"] / prices["Volume"]
    sample_prediction.sort_values(by = "rate", ascending=False, inplace=True)
    sample_prediction.Rank = np.arange(0,2000)
    print(sample_prediction)
    sample_prediction.sort_values(by = "SecuritiesCode", ascending=True, inplace=True)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)