In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%time df = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
print(df.shape)
df.head()

In order to rank with LightGBM Ranker, we need to form some groups up to 31. This means that; actually the model will try to learn in whic group an id is belong to..

In [None]:
df["Target"] = df.groupby("Date")["Target"].rank("dense", ascending=False).astype(int)
df["Target"] = pd.qcut(df.Target, 30).cat.codes

In [None]:
print(df.Date.agg(['min', 'max']))

In [None]:
# Just some arbitrary dates
time_config = {'train_split_date': '2021-12-06',
               'val_split_date'  : '2022-02-10',
               'test_split_date' : '2022-02-20'}

train = df[(df.Date >= time_config['train_split_date']) & (df.Date < time_config['val_split_date'])]
val = df[(df.Date >= time_config['val_split_date']) & (df.Date < time_config['test_split_date'])]
test = df[(df.Date >= time_config['test_split_date'])]

print(train.shape)
print(val.shape)
print(test.shape)

col_use = [c for c in df.columns if c not in ["RowId","Date", "Target"]]

In [None]:
query_train = [train.shape[0] /2000] * 2000 #Because we have 2000 stock in each time group
query_val = [val.shape[0] / 2000] * 2000
query_test = [test.shape[0] / 2000] *2000

In [None]:
from lightgbm import LGBMRanker

model_return = LGBMRanker(n_estimators=15000,
                          random_state=42,
                          num_leaves=41,
                          learning_rate=0.002,
                          #max_bin =20,
                          #subsample_for_bin=20000,
                          colsample_bytree=0.7,
                          n_jobs=2)
model_return.fit(train[col_use], train['Target'],
             group = query_train,
             verbose=100,
             early_stopping_rounds=200,
             eval_set=[(val[col_use], val['Target'])],
             eval_group=[query_val],
             eval_at=[1] #Make evaluation for target=1 ranking, I choosed arbitrarily
                )

In [None]:
test["pred"] = model_return.predict(test[col_use])
test["pred"] # So our output is not ranks, yet..

In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()

In [None]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    try:
        sample_prediction['Rank'] = model_return.predict(prices[col_use]) * -1
        # Get the ranks from prediction first and for the duplicated ones, just rank again
        sample_prediction['Rank'] = sample_prediction.groupby("Date")["Rank"].rank("dense", 
                                                                                   ascending=False).astype(int)
        sample_prediction['Rank'] = sample_prediction.groupby("Date")["Rank"].rank("first").astype(int) - 1
    except:
        sample_prediction['Rank'] = 0
    sample_prediction = sample_prediction.replace([-np.inf, np.inf], np.nan).fillna(0.0)
    # register your predictions
    env.predict(sample_prediction)
    display(sample_prediction)