## Only Rankings as Features
Key points:
* Here I created a new target that doesnt look ahead
* Reindex data
* Create ranking matrix
* Multiply predictions with -1

In [None]:
import os
import gc
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_columns = 80
import warnings
warnings.filterwarnings("ignore")

## Train Files

In [None]:
train_filepath = "../input/jpx-tokyo-stock-exchange-prediction/train_files/"

# Stock Prices
df = pd.read_csv(train_filepath + "stock_prices.csv")
df["Date"] = pd.to_datetime(df["Date"])

There are missing stocks for some days, I am populating those days and forward fill

In [None]:
def reindex_by_date(df):
    df = df.set_index("Date")
    dates = pd.date_range(pd.to_datetime('2017-01-04 00:00:00'), pd.to_datetime('2021-12-03 00:00:00'))
    return df.reindex(dates).ffill().bfill()

mind, maxd = min(df.Date), max(df.Date)
%time df = df.groupby('SecuritiesCode').apply(reindex_by_date).reset_index(0, drop=True)
print(df.shape)

df = df.reset_index()
df = df.rename({'index': 'Date'}, axis=1)

In [None]:
df = df.loc[df.Date > '2021-09-01'].reset_index()
print(df.shape)

In [None]:
df["C_O"] = df["Close"] / df["Open"]
df['Target2'] = df.fillna(0.0).groupby("Date")["C_O"].rank("dense", ascending=True).astype(int)
df["Target2"] = df.groupby("Date")["Target2"].rank(ascending=False,method="first") -1

Now rankings matrix

In [None]:
matris = pd.DataFrame(df.groupby("Date")["Target2"].apply(lambda x: x.values).values)[0].apply(pd.Series)
codes = df.groupby("Date")["SecuritiesCode"].apply(lambda x: x.values).values[0].tolist()
matris.columns = [f"code_{str(int(code))}" for code in codes]
dates = sorted(df["Date"].unique())
matris["Date"] = dates
matris

In [None]:
%time df = df.merge(matris, on='Date', how='left')

In [None]:
col_not_use = ["RowId", "Date", "Open", "High", "Low", "Close", "ExcectedDividend", "SupervisionFlag",
              "Target", "OptionsCode", "WholeDayOpen", "WholeDayHigh", "WholeDayLow", "WholeDayClose",
              "DaySessionOpen", "DaySessionHigh", "DaySessionLow", "DaySessionClose", 
              "ContractMonth", "Putcall", "LastTradingDay", "SpecialQuotationDay",
              "Dividend", "lagT_{i}",
              "Target2"]

col_use = [c for c in df.columns if c not in col_not_use]
print(len(col_use))

## Model

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, train_test_split, KFold

from lightgbm import LGBMRegressor, LGBMClassifier, LGBMRanker
from sklearn.linear_model import LinearRegression

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        try:
            weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
            purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
            short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
            return purchase - short
        except:
            return 0

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
df.Date.agg(['min', 'max'])

In [None]:
def setup_cv(df, splits=5):
    df['fold'] = -1

    df.loc[df.Date > '2021-11-01', 'fold'] = 9
    df.loc[df.Date <= '2021-10-01', 'fold'] = 8
    df.loc[df.Date <= '2021-09-01', 'fold'] = 7
    df.loc[df.Date <= '2021-08-01', 'fold'] = 6
    df.loc[df.Date <= '2021-07-01', 'fold'] = 5
    return df

df = setup_cv(df, 10)

In [None]:
df.fold.value_counts()

In [None]:
def train_lgbm(prices, folds, col_use):
    models = list()
    
    for f in range(folds):
        if f <8:
            continue
        X_train = prices[(prices.fold < f)][col_use]
        y_train = prices[(prices.fold < f)][["Target2"]]
        X_valid = prices[prices.fold == f][col_use]
        y_valid = prices[prices.fold == f][["Target2"]]
        
        query_train = [X_train.shape[0] /2000] * 2000 #Because we have 2000 stock in each time group
        query_valid = [X_valid.shape[0] / 2000] * 2000
        
        query_train = prices[(prices.fold < f)]['Date'].value_counts().sort_index()
        query_valid = prices[prices.fold == f]['Date'].value_counts().sort_index()
        
        #model = LGBMRanker(n_estimators=15000,random_state=42,n_jobs=2)
        model = LGBMRegressor(random_state=42, n_estimators=50, colsample_bytree=.5)
        #model = LinearRegression()
        """model.fit(X_train, y_train, 
                  verbose=10,
                 #group = query_train,
                  eval_set=[(X_valid, y_valid)],
                  early_stopping_rounds=30,
                 #eval_group=[query_valid],
                  #eval_at=[3]
                 )"""
        model.fit(X_train.values, y_train.values)
        
        oof_preds = model.predict(X_valid)
        oof_score = np.sqrt(mean_squared_error(y_valid, oof_preds))
        print(oof_score)
        models.append(model)
        
        dfval = prices[prices.fold == f].reset_index(drop=True)
        dfval["pred"] = model.predict(dfval[col_use])
        dfval['Rank'] = dfval.groupby("Date")["pred"].rank("dense", ascending=True).astype(int)
        dfval["Rank"] = dfval.groupby("Date")["Rank"].rank(ascending=False,method="first") -1 
        dfval["Rank"] = dfval["Rank"].astype("int")

        score = calc_spread_return_sharpe(dfval, portfolio_size= 200, toprank_weight_ratio= 2)
        print(f"Fold = {f}, Score =  {score}")
        
    return models

lgbm_models = train_lgbm(df, 10, col_use)

In [None]:
df_test = df.loc[df.Date=='2021-12-03'].reset_index(drop=True)[["index","Date", "RowId", "SecuritiesCode", "Open","High", "Low", "Close", "Volume", "AdjustmentFactor","ExpectedDividend", "Target"]]

In [None]:
col_use[:10]

In [None]:
df_test["C_O"] = df_test["Close"] / df_test["Open"]
df_test['Target2'] = df_test.fillna(0.0).groupby("Date")["C_O"].rank("dense", ascending=True).astype(int)
df_test["Target2"] = df_test.groupby("Date")["Target2"].rank(ascending=False,method="first") -1

matris = pd.DataFrame(df_test.groupby("Date")["Target2"].apply(lambda x: x.values).values)[0].apply(pd.Series)
#codes = df.groupby("Date")["SecuritiesCode"].apply(lambda x: x.values).values[0].tolist()
matris.columns = [f"code_{str(int(code))}" for code in codes]
dates = sorted(df_test["Date"].unique())
matris["Date"] = dates

In [None]:
df_test = df_test.merge(matris, on='Date', how='left')

In [None]:
lgbm_models[-1].predict(df_test[col_use])

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

usecols = ["Date","SecuritiesCode","Open", "High", "Low", "Close","Target"]

for (prices, _, _, _, _, submission) in iter_test:
    prices.loc[:,"Target"] = np.nan
    df_test = prices.reset_index(drop=True)
    df_test["index"] = 1
    
    df_test["C_O"] = df_test["Close"] / df_test["Open"]
    df_test['Target2'] = df_test.fillna(0.0).groupby("Date")["C_O"].rank("dense", ascending=True).astype(int)
    df_test["Target2"] = df_test.groupby("Date")["Target2"].rank(ascending=False,method="first") -1

    matris = pd.DataFrame(df_test.groupby("Date")["Target2"].apply(lambda x: x.values).values)[0].apply(pd.Series)
    #codes = df.groupby("Date")["SecuritiesCode"].apply(lambda x: x.values).values[0].tolist()
    matris.columns = [f"code_{str(int(code))}" for code in codes]
    dates = sorted(df_test["Date"].unique())
    matris["Date"] = dates
    
    
    df_test = df_test.merge(matris, on='Date', how='left')
    df_test["pred"] = ((lgbm_models[0].predict(df_test[col_use])*0.5)+(lgbm_models[-1].predict(df_test[col_use])*0.5)) * -1
    df_test['Rank'] = df_test.groupby("Date")["pred"].rank("dense", ascending=True).astype(int)
    df_test["Rank"] = df_test.groupby("Date")["Rank"].rank(ascending=False,method="first") -1 
    df_test["Rank"] = df_test["Rank"].astype("int")
    
    submission["Rank"] = df_test["Rank"].values
    
    display(df_test.head())
    display(submission.head())
    
    env.predict(submission)