The idea for this notebook is shown how training and ranking over a selected group of Securities affect the competition score.

In [None]:
import numpy as np
import pandas as pd
import math
import os,gc
from scipy import stats
import lightgbm as lgb
import jpx_tokyo_market_prediction
from sklearn.metrics import mean_squared_error

In [None]:
def seed_everything(seed=2021):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=2021
seed_everything(SEED)

In [None]:
train = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv",parse_dates=["Date"])
supplemental_train = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv",parse_dates=["Date"])
train = pd.concat([train, supplemental_train])
del supplemental_train
gc.collect()
    
train=train.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag']).dropna().reset_index(drop=True)
train.info()


In [None]:
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size=200, toprank_weight_ratio=2):
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio#, buf

def add_rank(df):
    df["Rank"] = df.groupby("Date")["Target"].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

def fill_nan_inf(df):
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)
    return df

def check_score(df,preds,Securities_filter=[]):
    tmp_preds=df[['Date','SecuritiesCode']].copy()
    tmp_preds['Target']=preds
    
    #Rank Filter. Calculate median for this date and assign this value to the list of Securities to filter.
    tmp_preds['target_median']=tmp_preds.groupby("Date")["Target"].transform('median')
    tmp_preds.loc[tmp_preds['SecuritiesCode'].isin(Securities_filter),'Target']=tmp_preds['target_median']
    
    tmp_preds = add_rank(tmp_preds)
    df['Rank']=tmp_preds['Rank']
    score=round(calc_spread_return_sharpe(df, portfolio_size= 200, toprank_weight_ratio= 2),5)
    score_mean=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).mean(),5)
    score_std=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).std(),5)
    print(f'Competition_Score:{score}, rank_score_mean:{score_mean}, rank_score_std:{score_std}')

In [None]:
list_spred_l=list((train.groupby('SecuritiesCode')['Target'].max()-train.groupby('SecuritiesCode')['Target'].min()).sort_values()[:1000].index)
list_spred_h=list((train.groupby('SecuritiesCode')['Target'].max()-train.groupby('SecuritiesCode')['Target'].min()).sort_values()[1000:].index)

In [None]:
# Training just with Securities with hight target_spread and validated with Securities with low target_spread.

features =['High','Low','Open','Close','Volume']
train=fill_nan_inf(train)

params_lgb = {'learning_rate': 0.001,'metric':'None','objective': 'regression','boosting': 'gbdt','verbosity': 0,'n_jobs': -1,'force_col_wise':True}  

tr_dataset = lgb.Dataset(train[train['SecuritiesCode'].isin(list_spred_l)][features],train[train['SecuritiesCode'].isin(list_spred_l)]["Target"],feature_name = features )
vl_dataset = lgb.Dataset(train[train['SecuritiesCode'].isin(list_spred_h)][features], train[train['SecuritiesCode'].isin(list_spred_h)]["Target"],feature_name = features)

model = lgb.train(params = params_lgb, 
                train_set = tr_dataset, 
                valid_sets =  vl_dataset, 
                num_boost_round = 3000, 
                feval=feval_pearsonr,
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True), lgb.log_evaluation(period=100)])    

#Early stopping, best iteration is:
#[683]	training's pearsonr: 0.0683874	valid_1's pearsonr: 0.0094303

In [None]:
# Ranking filtering by Securities with previous list based in target spread.

test = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv",parse_dates=["Date"])
test=test.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag'])
test=fill_nan_inf(test)

preds=model.predict(test[features])
print(math.sqrt(mean_squared_error(preds,test.Target)))

check_score(test,preds)
check_score(test,preds,list_spred_h)
check_score(test,preds,list_spred_l)


#0.02429290521138594
#Competition_Score:0.3891, rank_score_mean:0.32406, rank_score_std:0.83285
#Competition_Score:0.39113, rank_score_mean:0.24486, rank_score_std:0.62601
#Competition_Score:0.25724, rank_score_mean:0.27239, rank_score_std:1.05888

In [None]:
sample_submission = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")

env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:  
    prices['Target'] = model.predict(fill_nan_inf(prices)[features])
   # Filter applied when ranking.  
    prices['target_median']=prices.groupby("Date")["Target"].transform('median')
    prices.loc[prices['SecuritiesCode'].isin(list_spred_h),'Target']=prices['target_median']
    prices = add_rank(prices)
    sample_prediction['Rank'] = prices['Rank']
    env.predict(sample_prediction)
    
sample_prediction.head(5)