In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 20,12
import warnings
warnings.filterwarnings('ignore')
import time
# Configuration
pd.set_option('max_column', None)
colors = sns.color_palette('Set2')
from tqdm.notebook import tqdm
tqdm.pandas()
from decimal import ROUND_HALF_UP, Decimal
from datetime import datetime, timedelta
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler
from multiprocessing import Pool, cpu_count
from dateutil.relativedelta import relativedelta, FR
# models
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier

path = '/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files'
sup_path = '/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files'

In [2]:
def merge_options(df1, df2) :
    df2.loc[: ,"Date"] = pd.to_datetime(df2.loc[: ,"Date"], format="%Y-%m-%d")
    df2["GenSaki"] = df2["OptionsCode"].astype(str).str[1:2]
    df2["TargetIndex"] = df2["OptionsCode"].astype(str).str[5:]
    df2["Tot_Vol"] = df2["TradingVolume"].sum()
    df2["Tot_Vol_day"] = df2["WholeDayVolume"].sum()
    df2["Relative Volatility"] = df2["ImpliedVolatility"]/df2["BaseVolatility"]

    #df2 = df2[df2["TargetIndex"]=="0018"]
    #df2 = df2[df2["GenSaki"]== "3"]
    df2 = df2[df2["Putcall"]== 1]
    df2["Put_Vol"] = df2["TradingVolume"].sum()
    df2["Call_Vol"] = df2["Tot_Vol"] - df2["Put_Vol"]
    df2["Put_ratio"] = df2["Put_Vol"] / df2["Tot_Vol"]
    df2["Call_ratio"] = df2["Call_Vol"] / df2["Tot_Vol"]
    df2["Put_Vol_day"] = df2["WholeDayVolume"].sum()
    df2["Call_Vol_day"] =  df2["Tot_Vol_day"] - df2["Put_Vol_day"]
    df2["Put_ratio_day" ] = df2["Put_Vol_day"]/df2["Tot_Vol_day"]
    df2["Cal_ratio_day"] = df2["Call_Vol_day"]/df2["Tot_Vol_day"]
    df2["Call_call_ratio"] = df2["Call_Vol_day"]/df2["Call_Vol"]
    df2["Put_put_ratio"] = df2["Put_Vol_day"]/df2["Put_Vol"]
    
    df2 = df2[["Date","Relative Volatility","InterestRate","Put_ratio","Call_ratio","Put_ratio_day","Cal_ratio_day","Call_call_ratio","Put_put_ratio"]]
    df2 = df2.groupby("Date").mean().reset_index()    
    df1 = pd.merge(df1, df2, on = "Date", how="left")
    df1.set_index("Date","SecuritiesCode",inplace=True)    
    
    return df1

In [3]:
def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]

def get_features(price, codes, features):    
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        test_X (pd.DataFrame): test data
    """
    # to store splited data
    tests_X = []

    # generate feature one by one
    for code in tqdm(codes):
        feats = features[features["SecuritiesCode"] == code].dropna()
        if feats.shape[0] > 0:
            # store features
            tests_X.append(feats)
            
    # combine features for each codes
    test_X = pd.concat(tests_X)
    return test_X

def get_features_and_label(price, codes, features,TRAIN_END,TEST_START):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X\
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in tqdm(codes):

        feats = features[features["SecuritiesCode"] == code].dropna()
        labels = get_label(price, code).dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, "SecuritiesCode"] == feats.loc[:, "SecuritiesCode"]).all()
            labels = labels.loc[:, "label"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]
            
            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)
            
    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [4]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")
    price['SecuritiesCode'] = price['SecuritiesCode'].astype(int)

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (df["CumulativeAdjustmentFactor"] * df["Close"]).map(lambda x: float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)))
        # reverse order
        df = df.sort_values("Date")
        # Generate cumulative adjustment factor volume
        df.loc[:, 'CumAdjustmentFactorVol'] = df['AdjustmentFactor'].cumprod()
        
        # generate AdjustedVolume 
        df['AdjustedVolume'] = df['Volume'].where(df['AdjustmentFactor'] != 1, (df['Volume'] * df['CumAdjustmentFactorVol']).astype('int64'))

        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        df.loc[df['AdjustedVolume']==0,'AdjustedVolume'] = np.nan
        df.loc[:, "AdjustedVolume"] = df.loc[:, "AdjustedVolume"].ffill()
        return df
      # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
    

    price.set_index("Date","SecuritiesCode",inplace=True)
    return price

In [5]:
# technical indicators
def RSI(series, period):
    delta = series.diff().dropna()
    u = delta * 0
    d = u.copy()
    u[delta > 0] = delta[delta > 0]
    d[delta < 0] = -delta[delta < 0]
    u[u.index[period-1]] = np.mean( u[:period] ) #first value is sum of avg gains
    u = u.drop(u.index[:(period-1)])
    d[d.index[period-1]] = np.mean( d[:period] ) #first value is sum of avg losses
    d = d.drop(d.index[:(period-1)])
    rs = pd.DataFrame.ewm(u, com=period-1, adjust=False).mean() / \
         pd.DataFrame.ewm(d, com=period-1, adjust=False).mean()
    return 100 - 100 / (1 + rs)

def bbands(price, length, numsd=2):
    """ returns average, upper band, and lower band"""
    #ave = pd.stats.moments.rolling_mean(price,length)
    ave = price.rolling(window = length, center = False).mean()
    #sd = pd.stats.moments.rolling_std(price,length)
    sd = price.rolling(window = length, center = False).std()
    upband = ave + (sd*numsd)
    dnband = ave - (sd*numsd)
    return np.round(ave,3), np.round(upband,3), np.round(dnband,3)

def abands(df,length):
    #df['AB_Middle_Band'] = pd.rolling_mean(df['Close'], 20)
    df[f"AB_Middle_Band{length}_day"] = df['AdjustedClose'].rolling(window = length, center=False).mean()
    # High * ( 1 + 4 * (High - Low) / (High + Low))
    df['aupband'] = df['High'] * (1 + 4 * (df['High']-df['Low'])/(df['High']+df['Low']))
    df[f"AB_Upper_Band{length}_day"] = df['aupband'].rolling(window=length, center=False).mean()
    # Low *(1 - 4 * (High - Low)/ (High + Low))
    df['adownband'] = df['Low'] * (1 - 4 * (df['High']-df['Low'])/(df['High']+df['Low']))
    df[f"AB_Lower_Band{length}_day"] = df['adownband'].rolling(window=length, center=False).mean()
    return df[f"AB_Middle_Band{length}_day"],df[f"AB_Upper_Band{length}_day"] ,df[f"AB_Lower_Band{length}_day"]
    
def STOK(df, n):
    df[f'STOK{n}_days'] = ((df['AdjustedClose'] - df['Low'].rolling(window=n, center=False).mean()) / (df['High'].rolling(window=n, center=False).max() - df['Low'].rolling(window=n, center=False).min())) * 100
    df[f'STOD{n}_days'] = df[f'STOK{n}_days'].rolling(window = n, center=False).mean()
    return df[f'STOK{n}_days'],df[f'STOD{n}_days']

def CCI(df, n, constant):
    TP = (df['High'] + df['Low'] + df['AdjustedClose']) / 3
    CCI = pd.Series((TP - TP.rolling(window=n, center=False).mean()) / (constant * TP.rolling(window=n, center=False).std())) #, name = 'CCI_' + str(n))
    return CCI

def psar(df, iaf = 0.02, maxaf = 0.2):
    df.reset_index(inplace = True)
    length = len(df)
    dates = (df['Date'])
    high = (df['High'])
    low = (df['Low'])
    close = (df['AdjustedClose'])
    psar = df['AdjustedClose'][0:len(df['AdjustedClose'])]
    psarbull = [None] * length
    psarbear = [None] * length
    bull = True
    af = iaf
    ep = df['Low'][0]
    hp = df['High'][0]
    lp = df['Low'][0]
    for i in range(2,length):
        if bull:
            psar[i] = psar[i - 1] + af * (hp - psar[i - 1])
        else:
            psar[i] = psar[i - 1] + af * (lp - psar[i - 1])
        reverse = False
        if bull:
            if df['Low'][i] < psar[i]:
                bull = False
                reverse = True
                psar[i] = hp
                lp = df['Low'][i]
                af = iaf
        else:
            if df['High'][i] > psar[i]:
                bull = True
                reverse = True
                psar[i] = lp
                hp = df['High'][i]
                af = iaf
        if not reverse:
            if bull:
                if df['High'][i] > hp:
                    hp = df['High'][i]
                    af = min(af + iaf, maxaf)
                if df['Low'][i - 1] < psar[i]:
                    psar[i] = df['Low'][i - 1]
                if df['Low'][i - 2] < psar[i]:
                    psar[i] = df['Low'][i - 2]
            else:
                if df['Low'][i] < lp:
                    lp = df['Low'][i]
                    af = min(af + iaf, maxaf)
                if df['High'][i - 1] > psar[i]:
                    psar[i] = df['High'][i - 1]
                if df['High'][i - 2] > psar[i]:
                    psar[i] = df['High'][i - 2]
        if bull:
            psarbull[i] = psar[i]
        else:
            psarbear[i] = psar[i]
    #return {"dates":dates, "high":high, "low":low, "close":close, "psar":psar, "psarbear":psarbear, "psarbull":psarbull}
    #return psar, psarbear, psarbull
    df['psar'] = psar
    #df['psarbear'] = psarbear
    #df['psarbull'] = psarbull
    
#Keltner Channel  
def KELCH(df, n):  
    KelChM = pd.Series(((df['High'] + df['Low'] + df['AdjustedClose']) / 3).rolling(window =n, center=False).mean(), name = 'KelChM_' + str(n))  
    KelChU = pd.Series(((4 * df['High'] - 2 * df['Low'] + df['AdjustedClose']) / 3).rolling(window =n, center=False).mean(), name = 'KelChU_' + str(n))  
    KelChD = pd.Series(((-2 * df['High'] + 4 * df['Low'] + df['AdjustedClose']) / 3).rolling(window =n, center=False).mean(), name = 'KelChD_' + str(n))    
    return KelChM, KelChD, KelChU

def MFI(df,n):
    # typical price
    df['tp'] = (df['High']+df['Low']+df['AdjustedClose'])/3
    #raw money flow
    df['rmf'] = df['tp'] * df['AdjustedVolume']
    
    # positive and negative money flow
    df['pmf'] = np.where(df['tp'] > df['tp'].shift(1), df['tp'], 0)
    df['nmf'] = np.where(df['tp'] < df['tp'].shift(1), df['tp'], 0)

    # money flow ratio
    df['mfr'] = df['pmf'].rolling(window=n,center=False).sum()/df['nmf'].rolling(window=n,center=False).sum()
    df['Money_Flow_Index'] = 100 - 100 / (1 + df['mfr'])
    return df['Money_Flow_Index'] 

def ichimoku(df):
    # Turning Line
    period9_high = df['High'].rolling(window=9,center=False).max()
    period9_low = df['Low'].rolling(window=9,center=False).min()
    df['turning_line'] = (period9_high + period9_low) / 2
    
    # Standard Line
    period26_high = df['High'].rolling(window=26,center=False).max()
    period26_low = df['Low'].rolling(window=26,center=False).min()
    df['standard_line'] = (period26_high + period26_low) / 2
    
    # Leading Span 1
    df['ichimoku_span1'] = ((df['turning_line'] + df['standard_line']) / 2).shift(26)
    
    # Leading Span 2
    period52_high = df['High'].rolling(window=52,center=False).max()
    period52_low = df['Low'].rolling(window=52,center=False).min()
    df['ichimoku_span2'] = ((period52_high + period52_low) / 2).shift(26)
    
    # The most current closing price plotted 22 time periods behind (optional)
    df['chikou_span'] = df['AdjustedClose'].shift(-22) # 22 according to investopedia
    
    return  df['ichimoku_span1'] , df['ichimoku_span2'], df['chikou_span']

def KAMA(price, n, pow1=2, pow2=30):
    ''' kama indicator '''    
    ''' accepts pandas dataframe of prices '''

    absDiffx = abs(price - price.shift(1) )  

    ER_num = abs( price - price.shift(n) )
    ER_den = absDiffx.rolling(window=n,center=False).sum()
    ER = ER_num / ER_den

    sc = ( ER*(2.0/(pow1+1)-2.0/(pow2+1.0))+2/(pow2+1.0) ) ** 2.0


    answer = np.zeros(sc.size)
    N = len(answer)
    first_value = True

    for i in range(N):
        if sc[i] != sc[i]:
            answer[i] = np.nan
        else:
            if first_value:
                answer[i] = price[i]
                first_value = False
            else:
                answer[i] = answer[i-1] + sc[i] * (price[i] - answer[i-1])
    return answer


In [6]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code, ["SecuritiesCode","Target","Open","High","Low", close_col,"AdjustedVolume"]].copy()
    
    # feature engineering
    # 1day price momentum
    feats['Momentum_1D'] = (feats[close_col] - feats[close_col].shift(1))
    
    #Price Volume Trend
    feats["PVT"] = (feats["Momentum_1D"] - feats[close_col].shift(1)) * feats["AdjustedVolume"]
    feats["PVT"] = (feats["PVT"] - feats["PVT"].shift(1))
    
    for period in [5,10, 21, 63]:
        # return features
        feats[f"feature_return_{period}days"] = feats[close_col].pct_change(period)
        
        # historical volatility
        feats[f"feature_volatility_{period}days"] = (np.log(feats[close_col]).diff().rolling(period).std())
        
        # difference to moving-average
        feats[f"feature_MAgap_{period}days"] = feats[close_col].rolling(period).mean() / feats[close_col]
        
        # RSI
        feats[f"feature_RSI_{period}days"] = RSI(feats[close_col], period)
        
        #EMA
        feats[f"EMA{period}days"] = feats[close_col].ewm(span=period,adjust=False).mean()
        
         #BollingerBands
        
        feats[f"bollb{period}_mid"],feats[f"bollb{period}_up"],feats[f"bollb{period}_low"] = bbands(feats[close_col],length = period,numsd=2)
        
        #Acceleration Bands
        feats[f"AB_Middle_Band{period}_day"],feats[f"AB_Upper_Band{period}_day"] ,feats[f"AB_Lower_Band{period}_day"] = abands(feats,period)
        
        #Stochastic Oscilattor
        feats[f'STOK{period}_days'],feats[f'STOD{period}_days'] = STOK(feats,period)
        
        #CCI
        feats[f'CCI{period}_days'] = CCI(feats, period, 0.015)
        
        #ketner Channel
        feats[f'KelChM{period}'],feats[f'KelChD{period}'] ,feats[f'KelChU{period}'] = KELCH(feats,period)
        
        # money flow index
        feats[f'Money_Flow_Index{period}'] = MFI(feats,period)
        
        #KAMA
        feats[f'KAMA{period}_days'] = KAMA(feats[close_col],period)
        
        
    #Parabolic SAR
    feats["psar"] = psar(feats)
    
    #Ichimocu SPan
    feats['ichimoku_span1'] , feats['ichimoku_span2'], feats['chikou_span'] = ichimoku(feats)
        
        
     # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    #feats = feats.drop([close_col], axis=1)
    columns2Drop = ['Momentum_1D','aupband', 'adownband','tp','rmf','pmf','nmf','mfr','Money_Flow_Index','turning_line','standard_line']
    feats = feats.drop(labels = columns2Drop, axis=1)
    #feats.set_index('Date',inplace = True)
    
    return feats

In [7]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        
        weights = np.linspace(start = toprank_weight_ratio, stop = 1, num=portfolio_size)
        
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    
    return sharpe_ratio

def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        
        weights = np.linspace(start = toprank_weight_ratio, stop = 1, num=portfolio_size)
        
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        
        return purchase - short

In [8]:
def get_features(df_price):
    codes = sorted(df_price["SecuritiesCode"].unique())
    
    print("Calc price features")
    buff = []
    for code in tqdm(codes):
        feat = get_features_for_predict(df_price, code)
        buff.append(feat)
    feature_df = pd.concat(buff)
    
    # generate feature
    print("Read options")
    df_option = pd.read_csv(f'{path}/options.csv',na_values='-')
    df_sup_option = pd.read_csv(f"{sup_path}/options.csv",na_values='-')
    df_option = pd.concat([df_option,df_sup_option])
    
    print("Merge options")
    final_feat = merge_options(feature_df,df_option)
    
    # features to use
    feat_cols = []
    for cols in final_feat.columns:
        feat_cols.append(cols)
    feat_cols = feat_cols[5:]
    
    return final_feat, feat_cols

def train_model(df_price):
    final_feat, feat_cols = get_features(df_price)
        
    print("Get splits")
    # split data into TRAIN and TEST
    TRAIN_END = '2022-01-31'
    # We put 2months gap between TRAIN_END and TEST_START
    # to avoid leakage of test data information from label
    TEST_START = '2022-03-01'
    
    codes = sorted(df_price["SecuritiesCode"].unique())
    
    # train test split
    train_X, train_y, test_X, test_y = get_features_and_label(df_price,codes,final_feat,TRAIN_END,TEST_START)
 
    lgbm_params = {
        'n_estimators':  801,
        'num_leaves': 3921,
        'max_bin': 47,
        'objective': 'huber',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'max_depth': 4,
        'learning_rate': 0.7805291762864555,
        'feature_fraction': 0.4,
        'lambda_l1': 0.1,
        'lambda_l2': 1,
        'seed': 46,
        'early_stopping_rounds': 100,
    }    
    # initialize model
    pred_model = lgb.LGBMRegressor(**lgbm_params)

    print("Train LGB")
    # train
    pred_model.fit(
        train_X[feat_cols], train_y
        , eval_set=[(test_X[feat_cols], test_y)]
        , verbose=10
    )

    pred_model.booster_.save_model(filename_model)
    return pred_model

In [9]:
# load stock price data
df_price = pd.read_csv(f"{path}/stock_prices.csv")
df_sup_price = pd.read_csv(f"{sup_path}/stock_prices.csv")
df_price = pd.concat([df_price,df_sup_price])
# generate AdjustedClose
df_price = adjust_price(df_price)

In [10]:
filename_model = '../input/final-model-train/model.txt'
pred_model = lgb.Booster(model_file=filename_model)
#pred_model = train_model(df_price)

In [11]:
"""result = test_X[["SecuritiesCode"]].copy()

# predict
result.loc[:, "predict"] = pred_model.predict(test_X[feat_cols])

# actual result
result.loc[:, "Target"] = test_y.values

result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

# feature importance
feature_importance_df = pd.DataFrame()
feature_importance_df['features'] = feat_cols
feature_importance_df['importance'] = pred_model.booster_.feature_importance(importance_type="gain")

sns.barplot(x='importance', y='features', data=feature_importance_df.sort_values(by=['importance'], ascending=False))
calc_spread_return_sharpe(result, portfolio_size = 200)
# we will show daily spread return of the model.
df_result = result.groupby('Date').apply(_calc_spread_return_per_day, 200, 2)
df_result.plot()
# a cumulative spread return of the mode
df_result.cumsum().plot()"""

'result = test_X[["SecuritiesCode"]].copy()\n\n# predict\nresult.loc[:, "predict"] = pred_model.predict(test_X[feat_cols])\n\n# actual result\nresult.loc[:, "Target"] = test_y.values\n\nresult = result.sort_values(["Date", "predict"], ascending=[True, False])\nresult = result.groupby("Date").apply(set_rank)\n\n# feature importance\nfeature_importance_df = pd.DataFrame()\nfeature_importance_df[\'features\'] = feat_cols\nfeature_importance_df[\'importance\'] = pred_model.booster_.feature_importance(importance_type="gain")\n\nsns.barplot(x=\'importance\', y=\'features\', data=feature_importance_df.sort_values(by=[\'importance\'], ascending=False))\ncalc_spread_return_sharpe(result, portfolio_size = 200)\n# we will show daily spread return of the model.\ndf_result = result.groupby(\'Date\').apply(_calc_spread_return_per_day, 200, 2)\ndf_result.plot()\n# a cumulative spread return of the mode\ndf_result.cumsum().plot()'

In [12]:
def set_rank(df):
    #Args:
    #    df (pd.DataFrame): including predict column
    #Returns:
    #    df (pd.DataFrame): df with Rank
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

In [13]:
import jpx_tokyo_market_prediction
ix = pd.IndexSlice

env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()
counter = 0

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    current_date = prices['Date'].iloc[0]
    
    df_price = df_price.reset_index()
    if counter == 0:
        df_price = df_price.loc[df_price['Date']<current_date]
    
    print(current_date)
    #display(prices)
    #display(df_price)
    
    df_price = pd.concat([df_price, prices])
    df_price = adjust_price(df_price)
    #display(df_price)
    
    codes = sorted(df_price["SecuritiesCode"].unique())
    
    #get last 100 prices
    last_prices = df_price.sort_values('Date').groupby('SecuritiesCode').tail(100)
    
    # generate feature
    buff = []
    for code in tqdm(codes):
        feat = get_features_for_predict(last_prices, code)
        buff.append(feat)
    feature_df = pd.concat(buff)
    
    #Merge options date
    final_feat = merge_options(feature_df,options)
    final_feat = final_feat.dropna()
    
    # features to use
    feat_cols = []
    for cols in final_feat.columns:
        feat_cols.append(cols)
    feat_cols = feat_cols[5:]
    
    feat = final_feat[feat_cols]
    #display(feat)
    final_feat["predict"] = pred_model.predict(feat,predict_disable_shape_check=True)
    
    final_feat = final_feat.sort_values('predict')
    final_feat['Rank'] = np.arange(len(final_feat['predict']))
    final_feat = final_feat.sort_values("SecuritiesCode", ascending=True)
    
    sample_prediction.sort_values("SecuritiesCode", ascending=True)
    sample_prediction["Rank"] = final_feat["Rank"].values
    
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1
    
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
2021-12-06


  0%|          | 0/2000 [00:00<?, ?it/s]

2021-12-07


  0%|          | 0/2000 [00:00<?, ?it/s]