## Introduction

This Notebook was created as an experiment for the JPX competition.  
I consider this competition to be a prediction of the rate of increase in the stock price between the next day and the following day.  
It seems extremely difficult to me and seems to depend on happenstance.  

I have attempted to use LGB to predict Target.

## Preparations

In [None]:
#===== Libraries =====

import os
import gc
import re
import numpy as np
import pandas as pd
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt

from datetime import datetime
from datetime import timedelta, date

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

import jpx_tokyo_market_prediction

In [None]:
#===== Path =====

data_path = "../input/jpx-tokyo-stock-exchange-prediction/"
train_path = os.path.join(data_path, "train_files/")
supplemental_path = os.path.join(data_path, "supplemental_files/")

### Config

In [None]:
#===== CFG =====
   
class CFG:
    start_day = "2020-9-20"
    train_days = 5
    num_lags = 31
    lag_itv = 1
    #lag_list = []
    lag_list = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    #lag_list = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60]
    ma_list = [5, 10, 25]
    
    cat_features = None
    #cat_features = ['SecuritiesCode', 'SectorCode33']

    select_model_to_pred = 5
    without_growth_m = True

### Data loading

In [None]:
stock_list = pd.read_csv(os.path.join(data_path, "stock_list.csv"))
stock_prices_bak = pd.read_csv(os.path.join(train_path, "stock_prices.csv"))
financials = pd.read_csv(os.path.join(train_path, "financials.csv"))

s1 = stock_list.columns.tolist()
s1[5] = 'SectorCode33'
s1[7] = 'SectorCode17'
stock_list.columns = s1

## EDA

EDA was performed on the following NoteBook.  
Prease take a look at it, as the number of views is very small ;)  

**[JPX EDA(price volatility): Data Understanding by R](https://www.kaggle.com/code/kei96kag/jpx-eda-price-volatility-data-understanding-by-r)**

### Functions

reduce_mem_usage

In [None]:
# https://www.kaggle.com/competitions/foursquare-location-matching/discussion/321520

def reduce_mem_usage(df, verbose=False):

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

#

### Functions for FE

In [None]:
def make_date_num(df):
     
    tmp = df.groupby(["Date"]).groups #dict
    k = list(tmp.keys())
    
    for num in range(len(k)):
        df.loc[tmp[k[num]], "Date_num"] = num
        
    df['Date_num'] = df['Date_num'].astype(np.int16)
    
    return df
#
def add_Close_lag(df):

    df['Close_tmp'] = df['Close'] * df['AdjustmentFactor']
    df['Close_1'] = df[['SecuritiesCode', 'Close_tmp']].groupby('SecuritiesCode').shift(1)
    df = df.drop('Close_tmp', axis=1)
    
    return df
#
def add_psych12(df):

    df['u_or_d'] = 1
    idx = df.query('Close-Close_1 < 0').index
    df.loc[idx, 'u_or_d'] = 0  
    df['psych12'] = df.groupby('SecuritiesCode')['u_or_d'].transform(
        lambda x: x.rolling(12).mean()) 
    
    return df
#
def add_moving_avg(df):
    
    for n in CFG.ma_list:
        ma_col = f'ma_{n}'
        d_ma_col = f'd_ma_{n}'
        df[ma_col] = df.groupby('SecuritiesCode')['Close'].transform(
            lambda x: x.rolling(n).mean())
        df[d_ma_col] = (df['Close']-df[ma_col])/df['Close']
        
    return df
#
def add_ma_lags(df):
    
    for n in CFG.ma_list:
        ma_feature = f'ma_{n}'
        #df[ma_feature] = df[ma_feature].fillna(0)
        for i in range(1, 3): #2日前まで傾きを出す
            df[f'{ma_feature}_{i}'] =\
                df[['SecuritiesCode',ma_feature]].groupby('SecuritiesCode').shift(i)
    return df        
#
def add_ma_slopes(df):

    for n in CFG.ma_list:
        #break
        n1 = f'ma_{n}'
        n2 = f'{n1}_1'
        n3 = f'{n1}_2'            
        x = np.array([1,2,3])
        y = df[[n3, n2, n1]].values
        
        slopes = []
        c=0
        for i in range(len(df)):
            try:
                slope, intercept =np.polyfit(x,y[i],1)
                slopes.append(slope/y[i][2])
            except:
                slope = 0.
                slopes.append(slope)
                c += 1
        df[f'slope_ma_{n}'] = slopes
        
    return df
#
def add_rate_and_lags(df):
    
    df['rate'] =\
        (df['Close'] - df['Close_1'])/(df['Close_1'])
        
    df['rate_f'] = df[['SecuritiesCode', 'rate']].groupby('SecuritiesCode')['rate'].shift(-1)

    reduce_mem_usage(df)
    
    #===== lags =====
    
    if len(CFG.lag_list) != 0:
        lags = CFG.lag_list
        CFG.num_lags = max(lags)
    else:
        lags = np.arange(1, CFG.num_lags, CFG.lag_itv)#+1

    lag_cols = [f'rate_{lag}' for lag in lags]
    
    for lag, lag_col in zip(lags, lag_cols):
        print(f'Create lag cols... {lag_col}')
        df[lag_col] = df[['SecuritiesCode', 'rate']].groupby('SecuritiesCode')['rate'].shift(lag)
    
    return df
#
def remove_na_rows(df):
    
    #最初の方のna行を削除
    df_date = df['Date'].unique()
    df_date_start = df_date[CFG.num_lags + 2]
    df = df[df['Date'] >= df_date_start].reset_index(drop = True)
    
    return df    
#
def create_df(folds):
    
    folds_date = folds['Date'].unique() 
    folds_date_end = folds_date[-2] # 最後の日はrate_fがない
    
    test_fds = folds[folds['Date'] == folds_date[-1]].reset_index(drop = True)
    folds = folds[folds['Date'] <= folds_date_end].reset_index(drop = True)
    
    folds = make_date_num(folds)
    test_fds = make_date_num(test_fds)
    
    d1 = folds['Date_num'].unique()
    map1 = dict(zip(d1, reversed(range(len(d1)))))
    folds['Date_num_r'] = folds['Date_num'].map(map1)
    
    folds['fold'] = folds['Date_num_r']/CFG.train_days
    folds['fold'] = folds['fold'].astype(np.int16)
    #folds.groupby(['fold']).size()
    test_fds['fold'] = 0
    
    return folds, test_fds

### LGB

In [None]:
def run_lgb_loop(df, fet_cols, cat_fet, params, imp_gra):

    f_loop = len(df['fold'].unique()) # 6/1修正 -1 を削除
    scores = []
    models = []
    
    for n in range(1, f_loop):
        #validは必ず#0なのでループは１から回す
        train_fold = df.query('fold == @n').reset_index(drop = True)
        valid_fold = df.query('fold == 0').reset_index(drop = True)
        
        X_train = train_fold.loc[:, fet_cols]
        y_train = train_fold["rate_f"].values
        
        X_valid = valid_fold.loc[:, fet_cols]
        y_valid = valid_fold['rate_f'].values
        
        
        train_data = lgb.Dataset(X_train, label = y_train,
                                categorical_feature = cat_fet,
                                free_raw_data = False)
        
        valid_data = lgb.Dataset(X_valid, y_valid,
                            categorical_feature = cat_fet,
                                free_raw_data = False)
        
        model_reg = lgb.train(params,
                          train_data,
                          valid_sets = [train_data, valid_data],
                          verbose_eval = 250)
        
        pred = model_reg.predict(X_valid)
        
        mask_na = ~np.isnan(y_valid)
        score = np.sqrt(mean_squared_error(y_valid[mask_na], pred[mask_na]))
        print(n, score)
        scores.append(score)
        models.append(model_reg)
        
        if imp_gra:
            i_title =f'fold-{n}: score--> {score:.4f}' 
            lgb.plot_importance(model_reg, figsize=(8,4), max_num_features=20, importance_type='gain', title = i_title)
    
    #score_dictはゼロ番はじまりでよい
    score_dict = dict(zip(range(f_loop-1), scores))
    score_dict_sorted = sorted(score_dict.items(), key = lambda x:x[1])
    
    return score_dict_sorted, models

## Predictions

### Features
I attempted to forecast in the following way.  
* The target of the forecast was the stock price appreciation rate for the next day (variable: rate_f).  

**The idea was to predict this value using phases of similar price movements in the past.**

The feature is as follows.  
Numerical values were assumed to be expressible as percentages.  

* Number of days in the training period (number of days in 1fold)
* SecuritiesCode
* 33SectorCode
* Stock price appreciation rate (rate) and its lag
* Psychlogical-12 
* Moving average deviation rate of stock price (5 days/10 days/25 days)
* Slope rate of the moving average: Rejected from the middle of the moving average because of the calculation time.

valid is the fold including the last day of train (fold0), and other folds were evaluated by LGB.  
The average value of the prediction using multiple folds with small errors was used as the next day's rate of increase (Prediction1).

Once the next day's rate of increase was calculated (Prediction1), the Feature was updated to further estimate the next day's rate of increase (Prediction2). We considered this to be Target.

以下のような方法で予測を試みました。  
* 予測対象は翌日の株価上昇率(変数：rate_f)としました。   

**この値を過去の同じような値動きの局面を使用して予測することを考えました。**

featureは以下です。  
数値は、パーセンテージで表現できるものとしました。  

* training期間の日数（1foldの日数）
* 株式Code
* 33業種Code
* 株価上昇率(rate)とそのlag
* Psychlogical12 
* 株価の移動平均乖離率(5日/10日/25日)
* 移動平均の傾き率：計算時間がかかるので途中から不採用

validはtrainの最終日を含むfold(fold0)とし、それ以外のfoldについてLGBで評価しました。  
誤差の小さいfoldを複数用いたpredictionの平均値を翌日の上昇率(Prediction1)としました。

翌日の上昇率が算出されたら(Prediction1)、Featureを更新して、さらに翌日の上昇率を推定します(Prediction2)。これがTargetと考えました。

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

### Prediction1

First, the rate of the return for the next day is predicted.  
The 'rate_f' is the predicted rate of return for the next day.

In [None]:
def prediction_1(df):
    
    #===== Prediction1 =====
    
    test_pred1 = df.loc[:, feature_cols]

    predictions1 = []

    for i in model_num:
        model  = lgb_models[i]
        pred1 = model.predict(test_pred1)
        predictions1.append(pred1)

    prediction1  = np.mean(predictions1, axis = 0)
    len(prediction1)
    test_pred1['rate_f'] = prediction1 #This in the preicted return for nect day.
    
    return test_pred1

### Prediction2

The next day's closing price is calculated from the Prediction1 result.  
Feture is calculated again and the prediction is made using the Prediction1 model.

Prediction1の結果から翌日の終値を算出します。  
再びFetureを計算して、Prediction1のモデルで予測をおこないます。

In [None]:
def prediction_2(df):

    #===== Prediction2 =====

    s1 = df.columns
    s2 =[s for s in s1 if "rate" in s]

    date_pred1 = train_saved['Date'].unique()[-1]
    test_pred2 = train_saved.query('Date == @date_pred1').reset_index(drop = True)

    date_pred2 = pd.to_datetime(date_pred1)+ timedelta(days=1)
    test_pred2['Date'] = date_pred2
    
    #Cal Close values
    test_pred2['Close'] = round(test_pred2['Close']*test_pred2['AdjustmentFactor']*(1 + test_pred1['rate_f']),1)
    test_pred2['AdjustmentFactor'] = 1.
    reduce_mem_usage(test_pred2)

    test_pred2 = pd.concat([train_saved, test_pred2], axis = 0)
    test_pred2 = test_pred2.reset_index(drop = True)
    
    #FE
    test_pred2 = add_Close_lag(test_pred2)
    test_pred2 = add_psych12(test_pred2)
    test_pred2 = add_moving_avg(test_pred2)
    test_pred2 = add_ma_lags(test_pred2)
    
    test_pred2 = test_pred2.query('Date == @date_pred2').reset_index(drop = True)
    
    s3 = ['rate_f']
    for s in s2[:-1]:
        s3.append(s)

    test_pred2_rate = df.loc[:, s3[:-1]]
    test_pred2_rate.columns = s2[:-1]

    test_pred2 = pd.concat([test_pred2, test_pred2_rate], axis = 1)
    test_pred2 = test_pred2.loc[:, feature_cols]

    predictions2 = []

    for i in model_num:
        model  = lgb_models[i]
        pred2 = model.predict(test_pred2)
        predictions2.append(pred2)

    prediction2  = np.mean(predictions2, axis = 0)
    len(prediction2)
    
    return prediction2

## Summary of results

### Overall

I varied the parameters and evaluated the LGB based on 2021 data.  
**Overall, the LB scores tended to be negative.**  
The variation in the valid score suggests that I have not been able to extract the most promising features.  

The main results are as follows. The data used for the forecast is for the year 2021.  

* Including 'SecuritiesCode' and 'SectorCode33' in the features often results in negative LB scores.
* In many cases, the score improves slightly when data from the Growth market is removed.
* When the training period (train_days) was set to 5/10/15/20 days, the score became positive only on 5 days case.

パラメータを変化させて、2021年のデータをもとにLGBで評価をおこないました。  
**全体的にLBスコアは負の値が多い結果となりました。**  
validスコアの変化から、有力な特徴量が抽出できていないと思われます。  

主な結果は以下になります。予測に使用したデータは2021年のものです。  

* 'SecuritiesCode', 'SectorCode33'を特徴量に含めると、LBスコアはマイナスになるケースが多い。
* Growth市場のデータを抜くとややスコアが向上するケースが多い。
* training期間（train_days)を5/10/15/20日としたところ、スコアがプラスになったのは5日だけであった。

### LB scores

A summary of the LB scores is shown below.  

LB was positive in the case of 5 Training days.  
Scores are often better when 'SecuritiesCode' and 'SectorCode33' are not included in the features.
* train_days = 5
* LB = 0.049 ~ 0.262 

The result is negative when the number of training days is 10 or more.  
The absolute value is higher than in the case of 5 days, so multiplying by -1 and submitting will result in a better score.  
The absolute value of the score is higher when the 'SecuritiesCode' and 'SectorCode33' features are included.
* train_days = 10
* LB = -0.1 ~ -0.3 (approximately)

In other words, individual stocks are likely to move in the opposite direction from a similar phase in 2021.

LBスコアのまとめは以下のようになります。  

LBがプラスになったのはTraining日数が5日のケースでした。  
'SecuritiesCode', 'SectorCode33'を特徴量に含めない方がスコアは良くなることが多い。
* train_days = 5
* LB = 0.049 ~ 0.221 

Training日数が10日またはそれ以上の場合、結果はマイナスの値になりました。  
5日の場合より絶対値が大きいため、-1をかけてsubmitするとスコアは良くなります。。。  
(予測に逆張りせよと)  
'SecuritiesCode', 'SectorCode33'特徴量に含めた方がスコアの絶対値は大きくなります。
* train_days = 10
* LB = -0.1 ~ -0.3 (おおよそ)

つまり、個々の銘柄は2021年の同じような局面とは逆の動きをしやすいと思われます。


## Prediction loop for submission

In [None]:
cc = 1
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:

    #===== Trainning データの作成 =====
        
    stock_prices = stock_prices_bak.copy()
    #break
    
    #データを1日接続
    stock_prices = pd.concat([stock_prices, prices])
    stock_prices_bak = stock_prices.copy()

    stock_prices = stock_prices.merge(stock_list[['SecuritiesCode', 'SectorCode33', 'NewMarketSegment']], 
                                     how='left', on = 'SecuritiesCode')
    stock_prices['SectorCode33'] =stock_prices['SectorCode33'].astype(np.int16)
    stock_prices['Date'] = pd.to_datetime(stock_prices['Date'], format = "%Y-%m-%d")
    #
    price_features = ['Date', 
                       'SecuritiesCode', 
                       'Open', 'High', 'Low', 'Close', 'Volume',
                       'AdjustmentFactor','SectorCode33', 'NewMarketSegment']

    train = stock_prices.loc[:, price_features]
    train = train[train['Date'] >= CFG.start_day].reset_index(drop = True)
    
    #
    train_seg = train.groupby('NewMarketSegment')['SecuritiesCode'].unique().reset_index()
    train_seg['n_codes'] = train_seg['SecuritiesCode'].apply(lambda x: len(x))
    train = train.drop('NewMarketSegment', axis = 1)
    
    if CFG.without_growth_m:
        mask = ~train['SecuritiesCode'].isin(train_seg['SecuritiesCode'][0])
        train = train[mask].reset_index(drop = True)
    
    #===== Crete features =====

    train = add_Close_lag(train)
    train = add_psych12(train) #サイコロ12
    train = add_moving_avg(train) #移動平均
    train = add_ma_lags(train) #移動平均のlag

    train = add_rate_and_lags(train) #rateのlag
    train = remove_na_rows(train)
    #train = add_ma_slopes(train) #移動平均の傾き

    #Prediction2のFE用
    base_cols = ['Date', 'SecuritiesCode', 'Close', 'AdjustmentFactor', 'SectorCode33']
    date_tmp = train['Date'].unique()[-30]
    train_saved = train.query('Date >= @date_tmp')
    train_saved = train_saved.loc[:, base_cols].reset_index(drop = True)

    train_folds, test_folds = create_df(train)
    
    #全銘柄が含まれるのfoldを残す
    df_n = train_folds.groupby('fold').size()
    growth_num = 0
    if CFG.without_growth_m:
        growth_num = len(train_seg.iloc[0, 1])
    fold_mem = CFG.train_days * (2000 - growth_num)
    fold_true = df_n[df_n == fold_mem].index
    mask = train_folds['fold'].apply(lambda x: x in fold_true.tolist())
    train_folds = train_folds[mask].reset_index(drop = True)
    print("train_folds", fold_true)

    del(train)
    gc.collect()    

    #===== LGB =====

    s1 = train_folds.columns.tolist()

    drop_cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'AdjustmentFactor', 
                 'Close_1', 'rate_f', 'Date_num', 'Date_num_r', 
                 'fold', 'u_or_d']
    if CFG.cat_features is None:
        for s in ['SecuritiesCode', 'SectorCode33']:
            drop_cols.append(s)
        
    drop_cols2 = [s for s in s1 if re.search(r'^ma', s)]

    train_tmp = train_folds.drop(drop_cols, axis = 1).iloc[0:1, :]
    feature_cols = train_tmp.drop(drop_cols2, axis = 1).columns

    cat_features = CFG.cat_features
    
    lgb_params = {
            "objective": "regression",
            "metric" :"rmse",
            "force_col_wise" : True,
            "learning_rate" : 0.1,
            'feature_fraction' : 0.8,
            "lambda_l2" : 0.1,
            'random_state': 11,
            'verbosity': 1,
            'num_iterations' : 500,
    }#
    
    if cc ==1:
        IMP_GRA = True
    else:
        IMP_GRA = False
        
    lgb_scores, lgb_models = run_lgb_loop(train_folds, feature_cols, cat_features, lgb_params, IMP_GRA)


    #===== Prediction-1 =====
    
    #上位のlist番号
    model_num = [i[0] for i in lgb_scores[0:CFG.select_model_to_pred]]
    test_pred1 = prediction_1(test_folds)


    #===== Prediction-2 =====

    target = prediction_2(test_pred1)

#
    df_pred = pd.DataFrame()
    df_pred['SecuritiesCode'] = sample_prediction['SecuritiesCode']
    
    if CFG.without_growth_m:
        mask = ~df_pred['SecuritiesCode'].isin(train_seg.iloc[0,1])
        df_pred.loc[df_pred[mask].index, "Prediction"] = target
        df_pred['Prediction'] = df_pred['Prediction'].fillna(0)
    else:
        df_pred['Prediction'] = target
        #df_pred['Prediction'] = prediction1

#      
    sample_prediction["Prediction"] = df_pred['Prediction']
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction['Rank'] = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]].copy()
    env.predict(submission)
    
    display(submission)
    
    del(stock_prices, train_folds, test_folds)
    cc += 1
    gc.collect()

## What's next?

Unfortunately, the model seems to be unstable.  

I am considering the following
* Increase training data. Go further back in time.
* Search for other features?