# 参考

In [None]:
# https://zenn.dev/mst8823/articles/da505dcf45474f

# Pipeline

## Config

In [None]:
class Config:
    name = "baseline-lgbm001"
    only_inference = False

    # model_name = "roberta-base"
    # learning_rate = 1e-5
    # max_length = 256
    # epochs = 8
    # batch_size = 16

    n_fold = 5
    # trn_fold = [0, 1, 2, 3, 4]
    seed = 42
    # target_col = "target"
    # debug = False

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/kaggle/kaggle.json"
    drive_path = "/content/drive/MyDrive/kaggle/JPXTokyoStock"
    
    # Kaggle Env
    kaggle_dataset_path = None

## Library

In [None]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import pytz
import sys
import re

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, mean_squared_error

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from tensorflow.keras import backend as K
from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import pickle
import glob

# import shap
import xgboost
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns

warnings.filterwarnings("ignore")

## Utils

In [None]:
class Logger:
    # 参考) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info(f'[{self.now_string()}] - {message}')

    @staticmethod
    def now_string():
        return str(datetime.datetime.now(pytz.timezone('Asia/Tokyo')).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
def seed_everything(seed=42):
# 　　参考) https://qiita.com/kaggle_grandmaster-arai-san/items/d59b2fb7142ec7e270a5
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

## SetUp

### 環境ごとのセットアップ

In [None]:
COLAB = "google.colab" in sys.modules

In [None]:
# -------------------------------colab 環境の場合-------------------------------
if COLAB:
    print("-------------------------------This environment is Google Colab-------------------------------")
    
    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive') 

    # my-modules のPath設定
    import sys
    sys.path.append('/content/drive/MyDrive/Colab Notebooks/my-modules')

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]
    
    # set dirs
    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None 
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
    INPUT = os.path.join(DRIVE, "input")
    OUTPUT = os.path.join(DRIVE, "output")
    SUBMISSION = os.path.join(DRIVE, "submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP) 
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

    if not os.path.isfile(os.path.join(INPUT, "jpx-tokyo-stock-exchange-prediction.zip")):
        # download dataset
        # kaggle をインストール
        # アクセスパーミッションのため、以下を打ち込みます。
        ! chmod 600 /root/.kaggle/kaggle.json
        ! pip install kaggle
        ! kaggle competitions download -c jpx-tokyo-stock-exchange-prediction -p $INPUT
        # 上記でdownloadしてきたZIPファイルを解凍
        ! apt-get install p7zip-full -y
        ! 7za x os.path.join(INPUT, "jpx-tokyo-stock-exchange-prediction.zip")
    else:
        print('DS for competition has been already installed.') 
    
    # utils
    logger = Logger(OUTPUT_EXP)

# -------------------------------kaggle 環境の場合-------------------------------
else:
    print("-------------------------------This environment is Kaggle Kernel-------------------------------")
    
    # set dirs
    INPUT = "../input/jpx-tokyo-stock-exchange-prediction"
    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")
    
    # copy dirs
    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    # utils
    logger = Logger(EXP)

# utils
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')
seed_everything(seed=Config.seed)

### input dataの読み込み

In [None]:
# load data
input_files = os.listdir(INPUT)
input_folder_paths = [os.path.join(INPUT, f) for f in input_files if os.path.isdir(os.path.join(INPUT, f))]

data_specifications_ = [pd.read_csv(path) for path in glob.glob(os.path.join(input_folder_paths[0], '*.csv'), recursive=True)]
example_test_files_ = [pd.read_csv(path) for path in glob.glob(os.path.join(input_folder_paths[1], '*.csv'), recursive=True)]
jpx_tokyo_market_prediction_ = [pd.read_csv(path) for path in glob.glob(os.path.join(input_folder_paths[2], '*.csv'), recursive=True)]
supplemental_files_ = [pd.read_csv(path) for path in glob.glob(os.path.join(input_folder_paths[3], '*.csv'), recursive=True)]
train_files_ = [pd.read_csv(path) for path in glob.glob(os.path.join(input_folder_paths[4], '*.csv'), recursive=True)]

In [None]:
raw_train = train_files_[3 if COLAB else 4]

In [None]:
# if Config.debug:
#     train = train.sample(100).reset_index(drop=True)

# # cv split
# train["fold"] = -1
# for i_fold, lst in enumerate(
#     KFold(
#         n_splits=Config.n_fold, 
#         shuffle=True,
#         random_state=Config.seed).split(
#             X=train, 
#             y=train[Config.target_col]
#             )):
    
#     if i_fold in Config.trn_fold:
#         train.loc[lst[1].tolist(), "fold"] = i_fold

### Handle Nulls

In [None]:
prices = raw_train.copy()
display(prices)
# display(prices.isnull().sum())
prices = prices.drop("ExpectedDividend", axis=1)  # 大半がNaNの列をDrop
prices = prices.dropna()
display(prices.isnull().sum())

### Cross Validation Split

In [None]:
# binningを基にStratifiedKFoldを行い、'fold'列を追加
def setup_cv(df, splits=5):
    df['fold'] = -1
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(df))))  # スタージェスの公式によるbin数
    df.loc[:, "bins"] = pd.cut(
        df["Target"], bins=num_bins, labels=False
    )

    kf = StratifiedKFold(n_splits=splits, shuffle=False)
    for f, (t_, v_) in enumerate(kf.split(X=df, y=df.bins.values)):
            df.loc[v_, 'fold'] = f

    df = df.drop("bins", axis=1)
    return df

In [None]:
prices = setup_cv(prices, splits=Config.n_fold)
prices

### Ordinal Encode Securities Code

In [None]:
enc = OrdinalEncoder()
prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])
prices

In [None]:
"""
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
scaler.fit(prices.values)

def scale_dataset(df):
    scaled_features=scaler.transform(df.values)
    scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
    return scaled_features_df
"""

In [None]:
prices = prices.sort_values(by='Date')
prices

In [None]:
def make_lgb_ds(X, y):
    return lgb.Dataset(data=X, label=y, feature_name='auto')

# Train Models

### LGBM

In [None]:
def train_lgbm(df, folds, params):
    models = list()
    
    for fold in range(folds):
        # train, valid毎の入出力を用意
        X_train = df[df.fold != fold][["SecuritiesCode", "Open", "High", "Low", "Close"]]
        y_train = df[df.fold != fold][["Target"]]
        X_valid = df[df.fold == fold][["SecuritiesCode", "Open", "High", "Low", "Close"]]
        y_valid = df[df.fold == fold][["Target"]]
        # train, valid毎にdsへ格納
        train_ds = make_lgb_ds(X_train, y_train)
        valid_ds = make_lgb_ds(X_valid, y_valid)
        # modelの用意
        model = lgb.train(params=params, 
                          train_set=train_ds, 
                          num_boost_round=10000, 
                          early_stopping_rounds=20,
                          valid_sets=[valid_ds])
        # validモードの予測とRMSEを計算
        oof_preds = model.predict(X_valid)
        oof_score = np.sqrt(mean_squared_error(y_valid, oof_preds))
        # modelsへmodelを追加
        models.append(model)
        # fold毎のmodelをpklファイルとして保存
        model_path = os.path.join(EXP_MODEL, f"{Config.name}-seed{Config.seed}-fold{fold}")
        if not os.path.isfile(model_path):
            pickle.dump(model, open(model_path, 'wb'))
            print(f"{Config.name}-seed{Config.seed}-fold{fold} has been saved.")
        # 既に保存済みの場合は保存しない
        else:
            print('No model saved.')
        # fold毎にモデル名とスコア(RMSE)を表示
        logger.info(f"model_name:{Config.name}-seed:{Config.seed}-fold:{fold}\
                    ¥n-X_cols:{X_train.columns.values}-y:{y_train.columns.values} >>>>> RMSE={oof_score}")
        print(f'fold_{fold} has finished.')
        print('-----------------------------')
        
    return models

### Run - prints rmse for each fold

In [None]:
%%time
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0
    }
lgbm_models = train_lgbm(prices, folds=Config.n_fold, params=lgb_params)

# Make Predictions & Submit

In [None]:
def pred_to_submission(pred):
    pred = pred.sort_values(by = "Prediction", ascending=False)
    pred.Rank = np.arange(0,2000)
    pred = pred.sort_values(by = "SecuritiesCode", ascending=True)
    pred.drop(["Prediction"],axis=1)
    submission = pred[["Date","SecuritiesCode","Rank"]]
    return submission

In [None]:
if not COLAB:
    import jpx_tokyo_market_prediction as JTMP
    env = JTMP.make_env()
    iter_test = env.iter_test()

    for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
        prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])

        X_test = prices[["SecuritiesCode", "Open", "High", "Low", "Close"]]
        lgbm_preds = list()
        for model in lgbm_models:
            lgbm_preds.append( model.predict(X_test) )
        lgbm_preds = np.mean(lgbm_preds, axis=0)

        sample_prediction["Prediction"] = lgbm_preds
        submission = pred_to_submission(sample_prediction)
        print(submission)

        env.predict(submission)

## Others

In [None]:
# folderのディレクトリ構造可視化ツール
import pathlib
import glob
import os

def tree(path, layer=0, is_last=False, indent_current='　'):
    if not pathlib.Path(path).is_absolute():
        path = str(pathlib.Path(path).resolve())

    # カレントディレクトリの表示
    current = path.split('/')[::-1][0]
    if layer == 0:
        print('<'+current+'>')
    else:
        branch = '└' if is_last else '├'
        print('{indent}{branch}<{dirname}>'.format(indent=indent_current, branch=branch, dirname=current))

    # 下の階層のパスを取得
    paths = [p for p in glob.glob(path+'/*') if os.path.isdir(p) or os.path.isfile(p)]
    def is_last_path(i):
        return i == len(paths)-1

    # 再帰的に表示
    for i, p in enumerate(paths):

        indent_lower = indent_current
        if layer != 0:
            indent_lower += '　　' if is_last else '│　'

        if os.path.isfile(p):
            branch = '└' if is_last_path(i) else '├'
            print('{indent}{branch}{filename}'.format(indent=indent_lower, branch=branch, filename=p.split('/')[::-1][0]))
        if os.path.isdir(p):
            tree(p, layer=layer+1, is_last=is_last_path(i), indent_current=indent_lower)

In [None]:
tree('/content/drive/MyDrive/kaggle/JPXTokyoStock')