In [1]:
import os
from pathlib import Path
from decimal import ROUND_HALF_UP, Decimal

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [2]:
# I/O Func
BASE_PATH = Path(f'/kaggle/working')

def adjusting_price(price, key: str):
    """[Adjusting Close Price]
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """

    def generate_adjusted(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, f"CumulativeAdjustmentFactor{key}"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, f"Adjusted{key}"] = (
            df[f"CumulativeAdjustmentFactor{key}"] * df[key]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df[f"Adjusted{key}"] == 0, f"Adjusted{key}"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, f"Adjusted{key}"] = df.loc[:, f"Adjusted{key}"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted).reset_index(drop=True)

    # price.set_index("Date", inplace=True)
    return price

def adjusting_volume(price, key = "Volume"):
    """[Adjusting Close Price]
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """

    def generate_adjusted(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, f"CumulativeAdjustmentFactor{key}"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, f"Adjusted{key}"] = (
            df[key] / df[f"CumulativeAdjustmentFactor{key}"]  
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df[f"Adjusted{key}"] == 0, f"Adjusted{key}"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, f"Adjusted{key}"] = df.loc[:, f"Adjusted{key}"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted).reset_index(drop=True)

    # price.set_index("Date", inplace=True)
    return price

def read_prices(dir_name: str, securities_code: int = None):
    """[Important: the dateset of 2020/10/1 is lost because of system failer in JPX, see: https://www.jpx.co.jp/corporate/news/news-releases/0060/20201019-01.html]
    
    """
    base_path = Path(f'../input/jpx-tokyo-stock-exchange-prediction/{dir_name}')
    df = pd.read_csv(base_path / 'stock_prices.csv')
    df.loc[: ,"Date"] = pd.to_datetime(df.loc[: ,"Date"], format="%Y-%m-%d")
    df = df[df['Open'].notna()]
    if securities_code:
        df = df[df["SecuritiesCode"] == securities_code]
    return df

def read_stock_list(securities_code: int = None, only_universe: bool = True):
    df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')
    df.loc[: ,"EffectiveDate"] = pd.to_datetime(df.loc[: ,"EffectiveDate"], format="%Y%m%d")
    if only_universe:
        df = df[df['Universe0']]
    if securities_code:
        df = df[df["SecuritiesCode"] == securities_code]
    return df

def read_train_data_by_price(securities_code: int = None, with_supplemental: bool = True):
    """[The train base is price dataset, the other data are joined to prices DF by left join]
    
    """
    def merge_data(prices, stock_list):
        base_df = prices.copy()
        _stock_list = stock_list.copy()
        _stock_list.rename(columns={'Close': 'Close_x'}, inplace=True)
        base_df = base_df.merge(_stock_list, on='SecuritiesCode', how="left")
        return base_df
    
    # origin
    df = merge_data(prices=read_prices(dir_name="train_files", securities_code=securities_code), stock_list=read_stock_list(securities_code=securities_code))
    
    # supplyment
    if with_supplemental:
        supplemental_df = merge_data(prices=read_prices(dir_name="supplemental_files", securities_code=securities_code), stock_list=read_stock_list(securities_code=securities_code))
        df = pd.concat([df, supplemental_df]).reset_index(drop=True)
        
    df = adjusting_price(df, "Close")
    df = adjusting_price(df, "Open")
    df = adjusting_price(df, "High")
    df = adjusting_price(df, "Low")
    df = adjusting_volume(df)
    return df

def write_df(df, filename):
    df.to_csv(BASE_PATH / f'{filename}.csv',index = False)

In [None]:
train_df = read_train_data_by_price()
train_df

# Featrue

In [4]:
def cal_moving_average(key:str, periods):
    def func(df):
        for period in periods:
            col = f"MovingAverage{key}{period}"
            col_gap = f"{col}GapPercent"
            df[col] = df[key].rolling(period, min_periods=1).mean()
            df[col_gap] = (df[key] / df[col]) * 100.0
        return df
    return func

def cal_changing_ration(key:str, periods):
    def func(df):
        for period in periods:
            col = f"ChangingRatio{key}{period}"
            df[col] = df[key].pct_change(period) * 100
        return df
    return func

def cal_historical_vix(key: str, periods):
    def func(df):
        for period in periods:
            col = f"HistoricalVIX{key}{period}"
            df[col] = np.log(df[key]).diff().rolling(period).std()
        return df
    return func

def add_columns_per_code(df, functions):
    def func(df):
        for f in functions:
            df = f(df)
        return df
    df = df.sort_values(["SecuritiesCode", "Date"])
    df = df.groupby("SecuritiesCode").apply(func)
    df = df.reset_index(drop=True)
    return df

def add_columns_per_day(base_df):
    base_df['diff_rate1'] = (base_df['Close'] - base_df['Open']) / base_df['Close']
    base_df['diff_rate2'] = (base_df['High'] - base_df['Low']) / base_df['Close']    
    return base_df

def generate_features(df):
    base_df = df.copy()
    prev_column_names = base_df.columns
    periods = [5, 25, 75]
    functions = [
        cal_moving_average("AdjustedClose", periods),
        cal_moving_average("AdjustedOpen", periods),
        cal_moving_average("AdjustedHigh", periods),
        cal_moving_average("AdjustedLow", periods),
        cal_moving_average("AdjustedVolume", periods),
        cal_changing_ration("AdjustedClose", periods),
        cal_changing_ration("AdjustedOpen", periods),
        cal_changing_ration("AdjustedHigh", periods),
        cal_changing_ration("AdjustedLow", periods),
        cal_changing_ration("AdjustedVolume", periods),
        cal_historical_vix("AdjustedClose", periods),
        cal_historical_vix("AdjustedOpen", periods),
        cal_historical_vix("AdjustedHigh", periods),
        cal_historical_vix("AdjustedLow", periods),
        cal_historical_vix("AdjustedVolume", periods)
    ]
    
    base_df = add_columns_per_code(base_df, functions)
    base_df = add_columns_per_day(base_df)
    
    add_column_names = list(set(base_df.columns) - set(prev_column_names))
    #feats = feats[feats["HistoricalVIXAdjustedClose75"] != 0]
    return base_df, add_column_names

def select_features(feature_df, add_column_names, is_train):
    base_cols = ['RowId', 'Date', 'SecuritiesCode']
    numerical_cols = sorted(add_column_names)
    categorical_cols = ['NewMarketSegment', '33SectorCode', '17SectorCode']
    label_col = ['Target']
    feat_cols = numerical_cols + categorical_cols
    feature_df = feature_df[base_cols + feat_cols + label_col]
    feature_df[categorical_cols] = feature_df[categorical_cols].astype('category')
    if is_train:
        feature_df.dropna(inplace=True)
    else:
        feature_df[numerical_cols] = feature_df[numerical_cols].fillna(0)
        feature_df[numerical_cols] = feature_df[numerical_cols].replace([np.inf, -np.inf], 0)
    return feature_df, feat_cols, label_col

def preprocessor(base_df, is_train=True):
    feature_df = base_df.copy()
    
    ## 特徴量生成
    feature_df, add_column_names = generate_features(feature_df)
    
    ## 特徴量選択
    feature_df, feat_cols, label_col = select_features(feature_df, add_column_names, is_train)

    return feature_df, feat_cols, label_col

feature_df, feat_cols, label_col = preprocessor(train_df)

# modelの結果をもとにfeat_colsを上書き
feat_cols = ['33SectorCode', 'ChangingRatioAdjustedVolume25', 'diff_rate2', 'MovingAverageAdjustedHigh5GapPercent', 'MovingAverageAdjustedOpen5GapPercent', 'HistoricalVIXAdjustedLow5', 'MovingAverageAdjustedClose5GapPercent', 'HistoricalVIXAdjustedOpen5', 'MovingAverageAdjustedLow25GapPercent', 'ChangingRatioAdjustedVolume5', 'HistoricalVIXAdjustedOpen75', 'HistoricalVIXAdjustedVolume5', 'MovingAverageAdjustedVolume25GapPercent', 'MovingAverageAdjustedVolume5', 'diff_rate1', 'ChangingRatioAdjustedHigh5', 'ChangingRatioAdjustedOpen25', 'HistoricalVIXAdjustedOpen25', 'MovingAverageAdjustedClose25GapPercent', 'MovingAverageAdjustedVolume75GapPercent', 'ChangingRatioAdjustedLow25', 'ChangingRatioAdjustedLow5', 'HistoricalVIXAdjustedHigh75', 'MovingAverageAdjustedLow5GapPercent', 'ChangingRatioAdjustedClose75', 'MovingAverageAdjustedClose5', 'MovingAverageAdjustedClose75', 'MovingAverageAdjustedClose75GapPercent', 'HistoricalVIXAdjustedVolume75']
feat_cols

['33SectorCode',
 'ChangingRatioAdjustedVolume25',
 'diff_rate2',
 'MovingAverageAdjustedHigh5GapPercent',
 'MovingAverageAdjustedOpen5GapPercent',
 'HistoricalVIXAdjustedLow5',
 'MovingAverageAdjustedClose5GapPercent',
 'HistoricalVIXAdjustedOpen5',
 'MovingAverageAdjustedLow25GapPercent',
 'ChangingRatioAdjustedVolume5',
 'HistoricalVIXAdjustedOpen75',
 'HistoricalVIXAdjustedVolume5',
 'MovingAverageAdjustedVolume25GapPercent',
 'MovingAverageAdjustedVolume5',
 'diff_rate1',
 'ChangingRatioAdjustedHigh5',
 'ChangingRatioAdjustedOpen25',
 'HistoricalVIXAdjustedOpen25',
 'MovingAverageAdjustedClose25GapPercent',
 'MovingAverageAdjustedVolume75GapPercent',
 'ChangingRatioAdjustedLow25',
 'ChangingRatioAdjustedLow5',
 'HistoricalVIXAdjustedHigh75',
 'MovingAverageAdjustedLow5GapPercent',
 'ChangingRatioAdjustedClose75',
 'MovingAverageAdjustedClose5',
 'MovingAverageAdjustedClose75',
 'MovingAverageAdjustedClose75GapPercent',
 'HistoricalVIXAdjustedVolume75']

# Learning

In [5]:
# 予測値を降順に並べて順位番号を振る関数
# 言い換えると、目的変数から提出用項目を導出する関数
def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

# 予測用のデータフレームと、予測結果をもとに、スコアを計算する関数
def evaluator(df, pred):
    df["pred"] = pred
    df = add_rank(df)
    score = calc_spread_return_sharpe(df)
    return score

import lightgbm as lgb
import optuna.integration.lightgbm as lgb

# 学習を実行する関数
def trainer(feature_df, feat_cols, label_col, fold_params, seed=2022):
    scores = []
    models = []
    params = []
    i = 0
    for param in fold_params:
        ################################
        # データ準備
        ################################
        train = feature_df[(param[0] <= feature_df['Date']) & (feature_df['Date'] < param[1])]
        valid = feature_df[(param[1] <= feature_df['Date']) & (feature_df['Date'] < param[2])]

        X_train = train[feat_cols]
        y_train = train[label_col]
        X_valid = valid[feat_cols]
        y_valid = valid[label_col]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        ################################
        # 学習
        ################################
        params = {
            'task': 'train',                   # 学習
            'boosting_type': 'gbdt',           # GBDT
            'objective': 'regression',         # 回帰
            'metric': 'rmse',                  # 損失（誤差）
            'learning_rate': 0.01,             # 学習率
            'lambda_l1': 0.5,                  # L1正則化項の係数
            'lambda_l2': 0.5,                  # L2正則化項の係数
            'num_leaves': 10,                  # 最大葉枚数
            'feature_fraction': 0.5,           # ランダムに抽出される列の割合
            'bagging_fraction': 0.5,           # ランダムに抽出される標本の割合
            'bagging_freq': 5,                 # バギング実施頻度
            'min_child_samples': 10,           # 葉に含まれる最小データ数
            'seed': seed                       # シード値
        } 
 
        lgb_results = {}                       
        model = lgb.train( 
            params,                            # ハイパーパラメータ
            lgb_train,                         # 訓練データ
            valid_sets=[lgb_train, lgb_valid], # 検証データ
            valid_names=['Train', 'Valid'],    # データセット名前
            num_boost_round=2000,              # 計算回数
            early_stopping_rounds=100,         # 計算打ち切り設定
            evals_result=lgb_results,          # 学習の履歴
            verbose_eval=100,                  # 学習過程の表示サイクル
        )  

        ################################
        # 結果描画
        ################################
        fig = plt.figure(figsize=(10, 4))

        # loss
        plt.subplot(1,2,1)
        loss_train = lgb_results['Train']['rmse']
        loss_test = lgb_results['Valid']['rmse']   
        plt.xlabel('Iteration')
        plt.ylabel('logloss')
        plt.plot(loss_train, label='train loss')
        plt.plot(loss_test, label='valid loss')
        plt.legend()

        # feature importance
        plt.subplot(1,2,2)
        importance = pd.DataFrame({'feature':feat_cols, 'importance':model.feature_importance()})
        write_df(importance, f"importance_{i}")
        sns.barplot(x = 'importance', y = 'feature', data = importance.sort_values('importance', ascending=False))

        plt.tight_layout()
        plt.show()

        ################################
        # 評価
        ################################
        # 推論
        pred =  model.predict(X_valid, num_iteration=model.best_iteration)
        # 評価
        score = evaluator(valid, pred)

        scores.append(score)
        models.append(model)
        # save model
        model.save_model(f'{BASE_PATH} / model_{i}.txt')
        i = i + 1
        # model = lightgbm.Booster(model_file='lgbr_base.txt')

    print("CV_SCORES:", scores)
    print("CV_SCORE:", np.mean(scores))
    
    return models

In [6]:
# 2020-12-23よりも前のデータは証券コードが2000個すべて揃っていないため、これ以降のデータのみを使う。
# (学習用データの開始日、学習用データの終了日＝検証用データの開始日、検証用データの終了日)
fold_params = [
    ('2020-12-23', '2021-11-01', '2021-12-01'),
    ('2021-01-23', '2021-12-01', '2022-01-01'),
    ('2021-02-23', '2022-01-01', '2022-02-01'),
]
models = trainer(feature_df, feat_cols, label_col, fold_params)

[32m[I 2022-04-21 22:37:41,754][0m A new study created in memory with name: no-name-4b4a3694-c921-4962-af7a-b5a864aab57a[0m
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645:  14%|#4        | 1/7 [00:04<00:28,  4.69s/it][32m[I 2022-04-21 22:37:46,454][0m Trial 0 finished with value: 0.02464463239883677 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645:  14%|#4        | 1/7 [00:04<00:28,  4.69s/it]

[100]	Train's rmse: 0.0214215	Valid's rmse: 0.0246646
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214585	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645:  29%|##8       | 2/7 [00:08<00:21,  4.22s/it][32m[I 2022-04-21 22:37:50,343][0m Trial 1 finished with value: 0.024644802843216466 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645:  29%|##8       | 2/7 [00:08<00:21,  4.22s/it]

[100]	Train's rmse: 0.0214213	Valid's rmse: 0.0246646
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214585	Valid's rmse: 0.0246448
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645:  43%|####2     | 3/7 [00:12<00:15,  3.94s/it][32m[I 2022-04-21 22:37:53,957][0m Trial 2 finished with value: 0.02464463239883677 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645:  43%|####2     | 3/7 [00:12<00:15,  3.94s/it]

[100]	Train's rmse: 0.021422	Valid's rmse: 0.0246653
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214585	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645:  57%|#####7    | 4/7 [00:15<00:11,  3.78s/it][32m[I 2022-04-21 22:37:57,493][0m Trial 3 finished with value: 0.02464486665625294 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645:  57%|#####7    | 4/7 [00:15<00:11,  3.78s/it]

[100]	Train's rmse: 0.0214247	Valid's rmse: 0.0246661
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214586	Valid's rmse: 0.0246449
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645:  71%|#######1  | 5/7 [00:19<00:07,  3.70s/it][32m[I 2022-04-21 22:38:01,058][0m Trial 4 finished with value: 0.024644682109811286 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645:  71%|#######1  | 5/7 [00:19<00:07,  3.70s/it]

[100]	Train's rmse: 0.0214234	Valid's rmse: 0.0246646
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214586	Valid's rmse: 0.0246447
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645:  86%|########5 | 6/7 [00:23<00:03,  3.81s/it][32m[I 2022-04-21 22:38:05,086][0m Trial 5 finished with value: 0.024644677040188058 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645:  86%|########5 | 6/7 [00:23<00:03,  3.81s/it]

[100]	Train's rmse: 0.021421	Valid's rmse: 0.024666
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214585	Valid's rmse: 0.0246447
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.024645: 100%|##########| 7/7 [00:26<00:00,  3.69s/it][32m[I 2022-04-21 22:38:08,535][0m Trial 6 finished with value: 0.024644820662975717 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.02464463239883677.[0m
feature_fraction, val_score: 0.024645: 100%|##########| 7/7 [00:26<00:00,  3.83s/it]


[100]	Train's rmse: 0.0214265	Valid's rmse: 0.0246637
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214586	Valid's rmse: 0.0246448


num_leaves, val_score: 0.024645:   0%|          | 0/20 [00:00<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:   5%|5         | 1/20 [00:04<01:19,  4.20s/it][32m[I 2022-04-21 22:38:12,737][0m Trial 7 finished with value: 0.02464464975344699 and parameters: {'num_leaves': 16}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:   5%|5         | 1/20 [00:04<01:19,  4.20s/it]

[100]	Train's rmse: 0.021409	Valid's rmse: 0.0246651
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214584	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  10%|#         | 2/20 [00:10<01:35,  5.30s/it][32m[I 2022-04-21 22:38:18,805][0m Trial 8 finished with value: 0.024644705773761863 and parameters: {'num_leaves': 31}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  10%|#         | 2/20 [00:10<01:35,  5.30s/it]

[100]	Train's rmse: 0.0213859	Valid's rmse: 0.024666
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214582	Valid's rmse: 0.0246447
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  15%|#5        | 3/20 [00:19<01:56,  6.88s/it][32m[I 2022-04-21 22:38:27,567][0m Trial 9 finished with value: 0.024644957193690985 and parameters: {'num_leaves': 186}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  15%|#5        | 3/20 [00:19<01:56,  6.88s/it]

[100]	Train's rmse: 0.021268	Valid's rmse: 0.024674
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214568	Valid's rmse: 0.024645
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  20%|##        | 4/20 [00:27<02:02,  7.67s/it][32m[I 2022-04-21 22:38:36,439][0m Trial 10 finished with value: 0.024645013329043588 and parameters: {'num_leaves': 204}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  20%|##        | 4/20 [00:27<02:02,  7.67s/it]

[100]	Train's rmse: 0.0212584	Valid's rmse: 0.0246728
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214567	Valid's rmse: 0.024645
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  25%|##5       | 5/20 [00:33<01:45,  7.02s/it][32m[I 2022-04-21 22:38:42,322][0m Trial 11 finished with value: 0.024644786031698444 and parameters: {'num_leaves': 47}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  25%|##5       | 5/20 [00:33<01:45,  7.02s/it]

[100]	Train's rmse: 0.0213673	Valid's rmse: 0.0246688
Early stopping, best iteration is:
[1]	Train's rmse: 0.021458	Valid's rmse: 0.0246448
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  30%|###       | 6/20 [00:42<01:46,  7.62s/it][32m[I 2022-04-21 22:38:51,105][0m Trial 12 finished with value: 0.024644946580947322 and parameters: {'num_leaves': 121}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  30%|###       | 6/20 [00:42<01:46,  7.62s/it]

[100]	Train's rmse: 0.0213069	Valid's rmse: 0.0246735
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214572	Valid's rmse: 0.0246449
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  35%|###5      | 7/20 [00:51<01:44,  8.05s/it][32m[I 2022-04-21 22:39:00,051][0m Trial 13 finished with value: 0.024644991860950625 and parameters: {'num_leaves': 170}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  35%|###5      | 7/20 [00:51<01:44,  8.05s/it]

[100]	Train's rmse: 0.0212774	Valid's rmse: 0.0246749
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214569	Valid's rmse: 0.024645
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  40%|####      | 8/20 [00:59<01:37,  8.16s/it][32m[I 2022-04-21 22:39:08,449][0m Trial 14 finished with value: 0.024644943960414577 and parameters: {'num_leaves': 148}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  40%|####      | 8/20 [00:59<01:37,  8.16s/it]

[100]	Train's rmse: 0.0212901	Valid's rmse: 0.0246741
Early stopping, best iteration is:
[1]	Train's rmse: 0.021457	Valid's rmse: 0.0246449
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  45%|####5     | 9/20 [01:07<01:28,  8.08s/it][32m[I 2022-04-21 22:39:16,332][0m Trial 15 finished with value: 0.02464494891213401 and parameters: {'num_leaves': 138}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  45%|####5     | 9/20 [01:07<01:28,  8.08s/it]

[100]	Train's rmse: 0.0212966	Valid's rmse: 0.0246745
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214571	Valid's rmse: 0.0246449
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  50%|#####     | 10/20 [01:15<01:18,  7.86s/it][32m[I 2022-04-21 22:39:23,703][0m Trial 16 finished with value: 0.024644938537409913 and parameters: {'num_leaves': 68}. Best is trial 7 with value: 0.02464464975344699.[0m
num_leaves, val_score: 0.024645:  50%|#####     | 10/20 [01:15<01:18,  7.86s/it]

[100]	Train's rmse: 0.0213471	Valid's rmse: 0.0246707
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214577	Valid's rmse: 0.0246449
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  55%|#####5    | 11/20 [01:18<00:58,  6.48s/it][32m[I 2022-04-21 22:39:27,046][0m Trial 17 finished with value: 0.024644641398044835 and parameters: {'num_leaves': 6}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  55%|#####5    | 11/20 [01:18<00:58,  6.48s/it]

[100]	Train's rmse: 0.0214322	Valid's rmse: 0.0246658
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214587	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  60%|######    | 12/20 [01:21<00:44,  5.55s/it][32m[I 2022-04-21 22:39:30,471][0m Trial 18 finished with value: 0.02464464516211728 and parameters: {'num_leaves': 7}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  60%|######    | 12/20 [01:21<00:44,  5.55s/it]

[100]	Train's rmse: 0.0214291	Valid's rmse: 0.0246648
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214586	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  65%|######5   | 13/20 [01:31<00:47,  6.84s/it][32m[I 2022-04-21 22:39:40,296][0m Trial 19 finished with value: 0.02464507101412355 and parameters: {'num_leaves': 253}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  65%|######5   | 13/20 [01:31<00:47,  6.84s/it]

[100]	Train's rmse: 0.021235	Valid's rmse: 0.0246737
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214565	Valid's rmse: 0.0246451
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  70%|#######   | 14/20 [01:38<00:41,  6.87s/it][32m[I 2022-04-21 22:39:47,220][0m Trial 20 finished with value: 0.02464483272949999 and parameters: {'num_leaves': 87}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  70%|#######   | 14/20 [01:38<00:41,  6.87s/it]

[100]	Train's rmse: 0.0213315	Valid's rmse: 0.0246726
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214575	Valid's rmse: 0.0246448
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  75%|#######5  | 15/20 [01:42<00:29,  5.80s/it][32m[I 2022-04-21 22:39:50,545][0m Trial 21 finished with value: 0.024644641398044835 and parameters: {'num_leaves': 6}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  75%|#######5  | 15/20 [01:42<00:29,  5.80s/it]

[100]	Train's rmse: 0.0214322	Valid's rmse: 0.0246658
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214587	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  80%|########  | 16/20 [01:49<00:25,  6.46s/it][32m[I 2022-04-21 22:39:58,535][0m Trial 22 finished with value: 0.024644868569703618 and parameters: {'num_leaves': 92}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  80%|########  | 16/20 [01:49<00:25,  6.46s/it]

[100]	Train's rmse: 0.0213276	Valid's rmse: 0.0246736
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214574	Valid's rmse: 0.0246449
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  85%|########5 | 17/20 [01:55<00:18,  6.31s/it][32m[I 2022-04-21 22:40:04,498][0m Trial 23 finished with value: 0.02464484465134234 and parameters: {'num_leaves': 54}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  85%|########5 | 17/20 [01:55<00:18,  6.31s/it]

[100]	Train's rmse: 0.0213602	Valid's rmse: 0.0246698
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214579	Valid's rmse: 0.0246448
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


num_leaves, val_score: 0.024645:  90%|######### | 18/20 [01:59<00:10,  5.49s/it][32m[I 2022-04-21 22:40:08,075][0m Trial 24 finished with value: 0.024644641398044835 and parameters: {'num_leaves': 6}. Best is trial 17 with value: 0.024644641398044835.[0m
num_leaves, val_score: 0.024645:  90%|######### | 18/20 [01:59<00:10,  5.49s/it]

[100]	Train's rmse: 0.0214322	Valid's rmse: 0.0246658
Early stopping, best iteration is:
[1]	Train's rmse: 0.0214587	Valid's rmse: 0.0246446
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7173
[LightGBM] [Info] Number of data points in the train set: 416216, number of used features: 29
[LightGBM] [Info] Start training from score 0.000742
Training until validation scores don't improve for 100 rounds


KeyboardInterrupt: 