This notebook shows simple flow to deep dive into the competition.I appreciate community of kaggle.
I refered to following notebooks.

(Reference)  
**Introduction to financial concepts and data**  
https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data  

**LGB Starter**  
https://www.kaggle.com/manels/lgb-starter/notebook

# Agenda

1. Import modules  
2. Common settings
3. Function Definition
4. Preprocessing  
  4-1. Book parquet data processing  
  4-2. Trade parquet data processing  
  4-3. Merge book and trade data  
  4-4. Train data preprocessing  
  4-5. Test data preprocessing  
5. Training  
  5-1. Training function1 - Light GBM  
  5-2. Cross Validation  
6. Evaluation
7. Prediction  
8. Submission


# 1. Import modules

In [None]:
import os
import sys
import time
import glob
from pathlib import Path

import pandas as pd
import numpy as np

# Parallel processing
from joblib import Parallel
from joblib import delayed

# Preprocess
from sklearn import preprocessing
from sklearn import model_selection

# Evaluation
from sklearn.metrics import r2_score

# Visullize
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
#import lightgbm as lgb
import optuna.integration.lightgbm as lgb


# Others
import warnings
warnings.simplefilter("ignore")


# 2. Common Settings

In [None]:
# Dataset path
data_path = Path('../input/optiver-realized-volatility-prediction')

# setting display option
pd.options.display.max_columns = 50

In [None]:
# Objective variable
target = 'target'

# submission file setting
submit_file = 'submission.csv'
Id_column = 'row_id'

# 3. Functions Definition  

In [None]:
#　Log Return
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

# Realized Volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
# WAP calculation
def wap_calculation1(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def wap_calculation2(df):
    return (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
def wap_calculation3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def wap_calculation4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

In [None]:
# my palams
# askprice1 - bidprice1
# askprice2 - bidprice2
# askprice2 - askprice1
# bidprice1 - bidprice2
def price_ask1_bid1_diff(df):
    return (df['ask_price1'] - df['bid_price1'])
def price_ask2_bid2_diff(df):
    return (df['ask_price2'] - df['bid_price2'])
def price_ask2_bid1_diff(df):
    return (df['ask_price2'] - df['bid_price1'])
def price_ask1_bid2_diff(df):
    return (df['ask_price1'] - df['bid_price2'])
def price_wap1_wap2_diff(df):
    return (df['wap1'] - df['wap2'])
def std_per_mean(df):
    return np.std(df) / np.mean(df)


In [None]:
# RMSPE
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# 4. Preprocessing dataset

## 4-1. Book parquet data processing

In [None]:
def book_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'book_{data_type}.parquet/stock_id={stock_id}/')
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # WAP calculation
    df['wap1'] = wap_calculation1(df)
    df['wap2'] = wap_calculation2(df)
    df['wap3'] = wap_calculation3(df)
    df['wap4'] = wap_calculation4(df)

    # log return calculation
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return).fillna(0)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return).fillna(0)  
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return).fillna(0)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return).fillna(0)  
    
    # Calculate wap balance
    df['wap_balance12'] = abs(df['wap1'] - df['wap2'])
    df['wap_balance34'] = abs(df['wap3'] - df['wap4'])
    # Calculate spread
    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
                           
    # Log_return calculation each stock_id and time_id
    feat_to_calc_rv = ['log_return1','log_return2','log_return3','log_return4']
    return_values = pd.DataFrame(
        df.groupby(
            ['stock_id','time_id']
        )[feat_to_calc_rv].agg(realized_volatility)
    ).reset_index()
    return_values = return_values.rename(
        columns={
            'log_return1': 'realized_volatility1',
            'log_return2': 'realized_volatility2',
            'log_return3': 'realized_volatility3',
            'log_return4': 'realized_volatility4'
        }
    )

    # 集約関数のため不要な列を削除
    df = df.drop(['time_id', 'seconds_in_bucket'], axis=1)

#     # skew
#     return_values = return_values.merge(
#         df.groupby(['stock_id']).skew(),
#         on='stock_id',
#         suffixes=['', '_skew'],
#         how='left'
#     )
#     # sem
#     return_values = return_values.merge(
#         df.groupby(['stock_id']).sem(),
#         on='stock_id',
#         suffixes=['', '_sem'],
#         how='left'
#     )    
#     # kurt
#     return_values = return_values.merge(
#         df.groupby(['stock_id']).apply(pd.Series.kurt).drop('stock_id',axis=1),
#         on='stock_id',
#         suffixes=['', '_kurt'],
#         how='left'
#     )
#     # std_per_mean
#     return_values = return_values.merge(
#         df.groupby(['stock_id']).agg(std_per_mean),
#         on='stock_id',
#         suffixes=['', '_std_per_mean'],
#         how='left'
#     )   

    # 後工程で使うのでリスト化
    features = [
        'wap1',
        'wap2',
        'wap3',
        'wap4',
        'ask_price1',
        'ask_price2',
        'bid_price1',
        'bid_price2',
        'ask_size1',
        'ask_size2',
        'bid_size1',
        'bid_size2',
        'log_return1',
        'log_return2',
        'realized_volatility1',
        'realized_volatility2',
        'realized_volatility3',
        'realized_volatility4',
        'std_per_mean',
        'wap_balance12',
        'wap_balance34',
        'price_spread1',
        'price_spread2',
        'bid_spread',
        'ask_spread',
        'bid_ask_spread',
        'total_volume',
        'volume_imbalance'
    ]

    return return_values

Check data content of one sample with book_preprocessing function  
e.g. stock_id = 97

In [None]:
df_book = book_preprocessing(97, 'train')
df_book

## 4-2. Trade parquet data processing

In [None]:
def trade_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'trade_{data_type}.parquet/stock_id={stock_id}/')
    
    df = df.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # log return calculation
    df['trade_log_return1'] = df.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    
    # Log_return calculation each stock_id and time_id
    df = pd.DataFrame(df.groupby(['stock_id','time_id'])[['trade_log_return1']].agg(realized_volatility).reset_index())
    
    # その他のデータも使いたい size と order_count
    
    return df

Check data content of one sample with trade_preprocessing function
e.g. stock_id = 0

In [None]:
df_trade = trade_preprocessing(0,'train')
df_trade.head()

## 4-3. Merge book and trade data  
Merge two data created by preprocessed with book_preprocessing and trade_preprocessing function

In [None]:
def get_stock_stat(stock_id : int, data_type = 'train'):
    
    # parquet data processing
    book_stat = book_preprocessing(stock_id, data_type)
    trade_stat = trade_preprocessing(stock_id, data_type)
    
    #Merge book and trade features
    stock_stat = book_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    
    return stock_stat

In [None]:
def get_dataSet(stock_ids : list, data_type = 'train'):
    # Parallel process of get_stock_stat 
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, data_type) 
        for stock_id in stock_ids
    )
    # concat several stock_stats in vertical direction, axis=0(default)
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

## 4-4. Train data preprocessing

In [None]:
train=pd.read_csv(data_path / 'train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
display(train.head())
print('train data shape:', train.shape)

In [None]:
# def miff_max_min(x):
#     return max(x) - min(x)

# def add_my_param(data):

#     '''
#     データを追加する
#     '''
#     data_ = data.copy()
#     # targetはない場合がある
#     try:
#         data_ = data_.drop(['target'],axis=1)
#     except Exception as e:
#         pass

# #     # 特定のデータの最大値を計算
# #     # 最大値
# #     data_ = data_.merge(
# #         data.groupby('stock_id').max(),
# #         on='stock_id',
# #         suffixes=['', '_max'],
# #         how='left'
# #     )

# #     # 最小値
# #     data_ = data_.merge(
# #         data.groupby('stock_id').min(),
# #         on='stock_id',
# #         suffixes=['', '_min'],
# #         how='left'
# #     )

#     # 標準偏差
#     data_ = data_.merge(
#         data.groupby('stock_id').std(),
#         on='stock_id',
#         suffixes=['', '_std'],
#         how='left'
#     )
    
# #     # 最大 - 最小
# #     data_ = data_.merge(
# #         data.groupby('stock_id').min() - data.groupby('stock_id').min(),
# #         on='stock_id',
# #         suffixes=['', '_diffmaxmin'],
# #         how='left'
# #     )

# #     # 中央値
# #     data_ = data_.merge(
# #         data.groupby('stock_id').median(),
# #         on='stock_id',
# #         suffixes=['', '_median'],
# #         how='left'
# #     )
    
# #     # 平均値
# #     data_ = data_.merge(
# #         data.groupby('stock_id').mean(),
# #         on='stock_id',
# #         suffixes=['', '_mean'],
# #         how='left'
# #     )
    
#     # skew
#     data_ = data_.merge(
#         data.groupby('stock_id').skew(),
#         on='stock_id',
#         suffixes=['', '_skew'],
#         how='left'
#     )
    
#     # sem
#     data_ = data_.merge(
#         data.groupby('stock_id').sem(),
#         on='stock_id',
#         suffixes=['', '_sem'],
#         how='left'
#     )    
#     return data_

In [None]:
# # データセット取得
# train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), data_type = 'train')

# # パラメータ追加
# # train_stock_stat_df = add_my_param(train_stock_stat_df)

# # Merge train with train_stock_stat_df
# train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')

# train

In [None]:
# pickleを保存、読み込んで使う
# train.to_pickle('train.pkl')
train = pd.read_pickle('../input/from-ver-37/train.pkl')
train

# **4-4-1.LOFO importance**

In [None]:
# you need to activate internet connection!
'''!pip install git+https://github.com/aerdem4/lofo-importance

import pandas as pd
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance
%matplotlib inline

target="target"

sample_df = train.sample(frac=0.01, random_state=0)
#sample_df.sort_values("AvSigVersion", inplace=True)

# define the binary target and the features
cv = KFold(n_splits=4, shuffle=False, random_state=0)
#target = "HasDetections"
features = [col for col in train.columns if col != target]
#features = [col for col in train.columns]


# define the binary target and the features
dataset = Dataset(df=sample_df, target="target", features=[col for col in sample_df.columns if col != target])

# get the mean and standard deviation of the importances in pandas format
lofo = LOFOImportance(dataset, cv=cv, scoring="neg_mean_absolute_error")
importance_df = lofo.get_importance()

# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 40))'''

## 4-5. Test data Preprocessing

In [None]:
# テストデータを読み込む
test = pd.read_csv(data_path /'test.csv')
# stock_idとtime_idを組み合わせて、結果で必要になるrow_idを作る
test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
# 確認
test

In [None]:
# book(取引)の情報を全て取得。wapやlogreturnも計算
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), data_type = 'test')

# min,max,std等追加の特徴量計算を記載
# test_stock_stat_df = add_my_param(test_stock_stat_df)

# テストデータと推論に使うbookの情報をマージ
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)

# 確認
test

# 5.Training

## 5-1. Training function1 - Light GBM  

In [None]:
# Parameters of Light GBM
# first try
# params_lgbm = {
#         'task': 'train',
#         'boosting_type': 'gbdt',
#         'learning_rate': 0.01,
#         'objective': 'regression',
#         'metric': 'None',
#         'max_depth': -1,
#         'n_jobs': -1,
#         'feature_fraction': 0.7,
#         'bagging_fraction': 0.7,
#         'lambda_l2': 1,
#         'verbose': -1
#         #'bagging_freq': 5
# }

In [None]:
# Define loss function for lightGBM training
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

Following function is training with Light GBM function.If you would like to try any other function, you could define another function and call it.

In [None]:
# training function
def light_gbm(X_train, y_train, X_val ,y_val,cats, _pred_name, n_rounds, val_index):
    
    print(cats)
    
    # Create dataset
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cats, weight=1/np.power(y_train,2))
    val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cats, weight=1/np.power(y_val,2))
    
    # training
    model = lgb.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 10,
#                       verbose_eval= 250,
                     )
    
    # Prediction w/ validation data
    preds_val = model.predict(train.loc[val_index, features_columns])
    # train.loc[val_index, _pred_name] = preds_val
    
    # RMSPE calculation
    score = round(rmspe(y_true = y_val, y_pred = preds_val),5)

    # Prediction w/ validation data
    test_preds = model.predict(test[features_columns]).clip(0,1e10)
    
    # delete dataset
    del train_data, val_data
    
    return score, test_preds, model, preds_val

In [None]:
# training function
import xgboost as xgb
from sklearn import preprocessing
def my_xgboost(X_train, y_train, X_val ,y_val,cats, _pred_name, n_rounds, val_index):
    
    print(X_train.columns)
    print(cats)
    
    le = preprocessing.LabelEncoder()
    
    lookup = {
        np.int64: 'int',
        np.float32: 'float',
        np.float64: 'float',
        str: 'c'
    }
    feature_types = [lookup[type(train.head(1)[t][0])] for t in train.columns]
    
    
    # Create dataset
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]#訓練データはdtrain、評価用のテストデータはdvalidと設定
    
    # training
    model = xgb.train(
        {
            'objective': 'reg:squarederror',
            'silent':1, 
            'random_state':1234, 
            # 学習用の指標 (RMSE)
            'eval_metric': 'rmse',
            'max_depth' : '8',
            'eta' : '0.2'
        },
        dtrain,#訓練データ
        n_rounds,#設定した学習回数
        early_stopping_rounds=500,
        evals=watchlist,
    )
    
    # Prediction w/ validation data
    # XGBoostの学習を実行
    # テストデータで予測、評価
    dtest = xgb.DMatrix(X_val)
    preds_val = model.predict(
        dtest,
        ntree_limit = model.best_ntree_limit,
    )
    #train.loc[val_index, _pred_name] = preds_val
    
    # RMSPE calculation
    score = round(rmspe(y_true = y_val, y_pred = preds_val),5)

    # Prediction w/ validation data
    dtest = xgb.DMatrix(test[features_columns])
    test_preds = model.predict(dtest).clip(0,1e10)
    # テストデータを使ってプレディクトをかける
    print('pred results:{}'.format(test_preds))
    
    # delete dataset
    del dtrain, dvalid, watchlist, dtest
    
    return score, test_preds, model, preds_val

In [None]:
test

## 5-1-1
### prepare data to train

In [None]:
# Categorical data column list
# cats = []
cats = ['stock_id']

# 学習対象特徴量
features_columns = train.columns.values.tolist()

# drop feat list
# drop_feat = ['row_id','target','stock_id']
drop_feat = ['row_id','target']
for i in drop_feat: features_columns.remove(i)

print(f'Train dataset columns : {len(features_columns)} features')

## 5-1-2
### optuna

In [None]:

# 2021/09/16 by optuna
# params_lgbm= {
#     'objective': 'mean_squared_error',
#      'metric': 'l1',
#      'verbosity': -1,
#      'boosting_type': 'gbdt',
#      'feature_pre_filter': False,
#      'lambda_l1': 2.5812494450187865e-05,
#      'lambda_l2': 0.0005754010268853543,
#      'num_leaves': 234,
#      'feature_fraction': 0.42,
#      'bagging_fraction': 0.9352921232220405,
#      'bagging_freq': 7,
#      'min_child_samples': 5,
#      'num_iterations': 200,
#      'early_stopping_round': 50
# }

# # 2021/09/17 by optuna more params add_myparam
# params_lgbm = {
#     'objective': 'mean_squared_error',
#     'metric': 'l1',
#     'verbosity': -1,
#     'boosting_type': 'gbdt',
#     'feature_pre_filter': False,
#     'lambda_l1': 0.0,
#     'lambda_l2': 0.0,
#     'num_leaves': 105,
#     'feature_fraction': 1.0,
#     'bagging_fraction': 1.0,
#     'bagging_freq': 0,
#     'min_child_samples': 50,
#     'num_iterations': 200,
#     'early_stopping_round': 50
# }

# 2021/09/17 by optuna more params add_myparam
# params_lgbm = {
#     'objective': 'mean_squared_error',
#     'metric': 'l1',
#     'verbosity': -1,
#     'boosting_type': 'gbdt',
#     'feature_pre_filter': False,
#     'lambda_l1': 0.0,
#     'lambda_l2': 0.0,
#     'num_leaves': 177,
#     'feature_fraction': 0.8,
#     'bagging_fraction': 1.0,
#     'bagging_freq': 0,
#     'min_child_samples': 20,
#     'num_iterations': 200,
#     'early_stopping_round': 50
# }

# 2021/09/17 by optuna more params with std_per_mean
# params_lgbm = {
#     'objective': 'mean_squared_error',
#     'metric': 'l1',
#     'verbosity': -1,
#     'boosting_type': 'gbdt',
#     'feature_pre_filter': False,
#     'lambda_l1': 1.535303758262475e-07,
#     'lambda_l2': 0.0066570427899383285,
#     'num_leaves': 256,
#     'feature_fraction': 0.41600000000000004,
#     'bagging_fraction': 1.0,
#     'bagging_freq': 0,
#     'min_child_samples': 20,
#     'num_iterations': 200,
#     'early_stopping_round': 50
# }

# # Ver24
# {'objective': 'mean_squared_error',
#  'metric': 'l1',
#  'verbosity': -1,
#  'boosting_type': 'gbdt',
#  'feature_pre_filter': False,
#  'lambda_l1': 0.0,
#  'lambda_l2': 0.0,
#  'num_leaves': 179,
#  'feature_fraction': 1.0,
#  'bagging_fraction': 1.0,
#  'bagging_freq': 0,
#  'min_child_samples': 20,
#  'num_iterations': 200,
#  'early_stopping_round': 50}

# ver25
# params_lgbm = {
#     'objective': 'mean_squared_error',
#     'metric': 'l1',
#     'verbosity': -1,
#     'boosting_type': 'gbdt',
#     'feature_pre_filter': False,
#     'lambda_l1': 0.00013301130015106456,
#     'lambda_l2': 1.083683899969528,
#     'num_leaves': 203,
#     'feature_fraction': 0.8999999999999999,
#     'bagging_fraction': 0.697954364796923,
#     'bagging_freq': 4,
#     'min_child_samples': 20,
#     'num_iterations': 200,
#     'early_stopping_round': 50
# }

# ver37
params_lgbm = {
    'objective': 'mean_squared_error',
    'metric': 'l1',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'feature_pre_filter': False,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'num_leaves': 19,
    'feature_fraction': 1.0,
    'bagging_fraction': 0.6511720136604726,
    'bagging_freq': 1,
    'min_child_samples': 20,
    'num_iterations': 200,
    'early_stopping_round': 50
}

In [None]:
'''
optuna for lightgbm
'''
# params = {
#     'objective': 'mean_squared_error',
#     'metric': 'mae',
#     "verbosity": -1,
#     "boosting_type": "gbdt",
# }

# best_params, history = {}, []

# lgb_train = lgb.Dataset(train[features_columns], train[target])
# lgb_eval = lgb.Dataset(test[features_columns], test[target], reference=lgb_train)

# # LightGBM学習
# gbm = lgb.train(params,
#                 lgb_train,
#                 num_boost_round=200,
#                 valid_sets=[lgb_train, lgb_eval],
#                 early_stopping_rounds=50
#                )

# best_params = gbm.params
# params_lgbm = best_params
# params_lgbm

In [None]:
# '''
# optuna for xgboost
# '''
# import optuna
# import xgboost as xgb

# def objective_wrap(_train, _labels):
#     def objective(trial):

#         params = {
#             "max_depth": trial.suggest_int("max_depth", 6, 9),
#             "min_child_weight": 1,
#             "eta": trial.suggest_loguniform("eta", 0.01, 1.0),
#             "tree_method": "exact",
#             "eval_metric": "rmse",
#             "predictor": "cpu_predictor"  
#         }

#     #     a = train[features_columns]
#     #     b = train[target].values
#         dtrain = xgb.DMatrix(_train, _labels)

#         cv_results = xgb.cv(
#             params,
#             dtrain,
#             num_boost_round=1000,
#             seed=0,
#             nfold=5, # CVの分割数
#             metrics={"rmse"},
#             early_stopping_rounds=5
#         )

#         return cv_results["test-rmse-mean"].min()
#     return objective


# study = optuna.create_study()
# study.optimize(
#     objective_wrap(
#         train[features_columns],
#         train[target].values
#     ),
#     n_trials = 40
# )

## 5-2. Cross Validation

In [None]:
def do_cross_validation_xgb(n_folds, n_rounds, pred_name, reg_alpha=0, reg_lambda=0):
    
    scores_folds = []
    pred_result = []

    # k-flods Ensemble Training
    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state = 42)

    # Initial value
    cv_trial = 1
    
    # --- Cross Validation ---
    for train_index, val_index in kf.split(range(len(train))):

        print(f'CV trial : {cv_trial} /{n_folds}')

        # Divide dataset into train and validation data such as Cross Validation
        X_train = train.loc[train_index, features_columns]
        y_train = train.loc[train_index, target].values
        X_val = train.loc[val_index, features_columns]
        y_val = train.loc[val_index, target].values

        # train with Light GBM
        rmspe_score, test_preds, model, preds_val = my_xgboost(X_train, y_train, X_val ,y_val,cats, pred_name, n_rounds, val_index)
        
        # record score data at each train in CV
        scores_folds.append(rmspe_score)
        pred_result.append(test_preds)

        # Each validation Summary 
        print(f'Fold-{cv_trial} train score. Model-{pred_name} RMSPE: {rmspe_score}')
        
        # Prediction w/ test data
        dtest = xgb.DMatrix(test[features_columns])
        test_preds = model.predict(dtest).clip(0,1e10)
        # テストデータを使ってプレディクトをかける
        print('test pred results:{}'.format(test_preds))
        
        # cv trial 回数をインクリメント
        cv_trial += 1

    return pred_result, scores_folds, test_preds

In [None]:
def do_cross_validation_lightgbm(n_folds, n_rounds, pred_name, reg_alpha=0, reg_lambda=0):

    # k-flods Ensemble Training
    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state = 42)

    # Initialize scores list
    scores_folds = []
    pred_result = []

    # Initial value
    cv_trial = 1

    params_lgbm['reg_alpha'] = reg_alpha
    params_lgbm['reg_lambda'] = reg_lambda

    # --- Cross Validation ---
    for train_index, val_index in kf.split(range(len(train))):

        print(f'CV trial : {cv_trial} /{n_folds}')

        # Divide dataset into train and validation data such as Cross Validation
        X_train = train.loc[train_index, features_columns]
        y_train = train.loc[train_index, target].values
        X_val = train.loc[val_index, features_columns]
        y_val = train.loc[val_index, target].values

        # train with Light GBM
        rmspe_score, test_preds, model, preds_val = light_gbm(X_train, y_train, X_val ,y_val,cats, pred_name, n_rounds, val_index)
        
         # record score data at each train in CV
        scores_folds.append(rmspe_score)
        pred_result.append(test_preds)
  
        # Each validation Summary 
        print(f'Fold-{cv_trial} train score. Model-{pred_name} RMSPE: {rmspe_score}')

        # テストデータを使ってプレディクトをかける
        test_preds = model.predict(test[features_columns]).clip(0,1e10)
        print('test pred results:{}'.format(test_preds))
        
        # インポータンス出力
        train_index = train.head(0)
        train_index = train_index.drop(drop_feat, axis=1)
        importance = model.feature_importance()
        print(importance)
        print(train_index.columns)
        display(pd.DataFrame(importance, index=train_index.columns, columns=['importance']))

        # cv trial 回数をインクリメント
        cv_trial += 1
      
    return pred_result, scores_folds, test_preds

# 6. Evaluation

In [None]:
pd.set_option("max_rows", None) # 全て見たい

# results
# for xgboost
scores_folds = {}
train_pred_result = {}
test_pred_result = {}
model_name_xgb1 = 'xgb1'
pred_name_xgb1 = f'pred_{model_name_xgb1}'
scores_folds[model_name_xgb1]=[]
train_pred_result[model_name_xgb1]=[]
test_pred_result[model_name_xgb1]=[]


model_name_lgb1 = 'lgb1'
pred_name_lgb1 = f'pred_{model_name_lgb1}'
scores_folds[model_name_lgb1]=[]
train_pred_result[model_name_lgb1]=[]
test_pred_result[model_name_lgb1]=[]

# 過学習抑制パラメータ探索ループ
reg_alphas =  [0.1]
reg_lambdas = [0]
cv_count = 4
n_rounds = 10000
for reg_alpha in reg_alphas:
    for reg_lambda in reg_lambdas:
        # 交差検定
        # xgb
        pr, sf, tr = do_cross_validation_xgb(cv_count, n_rounds, pred_name_xgb1, reg_alpha, reg_lambda)
        scores_folds[model_name_xgb1].append(sf)
        train_pred_result[model_name_xgb1].append(pr)
        test_pred_result[model_name_xgb1].append(tr)

        # lgb
        pr, sf, tr = do_cross_validation_lightgbm(4, n_rounds, pred_name_lgb1, reg_alpha, reg_lambda)
        scores_folds[model_name_lgb1].append(sf)
        train_pred_result[model_name_lgb1].append(pr)
        test_pred_result[model_name_lgb1].append(tr)

pd.options.display.max_columns = 50 # 表示の抑制を初期設定に戻す

In [None]:
# アンサンブルする

# すべての結果を出力
print('='*8+'\n'+'print all result')
print(train_pred_result)
print(scores_folds)

# 推定値とRMSPEのスコアの平均値を計算
train_last_result = 0
test_last_result = 0
last_score = 0
count = 0
for pr in train_pred_result:
    train_last_result += pd.DataFrame(train_pred_result[pr][0]).mean()
    test_last_result += pd.DataFrame(test_pred_result[pr][0])
    last_score += pd.Series(scores_folds[pr][0]).mean()
    count += 1
train_last_result /= count
test_last_result /= count
last_score /= count

# 表示
print('='*8+'\n'+'print mean result')
print('train last result:{}'.format(train_last_result))
print('test last result:{}'.format(test_last_result))
print('last score:{}'.format(last_score))

# 結果を代入する
test[target]=test_last_result

display(test[[Id_column, target]].head(2))

In [None]:
test

# 7. Submittion

In [None]:
test[[Id_column,target]].to_csv(submit_file, index = False)