# Homebrew Kernel
Things to check
- Effects of clustering（stock_id）
- Effects of tau
- Effects of KNN + tau

This kernel is the English translation of this kernel.
If you prefer the Japanese version, please see here.  
[https://www.kaggle.com/satoshimts/tau-vs-no-tau-vs-knn-tau]

# Parameters used

This is a very detailed part that should be tuned every time if possible, but I'll leave it at that.


seed = 29  
params = {  
    'learning_rate': 0.1,          
    'lambda_l1': 2,  
    'lambda_l2': 7,  
    'num_leaves': 800,  
    'min_sum_hessian_in_leaf': 20,  
    'feature_fraction': 0.8,  
    'feature_fraction_bynode': 0.8,  
    'bagging_fraction': 0.9,  
    'bagging_freq': 42,  
    'min_data_in_leaf': 700,  
    'max_depth': 4,  
    'seed': seed,  
    'feature_fraction_seed': seed,  
    'bagging_seed': seed,  
    'drop_seed': seed,  
    'data_random_seed': seed,  
    'objective': 'rmse',  
    'boosting': 'gbdt',  
    'verbosity': -1,  
    'n_jobs': -1,  
}   

# Baseline


In [None]:
data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
from sklearn.cluster import KMeans
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [None]:
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(data_dir + '/train.csv')
    test = pd.read_csv(data_dir + '/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [None]:
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'price_spread2':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std],
        "bid_ask_spread":[np.sum, np.mean, np.std],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    # きっと100秒刻みはいい結果を得れなかったのだろう
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
#     df_feature_500 = get_stats_window(seconds_in_bucket = 500, add_suffix = True)
#     df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
#     df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
#     df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature


In [None]:
# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, realized_volatility, np.mean, np.std, np.max, np.min],
        'order_count':[np.mean,np.sum,np.max],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
#     df_feature_500 = get_stats_window(seconds_in_bucket = 500, add_suffix = True)
#     df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
#     df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    #
    # ここかスコアに相当影響してると記載してる
    # なんでこれが効いてるかはいまのところ不明
    # おって調べることにする。
    #
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        # 差分を動いた後の価格で割って*100してる
        # 変動額を変動後の価格で割ることで比率にしている(小さい価格だとこの値はおおきくなる)
        val = (df_diff/price[1:])*100
        # それにvolをかけてるので変動比率に大きさをかけるのでこの値が大きいと値の比率が大きく動いたことになる
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    # time_idに対応するdfを抜き出す。
    # time_idに統計指標なのでリークしている
    
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]
        
        # powerって呼ばれる指標を得る。
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        
        # 平均以上のpriceの合計と平均以下のpriceの合計値
        # 全くもっていらないデータにしか見えない。
        # 外れ値の影響引く気がするし
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        
        #
        # 正の差分の合計値と負の差分の合計値
        # いるのかこれ
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        
        
        # 偏差の中央値
        abs_diff = np.median(np.abs(df_id['price'].values - np.mean(df_id['price'].values)))  
        # 価格の二乗の平均値
        energy = np.mean(df_id['price'].values**2)
        # 第3-第１
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # sizeに対してもうえと同様のこと
        abs_diff_v = np.median(np.abs(df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
        
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
#     df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150','time_id'], axis = 1, inplace = True)
    
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature


In [None]:
# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']
#     vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility',
#                 'log_return1_realized_volatility_600', 'log_return2_realized_volatility_600', 
#                 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400',
# #                 'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 
#                 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200',
# #                 'log_return1_realized_volatility_100', 'log_return2_realized_volatility_100', 
#                 'trade_log_return_realized_volatility',
#                 'trade_log_return_realized_volatility_600', 
#                 'trade_log_return_realized_volatility_400',
# #                 'trade_log_return_realized_volatility_300',
# #                 'trade_log_return_realized_volatility_100',
#                 'trade_log_return_realized_volatility_200']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
# 重要度解析
def calc_model_importance(model, feature_names=None, importance_type='gain'):
    importance_df = pd.DataFrame(model.feature_importance(importance_type=importance_type),
                                 index=feature_names,
                                 columns=['importance']).sort_values('importance')
    return importance_df

In [None]:
def calc_mean_importance(importance_df_list):
    mean_importance = np.mean(
        np.array([df['importance'].values for df in importance_df_list]), axis=0)
    mean_df = importance_df_list[0].copy()
    mean_df['importance'] = mean_importance
    
    return mean_df

In [None]:
# 画像保存用
import matplotlib.pyplot as plt
def plot_importance(importance_df, title='',
                    save_filepath=None, figsize=(8, 12)):
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()

In [None]:
# Read train and test
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

In [None]:
train.shape

In [None]:
pd.to_pickle(train,'train(307)_notau_noKNN.pkl')
pd.to_pickle(test,'test(307)_notau_noKNN.pkl')

# About the clustering index

In this clustering, time_id is used as an instance, and the values of the objective variable for each stock_id are correlated with each stock_id.  
In other words, stock_ids with similar correlation values mean that the distributions of the objective variables are similar.  
The idea that the distributions are similar means that new features can be obtained by taking the average of the indicators written for each cluster and substituting the indicators of each cluster for each time_id as features.  

No tau feature in tihs model

In [None]:
train = pd.read_pickle('./train(307)_notau_noKNN.pkl')
test = pd.read_pickle('test(307)_notau_noKNN.pkl')
train.shape

In [None]:
test.shape

In [None]:
# making agg features

# time_id毎のstockid毎のtarget変数の一覧
train_p = pd.read_csv(data_dir + '/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

# 
corr = train_p.corr()

# stick_idの相関係数のindex、つまりstockid
ids = corr.index

# 7に関しては調べる(シルエット図とかで)
# time_idをインスタンスとした時のstock_id相関係数
kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
# print(kmeans.labels_)


# lにrange(7)と等しいクラスタを撮った時のstock_idを格納してる
l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )
    

mat = []
matTest = []

n = 0
for ind in l:
    print(ind)
    # stock_idが指定されてるクラスタと同じもの(全体サンプル)を引いてくる
    newDf = train.loc[train['stock_id'].isin(ind) ]
    # time_id毎に平均値を取る(異なるstock_idでもクラスタが同じなもの同士の平均値になる)
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    
    #
    # stock_idそのものにいみがなくなったので注意！！！！！！！！！！！！！！！！！！！！！！
    # stock_idをクタスタidに変更してその後にc1?っていうのをつけてる
    #
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test.loc[test['stock_id'].isin(ind) ]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()

In [None]:
matTest = []
mat = []
kmeans = []

In [None]:
#mat2 #= mat1.pivot(index='time_id', columns='stock_idmat2
# 何でそんなことしたんや笑
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])

In [None]:
# この書き方便利、覚えておく

mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

In [None]:
# クラスタリングを行った結果の使う特徴量だけ抜き出してきてる（10/314）
# 2.5にかんしてはクラスタの中があまりに小さいので除く
# なぜ、この特徴量にしたのかは不明のため、追加実験が必要
# log_return2を削った理由はわからん（おそらく相関が似ちゃうから）

nnn = ['time_id',
     'log_return1_realized_volatility_0c1',
     'log_return1_realized_volatility_1c1',     
     'log_return1_realized_volatility_2c1',
     'log_return1_realized_volatility_3c1',     
     'log_return1_realized_volatility_4c1',
     'total_volume_mean_0c1',
     'total_volume_mean_1c1', 
     'total_volume_mean_2c1',
     'total_volume_mean_3c1', 
     'total_volume_mean_4c1',
     'trade_size_mean_0c1',
     'trade_size_mean_1c1', 
     'trade_size_mean_2c1',
     'trade_size_mean_3c1', 
     'trade_size_mean_4c1',
     'trade_order_count_mean_0c1',
     'trade_order_count_mean_1c1',
     'trade_order_count_mean_2c1',
     'trade_order_count_mean_3c1',
     'trade_order_count_mean_4c1',      
     'price_spread_mean_0c1',
     'price_spread_mean_1c1',
     'price_spread_mean_2c1',
     'price_spread_mean_3c1',
     'price_spread_mean_4c1',   
     'bid_spread_mean_0c1',
     'bid_spread_mean_1c1',
     'bid_spread_mean_2c1',
     'bid_spread_mean_3c1',
     'bid_spread_mean_4c1',       
     'ask_spread_mean_0c1',
     'ask_spread_mean_1c1',
     'ask_spread_mean_2c1',
     'ask_spread_mean_3c1',
     'ask_spread_mean_4c1',   
     'volume_imbalance_mean_0c1',
     'volume_imbalance_mean_1c1',
     'volume_imbalance_mean_2c1',
     'volume_imbalance_mean_3c1',
     'volume_imbalance_mean_4c1',       
     'bid_ask_spread_mean_0c1',
     'bid_ask_spread_mean_1c1',
     'bid_ask_spread_mean_2c1',
     'bid_ask_spread_mean_3c1',
     'bid_ask_spread_mean_4c1',
] 

In [None]:
# trainとくっつける
train = pd.merge(train,mat1[nnn],how='left',on='time_id')

In [None]:
train.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test = pd.merge(test,mat2[nnn],how='left',on='time_id')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
test.head()

In [None]:
# Split features and target
x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
x_test = test.drop(['row_id', 'time_id'], axis = 1)
# Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
x_test['stock_id'] = x_test['stock_id'].astype(int)

# Create out of folds array
oof_predictions = np.zeros(x.shape[0])
# Create test array to store predictions
test_predictions = np.zeros(x_test.shape[0])

In [None]:
seed = 29
params = {
    'learning_rate': 0.1,        
    'lambda_l1': 2,
    'lambda_l2': 7,
    'num_leaves': 800,
    'min_sum_hessian_in_leaf': 20,
    'feature_fraction': 0.8,
    'feature_fraction_bynode': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 42,
    'min_data_in_leaf': 700,
    'max_depth': 4,
    'seed': seed,
    'feature_fraction_seed': seed,
    'bagging_seed': seed,
    'drop_seed': seed,
    'data_random_seed': seed,
    'objective': 'rmse',
    'boosting': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
} 

In [None]:
from sklearn.model_selection import GroupKFold
gain_importance_list = []
split_importance_list = []
group = train['time_id']
kf = GroupKFold(n_splits=5)
# Iterate through each fold
for fold, (trn_ind, val_ind) in enumerate(kf.split(x, groups=group)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    # Root mean squared percentage error weights
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights)
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights)
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      categorical_feature = ['stock_id'],
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 5000, 
                      early_stopping_rounds = 30, 
                      verbose_eval = 100,
                      feval = feval_rmspe)

    # この書き方することで、全データをOOfにしてrmspeが求められる、
    # 覚えておいた方がいい
    oof_predictions[val_ind] = model.predict(x_val)
    # Predict the test set
    test_predictions += model.predict(x_test) / 5

    feature_names = x_train.columns.values.tolist()
    gain_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='gain')
    gain_importance_list.append(gain_importance_df)

    split_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='split')
    split_importance_list.append(split_importance_df)

rmspe_score = rmspe(y, oof_predictions)
print(f'Our out of folds RMSPE is {rmspe_score}')

In [None]:
mean_gain_df = calc_mean_importance(gain_importance_list)
mean_gain_df = mean_gain_df.reset_index().rename(columns={'index': 'feature_names'})
mean_gain_df.to_csv('gain_importance_mean groupkfold 352　KNN notau.csv', index=False)

In [None]:
test['target'] = test_predictions
test[['row_id', 'target']].to_csv('submission.csv',index = False)

## Discussion
CV : 0.22013  
No clustering CV : 0.2253  

By adding the clustering index, the cv was significantly improved.
However, when I looked at the feature importance, it didn't seem to have that much of an effect.  
I wonder if it just happens to match the results of tuning, or if it goes up as well...

~So, what happens if we cluster by time_id?  
If you see a correlation between time_id 1 and 2, and you give a statistical measure, the image is  
Nothing time_id  
Moving time_id  
and the actual evaluation will be nice and coherent, right?  
It's going to be a lot of code to write, but it's worth it~.   

I can't do clustering because target is a per time_id indicator: ....  
I don't see any features other than stock_id that would make sense for clustering, so I can’t.

# About TAU

tau1 : train['size_tau'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique'] )
As for feature_importance, it is a simple statistical measure and has no effect on the LGBM model.  
The feature_importance does not work at all.  


tau2 : 
train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )  
train['size_tau2_150'] = np.sqrt( 0.75/ train['trade_order_count_sum'] )  
As for the "size_tau2_sum", I think it will change because it is an indicator per unit of time.  
The value of the total number of trades after 150 seconds is converted to 600 seconds, so it should be the estimated total number of trades.  
If this is different from the actual total number of trades, it should be an indicator that the volatility has moved too much and is different from the estimate.  

Why do we need to care about the number of trades...?  
I think you can just use log_return or something...

## Without clustering indicator feature

In [None]:
train = pd.read_pickle('./train(307)_notau_noKNN.pkl')
test = pd.read_pickle('./test(307)_notau_noKNN.pkl')

In [None]:
# こっちは価格が動くどうこうではなくて、取引回数の合計の値
# 単位をそろえるのと、傾向がなんとなくみえるのか！

train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
test['size_tau2'] = np.sqrt( 1/ test['trade_order_count_sum'] )
train['size_tau2_450'] = np.sqrt( 0.25/ train['trade_order_count_sum'] )
test['size_tau2_450'] = np.sqrt( 0.25/ test['trade_order_count_sum'] )
train['size_tau2_300'] = np.sqrt( 0.5/ train['trade_order_count_sum'] )
test['size_tau2_300'] = np.sqrt( 0.5/ test['trade_order_count_sum'] )
train['size_tau2_150'] = np.sqrt( 0.75/ train['trade_order_count_sum'] )
test['size_tau2_150'] = np.sqrt( 0.75/ test['trade_order_count_sum'] )

# delta tau
train['size_tau2_600-450'] = train['size_tau2_450'] - train['size_tau2']
test['size_tau2_600-450'] = test['size_tau2_450'] - test['size_tau2']
train['size_tau2_600-300'] = train['size_tau2_300'] - train['size_tau2']
test['size_tau2_600-300'] = test['size_tau2_300'] - test['size_tau2']
train['size_tau2_600-150'] = train['size_tau2_150'] - train['size_tau2']
test['size_tau2_600-150'] = test['size_tau2_150'] - test['size_tau2']

train['size_tau2_450-300'] = train['size_tau2_300'] - train['size_tau2_450']
test['size_tau2_450-300'] = test['size_tau2_300'] - test['size_tau2_450']
train['size_tau2_450-150'] = train['size_tau2_150'] - train['size_tau2_450']
test['size_tau2_450-150'] = test['size_tau2_150'] - test['size_tau2_450']
train['size_tau2_300-150'] = train['size_tau2_150'] - train['size_tau2_300']
test['size_tau2_300-150'] = test['size_tau2_150'] - test['size_tau2_300']

In [None]:
pd.to_pickle(train, 'train(317)_tau_noKNN.pkl')
pd.to_pickle(test, 'test(317)_tau_noKNN.pkl')

In [None]:
train.shape

In [None]:
# Split features and target
x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
x_test = test.drop(['row_id', 'time_id'], axis = 1)
# Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
x_test['stock_id'] = x_test['stock_id'].astype(int)

# Create out of folds array
oof_predictions = np.zeros(x.shape[0])
# Create test array to store predictions
test_predictions = np.zeros(x_test.shape[0])

In [None]:
x.shape

In [None]:
seed = 29
params = {
    'learning_rate': 0.1,        
    'lambda_l1': 2,
    'lambda_l2': 7,
    'num_leaves': 800,
    'min_sum_hessian_in_leaf': 20,
    'feature_fraction': 0.8,
    'feature_fraction_bynode': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 42,
    'min_data_in_leaf': 700,
    'max_depth': 4,
    'seed': seed,
    'feature_fraction_seed': seed,
    'bagging_seed': seed,
    'drop_seed': seed,
    'data_random_seed': seed,
    'objective': 'rmse',
    'boosting': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
} 

In [None]:
oof = pd.DataFrame()                 # out-of-fold result
models = []                          # models
scores = 0.0                         # validation score

gain_importance_list = []
split_importance_list = []

from sklearn.model_selection import GroupKFold
group = train['time_id']
kf = GroupKFold(n_splits=5)
# Iterate through each fold
for fold, (trn_ind, val_ind) in enumerate(kf.split(x, groups=group)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    # Root mean squared percentage error weights
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights)
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights)
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      categorical_feature = ['stock_id'],
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 5000, 
                      early_stopping_rounds = 30, 
                      verbose_eval = 100,
                      feval = feval_rmspe)

    # この書き方することで、全データをOOfにしてrmspeが求められる、
    # 覚えておいた方がいい
    oof_predictions[val_ind] = model.predict(x_val)
    # Predict the test set
    test_predictions += model.predict(x_test) / 5

    feature_names = x_train.columns.values.tolist()
    gain_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='gain')
    gain_importance_list.append(gain_importance_df)

    split_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='split')
    split_importance_list.append(split_importance_df)

rmspe_score = rmspe(y, oof_predictions)
print(f'Our out of folds RMSPE is {rmspe_score}')

In [None]:
train.shape

In [None]:
mean_gain_df = calc_mean_importance(gain_importance_list)
mean_gain_df = mean_gain_df.reset_index().rename(columns={'index': 'feature_names'})
mean_gain_df.to_csv('gain_importance_mean kfold 317 tau.csv', index=False)

## Discussion
CV : 0.22507  
No tau : 0.2253  

No effect ....  
There is no effect at all when viewed with feture_importance. 

## KNN + TAU

In [None]:
train.shape

In [None]:
# making agg features

# time_id毎のstockid毎のtarget変数の一覧
train_p = pd.read_csv(data_dir + '/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

# 
corr = train_p.corr()

# stick_idの相関係数のindex、つまりstockid
ids = corr.index

# 7に関しては調べる(シルエット図とかで)
# time_idをインスタンスとした時のstock_id相関係数
kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
# print(kmeans.labels_)


# lにrange(7)と等しいクラスタを撮った時のstock_idを格納してる
l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )
    

mat = []
matTest = []

n = 0
for ind in l:
    print(ind)
    # stock_idが指定されてるクラスタと同じもの(全体サンプル)を引いてくる
    newDf = train.loc[train['stock_id'].isin(ind) ]
    # time_id毎に平均値を取る(異なるstock_idでもクラスタが同じなもの同士の平均値になる)
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    
    #
    # stock_idそのものにいみがなくなったので注意！！！！！！！！！！！！！！！！！！！！！！
    # stock_idをクタスタidに変更してその後にc1?っていうのをつけてる
    #
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test.loc[test['stock_id'].isin(ind) ]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()

In [None]:
matTest = []
mat = []
kmeans = []
#mat2 #= mat1.pivot(index='time_id', columns='stock_idmat2
# 何でそんなことしたんや笑
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])

# この書き方便利、覚えておく

mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

In [None]:
# クラスタリングを行った結果の使う特徴量だけ抜き出してきてる（10/314）
# 2.5にかんしてはクラスタの中があまりに小さいので除く
# なぜ、この特徴量にしたのかは不明のため、追加実験が必要
# log_return2を削った理由はわからん（おそらく相関が似ちゃうから）

nnn = ['time_id',
     'log_return1_realized_volatility_0c1',
     'log_return1_realized_volatility_1c1',     
     'log_return1_realized_volatility_2c1',
     'log_return1_realized_volatility_3c1',     
     'log_return1_realized_volatility_4c1',
     'total_volume_mean_0c1',
     'total_volume_mean_1c1', 
     'total_volume_mean_2c1',
     'total_volume_mean_3c1', 
     'total_volume_mean_4c1',
     'trade_size_mean_0c1',
     'trade_size_mean_1c1', 
     'trade_size_mean_2c1',
     'trade_size_mean_3c1', 
     'trade_size_mean_4c1',
     'trade_order_count_mean_0c1',
     'trade_order_count_mean_1c1',
     'trade_order_count_mean_2c1',
     'trade_order_count_mean_3c1',
     'trade_order_count_mean_4c1',      
     'price_spread_mean_0c1',
     'price_spread_mean_1c1',
     'price_spread_mean_2c1',
     'price_spread_mean_3c1',
     'price_spread_mean_4c1',   
     'bid_spread_mean_0c1',
     'bid_spread_mean_1c1',
     'bid_spread_mean_2c1',
     'bid_spread_mean_3c1',
     'bid_spread_mean_4c1',       
     'ask_spread_mean_0c1',
     'ask_spread_mean_1c1',
     'ask_spread_mean_2c1',
     'ask_spread_mean_3c1',
     'ask_spread_mean_4c1',   
     'volume_imbalance_mean_0c1',
     'volume_imbalance_mean_1c1',
     'volume_imbalance_mean_2c1',
     'volume_imbalance_mean_3c1',
     'volume_imbalance_mean_4c1',       
     'bid_ask_spread_mean_0c1',
     'bid_ask_spread_mean_1c1',
     'bid_ask_spread_mean_2c1',
     'bid_ask_spread_mean_3c1',
     'bid_ask_spread_mean_4c1',
     'size_tau2_0c1',
     'size_tau2_1c1',
     'size_tau2_2c1',
     'size_tau2_3c1',
     'size_tau2_4c1',
     'size_tau2_450_0c1',
     'size_tau2_450_1c1',
     'size_tau2_450_2c1',
     'size_tau2_450_3c1',
     'size_tau2_450_4c1',
     'size_tau2_300_0c1',
     'size_tau2_300_1c1',
     'size_tau2_300_2c1', 
     'size_tau2_300_3c1', 
     'size_tau2_300_4c1', 
     'size_tau2_150_0c1',
     'size_tau2_150_1c1',
     'size_tau2_150_2c1', 
     'size_tau2_150_3c1', 
     'size_tau2_150_4c1',          
     'size_tau2_600-450_0c1',
     'size_tau2_600-450_1c1',
     'size_tau2_600-450_2c1',
     'size_tau2_600-450_3c1',
     'size_tau2_600-450_4c1',
     'size_tau2_600-300_0c1',
     'size_tau2_600-300_1c1',
     'size_tau2_600-300_2c1',
     'size_tau2_600-300_3c1',
     'size_tau2_600-300_4c1',
     'size_tau2_600-150_0c1',
     'size_tau2_600-150_1c1',
     'size_tau2_600-150_2c1',
     'size_tau2_600-150_3c1',
     'size_tau2_600-150_4c1',          
     'size_tau2_450-300_0c1',
     'size_tau2_450-300_1c1',
     'size_tau2_450-300_2c1',
     'size_tau2_450-300_3c1',
     'size_tau2_450-300_4c1',
     'size_tau2_450-150_0c1',
     'size_tau2_450-150_1c1',
     'size_tau2_450-150_2c1',
     'size_tau2_450-150_3c1',
     'size_tau2_450-150_4c1',            
     'size_tau2_300-150_0c1',
     'size_tau2_300-150_1c1',
     'size_tau2_300-150_2c1',
     'size_tau2_300-150_3c1',
     'size_tau2_300-150_4c1',            
      ] 

In [None]:
# trainとくっつける
train = pd.merge(train,mat1[nnn],how='left',on='time_id')
test = pd.merge(test,mat2[nnn],how='left',on='time_id')

In [None]:
train.shape

In [None]:
pd.to_pickle(train, 'train(412)_tau_KNN.pkl')
pd.to_pickle(test, 'test(412)_tau_KNN.pkl')

In [None]:
# Split features and target
x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
x_test = test.drop(['row_id', 'time_id'], axis = 1)
# Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
x_test['stock_id'] = x_test['stock_id'].astype(int)

# Create out of folds array
oof_predictions = np.zeros(x.shape[0])
# Create test array to store predictions
test_predictions = np.zeros(x_test.shape[0])

In [None]:
oof = pd.DataFrame()                 # out-of-fold result
models = []                          # models
scores = 0.0                         # validation score

gain_importance_list = []
split_importance_list = []

from sklearn.model_selection import GroupKFold
group = train['time_id']
kf = GroupKFold(n_splits=5)
# Iterate through each fold
for fold, (trn_ind, val_ind) in enumerate(kf.split(x, groups=group)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    # Root mean squared percentage error weights
    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)
    train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights)
    val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights)
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      categorical_feature = ['stock_id'],
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 5000, 
                      early_stopping_rounds = 30, 
                      verbose_eval = 100,
                      feval = feval_rmspe)

    # この書き方することで、全データをOOfにしてrmspeが求められる、
    # 覚えておいた方がいい
    oof_predictions[val_ind] = model.predict(x_val)
    # Predict the test set
    test_predictions += model.predict(x_test) / 5

    feature_names = x_train.columns.values.tolist()
    gain_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='gain')
    gain_importance_list.append(gain_importance_df)

    split_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='split')
    split_importance_list.append(split_importance_df)

rmspe_score = rmspe(y, oof_predictions)
print(f'Our out of folds RMSPE is {rmspe_score}')

In [None]:
mean_gain_df = calc_mean_importance(gain_importance_list)
mean_gain_df = mean_gain_df.reset_index().rename(columns={'index': 'feature_names'})
mean_gain_df.to_csv('gain_importance_mean kfold 412 tau KNN.csv', index=False)

In [None]:
mean_gain_df.set_index('feature_names').filter(like='tau', axis=0).sort_values('importance', ascending=False).head(15)

## Discussion

CV : 0.21742  
LB : 0.21906  
No clustering tau : CV : 0.22057    

Small, but effective?  
Looking at feature importance, size_tau2 might be good?  
600-450 is also a little higher, so it might be working.  

Probably because the slope of the last 600 seconds is related to the volatility of the next 600 seconds.
