# Acknowledgements

Stock Embedding - FFNN - upgrade & 3D
https://www.kaggle.com/vbmokin/stock-embedding-ffnn-upgrade-3d

Stock Embedding - FFNN - My features
https://www.kaggle.com/alexioslyon/stock-embedding-ffnn-my-features

lgbm baseline
https://www.kaggle.com/alexioslyon/lgbm-baseline

I have reorganised all the codes for my own convenience in modifying, but the ideas are mostly based on the above kagglers. Thank you to all of you inspried me a lot.

In [None]:
from IPython.core.display import display, HTML

import glob
import os
import gc
from functools import reduce
from joblib import Parallel, delayed

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib
from numpy.random import seed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans

import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.backend import sigmoid
from keras.utils.generic_utils import get_custom_objects
from keras.layers import Activation

import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

#https://bignerdranch.com/blog/implementing-swish-activation-function-in-keras/
def swish(x, beta = 1):
    return (x * sigmoid(beta * x))
get_custom_objects().update({'swish': Activation(swish)})

In [None]:
# data preparation
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate third WAP
def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap 

# Function to calculate fourth WAP
def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap  

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

#Function to calculate the range of the series
def spread(series):
    return np.max(series) - np.min(series)

def sum_square(series):
    return np.sum(series ** 2)

def imr(series):
    return np.percentile(series.values,75) - np.percentile(series.values,25)

#function to calculate tendency
def tendency(price, vol):    
    df_diff = np.diff(price)
    val = (df_diff/price[1:])*100
    power = np.sum(val*vol[1:])
    return(power)

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to calculate the root mean squared percentage error for NN model
def root_mean_squared_per_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path, stock_id, seconds_in_bucket_groups, book_feature_dict, book_feature_dict_time):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap3(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket):
        # Group by the window
        if seconds_in_bucket:
            df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(book_feature_dict_time).reset_index()
        else:
            df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(book_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) + '_' + str(seconds_in_bucket) if col[1] else col[0] for col in df_feature.columns]
        return df_feature
    
    # Get and merge the stats for different windows
    df_features = [get_stats_window(seconds_in_bucket = second) for second in seconds_in_bucket_groups]
    df_feature = reduce(lambda  left,right: pd.merge(left,right, on='time_id', how='left'), df_features)
    df_feature['stock_id'] = stock_id
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path, stock_id, seconds_in_bucket_groups, trade_feature_dict, trade_feature_dict_time, seconds_in_bucket_groups2):
    df = pd.read_parquet(file_path)
    df['trade_log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(trade_feature_dict_time).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) + '_' + str(seconds_in_bucket) if col[1] else col[0] for col in df_feature.columns]
        return df_feature
    
    # Get and merge the stats for different windows
    df_features = [get_stats_window(seconds_in_bucket = second) for second in seconds_in_bucket_groups2]
    df_feature = reduce(lambda  left,right: pd.merge(left,right,on='time_id', how='left'), df_features)
    #add size_tau for each buckets
    for second in seconds_in_bucket_groups2:
        df_feature['size_tau_'+str(second)] = np.sqrt(1/ df_feature['seconds_in_bucket_count_unique_'+str(second)])
        df_feature['size_tau2_'+str(second)] = np.sqrt(((600 - second) / 600) / df_feature['order_count_sum_'+str(second)])
        if second == 400:
            df_feature['size_tau2_d_' + str(second)] = df_feature['size_tau2_'+ str(second)] - df_feature['size_tau2_' + '0']
        
    #create additional features
    def additional_features(df):
        lis = []
        for n_time_id in df['time_id'].unique():
            df_id = df[df['time_id'] == n_time_id]        
            tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
            f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
            f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
            df_max =  np.sum(np.diff(df_id['price'].values) > 0)
            df_min =  np.sum(np.diff(df_id['price'].values) < 0)
            # new
            abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
            energy = np.mean(df_id['price'].values**2)
            iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
            # vol vars
            abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
            energy_v = np.sum(df_id['size'].values**2)
            iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
            lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                       'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
        return pd.DataFrame(lis)
    df = df.merge(additional_features(df), on='time_id', how='left')
    df_feature['stock_id'] = stock_id
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df, seconds_in_bucket_groups, vol_groups):
    # Get realized volatility columns
    vol_cols = [ret + '_realized_volatility_' + str(second) for ret in ['log_return1', 'log_return2', 'trade_log_return'] 
                for second in seconds_in_bucket_groups]
    
    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(vol_groups).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) + '_stock' if col[1] else col[0] for col in df_stock_id.columns]

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(vol_groups).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) + '_time' if col[1] else col[0] for col in df_time_id.columns]
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', on = ['stock_id'])
    df = df.merge(df_time_id, how = 'left', on = ['time_id'])
    return df

#quantile_transform
def quantile_transform(train, test, quantiles_no):
    colNames = list(train)
    cols_remove = ['time_id', 'target', 'row_id', 'stock_id']
    for col in cols_remove:
        colNames.remove(col)
    train.replace([np.inf, -np.inf], np.nan,inplace=True)
    test.replace([np.inf, -np.inf], np.nan,inplace=True)
    for col in colNames:
        qt = QuantileTransformer(random_state=seed_no, n_quantiles=quantiles_no, output_distribution='normal')
        train[col] = qt.fit_transform(train[[col]])
        test[col] = qt.transform(test[[col]])
    return train, test

#make aggregate features by Kmeans
def aggregate_features(train, test, selected_features, selected_second_buckets, selected_clusters, clusters_no = 7):
    train_p = pd.read_csv(PATH + 'train.csv')
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')
    corr = train_p.corr()
    ids = corr.index
    kmeans = KMeans(n_clusters=clusters_no, random_state=0).fit(corr.values)
    l = []
    for n in range(7):
        l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0])
    mat = []
    matTest = []

    n = 0
    for ind in l:
        newDf = train.loc[train['stock_id'].isin(ind) ]
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        mat.append(newDf)

        newDf = test.loc[test['stock_id'].isin(ind) ]    
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        matTest.append(newDf)
        n+=1

    mat1 = pd.concat(mat).reset_index()
    mat1.drop(columns=['target'],inplace=True)
    mat2 = pd.concat(matTest).reset_index()
    
    matTest = []
    mat = []
    kmeans = []
    #mat2 #= mat1.pivot(index='time_id', columns='stock_idmat2
    mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
    
    mat1 = mat1.pivot(index='time_id', columns='stock_id')
    mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
    mat1.reset_index(inplace=True)

    mat2 = mat2.pivot(index='time_id', columns='stock_id')
    mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
    mat2.reset_index(inplace=True)
    
    selected_features_groups = ['time_id'] + [f'{feature}_{second}_{cluster}c1' for feature in selected_features for second in selected_second_buckets 
                         for cluster in selected_clusters]

    train = pd.merge(train,mat1[selected_features_groups],how='left',on='time_id')
    test = pd.merge(test,mat2[selected_features_groups],how='left',on='time_id')
    del mat1,mat2
    gc.collect()
    return train, test
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, seconds_in_bucket_groups, book_feature_dict, trade_feature_dict, is_train = True):
    # Parrallel for loop
    def for_joblib(stock_id):
        df_name = 'train' if is_train else 'test'
        file_path_book = PATH + f'book_{df_name}.parquet/stock_id=' + str(stock_id)
        file_path_trade = PATH + f'trade_{df_name}.parquet/stock_id=' + str(stock_id)
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book, stock_id, seconds_in_bucket_groups, book_feature_dict, book_feature_dict_time), 
                          trade_preprocessor(file_path_trade, stock_id, seconds_in_bucket_groups, trade_feature_dict, trade_feature_dict_time, seconds_in_bucket_groups2),
                          on = ['stock_id', 'time_id'], how = 'left')
        return df_tmp
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to read our base train and test set
def read_data(PATH, is_train = True):
    df_name = 'train' if is_train else 'test'
    df = pd.read_csv(PATH + f'{df_name}.csv')
    # Create a key to merge with book and trade data
    df['row_id'] = df['stock_id'].astype(str) + '-' + df['time_id'].astype(str)
    print(f'Our {df_name}ing set has {df.shape[0]} rows')
    return df

def prepare_data(is_debug = False, add_aggregate_features = False):
    train, test = read_data(PATH, True), read_data(PATH, False)
    if add_aggregate_features:
        stock_list = [1, 0, 3, 27, 2, 81, 8]
    else:
        stock_list = [0]
    if is_debug:
        train, test = train[train['stock_id'].isin(stock_list)], test[test['stock_id'] == 0]
        train_stock_ids, test_stock_ids = stock_list, [0]
    else:
        train_stock_ids, test_stock_ids = train['stock_id'].unique(), test['stock_id'].unique()
    train_ = preprocessor(train_stock_ids, seconds_in_bucket_groups, book_feature_dict, trade_feature_dict, is_train = True)
    test_ = preprocessor(test_stock_ids, seconds_in_bucket_groups, book_feature_dict, trade_feature_dict, is_train = False)
    train = train.merge(train_, on = ['stock_id', 'time_id'], how = 'left')
    test = test.merge(test_, on = ['stock_id', 'time_id'], how = 'left')
    train = get_time_stock(train, seconds_in_bucket_groups2, vol_groups)
    test = get_time_stock(test, seconds_in_bucket_groups2, vol_groups)
    return train, test

# Training model and making predictions

In [None]:
## NN model
def NN_model(train, test, nfolds, no_quantile, hidden_units, dropout_rate, stock_embedding_size, callbacks, seed_no, learning_rate, cat_variables, aggregate_parameters, 
             is_quan_trans = False, add_aggregate_features = False):
    # kfold based on the knn++ algorithm
    def k_fold(nfolds):
        out_train = pd.read_csv(PATH + 'train.csv')
        out_train = out_train.pivot(index='time_id', columns='stock_id', values='target')
        out_train = out_train.fillna(out_train.mean())
        # code to add the just the read data after first execution
        # data separation based on knn ++
        index, totDist, values = [], [], []
        # generates a matriz with the values of 
        mat = MinMaxScaler(feature_range=(-1, 1)).fit_transform(out_train.values)
        # adds index in the last column
        mat = np.c_[mat,np.arange(mat.shape[0])]
        nind = int(mat.shape[0]/nfolds) # number of individuals
        lineNumber = np.random.choice(np.array(mat.shape[0]), size=nfolds, replace=False)
        lineNumber = np.sort(lineNumber)[::-1]
        for n in range(nfolds):
            totDist.append(np.zeros(mat.shape[0]-nfolds))
            values.append([lineNumber[n]])     
        s=[]
        for n in range(nfolds):
            s.append(mat[lineNumber[n],:])
            mat = np.delete(mat, obj=lineNumber[n], axis=0)
        for n in range(nind-1):    
            luck = np.random.uniform(0,1,nfolds)
            for cycle in range(nfolds):
                 # saves the values of index           
                s[cycle] = np.matlib.repmat(s[cycle], mat.shape[0], 1)
                sumDist = np.sum((mat[:,:-1] - s[cycle][:,:-1])**2 , axis=1)   
                totDist[cycle] += sumDist        
                # probabilities
                f = totDist[cycle]/np.sum(totDist[cycle]) # normalizing the totdist
                j = 0
                kn = 0
                for val in f:
                    j += val        
                    if (j > luck[cycle]): # the column was selected
                        break
                    kn +=1
                lineNumber[cycle] = kn
                # delete line of the value added    
                for n_iter in range(nfolds):
                    totDist[n_iter] = np.delete(totDist[n_iter],obj=lineNumber[cycle], axis=0)
                    j= 0
                s[cycle] = mat[lineNumber[cycle],:]
                values[cycle].append(int(mat[lineNumber[cycle],-1]))
                mat = np.delete(mat, obj=lineNumber[cycle], axis=0)
        for n_mod in range(nfolds):
            values[n_mod] = out_train.index[values[n_mod]]
        return values
    
    def base_model(no_of_col, hidden_units, stock_embedding_size):
        # Each instance will consist of two inputs: a single user id, and a single movie id
        stock_id_input = keras.Input(shape=(1,), name='stock_id')
        num_input = keras.Input(shape=(no_of_col,), name='num_data')
        #embedding, flatenning and concatenating
        stock_embedded = keras.layers.Embedding(max(train['stock_id'])+1, stock_embedding_size, 
                                               input_length=1, name='stock_embedding')(stock_id_input)
        stock_flattened = keras.layers.Flatten()(stock_embedded)
        out = keras.layers.Concatenate()([stock_flattened, num_input])
        # Add one or more hidden layers
        for n_hidden in hidden_units:
            out = keras.layers.Dense(n_hidden, activation='swish')(out)
            out = keras.layers.Dropout(dropout_rate)(out)
        # A single output: our predicted rating
        out = keras.layers.Dense(1, activation='linear', name='prediction')(out)
        model = keras.Model(inputs = [stock_id_input, num_input],outputs = out)
        return model
    
    values = k_fold(nfolds)
    if is_quan_trans:
        train, test, quantile_transform(train, test, quantiles_no = no_quantile)
    if add_aggregate_features:
        selected_features, selected_second_buckets, selected_clusters = aggregate_parameters
        train, test = aggregate_features(train, test, selected_features, selected_second_buckets, selected_clusters)

    train_scores_folds, val_scores_folds = [], []
    features_to_consider = list(train)
    if 'time_id' not in cat_variables:
        features_to_consider.remove('time_id')
    features_to_consider.remove('target')
    features_to_consider.remove('row_id')

    train[features_to_consider] = train[features_to_consider].fillna(train[features_to_consider].mean()).fillna(0)
    test[features_to_consider] = test[features_to_consider].fillna(train[features_to_consider].mean()).fillna(0)
    y = train['target']
    
    no_of_col = len(list(train)) - 4
    # Create out of folds array
    oof_predictions = np.zeros(train.shape[0])
    
    test_predictions = np.zeros(test.shape[0])
    for n_count in range(nfolds):
        print('CV {}/{}'.format(n_count + 1, nfolds))
        indexes = np.arange(nfolds).astype(int)    
        indexes = np.delete(indexes,obj=n_count, axis=0)
        indexes = np.r_[tuple([values[indexes[i]] for i in range(nfolds - 1)])]

        X_train = train.loc[train.time_id.isin(indexes), features_to_consider]
        y_train = train.loc[train.time_id.isin(indexes), 'target']
        X_val = train.loc[train.time_id.isin(values[n_count]), features_to_consider]
        y_val = train.loc[train.time_id.isin(values[n_count]), 'target']

        model = base_model(no_of_col, hidden_units, stock_embedding_size)
        model.compile(
            keras.optimizers.Adam(learning_rate=learning_rate),
            loss=root_mean_squared_per_error
        )
        for cat in cat_variables:
            features_to_consider.remove(cat)
        num_data = X_train[features_to_consider]
        scaler = MinMaxScaler(feature_range=(-1, 1))         
        num_data = scaler.fit_transform(num_data.values)    
        cat_data = X_train[cat_variables]    
        target =  y_train
        num_data_val = X_val[features_to_consider]
        num_data_val = scaler.transform(num_data_val.values)
        cat_data_val = X_val[cat_variables]
        model.fit([cat_data, num_data], 
                  target,               
                  batch_size=1024,
                  epochs=1000,
                  validation_data=([cat_data_val, num_data_val], y_val),
                  callbacks=[es, plateau],
                  validation_batch_size=len(y_val),
                  shuffle=True,
                  verbose = 0)
        #training scores
        train_preds = model.predict([cat_data, num_data]).reshape(1,-1)[0]
        train_score = rmspe(y_true = y_train, y_pred = train_preds)
        train_scores_folds.append(train_score)
        #validating scores
        val_preds = model.predict([cat_data_val, num_data_val]).reshape(1,-1)[0]
        oof_predictions[train.time_id.isin(values[n_count])] = val_preds
        val_score = rmspe(y_true = y_val, y_pred = val_preds)
        val_scores_folds.append(val_score)
        print(f'Fold {n_count + 1}: trainscore: {train_score} valscore: {val_score}')
        #test
        tt =scaler.transform(test[features_to_consider].values)
        test_predictions += model.predict([test['stock_id'], tt]).reshape(1,-1)[0].clip(0,1e10) / nfolds
        for cat in cat_variables:
            features_to_consider.append(cat)
    rmspe_val_score = rmspe(y, oof_predictions)
    print(f'Training RMSPE: {np.mean(train_scores_folds)} Out of folds RMSPE is {rmspe_val_score}')
    return  train_scores_folds, val_scores_folds, rmspe_val_score, test_predictions

In [None]:
#lightgbm model
def lightgbm_model(train, test, nfolds, no_quantile, params, seed_no, aggregate_parameters, is_quan_trans = False, add_aggregate_features = False):
    if is_quan_trans:
        train, test = quantile_transform(train, test, quantiles_no = no_quantile)
    if add_aggregate_features:
        selected_features, selected_second_buckets, selected_clusters = aggregate_parameters
        train, test = aggregate_features(train, test, selected_features, selected_second_buckets, selected_clusters)
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    x_test = test.drop(['row_id', 'time_id'], axis = 1)
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    x_test['stock_id'] = x_test['stock_id'].astype(int)
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(x_test.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = nfolds, random_state = seed_no, shuffle = True)
    # Iterate through each fold
    train_scores_folds, val_scores_folds = [], []
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 1000, 
                          early_stopping_rounds = 100, 
                          verbose_eval = 250,
                          feval = feval_rmspe)

        # Add predictions to the out of folds array
        train_preds = model.predict(x_train)
        train_score = rmspe(y_true = y_train, y_pred = train_preds)
        train_scores_folds.append(train_score)
        
        val_preds = model.predict(x_val)
        val_score = rmspe(y_true = y_val, y_pred = val_preds)
        val_scores_folds.append(val_score)
        oof_predictions[val_ind] = val_preds
        print(f'Fold {fold + 1}: trainscore: {train_score} valscore: {val_score}')
        # Predict the test set
        test_predictions += model.predict(x_test) / nfolds
    #plot importance
    plt.figure(figsize=(12,6))
    lgb.plot_importance(model, max_num_features=20)
    plt.title("Feature importance")
    plt.show()
    rmspe_val_score = rmspe(y, oof_predictions)
    print(f'Training RMSPE: {np.mean(train_scores_folds)} Out of folds RMSPE is {rmspe_val_score}')
    return train_scores_folds, val_scores_folds, rmspe_val_score, test_predictions

In [None]:
#Parameters for features engineering
PATH = '../input/optiver-realized-volatility-prediction/'
# Book for aggregations
book_feature_dict = {
        'wap1': [np.sum, np.std],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_balance': [np.sum, np.max],
        'price_spread':[np.sum, np.max],
        'price_spread2':[np.sum, np.max],
        'bid_spread':[np.sum, np.max],
        'ask_spread':[np.sum, np.max],
        'total_volume':[np.sum, np.max],
        'volume_imbalance':[np.sum, np.max],
        "bid_ask_spread":[np.sum, np.max],
}
book_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
    }

# Trade for aggregations
trade_feature_dict = {
    'trade_log_return':[realized_volatility],
    'seconds_in_bucket':[count_unique],
    'size':[np.sum, spread],
    'order_count':[np.sum, spread],
    'amount':[np.sum, spread],
}
trade_feature_dict_time = {
    'trade_log_return':[realized_volatility],
    'seconds_in_bucket':[count_unique],
    'size':[np.sum],
    'order_count':[np.sum],
}

vol_groups = ['mean', 'std', 'min', 'max']

seconds_in_bucket_groups = [0, 100, 200, 300, 400, 500]

seconds_in_bucket_groups2 = [0, 200, 300, 400]

selected_features = ['log_return1_realized_volatility',
                     'total_volume_sum', 'size_sum', 'order_count_sum', 'price_spread_sum',
                    'bid_spread_sum', 'ask_spread_sum', 'volume_imbalance_sum', 'bid_ask_spread_sum', 'size_tau2']
#                      , 'trade_log_return_realized_volatility', 'seconds_in_bucket_count_unique', 'size_sum', 'order_count_mean']

selected_second_buckets = [0]

selected_clusters = [0, 1, 3, 4, 6]

aggregate_parameters = [selected_features, selected_second_buckets, selected_clusters]

is_debug = False
is_quan_trans = True
add_aggregate_features = True
train, test = prepare_data(is_debug = is_debug, add_aggregate_features = add_aggregate_features)

In [None]:
nfolds = 10
seed_no = 1111
no_quantile = 500
hidden_units = (256,128,64)
stock_embedding_size = 24

seed(seed_no)
tf.random.set_seed(seed_no)

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=100, verbose=0,
    mode='min', restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

learning_rate = 0.005

dropout_rate = 0

callbacks = [es, plateau]

cat_variables = ['stock_id']

train_scores_folds, scores_folds, rmspe_score, test_predictions = NN_model(train, test, nfolds, no_quantile, 
                     hidden_units, dropout_rate, stock_embedding_size, callbacks, seed_no, learning_rate, cat_variables, aggregate_parameters,
                    is_quan_trans = is_quan_trans, add_aggregate_features = add_aggregate_features)

In [None]:
params = {
    'learning_rate': 0.05,        
    'lambda_l1': 2,
    'lambda_l2': 7,
    'num_leaves': 800,
    'min_sum_hessian_in_leaf': 20,
    'feature_fraction': 0.8,
    'feature_fraction_bynode': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 42,
    'min_data_in_leaf': 700,
    'max_depth': 4,
    'seed': seed_no,
    'feature_fraction_seed': seed_no,
    'bagging_seed': seed_no,
    'drop_seed': seed_no,
    'data_random_seed': seed_no,
    'objective': 'rmse',
    'boosting': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
}  
params0 = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'seed':seed_no,
    'feature_fraction_seed': seed_no,
    'bagging_seed': seed_no,
    'drop_seed': seed_no,
    'data_random_seed': seed_no,
    'n_jobs':-1,
    'verbose': -1}

train_scores_folds2, val_scores_folds2, rmspe_val_score2, test_predictions2 = lightgbm_model(train, test, nfolds, no_quantile, params0, seed_no,aggregate_parameters,
                                                                            is_quan_trans = False, add_aggregate_features = True)

In [None]:
def predict(test_predictions, test_predictions2, weight = False, rmspe_val_score = None, rmspe_val_score2 = None):
    if weight:
        rmspe_val_score = 1 / rmspe_val_score
        rmspe_val_score2 = 1 / rmspe_val_score2
        return (rmspe_val_score * test_predictions + rmspe_val_score2 * test_predictions2)/(rmspe_val_score + rmspe_val_score2)
    else:
        return (test_predictions + test_predictions2)/2
test['target'] = predict(test_predictions, test_predictions2, weight = False, rmspe_val_score = rmspe_score, rmspe_val_score2 = rmspe_val_score2)

In [None]:
test[['row_id', 'target']].to_csv('submission.csv',index = False)

In [None]:
'''
Log: Add aggregate features in KNN only, [0, 200, 300 ,400], no_quantile = 500, add trade spread
Trial20:Training KNN:0.2001754207356333,  lgbm:0.18897182891288467   KNN:0.2112148779306615,  lgbm:0.20338421694021552,  Public Score: 0.20023
Log: Modified based on new kernal, selected_clusters = [0, 1, 2, 3, 4]
Trial24:Training KNN:0.20202140031573315, lgbm:0.17893973303582508   KNN:0.2102177518533855,  lgbm:0.19493261465281048,  Public Score: 0.19969
Log: selected_clusters = [0, 1, 2, 3, 4, 5, 6]
Trial25:Training KNN:0.20056520864481855, lgbm:0.17862442156084907   KNN:0.21080988078150428, lgbm:0.19457209713111165,  Public Score: 0.19986
Log: add wap3 and wap4 and related log return
Trial26:Training KNN:0.20181446860339824, lgbm:0.17980917806218497   KNN:0.20995577500704568, lgbm:0.1949107281631906,   Public Score: 0.19904
Log: add wap3 and wap4 and related log return, selected_clusters = [0, 1, 3, 4, 6], no spread
Trial27:Training KNN:0.20221566413103273, lgbm:0.17897978725266786   KNN:0.21066830921449906, lgbm:0.19478845606184658,  Public Score: 0.19863
log: size_tau2_d only 400
Trial29:Training KNN:0.2018674377866964,  lgbm:0.17918047277983357   KNN:0.2101467689834155,  lgbm:0.19501864463388588,  Public Score: 0.19880
'''

'''
Fast Trial
Log: Add aggregate features in KNN only, [0, 200, 300 ,400], no_quantile = 500, add trade spread
Trial20:Training KNN:0.21477605084969267,  lgbm:0.2112767456735686   KNN:0.23647947146162088, lgbm:0.232755165366543,    Public Score: 0.20023
Log: Modified based on new kernal, selected_clusters = [0, 1, 2, 3, 4]
        Training KNN:0.21734438390621938,  lgbm:0.17682403414225137  KNN:0.23179466773975121, lgbm:0.22290430972939718,  Public Score: 0.19969
Log: selected_clusters = [0, 1, 2, 3, 4, 5, 6]
        Training KNN:0.21528366042125913,  lgbm:0.17623016138257902  KNN:0.23120720093921826, lgbm:0.22185254969975954
Log: seconds_in_bucket_groups = [0, 100, 200, 300, 400, 500] - > [0, 200, 300, 400]
        Training KNN:0.2149028630710367,   lgbm:0.1755718158339396   KNN:0.2332897817467045,  lgbm:0.2218928427951056
Log: add wap3 and wap4 and related log return
        Training KNN:0.2154105796076588,   lgbm:0.1746446745025871   KNN:0.23236514724791815, lgbm:0.22147884880810506
Log: change book max to spread
        Training KNN:0.21754382246940768,  lgbm:0.1787300836034825   KNN:0.23278519803416653, lgbm:0.2249218172831521
Log: lbgm add_aggregate_features = False, add wap3 and wap4 and related log return
        Training KNN:0.2181941671317213,   lgbm:0.18681903260555696  KNN:0.23353375861830414, lgbm:0.22792100702016171
Log: add wap3 and wap4 and related log return, selected_clusters = [0, 1, 3, 4, 6]
Trial27:Training KNN:0.21642869439818804,  lgbm:0.1742123982976963   KNN:0.2318034023014281,  lgbm:0.2226242440767707,   Public Score: 0.19863
log: get time stock add logreturn3 and 4 (reversed)
        Training KNN:0.21836248916538498,  lgbm:0.17416614843171563  KNN:0.23257533999906102, lgbm:0.22303301056885588
log: selected_features add ['log_return2_realized_volatility', 'trade_return_realized_volatility'] (reversed)
        Training KNN:0.21906564563645076,  lgbm:0.17243249914765277  KNN:0.23425487932426461, lgbm:0.222073366135514
log: selected_clusters = [0, 1, 2, 3, 4, 5, 6]
        Training KNN:0.2176840245308933,   lgbm:0.17674364914662366  KNN:0.23292574600788693, lgbm:0.2225863285558057
log: no_quantile 500 -> 2000
        Training KNN:0.2159818277245494,   lgbm:0.17857807833774592  KNN:0.23232437011450735, lgbm:0.22285963617022936
log: book feature delete logreturn3 and 4
        Training KNN:0.216430252174648,    lgbm:0.1787822359316717   KNN:0.23318504064283288, lgbm:0.2229118038742619
log: trade size and amount replace min max by spread (same as min max)
        Training KNN:0.21642869439818804,  lgbm:0.1742123982976963   KNN:0.2318034023014281,  lgbm:0.2226242440767707,   Public Score: 0.19863
log: order_count max -> spread (same as max)
        Training KNN:0.21642869439818804,  lgbm:0.1742123982976963   KNN:0.2318034023014281,  lgbm:0.2226242440767707,   Public Score: 0.19863
log: trade_feature_dict_time add amount sum (reversed)
        Training KNN:0.21841185989276046,  lgbm:0.1779921805085564   KNN:0.23424890396258807, lgbm:0.2234330406640632
log: vol_groups min max -> spread (reversed)
        Training KNN:0.218744304158386,    lgbm:0.17953000379961834  KNN:0.2334280726091211,  lgbm:0.2237829243514391
log: vol_groups add spread
        Training KNN:0.21701558180105923,  lgbm:0.17572965238162686  KNN:0.23157669125800778, lgbm:0.22317242219920877
log: size_tau2_d only 400
        Training KNN:0.2176399078888894,   lgbm:0.17892425974989284  KNN:0.22989670347607916, lgbm:0.22360151758606328,  Public Score: 0.19880      
log: book_feature_dict_time add 'volume_imbalance':[np.sum] (reversed)
        Training KNN:0.21901899204719757,  lgbm:0.17634415875366674  KNN:0.23469634182000773, lgbm:0.2228152130397425  
log: 'volume_imbalance' add std
        Training KNN:0.2185474348107188,   lgbm:0.17465596873417216  KNN:0.23380060015120852, lgbm:0.22252130655681454 
log: early stopping, 50 -> 100
        Training KNN:0.2176399078888894,   lgbm:0.171026447690265    KNN:0.22989670347607916, lgbm:0.22298375912867413 
hidden_units = (256,128,64)
        Training KNN:0.21648116527683733,  lgbm:0.17892425974989284  KNN:0.22947559570636517, lgbm:0.22360151758606328   
'''
Training RMSPE: 0.21648116527683733 Out of folds RMSPE is 0.22947559570636517