## Stacked LGBM
* KFold split scheme using time_id
* hyperOPT example (however it's not running on this notebook )
* Using different models for different clusters

In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
import lightgbm as lgb

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib


path_submissions = '/'

target_name = 'target'
scores_folds = {}

In [None]:
training = True
opt= False

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def encode_mean(column, df):
    avg = df.groupby('time_id')[column].transform('mean')
    return np.abs(df[column].sub(avg).div(avg))
    
# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.std],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_balance': [np.sum, np.max],
        'price_spread':[np.sum, np.max],
        'price_spread2':[np.sum, np.max],
        'bid_spread':[np.sum, np.max],
        'ask_spread':[np.sum, np.max],
        'total_volume':[np.sum, np.max],
        'volume_imbalance':[np.sum, np.max],
        "bid_ask_spread":[np.sum,  np.max],
    }
    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
if training:
    train, test = read_train_test()

    # Get unique stock ids 
    train_stock_ids = train['stock_id'].unique()
    # Preprocess them using Parallel and our single stock id functions
    train_ = preprocessor(train_stock_ids, is_train = True)
    train = train.merge(train_, on = ['row_id'], how = 'left')

    # Get unique stock ids 
    test_stock_ids = test['stock_id'].unique()
    # Preprocess them using Parallel and our single stock id functions
    test_ = preprocessor(test_stock_ids, is_train = False)
    test = test.merge(test_, on = ['row_id'], how = 'left')
    
    columns_to_encode = ['wap1_sum', 'wap2_sum', 'wap3_sum', 'wap4_sum', 'log_return1_realized_volatility', 'log_return2_realized_volatility',
                         'log_return3_realized_volatility', 'log_return4_realized_volatility', 'wap_balance_sum', 'price_spread_sum',
                         'price_spread2_sum', 'bid_spread_sum', 'ask_spread_sum', 'total_volume_sum', 
                         'volume_imbalance_sum',  'bid_ask_spread_sum', 'trade_log_return_realized_volatility',
                         'trade_seconds_in_bucket_count_unique', 'trade_size_sum', 'trade_order_count_sum',
                         'trade_amount_sum', 'trade_tendency', 'trade_f_max','trade_df_max', 'trade_abs_diff',
                         'trade_energy', 'trade_iqr_p', 'trade_abs_diff_v', 'trade_energy_v', 'trade_iqr_p_v']
    
    df_aux = Parallel(n_jobs = -1, verbose = 1)(delayed(encode_mean)(column, train) for column in columns_to_encode)
    # Get group stats of time_id and stock_id
    train = pd.concat([train]+[x.rename(x.name + '_timeid_encoded') for x in df_aux], axis=1)
    del(df_aux)
    
    df_aux = Parallel(n_jobs = -1, verbose = 1)(delayed(encode_mean)(column, test) for column in columns_to_encode)
    # Get group stats of time_id and stock_id
    test = pd.concat([test]+[x.rename(x.name + '_timeid_encoded') for x in df_aux], axis=1)
    del(df_aux)
    
    train = get_time_stock(train)
    test = get_time_stock(test)
    train['size_tau'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique'] )
    test['size_tau'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique'] )
    train['size_tau_400'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_400'] )
    test['size_tau_400'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_400'] )
    train['size_tau_300'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_300'] )
    test['size_tau_300'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_300'] )
    train['size_tau_200'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_200'] )
    test['size_tau_200'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_200'] )
    train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
    test['size_tau2'] = np.sqrt( 1/ test['trade_order_count_sum'] )
    train['size_tau2_400'] = np.sqrt( 0.33/ train['trade_order_count_sum'] )
    test['size_tau2_400'] = np.sqrt( 0.33/ test['trade_order_count_sum'] )
    train['size_tau2_300'] = np.sqrt( 0.5/ train['trade_order_count_sum'] )
    test['size_tau2_300'] = np.sqrt( 0.5/ test['trade_order_count_sum'] )
    train['size_tau2_200'] = np.sqrt( 0.66/ train['trade_order_count_sum'] )
    test['size_tau2_200'] = np.sqrt( 0.66/ test['trade_order_count_sum'] )
    train['size_tau2_d'] = train['size_tau2_400'] - train['size_tau2']
    test['size_tau2_d'] = test['size_tau2_400'] - test['size_tau2']

    
    # making agg features

    train_p = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

    corr = train_p.corr(method='kendall')

    ids = corr.index

    kmeans = KMeans(n_clusters=5, random_state=0).fit(corr.values)
    print(kmeans.labels_) 

    l = []
    for n in range(5):
        l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )

    

    mat = []
    matTest = []

    n = 0
    for ind in l:
        print(ind)
        newDf = train.loc[train['stock_id'].isin(ind) ]
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        mat.append ( newDf )

        newDf = test.loc[test['stock_id'].isin(ind) ]    
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        matTest.append ( newDf )

        n+=1

    mat1 = pd.concat(mat).reset_index()
    mat1.drop(columns=['target'],inplace=True)
    mat2 = pd.concat(matTest).reset_index()
    mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
    mat1 = mat1.pivot(index='time_id', columns='stock_id')
    mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
    mat1.reset_index(inplace=True)
    mat2 = mat2.pivot(index='time_id', columns='stock_id')
    mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
    mat2.reset_index(inplace=True)
    nnn = ['time_id',
         'log_return1_realized_volatility_0c1',
         'log_return1_realized_volatility_1c1',     
         'log_return1_realized_volatility_3c1',
         'log_return1_realized_volatility_4c1',     
         'log_return1_realized_volatility_2c1',
         'total_volume_sum_0c1',
         'total_volume_sum_1c1', 
         'total_volume_sum_3c1',
         'total_volume_sum_4c1', 
         'total_volume_sum_2c1',
         'trade_size_sum_0c1',
         'trade_size_sum_1c1', 
         'trade_size_sum_3c1',
         'trade_size_sum_4c1', 
         'trade_size_sum_2c1',
         'trade_order_count_sum_0c1',
         'trade_order_count_sum_1c1',
         'trade_order_count_sum_3c1',
         'trade_order_count_sum_4c1',
         'trade_order_count_sum_2c1',      
         'price_spread_sum_0c1',
         'price_spread_sum_1c1',
         'price_spread_sum_3c1',
         'price_spread_sum_4c1',
         'price_spread_sum_2c1',   
         'bid_spread_sum_0c1',
         'bid_spread_sum_1c1',
         'bid_spread_sum_3c1',
         'bid_spread_sum_4c1',
         'bid_spread_sum_2c1',       
         'ask_spread_sum_0c1',
         'ask_spread_sum_1c1',
         'ask_spread_sum_3c1',
         'ask_spread_sum_4c1',
         'ask_spread_sum_2c1',   
         'volume_imbalance_sum_0c1',
         'volume_imbalance_sum_1c1',
         'volume_imbalance_sum_3c1',
         'volume_imbalance_sum_4c1',
         'volume_imbalance_sum_2c1',       
         'bid_ask_spread_sum_0c1',
         'bid_ask_spread_sum_1c1',
         'bid_ask_spread_sum_3c1',
         'bid_ask_spread_sum_4c1',
         'bid_ask_spread_sum_2c1',
         'size_tau2_0c1',
         'size_tau2_1c1',
         'size_tau2_3c1',
         'size_tau2_4c1',
         'size_tau2_2c1'] 
    train = pd.merge(train,mat1[nnn],how='left',on='time_id')
    test = pd.merge(test,mat2[nnn],how='left',on='time_id')

In [None]:
drop_columns = ['wap1_sum',
 'wap2_sum',
 'wap3_sum',
 'wap4_sum',
 'bid_spread_sum',
 'ask_spread_sum',
 'ask_spread_amax',
 'total_volume_sum',
 'total_volume_amax',
 'volume_imbalance_sum',
 'volume_imbalance_amax',
 'bid_ask_spread_sum',
 'trade_seconds_in_bucket_count_unique',
 'trade_size_sum',
 'trade_size_amax',
 'trade_size_amin',
 'trade_order_count_sum',
 'trade_order_count_amax',
 'trade_amount_sum',
 'trade_amount_amax',
 'trade_amount_amin',
 'trade_tendency',
 'trade_f_max',
 'trade_f_min',
 'trade_df_max',
 'trade_df_min',
 'trade_abs_diff',
 'trade_energy',
 'trade_iqr_p',
 'trade_abs_diff_v',
 'trade_energy_v',
 'trade_iqr_p_v',
 'trade_seconds_in_bucket_count_unique_500',
 'trade_size_sum_500',
 'trade_order_count_sum_500',
 'trade_seconds_in_bucket_count_unique_400',
 'trade_size_sum_400',
 'trade_order_count_sum_400',
 'trade_seconds_in_bucket_count_unique_300',
 'trade_size_sum_300',
 'trade_order_count_sum_300',
 'trade_seconds_in_bucket_count_unique_200',
 'trade_size_sum_200',
 'trade_order_count_sum_200',
 'trade_seconds_in_bucket_count_unique_100',
 'trade_size_sum_100',
 'trade_order_count_sum_100',
 'wap1_sum_timeid_encoded',
 'wap2_sum_timeid_encoded',
 'wap3_sum_timeid_encoded',
 'wap4_sum_timeid_encoded',
 'log_return1_realized_volatility_timeid_encoded',
 'log_return2_realized_volatility_timeid_encoded',
 'log_return3_realized_volatility_timeid_encoded',
 'log_return4_realized_volatility_timeid_encoded',
 'wap_balance_sum_timeid_encoded',
 'price_spread_sum_timeid_encoded',
 'price_spread2_sum_timeid_encoded',
 'bid_spread_sum_timeid_encoded',
 'ask_spread_sum_timeid_encoded',
 'total_volume_sum_timeid_encoded',
 'volume_imbalance_sum_timeid_encoded',
 'bid_ask_spread_sum_timeid_encoded',
 'trade_log_return_realized_volatility_timeid_encoded',
 'trade_seconds_in_bucket_count_unique_timeid_encoded',
 'trade_size_sum_timeid_encoded',
 'trade_order_count_sum_timeid_encoded',
 'trade_amount_sum_timeid_encoded',
 'trade_tendency_timeid_encoded',
 'trade_f_max_timeid_encoded',
 'trade_df_max_timeid_encoded',
 'trade_abs_diff_timeid_encoded',
 'trade_energy_timeid_encoded',
 'trade_iqr_p_timeid_encoded',
 'trade_abs_diff_v_timeid_encoded',
 'trade_energy_v_timeid_encoded',
 'trade_iqr_p_v_timeid_encoded',
 'size_tau',
 'size_tau_400',
 'size_tau_300',
 'size_tau_200',
 'size_tau2',
 'size_tau2_400',
 'size_tau2_300',
 'size_tau2_200',
 'size_tau2_d']

In [None]:
drop_columns =['wap1_sum',
 'wap2_sum',
 'wap2_std',
 'wap3_sum',
 'wap4_sum',
 'bid_spread_sum',
 'ask_spread_sum',
 'ask_spread_amax',
 'total_volume_sum',
 'total_volume_amax',
 'volume_imbalance_sum',
 'volume_imbalance_amax',
 'bid_ask_spread_sum',
 'trade_seconds_in_bucket_count_unique',
 'trade_size_sum',
 'trade_size_amax',
 'trade_size_amin',
 'trade_order_count_sum',
 'trade_order_count_amax',
 'trade_amount_sum',
 'trade_amount_amax',
 'trade_amount_amin',
 'trade_tendency',
 'trade_f_max',
 'trade_f_min',
 'trade_df_max',
 'trade_df_min',
 'trade_abs_diff',
 'trade_energy',
 'trade_iqr_p',
 'trade_abs_diff_v',
 'trade_energy_v',
 'trade_iqr_p_v',
 'trade_seconds_in_bucket_count_unique_500',
 'trade_size_sum_500',
 'trade_order_count_sum_500',
 'trade_seconds_in_bucket_count_unique_400',
 'trade_size_sum_400',
 'trade_order_count_sum_400',
 'trade_seconds_in_bucket_count_unique_300',
 'trade_size_sum_300',
 'trade_order_count_sum_300',
 'trade_seconds_in_bucket_count_unique_200',
 'trade_size_sum_200',
 'trade_order_count_sum_200',
 'trade_seconds_in_bucket_count_unique_100',
 'trade_size_sum_100',
 'trade_order_count_sum_100',
 'wap1_sum_timeid_encoded',
 'wap2_sum_timeid_encoded',
 'wap3_sum_timeid_encoded',
 'wap4_sum_timeid_encoded',
 'log_return1_realized_volatility_timeid_encoded',
 'log_return2_realized_volatility_timeid_encoded',
 'log_return3_realized_volatility_timeid_encoded',
 'log_return4_realized_volatility_timeid_encoded',
 'wap_balance_sum_timeid_encoded',
 'price_spread_sum_timeid_encoded',
 'price_spread2_sum_timeid_encoded',
 'bid_spread_sum_timeid_encoded',
 'ask_spread_sum_timeid_encoded',
 'total_volume_sum_timeid_encoded',
 'volume_imbalance_sum_timeid_encoded',
 'bid_ask_spread_sum_timeid_encoded',
 'trade_log_return_realized_volatility_timeid_encoded',
 'trade_seconds_in_bucket_count_unique_timeid_encoded',
 'trade_size_sum_timeid_encoded',
 'trade_order_count_sum_timeid_encoded',
 'trade_amount_sum_timeid_encoded',
 'trade_tendency_timeid_encoded',
 'trade_f_max_timeid_encoded',
 'trade_df_max_timeid_encoded',
 'trade_abs_diff_timeid_encoded',
 'trade_energy_timeid_encoded',
 'trade_iqr_p_timeid_encoded',
 'trade_abs_diff_v_timeid_encoded',
 'trade_energy_v_timeid_encoded',
 'trade_iqr_p_v_timeid_encoded',
 'trade_log_return_realized_volatility_400_min_stock',
 'size_tau',
 'size_tau_400',
 'size_tau_300',
 'size_tau_200',
 'size_tau2',
 'size_tau2_400',
 'size_tau2_300',
 'size_tau2_200',
 'size_tau2_d',
 'total_volume_sum_0c1',
 'total_volume_sum_1c1',
 'total_volume_sum_3c1',
 'total_volume_sum_4c1',
 'total_volume_sum_6c1',
 'trade_size_sum_0c1',
 'trade_size_sum_1c1',
 'trade_order_count_sum_0c1',
 'trade_order_count_sum_1c1',
 'volume_imbalance_sum_0c1',
 'volume_imbalance_sum_1c1',
 'volume_imbalance_sum_3c1',
 'volume_imbalance_sum_4c1',
 'volume_imbalance_sum_6c1',
 'size_tau2_0c1',
 'size_tau2_1c1',
 'size_tau2_4c1',
 'size_tau2_6c1']

In [None]:
train.drop(drop_columns, axis=1, inplace=True, errors='ignore')
test.drop(drop_columns, axis=1, inplace=True, errors='ignore')

In [None]:
colNames = [col for col in list(train.columns)
            if col not in {"stock_id", "time_id", "target", "row_id"}]

In [None]:
from sklearn.model_selection import KFold


X = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']

if training:
    X_test=test.copy()
    X_test.drop(['time_id','row_id'], axis=1,inplace=True)


seed0=2021
# params0 = {
#     'objective': 'rmse',
#     'boosting_type': 'gbdt',
#     'max_depth': -1,
#     'max_bin':100,
#     'min_data_in_leaf':500,
#     'learning_rate': 0.05,
#     'subsample': 0.72,
#     'subsample_freq': 4,
#     'feature_fraction': 0.5,
#     'lambda_l1': 0.5,
#     'lambda_l2': 1.0,
#     'categorical_column':[0],
#     'seed':seed0,
#     'feature_fraction_seed': seed0,
#     'bagging_seed': seed0,
#     'drop_seed': seed0,
#     'data_random_seed': seed0,
#     'n_jobs':-1,
#     'verbose': -1}
# seed1=42
# params1 = {
#         'learning_rate': 0.1,        
#         'lambda_l1': 2,
#         'lambda_l2': 7,
#         'num_leaves': 800,
#         'min_sum_hessian_in_leaf': 20,
#         'feature_fraction': 0.8,
#         'feature_fraction_bynode': 0.8,
#         'bagging_fraction': 0.9,
#         'bagging_freq': 42,
#         'min_data_in_leaf': 700,
#         'max_depth': 4,
#         'categorical_column':[0],
#         'seed': seed1,
#         'feature_fraction_seed': seed1,
#         'bagging_seed': seed1,
#         'drop_seed': seed1,
#         'data_random_seed': seed1,
#         'objective': 'rmse',
#         'boosting': 'gbdt',
#         'verbosity': -1,
#         'n_jobs':-1,
#     }


params = {'feature_fraction': 0.5031513938037354,
          'lambda_l1': 6.342730329515877, 
          'lambda_l2': 0.2008158055008158, 
          'learning_rate': 0.09837361881954568, 
          'max_bin': 100, 
          'max_depth': 4, 
          'min_data_in_leaf': 400, 
          'subsample': 0.8,
          'num_leaves': 20,
          'subsample_freq': 1,
         'verbose':-1}

In [None]:
for col in X.columns:
    scaler = StandardScaler()
    X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

In [None]:
l

In [None]:
# Function to early stop with root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_and_optimize_lgb(p):
    # Hyperparammeters (just basic)
    print(p)
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': p['max_depth'],
        'max_bin':p['max_bin'],
        'min_data_in_leaf': p['min_data_in_leaf'],
        'learning_rate': p['learning_rate'],
        'subsample': p['subsample'],
        'subsample_freq': p['subsample_freq'],
        'feature_fraction': p['feature_fraction'],
        'lambda_l1': p['lambda_l1'],
        'lambda_l2': p['lambda_l2'],
        'categorical_column':[0],
        'seed':seed0,
        'feature_fraction_seed': seed0,
        'bagging_seed': seed0,
        'drop_seed': seed0,
        'data_random_seed': seed0,
        'n_jobs':-1,
        'verbose': -1}
    features = [col for col in X.columns if col not in {"time_id", "target", "row_id", "stock_id"}]
    stock_id = X.stock_id.unique()
    # Create out of folds array
    oof_predictions = np.zeros(X.shape[0])
    time_ids_split = np.array_split(train.time_id.unique(),5)
    #kfold = KFold(n_splits = , random_state = 2021, shuffle = True)
    # Iterate through each fold
    
    for i in enumerate(time_ids_split):
        fold = i[0]
        print(f'Training fold {fold + 1}')
        
        trn_ind = ~train.time_id.isin(i[1])
        val_ind = train.time_id.isin(i[1])
        features = [col for col in X.columns if col not in {"time_id", "target", "row_id", "stock_id"}]
        x_train, x_val = train.loc[trn_ind], train.loc[val_ind]
        y_train, y_val = y.loc[trn_ind], y.loc[val_ind]
                
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train[features], y_train, weight = train_weights)
        val_dataset = lgb.Dataset(x_val[features], y_val, weight = val_weights)
        
        model = lgb.train(params = params,
                          num_boost_round=1200,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 300,
                          early_stopping_rounds=50,
                          feval = feval_rmspe)
        
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val[features])
        print(rmspe(y, oof_predictions))
    return rmspe(y, oof_predictions)


def train_and_evaluate_lgb(train, test, params):

    features = [col for col in train.columns if col not in {"time_id", "target", "row_id", "stock_id"}]
    stock_id = train.stock_id.unique()
    time_ids_split = np.split(train.time_id.unique(),5)
    
    oof_predictions = np.zeros(train.shape[0])
    test_predictions = np.zeros(test.shape[0])
    j=1
    
    for i in l:
        print(f'{i} stock')
        X_test_stock = test[test.stock_id.isin(i)]
        X_stock = train[train.stock_id.isin(i)]
        stock_predictions = np.zeros(X_stock.shape[0])
        for j in enumerate(time_ids_split):
            trn_ind = ~X_stock.time_id.isin(j[1])
            val_ind = X_stock.time_id.isin(j[1])
            
            x_train, x_val = X_stock.loc[trn_ind, features], X_stock.loc[val_ind, features]
            y_train, y_val = X_stock.loc[trn_ind, 'target'], X_stock.loc[val_ind, 'target']
            
            # Root mean squared percentage error weights
            train_weights = 1 / np.square(y_train)
            val_weights = 1 / np.square(y_val)
            train_dataset = lgb.Dataset(x_train[features], y_train, weight = train_weights)
            val_dataset = lgb.Dataset(x_val[features], y_val, weight = val_weights)

            model = lgb.train(params = params,
                              num_boost_round=1200,
                              train_set = train_dataset, 
                              valid_sets = [train_dataset, val_dataset], 
                              verbose_eval = 250,
                              early_stopping_rounds=50,
                              feval = feval_rmspe)

            # Add predictions to the out of folds array
            rmspe_fold = rmspe(y_val, model.predict(x_val[features]))
            print(f'For fold {j[0]}, stock{i}: {rmspe_fold}')
            if X_test_stock.empty:
                pass
            else:
                test_predictions[test.stock_id.isin(i)] += model.predict(X_test_stock[features])/len(time_ids_split)
        
        
    
    return test_predictions

In [None]:
a = [[1,2,3,4],[4,5,6,7,]]

In [None]:
if opt:
    param_space = {
        'max_depth': scope.int(hp.uniform('max_depth', 4, 20)),
        'max_bin': scope.int(hp.uniform('max_bin', 40, 400)),
        'min_data_in_leaf': scope.int(hp.uniform('min_data_in_leaf', 100, 2000)),
        'learning_rate': hp.uniform('learning_rate',0.01,0.1),
        'subsample': hp.uniform('subsample', 0.3, 0.9),
        'subsample_freq': scope.int(hp.uniform('subsample_freq',1,30)),
        'feature_fraction': hp.uniform('feature_fraction',0.5, 0.9),
        'lambda_l1': hp.uniform('lambda_l1',0.1,10),
        'lambda_l2': hp.uniform('lambda_l2',0.1,10)
    }
    
    trials = Trials()

    hopt = fmin(fn = train_and_optimize_lgb, 
                space = param_space, 
                algo = tpe.suggest, 
                max_evals = 100, 
                trials = trials
               )
    print(hopt)

In [None]:
predictions_lgb= train_and_evaluate_lgb(train, test,params)

In [None]:
target_name='target'
test["row_id"] = test["stock_id"].astype(str) + "-" + test["time_id"].astype(str) 
test[target_name] = predictions_lgb


display(test[['row_id', target_name]].head(3))
test[['row_id', target_name]].to_csv('submission.csv',index = False)