# Setup

## Links

* https://www.kaggle.com/tommy1028/lightgbm-starter-with-feature-engineering-idea (some feature engineering ideas... with lightgbm/crossvalidation for prediction)
* https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data/ (official tutorial notebook)
* https://www.kaggle.com/lucasmorin/feature-engineering-aggregation-functions (feature engineering, lists many functions)
* https://www.kaggle.com/ragnar123/optiver-realized-volatility-lgbm-baseline (group features by seconds in bucket)
* https://www.kaggle.com/endremoen/feature-engineering-optimal-clustering (overview of clustering methods)
* https://www.kaggle.com/hiromasatabuchi/baseline-simple-flow-with-lightgbm (good starting point)
* https://www.kaggle.com/monolith0456/2xlgbm-fnn-ensemble (0.19... score)
* https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/263308 (note groupkfold instead of kfold for better CV error, point 2)

In [None]:
import psutil
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])

In [None]:
import pandas as pd
import numpy as np
from numpy.random import default_rng

import os, glob
from pathlib import Path
from pprint import pprint

from time import perf_counter as p_f
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold, KFold

from scipy import stats

import matplotlib.pyplot as plt

from lightgbm import LGBMRegressor
import lightgbm as lgb

## Set path to data

In [None]:
data_path = "../input/optiver-realized-volatility-prediction/"

## Read example timeseries

In [None]:
time_series = pd.read_parquet(data_path + '/book_train.parquet/stock_id=0')

time_series.head()

time_series = time_series[time_series['time_id']<100]

time_series['row_id'] = [f"0-{t_id}" for t_id in time_series['time_id']]

time_series.drop(['time_id'], axis=1, inplace=True)

time_series.info()

# Define functions

## Calculate different statistics

In [None]:
def calc_wap1(box):
    wap = (box['bid_price1'] * box['ask_size1'] +
                                box['ask_price1'] * box['bid_size1']) / (
                                       box['bid_size1']+ box['ask_size1'])
    return wap

def calc_wap2(box):
    wap = (box['bid_price2'] * box['ask_size2'] +
                                box['ask_price2'] * box['bid_size2']) / (
                                       box['bid_size2']+ box['ask_size2'])
    return wap

def calc_wap3(box):
    wap = (box['bid_price1'] * box['bid_size1'] +
                                box['ask_price1'] * box['ask_size1']) / (
                                       box['bid_size1']+ box['ask_size1'])
    return wap

def calc_wap4(box):
    wap = (box['bid_price2'] * box['bid_size2'] +
                                box['ask_price2'] * box['ask_size2']) / (
                                       box['bid_size2']+ box['ask_size2'])
    return wap

# log return
def log_return(list_stock_prices, fillna = True):
    a = np.log(list_stock_prices).diff()
    if(fillna):
        return a.fillna(np.mean(a))
    else:
        return a

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def n_unique(series):
    return len(np.unique(series))

#f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
def n_above_mean(x):
    m = np.mean(x)
    return np.where(x > m)[0].size

#f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
def n_below_mean(x):
    m = np.mean(x)
    return np.where(x < m)[0].size

#df_max =  np.sum(np.diff(df_id['price'].values) > 0)
def n_pos_diff(x):
    return np.sum(np.diff(x) > 0)

#df_min =  np.sum(np.diff(df_id['price'].values) < 0)
def n_neg_diff(x):
    return np.sum(np.diff(x) < 0)

#abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))
#abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))  
def med_abs_diff(x):
    return np.median(np.abs( x - np.mean(x)))

#iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
#iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
def iqpr(x):
    return np.percentile(x,75) - np.percentile(x,25)

#energy = np.mean(df_id['price'].values**2)
#energy_v = np.sum(df_id['size'].values**2)
def mean_square(x):
    return np.mean(x**2)

### Weighted averages

In [None]:
# weights - exponential decay
def calcExpWeights(t, halflife = 200):
    t = np.array(t)
    
    # t will be 'age' measured relative to t0, t0 being the start of
    # the second time box, at 'seconds_in_bucket' = 600s
    t = 600 - t
    tau = halflife/np.log(2)
    return np.exp(-t/tau)

def weighted_average(pd_series,
                     data_col, w_col,
                     new_var_name,
                     exp_w =False,
                    sqrt = False):
    data = pd_series[data_col]
    w = pd_series[w_col]
    
    if(exp_w):
        w = calcExpWeights(w)
        
        
    ws = np.sum(data*w)/np.sum(w) # weighted sum
    
    # root square of result
    if(sqrt):
        ws = ws**0.5
    
    res = {new_var_name: ws}
    
    return pd.Series(res)


# calcExpWeights(np.arange(10), halflife=5)

### Gradient

In [None]:
from scipy import stats

# assume x-data is seconds_in_bucket column
def calcGradientR2(pd_series,
                   y_col,
                   new_var_name = 'var1'):
    
    x = pd_series['seconds_in_bucket']
    y = pd_series[y_col]
    
    if(len(x) == 0 or len(y) ==0):
        return 0
    gradient, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    
    res = {f"{new_var_name}_gradient" : gradient,
           f"{new_var_name}_R2" : r_value**2
          }
    
    return pd.Series(res)

### Fourier amplitudes

In [None]:
from scipy.interpolate import interp1d
from scipy import stats
from scipy.fft import rfft, irfft, rfftfreq

def getft_amplitudes(pd_series, y_col, component_list=None, aggf = [np.mean]):
    
    x = pd_series['seconds_in_bucket'].values
    y = pd_series[y_col].values
    
    # extrapolate time-series to fill entire box 0-599s
    # extrapolate by creating new data-points at beginning/end of series
    # having the same value as the closest known point
    # to make the fourier coefficients comparable across all timeseries
    if(x.min() != 0):
        x = np.concatenate( ([0], x), axis=0)
        y = np.concatenate( ([y[0]], y), axis=0)
        
    if(x.max() != 599):
        x = np.concatenate( (x,[599]), axis=0)
        y = np.concatenate( (y, [y[-1]]), axis=0)
    
    #print(y_detrend.mean()) # should be zero after removing the intercept and gradient
    
    # interpolate values to get a complete time-series
    duration = x.max()
    N = duration*2+1 # total number of samples
    x_ip = np.linspace(0, duration, num=N, endpoint=True)
    
    f_ip = interp1d(x, y, kind="linear")
    y_ip = f_ip(x_ip) # interpolate values
 
    gradient, intercept, r_value, p_value, std_err = stats.linregress(x_ip,y_ip)
    y_detrend = y_ip - (x_ip*gradient + intercept)
    
    
    # normalize signal
    y_detrend = y_detrend/(y_detrend.max()-y_detrend.min())
    
    yf = rfft(y_detrend)
    xf = rfftfreq(N, 2)
    
    res = {}
    if(component_list == None):
        component_list = range(len(yf))
        
    # https://www.mathworks.com/matlabcentral/answers/162846-amplitude-of-signal-after-fft-operation
    amp = np.abs(yf)/N
        
    for i in component_list:
        var_name = f'{y_col}_fft{i}'
        
        res[var_name] = amp[i]
        
    for my_f in aggf:
        var_name = f'{y_col}_fft_{my_f.__name__}'
        res[var_name] = my_f(amp)
        
    return pd.Series(res)
    

#ex0 = pd.read_parquet(f"{data_path}/book_train.parquet/stock_id=11")
#ex0_0 = ex0[ex0['time_id']==5]
#res = (ex0.groupby('time_id')).apply(getft_amplitudes, 'bid_price1', [], [np.mean, np.max, np.argmax])
#res

## Pandas helper functions

This function convert a pandas multi-index (created when applying different functions to columns with 'agg') to a list of column-names.

In [None]:
def flattenIndices(columns, suffix=None):
    
    new_colnames = []
    for index in np.ravel(columns):
        if type(index) is tuple:
            varname, fname = index
            cname = f"{varname}_{fname}"
        else:
            cname = f"{index}"
            
        if suffix is not None:
            cname = f"{cname}_{suffix}"
        new_colnames += [cname]
            
    return new_colnames

This function will replace Na's in a dataframe with their means, for each column in col_list.

Means are calculated for the different stock-ids (averaged over time-id's).

In [None]:
def replaceNaWithMean(df, col_list):
    cols_with_na = df.columns[df.isnull().any()]
    num_na = df.isnull().sum().sum()
    num_notNa = df.count().sum()
    num_tot = len(df.isnull())
    
    print("Number of na's:", num_na, "of", num_na + num_notNa, ". Percentage (%) of na's=", 100*num_na/(num_notNa+num_na))
    
    for col in col_list:
        df[col] = df.groupby(['stock_id'], sort=False)[col].apply(lambda x: x.fillna(x.mean()))
    
    #print(sorted(num_na[num_na>0]*100/num_tot, reverse=True))
    return df

## Aggregate values with same time id's

In [None]:
def aggregates_per_time_id(df_book_data, var_function_dict,
                           # seconds_in_bucket is between 0 and 599 (inclusive) in original data
                           #box_time_windows=[(0,599), (0,149), (150,299), (300,449), (450,599)]):
                          #box_time_windows=[(0,599), (150,599), (300,599), (450, 599)]):
                           box_time_windows=[(0,599), (0,99), (100,199), (200,299), (300,399), (400,499), (500,599)]):
    
    # bug resulting in NaN's somewhere in this code!
    df_aggregates_time_id = pd.DataFrame()
    df_aggregates_time_id['time_id'] = pd.Series(dtype=int)
    
    for tw in box_time_windows:
        min_s, max_s = tw
        idx = (df_book_data['seconds_in_bucket'] >= min_s) & (df_book_data['seconds_in_bucket'] <= max_s)
        
        res =  pd.DataFrame(df_book_data[idx].groupby(['time_id']).agg(var_function_dict)).reset_index(drop=True)
        res.columns = flattenIndices(res.columns, suffix=f"t{min_s}-{max_s}")
        
        
        res['time_id'] = pd.DataFrame(df_book_data[idx].groupby(['time_id']).agg({'time_id':np.mean})).reset_index(drop=True)
        df_aggregates_time_id = pd.DataFrame(pd.merge(df_aggregates_time_id, res, how='outer', on='time_id')).reset_index(drop=True)

    return df_aggregates_time_id

#df_trade_data = pd.read_parquet(f"{data_path}/trade_train.parquet/stock_id=103")
#f_dict = {'size' : np.sum,
#         'order_count': np.sum}
#res = aggregates_per_time_id(df_trade_data, f_dict,  box_time_windows=[(0,299),(300,599)])

#idx = df_trade_data['time_id']==9664
#df_trade_data = df_trade_data[idx]



#idx = (df_trade_data['seconds_in_bucket'] >= 0) & (df_trade_data['seconds_in_bucket'] <= 300)
#df_trade_data[idx].groupby(['time_id']).agg(f_dict)
#res.tail()
#df_trade_data

## Define aggregates functions for book

In [None]:
def calcBookAggregatesForStock(si, file_path):
    bdata = pd.read_parquet(file_path)
    
    # calculate statistics that will be used for aggregation
    # and add them table
    bdata['wap1'] = calc_wap1(bdata)
    bdata['wap2'] = calc_wap2(bdata)
    bdata['wap3'] = calc_wap3(bdata)
    bdata['wap4'] = calc_wap4(bdata)
    
    # Calculate price/bid spread
    bdata['price_spread1'] = (bdata['ask_price1'] - bdata['bid_price1']) / ((bdata['ask_price1'] + bdata['bid_price1']) / 2)
    bdata['price_spread2'] = (bdata['ask_price2'] - bdata['bid_price2']) / ((bdata['ask_price2'] + bdata['bid_price2']) / 2)
    bdata['bid_spread'] = bdata['bid_price1'] - bdata['bid_price2']
    bdata['ask_spread'] = bdata['ask_price1'] - bdata['ask_price2']
    bdata["bid_ask_spread"] = abs(bdata['bid_spread'] - bdata['ask_spread'])
    
    # volume
    bdata['total_volume'] = (bdata['ask_size1'] + bdata['ask_size2']) + (bdata['bid_size1'] + bdata['bid_size2'])
    bdata['volume_imbalance'] = abs((bdata['ask_size1'] + bdata['ask_size2']) - (bdata['bid_size1'] + bdata['bid_size2']))
    bdata['volume1'] = bdata['bid_size1'] + bdata['ask_size1']
    bdata['volume2'] = bdata['bid_size2'] + bdata['ask_size2']
    
    
    # need to group by time_id to avoid calculating it across for different time_id's
    bdata['log_return1'] = (bdata.groupby(['time_id'])['wap1'].apply(log_return))
    bdata['log_return2'] = (bdata.groupby(['time_id'])['wap2'].apply(log_return))
    bdata['log_return3'] = (bdata.groupby(['time_id'])['wap3'].apply(log_return))
    bdata['log_return4'] = (bdata.groupby(['time_id'])['wap4'].apply(log_return))
    
    
    bdata['sqlog_return1'] = bdata['log_return1']*bdata['log_return1']
    bdata['sqlog_return2'] = bdata['log_return2']*bdata['log_return2']
    bdata['sqlog_return3'] = bdata['log_return3']*bdata['log_return3']
    bdata['sqlog_return4'] = bdata['log_return4']*bdata['log_return4']
    
    
    # wap balance
    bdata['wap_balance'] = abs(bdata['wap1'] - bdata['wap2'])
    
    # defines which aggregates to calculate
    aggf_dict = { # [function_argument, [functions]]
        'wap1': [np.sum, np.std],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'sqlog_return1': [np.min, np.max],
        'sqlog_return2': [np.min, np.max],
        'sqlog_return3': [np.min, np.max],
        'sqlog_return4': [np.min, np.max],
        'wap_balance': [np.sum, np.max],
        'price_spread1':[np.sum, np.max],
        'price_spread2':[np.sum, np.max],
        'bid_spread':[np.sum, np.max],
        'ask_spread':[np.sum, np.max],
        'total_volume':[np.sum, np.max],
        'volume_imbalance':[np.sum, np.max],
        "bid_ask_spread":[np.sum,  np.max] }
    
    
    # functions to calculate for the different time windows
    aggf_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility]
    }
    
    
    # aggregates over whole time window
    res1 = aggregates_per_time_id(bdata, aggf_dict,
                                  box_time_windows=[(0,599)])
    
    # aggregates over smaller time windows
    res2 = aggregates_per_time_id(bdata, aggf_dict_time,
                                  #box_time_windows=[(0,199), (200,399), (400,599)])
                                  box_time_windows=[(0,99), (100,199), (200,299), (300,399), (400,499), (500,599)])
    
    
    resAll = pd.merge(res1,res2, how='left', on=['time_id'])

        
    #
    # aggregates requiring more than one argument 
    #
    
    # Find gradient of squared log-return. Note realized volatility = root-mean of squared-log-returns
    reg_tmp = (bdata.groupby('time_id')).apply(calcGradientR2, 'sqlog_return1', 'sqlog_return1')
    resAll = pd.merge(resAll, reg_tmp, on=["time_id"], how="left")
    
    reg_tmp = (bdata.groupby('time_id')).apply(calcGradientR2, 'sqlog_return2', 'sqlog_return2')
    resAll = pd.merge(resAll, reg_tmp, on=["time_id"], how="left")
    
    reg_tmp = (bdata.groupby('time_id')).apply(calcGradientR2, 'sqlog_return3', 'sqlog_return3')
    resAll = pd.merge(resAll, reg_tmp, on=["time_id"], how="left")
    
    reg_tmp = (bdata.groupby('time_id')).apply(calcGradientR2, 'sqlog_return4', 'sqlog_return4')
    resAll = pd.merge(resAll, reg_tmp, on=["time_id"], how="left")
    
    
    #
    # fft of original features
    #
    
    orig_features = ['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2']
    for col_name in orig_features:
        res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, col_name, [1,2,3], [np.mean, np.max, np.argmax])
        resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
        
        
    # fft of sq-log return

    res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, 'sqlog_return1', [1,2,3], [np.mean, np.max, np.argmax])
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, 'sqlog_return2', [1,2,3], [np.mean, np.max, np.argmax])
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    
    # fft of wap return
    
    res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, 'wap1', [1,2,3], [np.mean, np.max, np.argmax])
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, 'wap2', [1,2,3], [np.mean, np.max, np.argmax])
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
        
    # volume-weighted wap1/2
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'wap1', 'volume1', 'vw_wap1', exp_w=False, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'wap2', 'volume2', 'vw_wap2', exp_w=False, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left") 
                                                                   
                                                                
    # sqrt of volume-weighted sqlog-return 1/2
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'sqlog_return1', 'volume1', 'rvw_sqlog_return1', exp_w=False, sqrt=True)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'sqlog_return2', 'volume2', 'rvw_sqlog_return2', exp_w=False, sqrt=True)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # sqrt of time-weighted sqlog-return 1/2
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'sqlog_return1', 'seconds_in_bucket', 'rtw_sqlog_return1', exp_w=True, sqrt=True)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'sqlog_return2', 'seconds_in_bucket', 'rtw_sqlog_return2', exp_w=True, sqrt=True)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # time-weighted total_volume
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'total_volume', 'seconds_in_bucket', 'tw_total_volume', exp_w=True, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # time-weighted volume_imbalance
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'volume_imbalance', 'seconds_in_bucket', 'tw_volume_imbalance', exp_w=True, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # time-weighted price spread 1/2
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'price_spread1', 'seconds_in_bucket', 'tw_price_spread1', exp_w=True, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'price_spread2', 'seconds_in_bucket', 'tw_price_spread2', exp_w=True, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
     # add book prefix to all columns except time_id
    resAll.columns = [(f"book_{e}" if e != "time_id" else e) for e in resAll.columns]
    
    
    resAll['stock_id'] = f"{si}"
    # to get correct time_id's,
    resAll['row_id'] = [f"{si}-{t_id:.0f}" for t_id in resAll['time_id']]

    return pd.DataFrame(resAll)

#stats1 = calcBookAggregatesForStock(0, f"{data_path}/book_train.parquet/stock_id=0")
#stats1.head()

## Define aggregates functions for trade

In [None]:
def calcTradeAggregatesForStock(si, file_path):
    bdata = pd.read_parquet(file_path)
    
    #bdata['time_id'] = bdata['time_id'].astype(int)

    # need to group by time_id to avoid calculating it across for different time_id's
    bdata['log_return'] = (bdata.groupby(['time_id'])['price'].apply(log_return))
    bdata['amount']=bdata['price']*bdata['size']
    bdata['sqlog_return'] = bdata['log_return']*bdata['log_return']
    
    # Dict for aggregations
    aggf_dict = {
        'log_return':[realized_volatility],
        'sqlog_return': [np.sum, np.max],
        'seconds_in_bucket':[n_unique],
        'size':[np.sum, np.max, med_abs_diff, iqpr],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max],
        'price':[np.sum, n_above_mean, n_below_mean, n_pos_diff, n_neg_diff, med_abs_diff, iqpr]
    }
    
    aggf_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[n_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }

    # aggregates over whole time window
    res1 = aggregates_per_time_id(bdata, aggf_dict,
                                  box_time_windows=[(0,599)])
    
    # aggregates over smaller time windows
    res2 = aggregates_per_time_id(bdata, aggf_dict_time,
                                  box_time_windows=[(0,99), (100,199), (200,299), (300,399), (400,499), (500,599)])
    
    
   
    # regress on square log-return
    reg_tmp = (bdata.groupby('time_id')).apply(calcGradientR2, 'sqlog_return', 'sqlog_return')
    res2 = pd.merge(res2, reg_tmp, on=["time_id"], how="left") 
    
    resAll = pd.merge(res1,res2, how='left', on=['time_id'])
    
    
    # functions requiring more than one argument
    
    #
    # fft of original features
    #
    
    orig_features = ['price', 'size', 'order_count']
    for col_name in orig_features:
        res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, col_name, [], [np.mean, np.max, np.argmax])
        resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
        
        
    # fft of sqlog return
    
    res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, 'sqlog_return', [], [np.mean, np.max, np.argmax])
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # fft of amount
    res_tmp = (bdata.groupby('time_id')).apply(getft_amplitudes, 'amount', [], [np.mean, np.max, np.argmax])
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # volume-weighted average price
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'price', 'size', 'vw_price', exp_w=False, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")

    # sqrt of volume-weighted sqlog-return
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'sqlog_return', 'size', 'rvw_sqlogreturn', exp_w=False, sqrt=True)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # sqrt of time-weighted sqlog-return
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'sqlog_return', 'seconds_in_bucket', 'rtw_sqlogreturn', exp_w=True, sqrt=True)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # time-weighted size
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'size', 'seconds_in_bucket', 'tw_size', exp_w=True, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")
    
    # time-weighted order_count
    res_tmp = (bdata.groupby('time_id')).apply(weighted_average, 'order_count', 'seconds_in_bucket', 'tw_order_count', exp_w=True, sqrt=False)
    resAll = pd.merge(resAll, res_tmp, on=["time_id"], how="left")    
    
    # add trade prefix to all columns except time_id
    resAll.columns = [(f"trade_{e}" if e != "time_id" else e) for e in resAll.columns]

    resAll['stock_id'] = f"{si}"
    # to get correct time_id's,
    resAll['row_id'] = [f"{si}-{t_id:.0f}" for t_id in resAll['time_id']]

    return pd.DataFrame(resAll)


#stats2 = calcTradeAggregatesForStock(0, f"{data_path}/trade_train.parquet/stock_id=0")
#stats2.head()

## Aggregate group statistics over time and stock id's

In [None]:
def calcTimeStockAgg(df_data):
    
    # calculate for previous volatilty
    vol_cols = [c for c in df_data.columns if ( "volatility_t0-599" in c) or ( "volatility1" in c)
               or ("wap1_sum_t0-599" in c) or ('volume1_sum_t0-599' in c) or ('price_spread1_sum_t0-599' in c)]
    
    #print(sorted(vol_cols))
    
    # Group by the stock id
    df_stock_id = df_data.groupby(['stock_id'])[vol_cols].agg(['mean', 'std']).reset_index()
      
    # Rename columns
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_for_stock_id')

    # Group by the time id
    df_time_id = df_data.groupby(['time_id'])[vol_cols].agg(['mean', 'std' ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_for_time_id')
    
    # Merge with original dataframe
    df_data = pd.merge(df_data, df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__for_stock_id'])
    df_data = pd.merge(df_data, df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__for_time_id'])
    df_data.drop(['stock_id__for_stock_id', 'time_id__for_time_id'], axis = 1, inplace = True)
    
    return df_data

## Collect, and parallelize all computations for the pre-processing

In [None]:
from joblib import Parallel, delayed

def calcAggregatesParallell(stock_idx, is_train=True, is_book=True, n_jobs=-1):  
    to_do = []
    
    dataset = "train" if is_train else "test"
    
    datalib = "book" if is_book else "trade"
    
    calcAggF = calcBookAggregatesForStock if is_book else calcTradeAggregatesForStock
    
    for si in stock_idx:
        file_path = f"{data_path}/{datalib}_{dataset}.parquet/stock_id={si}"
        to_do += [delayed(calcAggF)(si, file_path)]
        
    p_res = Parallel(n_jobs=n_jobs, verbose=3)(to_do)

    df_boxagg = pd.concat(p_res, axis=0)
    
    df_boxagg['time_id'] = df_boxagg['time_id'].astype(int)
    df_boxagg['stock_id'] = df_boxagg['stock_id'].astype(int)
    df_boxagg.reset_index(inplace=True, drop=True)
    
    return df_boxagg


# assumes data_path is set before
def calcAllAggregatesParallell(train_stock_idx, test_stock_idx):
    
    print("Calculating stats for book training data...")
    b_agg_train = calcAggregatesParallell(train_stock_idx, is_train = True, is_book=True)
    
    print("Calculating stats for trade training data...")
    t_agg_train = calcAggregatesParallell(train_stock_idx, is_train = True, is_book=False)
    bt_agg_train = pd.merge(b_agg_train, t_agg_train, on = ['stock_id', 'time_id', 'row_id'], how='left')
      
    print("Calculating stats for book test data...")
    b_agg_test = calcAggregatesParallell(test_stock_idx, is_train = False, is_book=True)
    
    print("Calculating stats for trade test data...")
    t_agg_test = calcAggregatesParallell(test_stock_idx, is_train = False, is_book=False)
    bt_agg_test = pd.merge(b_agg_test, t_agg_test, on = ['stock_id', 'time_id', 'row_id'], how='left')
    
    
    print("Calculating stats over stock_id's/time_id's... (not parallelized)")
    bt_agg_train = calcTimeStockAgg(bt_agg_train)
    bt_agg_train['time_id'] = bt_agg_train['time_id'].astype(int)
    bt_agg_train['stock_id'] = bt_agg_train['stock_id'].astype(int)
    bt_agg_train.reset_index(inplace=True, drop=True)
    
    bt_agg_test = calcTimeStockAgg(bt_agg_test)
    bt_agg_test['time_id'] = bt_agg_test['time_id'].astype(int)
    bt_agg_test['stock_id'] = bt_agg_test['stock_id'].astype(int)
    bt_agg_test.reset_index(inplace=True, drop=True)
    
    
    print("Loading target value from training data and adding to full table... (not parallelized)")
    # add target values from training data
    # (note: no target value in test to add)
    train = pd.read_csv(data_path + 'train.csv')

    # same datatype for join to work
    train['time_id'] = train['time_id'] .astype(int)
    train['stock_id'] = train['stock_id'].astype(int)
    train.reset_index(inplace=True, drop=True)

    bt_agg_train = pd.merge(bt_agg_train, train, on = ['stock_id', 'time_id'], how='left')
    
    print(bt_agg_train.head())
    
    
    return (bt_agg_train, bt_agg_test)

## Calculate root-mean-square-percentage error

$rmspe = \sqrt{\frac{1}{n} \Sigma_{data} \left( \frac{y_{true} - y_{pred}}{y_{true}} \right)^2}$

$rmspe = \sqrt{ \frac{1}{n} \Sigma_{data} \left(1 - \frac{y_{pred}}{y_{true}} \right)^2}$

In [None]:
# root mean square percentage error
def rmspe(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Calculate statistics (pre-processing)
## Find all stock id's in data

In [None]:
book_train_path = f"{data_path}/book_train.parquet"
book_test_path = f"{data_path}/book_test.parquet"

trade_train_path = f"{data_path}/trade_train.parquet"
trade_test_path = f"{data_path}/trade_test.parquet"

# find all stock id's
# will be read as a category, use cat.categories for unique values
btrain_sidx = pd.read_parquet(book_train_path, columns = ['stock_id'])['stock_id'].cat.categories
btest_sidx = pd.read_parquet(book_test_path, columns = ['stock_id'])['stock_id'].cat.categories

ttrain_sidx = pd.read_parquet(trade_train_path, columns = ['stock_id'])['stock_id'].cat.categories
ttest_sidx = pd.read_parquet(trade_test_path, columns = ['stock_id'])['stock_id'].cat.categories

# find all time id's
# will be read as an int, use np.unique for unique values
btrain_tidx = np.unique(pd.read_parquet(book_train_path, columns = ['time_id'])['time_id'])
btest_tidx = np.unique(pd.read_parquet(book_test_path, columns = ['time_id'])['time_id'])

ttrain_tidx = np.unique(pd.read_parquet(trade_train_path, columns = ['time_id'])['time_id'])
ttest_tidx = np.unique(pd.read_parquet(trade_test_path, columns = ['time_id'])['time_id'])

In [None]:
if(not np.array_equal(btrain_sidx, ttrain_sidx)):
    print("Warning, stock indices are different in trade and book data for training set.")
else:
    print("Number of stock indices in book/trade training set:", len(ttrain_sidx))
    
if(not np.array_equal(btest_sidx, ttest_sidx)):
    print("Warning, stock indices are different in trade and book data for test set.")
else:
    print("Number of stock indices in book/trade test set:", len(ttest_sidx))

if(not np.array_equal(btrain_tidx, ttrain_tidx)):
    print("Warning, time indices are different in trade and book data for training set.")
else:
    print("Number of time indices in book/trade training set:", len(ttrain_tidx))
if(not np.array_equal(btest_tidx, ttest_tidx)):
    print("Warning, time indices are different in trade and book data for test set.")
else:
    print("Number of time indices in book/trade test set:", len(ttest_tidx))

In [None]:
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])

### Load example data

## Calculate statistics for book/trade, training/test set

In [None]:
t0 = p_f()

(all_agg_train, all_agg_test) = calcAllAggregatesParallell(btrain_sidx[:], btest_sidx) # returns (train, test) data

print("Time for pre-processing of train/test, book/trade data (h): ", (p_f()-t0)/3600 ) # 

In [None]:
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])

In [None]:
all_agg_train.head()

## Clustering

In [None]:
from sklearn.cluster import KMeans

def calcCorrClusterDict(df, n_clusters=6):
    time_stock_target = df[['time_id', 'stock_id', 'target']]
    train_p = time_stock_target.pivot(index='time_id', columns='stock_id', values='target')

    corr = train_p.corr()
    ids = corr.index
    kmeans = KMeans(n_clusters=n_clusters, random_state=111, max_iter=500).fit(corr.values)
    stock_clusters = kmeans.predict(corr.values)
    
    # print number of occurences of each cluster
    #print(np.unique(stock_clusters, return_counts=True))

    si_cluster_dict = { ids[i] : sc for i, sc in enumerate(stock_clusters)}
    return si_cluster_dict


def calcMargClusterDict(df, n_clusters=6):
    time_aggr_cols = [e for e in df.columns if "for_stock_id" in e] # marginal aggregates for stock_ids (values aggregated over all time-ids)
    time_aggr_cols += ['stock_id']
    cluster_data = df[time_aggr_cols].groupby(['stock_id']).agg(np.mean) # rows with same stock id have same values for these cols
    
    ids = cluster_data.index

    kmeans = KMeans(n_clusters=n_clusters, random_state=111, max_iter=300).fit(cluster_data.values)
    stock_clusters = kmeans.predict(cluster_data.values)
    #print(np.unique(stock_clusters, return_counts=True))

    si_cluster_dict = { ids[i] : sc for i, sc in enumerate(stock_clusters)}
    return si_cluster_dict

In [None]:
# add feature describing which cluster the stock is in from k-means

c_cluster_dict = calcCorrClusterDict(all_agg_train)
m_cluster_dict = calcMargClusterDict(all_agg_train)

all_agg_train['stock_corr_cluster_id'] = all_agg_train['stock_id'].map(c_cluster_dict)
all_agg_train['stock_marg_cluster_id'] = all_agg_train['stock_id'].map(m_cluster_dict)
all_agg_test['stock_corr_cluster_id'] = all_agg_test['stock_id'].map(c_cluster_dict)
all_agg_test['stock_marg_cluster_id'] = all_agg_test['stock_id'].map(m_cluster_dict)

In [None]:
#!pip install umap-learn

In [None]:
import umap

def calcUMAPCoordsDict(df, n_components = 2, n_clusters=5, plot=True):
    
    time_aggr_cols = [e for e in df.columns if "for_stock_id" in e] # marginal aggregates for stock_ids (values aggregated over all time-ids)
    time_aggr_cols += ['stock_id']
    cluster_data = df[time_aggr_cols].groupby(['stock_id']).agg(np.mean) # rows with same stock id have same values for these cols
    
    ids = cluster_data.index
    
    umap_clst = umap.UMAP(n_components=n_components, n_neighbors=4, min_dist=0.05, random_state=111)
    embedding = umap_clst.fit_transform(cluster_data)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=111, max_iter=500).fit(embedding)
    embedding_clusters = kmeans.predict(embedding)
    
    if(plot):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.scatter(embedding[:,0], embedding[:,1], c = embedding_clusters)
        
    si_embedding_dict = {}
    # embeddings
    for ni in range(n_components):
        si_embedding_dict[ni] = { ids[i] : emb[ni] for i, emb in enumerate(embedding)}
    
    # clusters
    si_embedding_dict[n_components] = { ids[i] : cl for i, cl in enumerate(embedding_clusters)}
    
    # si_embedding_dict[0] will have first coords, si_embedding_dict[1] will have seconds coords, ...
    return si_embedding_dict

In [None]:
si_embedding_dict = calcUMAPCoordsDict(all_agg_train, plot=True)
all_agg_train['stock_umap_coord0'] = all_agg_train['stock_id'].map(si_embedding_dict[0])
all_agg_train['stock_umap_coord1'] = all_agg_train['stock_id'].map(si_embedding_dict[1])
all_agg_train['stock_umap_cluster'] = all_agg_train['stock_id'].map(si_embedding_dict[2])
all_agg_test['stock_umap_coord0'] = all_agg_test['stock_id'].map(si_embedding_dict[0])
all_agg_test['stock_umap_coord1'] = all_agg_test['stock_id'].map(si_embedding_dict[1])
all_agg_test['stock_umap_cluster'] = all_agg_test['stock_id'].map(si_embedding_dict[2])

In [None]:
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])

## Save result from preprocessing
Commented out for now, saved files will be deleted when notebook is reset on kaggle.

In [None]:
import pickle

with open('aggf.data', 'wb') as f:
    pickle.dump((all_agg_train, all_agg_test), f)
    
# with open('aggf.data', 'rb') as f:
#     (all_agg_train, all_agg_test) = pickle.load(f) 

In [None]:
all_agg_train.head()

In [None]:
all_agg_test.head()

### Handle missing values

Assume zero is a reasonable value for all na's. Na here (always? check this) means no data/trades. Then order size can be set to 0 since no orders, sum of price=0 since to trades, standard deviation of price = 0 since no data points/deviations...

Note:

lightgbm able to accept data with Na, will treat them as separate value. Could be better than setting Na's to zero.

In [None]:
# replace na's with mean for variables such as mean price that does not depend on number of orders

"""
cols_mean = [e for e in all_agg_train.columns if not
             (("size" in e) or ("count" in e) or ("unique" in e) or ("volume") in e
             or ("n_above" in e) or ("n_below" in e) or ("n_pos" in e) or ("n_neg" in e))]
"""
# note: this will set variables such as n_above_mean to their averages (over all timeid's for the respective stocks)
# within their time-frame, while setting size/order to the true value of zero
#all_agg_train = replaceNaWithMean(all_agg_train, cols_mean)
#all_agg_test = replaceNaWithMean(all_agg_test, cols_mean)

# zero reasonable value for (all?) other Nas (i.e. order size being 0 since no orders, sum of price being 0 since to trades, standard deviation of price 0 since no data points/deviations...)
all_agg_train = all_agg_train.fillna(0)
all_agg_test = all_agg_test.fillna(0)

## Downcast variables to save memory

Does it increase % of available RAM?

In [None]:
# #!pip install pandas-downcast

# import pdcast as pdc
# import gc

# all_agg_train = pdc.downcast(all_agg_train)
# all_agg_test = pdc.downcast(all_agg_test)

# gc.collect()

In [None]:
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
all_agg_train.info()

# Find collinearities and remove highly correlated variables

Removed to reduce computation time/complexity of code.

Removing correlated variables seemed to result in worse results.

## Function definitions
### Calculate VIF
Regresses one variable against the others. Variance inflation factor (VIF) is

$VIF = 1/(1-r^2)$

where $r^2$ is R-squared value from linear regression.

### Function for removing features that are highly correlated
Remove variables with high Pearson-coefficients (absolute value).

Approach: Find the pair of variables with highest absolute correlation, of these two, remove the one with highest mean absolute correlation to other variables.

### Extract independent/dependent variables for calculating correlations

In [None]:
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])

## Remove variables with high internal correlations

# New train/valid split, keep selected columns

## train_valid_groupsplit


In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold, train_test_split

def train_valid_groupsplit(X_data, y_data, group_idx, random_state=111, ratio=4):
    
    n_splits = ratio+1 # ratio is num_train/num_valid (must be integer)
    gkfold = GroupKFold(n_splits = n_splits)
    
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X_data, y_data, group_idx, random_state=random_state)

    t_idx, v_idx = list(gkfold.split(X_shuffled, groups=groups_shuffled))[0] # no randomness in groupkfold

    x_t = X_shuffled.iloc[t_idx]
    x_v = X_shuffled.iloc[v_idx]

    y_t = y_shuffled.iloc[t_idx]
    y_v = y_shuffled.iloc[v_idx]
    
    return (x_t, x_v, y_t, y_v)

#(x_tt, x_tv, y_tt, y_tv) = train_valid_groupsplit(all_agg_train.drop(["time_id", 'target', 'row_id'], axis=1), all_agg_train['target'], all_agg_train['time_id'])

### Extract features, groupid's (time indices), and create train/valid set for model training

In [None]:
from sklearn.model_selection import train_test_split

X_data = all_agg_train.drop(['target', 'row_id'], axis=1)
y_data = all_agg_train['target']
X_test = all_agg_test.drop(['row_id'], axis=1)

# x_tt, x_tv, y_tt, y_tv = train_valid_groupsplit(X_data, y_data, X_data['time_id'])
# time_idxs_tt = x_tt['time_id']
# time_idxs_tv = x_tv['time_id']

# x_tt = x_tt.drop(['time_id'], axis=1)
# x_tv = x_tv.drop(['time_id'], axis=1)

categorical_features = ['stock_umap_cluster', 'stock_corr_cluster_id', 'stock_marg_cluster_id', 'stock_id']

# Train models
## Function definitions
### Parameter sets

In [None]:
lgb_paras0 = {
    'num_iterations':2000,
    'objective': 'rmse',  
    'boosting_type': 'gbdt',
    'num_leaves': 40,
    'n_jobs': -1,
    'learning_rate': 0.05,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.3,
    'min_data_in_leaf': 70,
    'max_depth': 3,
    'verbose': -1,
     'device' : "cpu"
}

lgb_paras1 = {
        'num_iterations':2000,
    'bagging_fraction': 0.21498006,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.66163467,
 'learning_rate': 0.03146706,
 'max_depth': 4,
 'min_data_in_leaf': 78,
 'n_jobs': -1,
 'num_leaves': 43,
 'objective': 'rmse',
 'verbose': -1}

lgb_paras2 = {
        'num_iterations':2000,
    'bagging_fraction': [0.12037843],
 'boosting_type': 'gbdt',
 'feature_fraction': [0.59048959],
 'learning_rate': [0.05567519],
 'max_depth': 5,
 'min_data_in_leaf': 79,
 'n_jobs': -1,
 'num_leaves': 35,
 'objective': 'rmse',
 'verbose': -1}


lgb_paras3 = {
        'num_iterations':2000,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': 5,
    'num_leaves' :45,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.05,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.6,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'n_jobs':-1,
    'verbose': -1}


lgb_paras4 = {
        'num_iterations':2000,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': 3,
    'num_leaves' :45,
    'max_bin':100,
    'min_data_in_leaf':500,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'subsample_freq': 3,
    'feature_fraction': 0.6,
    'lambda_l1': 5.0,
    'lambda_l2': 20.0,
    'n_jobs':-1,
    'verbose': -1}

### LightGBM ensemble

In [None]:
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

class lgbEnsemble():
    def __init__(self, n_folds = 5, lgb_paras={}):
        self.n_folds_ = n_folds
        self.lgb_paras_ = lgb_paras
        return

    def fit(self, X_data, y_data, group_idx, n_early_stopping=50, verbose_freq=50, categorical_features=[]):
        kfold = GroupKFold(n_splits = self.n_folds_)
        self.oof_predictions_ = np.zeros(X_data.shape[0])
        
        self.estimators_ = []
        
        self.weights = np.zeros(self.n_folds_)
        
        X_data, y_data, group_idx = shuffle(X_data, y_data, group_idx, random_state=111)
        
        for k, (t_idx, v_idx) in enumerate(kfold.split(X_data, groups=group_idx)):
            
            
            x_tt, x_tv = X_data.iloc[t_idx], X_data.iloc[v_idx]
            y_tt, y_tv = y_data.iloc[t_idx], y_data.iloc[v_idx]
            
            # to get rmspe errors:
            train_weights = 1 / np.square(y_tt)
            val_weights = 1 / np.square(y_tv)
            feval_error = feval_rmspe
            
            train_dataset = lgb.Dataset(x_tt, y_tt, weight = train_weights, categorical_feature = categorical_features)
            val_dataset = lgb.Dataset(x_tv, y_tv, weight = val_weights, categorical_feature = categorical_features)
            model = lgb.train(params = self.lgb_paras_, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          early_stopping_rounds = n_early_stopping, 
                          verbose_eval = verbose_freq,
                          feval = feval_rmspe)
            
            
            
            self.estimators_ += [model]
            self.oof_predictions_[v_idx] = model.predict(x_tv)
            
        
        my_score = rmspe(y_data, self.oof_predictions_)
        self.oof_rmspe_ = my_score
        print(f'Out-of-folds error is {my_score}')
            
        return self
    
    def predict(self, X_data):
        
        predictions = np.zeros(len(X_data))
        
        n_e = len(self.estimators_)
        
        for k, est in enumerate(self.estimators_):
            predictions += est.predict(X_data)/n_e
        
        return predictions

### Catboost Ensemble

In [None]:
from catboost import CatBoostRegressor, Pool

class cbrEnsemble():
    def __init__(self, n_folds = 5, cbr_paras={}):
        self.n_folds_ = n_folds
        self.cbr_paras_ = cbr_paras
        return
    
    def fit(self, X_data, y_data, group_idx, n_early_stopping=50, verbose_freq=50, categorical_features=[]):
        kfold = GroupKFold(n_splits = self.n_folds_)
        self.oof_predictions_ = np.zeros(X_data.shape[0])
        
        self.estimators_ = []
        
        self.weights = np.zeros(self.n_folds_)
        
        
        X_data, y_data, group_idx = shuffle(X_data, y_data, group_idx, random_state=111)
        
        
        for k, (t_idx, v_idx) in enumerate(kfold.split(X_data, groups=group_idx)):
            
            
            x_tt, x_tv = X_data.iloc[t_idx], X_data.iloc[v_idx]
            y_tt, y_tv = y_data.iloc[t_idx], y_data.iloc[v_idx]
            
            train_weights = 1 / np.square(y_tt)
            val_weights = 1 / np.square(y_tv)
            feval_error = feval_rmspe
            
            
            
            train_dataset = Pool(x_tt,y_tt, weight=train_weights, cat_features = categorical_features)
            val_dataset = Pool(x_tv, y_tv, weight = val_weights, cat_features = categorical_features)
            
            model = CatBoostRegressor(**self.cbr_paras_)
            
            model = model.fit(train_dataset,
                          eval_set = val_dataset, 
                          early_stopping_rounds = n_early_stopping, 
                          verbose_eval = verbose_freq)
            
            
            
            self.estimators_ += [model]
            self.oof_predictions_[v_idx] = model.predict(x_tv)
            
        
        my_score = rmspe(y_data, self.oof_predictions_)
        self.oof_rmspe_ = my_score
        print(f'Out-of-folds error is {my_score}')
            
        return self
    
    def predict(self, X_data):
        
        predictions = np.zeros(len(X_data))
        
        n_e = len(self.estimators_)
        
        for k, est in enumerate(self.estimators_):
            predictions += est.predict(X_data)/n_e
        
        return predictions

### LinearRegression ensemble

In [None]:
from sklearn.linear_model import LinearRegression

class lrStockEnsemble():
    def __init__(self):
        return
    
    def fit(self, X_data, y_data):
        self.ensemble_dict_ = {}
        stock_idxs = np.unique(X_data['stock_id'])
        
        for si in stock_idxs:
            
            rows = (X_data['stock_id']==si)
            
            X = X_data.loc[rows]
            y = y_data.loc[rows]
            
            train_weights = 1 / np.square(y)
            
            model = LinearRegression()
            model = model.fit(X, y,
                             sample_weight=train_weights)
            
            self.ensemble_dict_[si] = model
            
        
        
            
        return self
    
    def predict(self, X_data):
        
        predictions = np.zeros(len(X_data))

        pred_f = lambda x: self.ensemble_dict_[x['stock_id']].predict(x.values.reshape(1, -1))[0]
        return X_data.apply(pred_f, axis=1)
    

#lr_se = lrStockEnsemble()

#lr_se.fit(x_tt, y_tt)
#rmspe(y_tv, lr_se.predict(x_tv))

In [None]:
from sklearn.linear_model import LinearRegression

class lrEnsemble():
    def __init__(self, n_folds = 5):
        self.n_folds_ = n_folds
        return
    
    def fit(self, X_data, y_data, group_idx):
        kfold = GroupKFold(n_splits = self.n_folds_)
        self.oof_predictions_ = np.zeros(X_data.shape[0])
        
        self.estimators_ = []

        X_data, y_data, group_idx = shuffle(X_data, y_data, group_idx, random_state=111)
        
        for k, (t_idx, v_idx) in enumerate(kfold.split(X_data, groups=group_idx)):
            
            x_tt, x_tv = X_data.iloc[t_idx], X_data.iloc[v_idx]
            y_tt, y_tv = y_data.iloc[t_idx], y_data.iloc[v_idx]
            
            train_weights = 1 / np.square(y_tt)
            
            model = LinearRegression()
            model = model.fit(x_tt, y_tt,
                             sample_weight=train_weights)
            
            self.estimators_ += [model]
            self.oof_predictions_[v_idx] = model.predict(x_tv)
            
        
        my_score = rmspe(y_data, self.oof_predictions_)
        self.oof_rmspe_ = my_score
        print(f'Out-of-folds error is {my_score}')
            
        return self
    
    def predict(self, X_data):
        
        predictions = np.zeros(len(X_data))
        
        n_e = len(self.estimators_)
        
        for k, est in enumerate(self.estimators_):
            predictions += est.predict(X_data)/n_e
        
        return predictions

### Model stack

In [None]:
class optModelStack():
    def __init__(self, lgb_ens_parasets,
                 cbr_ens_parasets,
                 lgb_meta_paras,
                 n_folds=5):
        
        
        self.feature_names_ = {'lgb':[], 'cbr':[], 'lr':[]}
        self.cbr_ensembles_ = []
        self.lgb_ensembles_ = []
        
        n_ens = len(lgb_ens_parasets) + len(cbr_ens_parasets)
        
        # initiate lgbEnsembles
        for i, paras in enumerate(lgb_ens_parasets):
            self.lgb_ensembles_ += [lgbEnsemble(n_folds=n_folds, lgb_paras = paras)]
            self.feature_names_['lgb'] += [f'lgb_f{i}']
            
            
        for i, paras in enumerate(cbr_ens_parasets):
            self.cbr_ensembles_ += [cbrEnsemble(n_folds=n_folds, cbr_paras = paras)]
            self.feature_names_['cbr'] += [f'cbr_f{i}']
            
            
        self.lr_ensembles_ = [lrEnsemble()]
        self.feature_names_['lr'] += ['f0']
        
        self.meta_model_ = lgbEnsemble(n_folds=n_folds, lgb_paras = lgb_meta_paras)
        
        self.n_folds=n_folds
        return
    
    def fit(self, X_data, y_data, group_idx, **extra_kwargs):
        
        self.oof_predictions_ = pd.DataFrame()
        
        for i, ens in enumerate(self.lgb_ensembles_):
            ens.fit(X_data, y_data, group_idx, **extra_kwargs)
            col_name = self.feature_names_['lgb'][i]
            self.oof_predictions_[col_name] = ens.oof_predictions_
            print("lgb", i, "error:", ens.oof_rmspe_)
            
        for i, ens in enumerate(self.cbr_ensembles_):
            ens.fit(X_data, y_data, group_idx, **extra_kwargs)
            col_name = self.feature_names_['cbr'][i]
            self.oof_predictions_[col_name] = ens.oof_predictions_
            print("cbr", i, "error:", ens.oof_rmspe_)
            
            
        for i, ens in enumerate(self.lr_ensembles_):
            ens.fit(X_data, y_data, group_idx)
            col_name = self.feature_names_['lr'][i]
            self.oof_predictions_[col_name] = ens.oof_predictions_
            print("lr", i, "error:", ens.oof_rmspe_)
        
        
        # assumed rmspe error
        sw = 1/np.square(y_data)
        self.oof_predictions_.set_index(X_data.index)
        
        X_data = pd.concat([X_data.reset_index(drop=True), self.oof_predictions_], axis=1)
        
        
        self.meta_model_.fit(X_data, y_data, group_idx)
        
        return self
    
    def predict_L0(self, X_data):
        predictions = pd.DataFrame()
        
        for i, ens in enumerate(self.lgb_ensembles_):
            col_name = self.feature_names_['lgb'][i]
            predictions[col_name] = ens.predict(X_data)
        
        for i, ens in enumerate(self.cbr_ensembles_):
            col_name = self.feature_names_['cbr'][i]
            predictions[col_name] = ens.predict(X_data)
            
        for i, ens in enumerate(self.lr_ensembles_):
            col_name = self.feature_names_['lr'][i]
            predictions[col_name] = ens.predict(X_data)
            
            
        return predictions
    
    def predict(self, X_data):
        res = self.predict_L0(X_data)
        X_data = pd.concat([X_data.reset_index(drop=True), res], axis=1)
        
        return self.meta_model_.predict(X_data)

In [None]:
def evaluate_lr(X, y, group_idx, n_folds=5):
    gkf = GroupKFold(n_splits=n_folds).split(X,y,groups=group_idx)
    
    X, y, group_idx = shuffle(X, y, group_idx, random_state=111)
    
    error = 0
    for t_i, v_i in gkf:
        model = LinearRegression().fit(X.iloc[t_i], y.iloc[t_i], sample_weight = 1/np.square(y.iloc[t_i]))
        error += rmspe(y.iloc[v_i], model.predict(X.iloc[v_i]))/n_folds
        
    return error


def evaluate_knnr(X, y, group_idx, n_folds=5):
    gkf = GroupKFold(n_splits=n_folds).split(X,y,groups=group_idx)
    
    X, y, group_idx = shuffle(X, y, group_idx, random_state=111)
    
    error = 0
    for t_i, v_i in gkf:
        model = KNeighborsRegressor(n_neighbors=10, weights='distance', metric='euclidean').fit(X.iloc[t_i], y.iloc[t_i])
        error += rmspe(y.iloc[v_i], model.predict(X.iloc[v_i]))/n_folds
    return error

### Train ensemble

Train models for each stock id and calculate average oof-errors

In [None]:
print('RAM memory % used:', psutil.virtual_memory()[2])
cbr_paras0 = {'loss_function' : 'RMSE',
             'iterations' : 600,
              'learning_rate': 0.05,
             'depth': 6,
              'task_type': 'CPU',
             'min_data_in_leaf': 3}

In [None]:
lgb_paralist = [lgb_paras0, lgb_paras1, lgb_paras2]
cbr_paralist = [cbr_paras0]

model_stack = optModelStack(lgb_paralist, cbr_paralist, lgb_paras3, n_folds=7)

model_stack.fit(X_data.drop(['time_id'], axis=1), y_data, X_data['time_id'], verbose_freq=0, categorical_features=categorical_features)

## Train lightgbm model
### Feature importances
#### Calculate feature importances for model

Removed to reduce complexity of code, seems to be best to keep all...

## Check % RAM used

In [None]:
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])

## Train model ensemble

Notes:

* baseline (folds=3, Na's replaced with zero)

Out-of-folds RMSPE is 0.22493430359860675

Error on valid set is: 0.22943458641036366

* Na's kept (folds=5)

Out-of-folds RMSPE is 0.2259242998405427

Error on valid set is: 0.23030996419701785

* add gradient features (folds=5)

Out-of-folds RMSPE is 0.2252442009993441

Error on valid set is: 0.2292844069587652

* add gradient features+ vw/tw-features (folds=5)

Out-of-folds RMSPE is 0.22541343505501404
Error on valid set is: 0.2289696589191299

# Predict on test data and submit
## Load test data

In [None]:
# not needed?
#test = pd.read_csv(data_path + 'test.csv')
#test['time_id'] = test['time_id'] .astype(int)
#test['stock_id'] = test['stock_id'].astype(int)
#test.reset_index(inplace=True, drop=True)

## Calculate predictions

In [None]:
test_pred = model_stack.predict(X_test.drop(['time_id'], axis=1))
all_agg_test['target'] = test_pred
all_agg_test.head()

## Save result to file

In [None]:
final_result = all_agg_test[['row_id', 'target']]
final_result.to_csv('submission.csv',index = False)
final_result.head()