In [None]:
import pandas as pd
import numpy as np
import gc

from joblib import Parallel, delayed
import pickle

from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.cluster import KMeans

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import statsmodels.api as sm

path_submissions = '/'

target_name = 'target'
scores_folds = {}

# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
def gradient(y_true, y_pred):
    return -2*(y_true-y_pred)/(y_true**2)

def hessian(y_true, y_pred):
    return 2/(y_true**2)

def SPELoss(y_true, y_pred):
    grad = gradient(y_true, y_pred)
    hess = hessian(y_true, y_pred)
    return grad, hess

def eval_rmspe(y_true, y_pred):
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))
    return 'RMSPE', loss, False

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calculate_double_depth_bid_size(df):
    return df['bid_size1'] + df['bid_size2']
    
def calculate_double_depth_ask_size(df):
    return df['ask_size1'] + df['ask_size2']

def calculate_double_depth_bid(df):
    w_avg_bid_price_numerator = df['bid_price1'] * df['bid_size1'] + df['bid_price2'] * df['bid_size2']
    w_avg_bid_size = calculate_double_depth_bid_size(df)
    w_avg_bid_price = w_avg_bid_price_numerator.values / w_avg_bid_size.values
    return w_avg_bid_price, w_avg_bid_size

def calculate_double_depth_ask(df):
    w_avg_ask_price_numerator = df['ask_price1'] * df['ask_size1'] + df['ask_price2'] * df['ask_size2']
    w_avg_ask_size = calculate_double_depth_ask_size(df)
    w_avg_ask_price = w_avg_ask_price_numerator.values / w_avg_ask_size.values
    return w_avg_ask_price, w_avg_ask_size

def calculate_double_depth_wap(df):
    w_avg_bid_price, w_avg_bid_size = calculate_double_depth_bid(df)
    w_avg_ask_price, w_avg_ask_size = calculate_double_depth_ask(df)
    numerator = w_avg_bid_price * w_avg_ask_size + w_avg_ask_price * w_avg_bid_size
    denominator = w_avg_bid_size + w_avg_ask_size
    wap = numerator.values / denominator.values
    return wap

def calculate_spread(df):
    return (df['ask_price1'] / df['bid_price1']) - 1

def calculate_double_depth_spread(df):
    w_avg_bid_price = calculate_double_depth_bid(df)[0]
    w_avg_ask_price = calculate_double_depth_ask(df)[0]
    return (w_avg_ask_price / w_avg_bid_price) - 1

def calculate_order_book_imbalance(df):
    numerator = df['bid_size1'] - df['ask_size1']
    denominator = df['bid_size1'] + df['ask_size1']
    return numerator / denominator

def calculate_double_depth_order_book_imbalance(df):
    double_depth_bid_size = calculate_double_depth_bid_size(df)
    double_depth_ask_size = calculate_double_depth_ask_size(df)
    numerator = double_depth_bid_size - double_depth_ask_size
    denominator = double_depth_bid_size + double_depth_ask_size
    return numerator / denominator   

def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))

def read_train():
    train = pd.read_csv(data_dir + 'train.csv')
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    return train

def read_test():
    test = pd.read_csv(data_dir + 'test.csv')
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    return test
    
def calculate_volume_per_trade(df):
    return df['size'] / df['order_count']  


# Function to get group stats for different windows (seconds in bucket)
def get_stats_window(df, fe_dict, seconds_in_bucket, add_suffix = False):
    # Group by the window
    df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
    # Rename columns joining suffix
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    # Add a suffix to differentiate windows
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
    return df_feature

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    df['wap_avg'] = calculate_double_depth_wap(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].transform(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].transform(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].transform(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].transform(log_return)
    df['log_return_avg'] = df.groupby(['time_id'])['wap_avg'].transform(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['bid_ask_spread'] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    df['spread'] = calculate_spread(df)
    df['spread_2'] = calculate_double_depth_spread(df)
    df['ob_imb'] = calculate_order_book_imbalance(df)
    df['ob_imb_2'] = calculate_double_depth_order_book_imbalance(df)

    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.std],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'wap_avg': [np.sum, np.std],
        'log_return1': [np.mean, np.median, np.max, np.min, realized_volatility, pd.DataFrame.kurt, pd.DataFrame.skew],
        'log_return2': [np.mean, np.median, np.max, np.min, realized_volatility, pd.DataFrame.kurt, pd.DataFrame.skew],
        'log_return3': [np.mean, np.median, np.max, np.min, realized_volatility, pd.DataFrame.kurt, pd.DataFrame.skew],
        'log_return4': [np.mean, np.median, np.max, np.min, realized_volatility, pd.DataFrame.kurt, pd.DataFrame.skew],
        'log_return_avg': [np.mean, np.median, np.max, np.min, realized_volatility, pd.DataFrame.kurt, pd.DataFrame.skew],
        'wap_balance': ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'price_spread':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'price_spread2':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'bid_spread':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'ask_spread':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'total_volume':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'volume_imbalance':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'bid_ask_spread':['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'spread':   ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'spread_2': ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'ob_imb':   ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'ob_imb_2': ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
    }
    
    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_avg': [realized_volatility],
    }
    
    
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket=0, add_suffix=False)
    df_feature_500 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=500, add_suffix=True)
    df_feature_400 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=400, add_suffix=True)
    df_feature_300 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=300, add_suffix=True)
    df_feature_200 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=200, add_suffix=True)
    df_feature_100 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=100, add_suffix=True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how='left', left_on='time_id_', right_on='time_id__500')
    df_feature = df_feature.merge(df_feature_400, how='left', left_on='time_id_', right_on='time_id__400')
    df_feature = df_feature.merge(df_feature_300, how='left', left_on='time_id_', right_on='time_id__300')
    df_feature = df_feature.merge(df_feature_200, how='left', left_on='time_id_', right_on='time_id__200')
    df_feature = df_feature.merge(df_feature_100, how='left', left_on='time_id_', right_on='time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis=1, inplace=True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount'] = df['price'] * df['size']
    df['seconds_between_trades'] = df.groupby('time_id')['seconds_in_bucket'].diff()
    df['volume_per_trade'] = calculate_volume_per_trade(df)
    # Dict for aggregations
    create_feature_dict = {
        'price': [np.sum, np.std],
        'log_return':   [np.mean, np.median, np.max, np.min, realized_volatility, pd.DataFrame.kurt, pd.DataFrame.skew],
        'seconds_in_bucket': [count_unique],
        'size': ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'order_count':  [np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
        'seconds_between_trades': ['last', np.mean, np.median, np.sum, np.max, np.min, 'std', pd.DataFrame.kurt, pd.DataFrame.skew],
    }
    create_feature_dict_time = {
        'log_return':   [realized_volatility],
        'seconds_in_bucket': [count_unique],
        'size': [np.sum],
        'order_count':  [np.sum],
    }

    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket=0, add_suffix=False)
    df_feature_500 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=500, add_suffix=True)
    df_feature_400 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=400, add_suffix=True)
    df_feature_300 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=300, add_suffix=True)
    df_feature_200 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=200, add_suffix=True)
    df_feature_100 = get_stats_window(df, create_feature_dict_time, seconds_in_bucket=100, add_suffix=True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({
            'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
            'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v
        })
    df_lr = pd.DataFrame(lis)
        
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = [
        'log_return1_realized_volatility', 
        'log_return2_realized_volatility', 
        'log_return1_realized_volatility_400', 
        'log_return2_realized_volatility_400', 
        'log_return1_realized_volatility_300', 
        'log_return2_realized_volatility_300', 
        'log_return1_realized_volatility_200', 
        'log_return2_realized_volatility_200', 
        'trade_log_return_realized_volatility', 
        'trade_log_return_realized_volatility_400', 
        'trade_log_return_realized_volatility_300', 
        'trade_log_return_realized_volatility_200'
    ]
    
    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', pd.DataFrame.kurt, pd.DataFrame.skew]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', pd.DataFrame.kurt, pd.DataFrame.skew]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how='left', left_on=['stock_id'], right_on=['stock_id__stock'])
    df = df.merge(df_time_id, how='left', left_on=['time_id'], right_on=['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train=True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on='row_id', how='left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs=-1, verbose=1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index=True)
    return df

In [None]:
import pickle
# train = read_train()
test = read_test()

# Get unique stock ids 
# train_stock_ids = train['stock_id'].unique()
# # Preprocess them using Parallel and our single stock id functions
# train_ = preprocessor(train_stock_ids, is_train=True)
# train = train.merge(train_, on=['row_id'], how='left')
# pickle.dump(train, open('train.p', 'wb+'))
# train = pickle.load(open('../input/too-many-features/train.p', 'rb'))

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train=False)
test = test.merge(test_, on=['row_id'], how='left')

In [None]:
# replace by order sum (tau)
# train['size_tau'] = np.sqrt(1 / train['trade_seconds_in_bucket_count_unique'])
# train['size_tau_400'] = np.sqrt(1 / train['trade_seconds_in_bucket_count_unique_400'])
# train['size_tau_300'] = np.sqrt(1 / train['trade_seconds_in_bucket_count_unique_300'])
# train['size_tau_200'] = np.sqrt(1 / train['trade_seconds_in_bucket_count_unique_200'])

test['size_tau'] = np.sqrt(1 / test['trade_seconds_in_bucket_count_unique'])
test['size_tau_400'] = np.sqrt(1 / test['trade_seconds_in_bucket_count_unique_400'])
test['size_tau_300'] = np.sqrt(1 / test['trade_seconds_in_bucket_count_unique_300'])
test['size_tau_200'] = np.sqrt(1 / test['trade_seconds_in_bucket_count_unique_200'])

# train['size_tau2'] = np.sqrt(1 / train['trade_order_count_sum'])
# train['size_tau2_400'] = np.sqrt(0.33 / train['trade_order_count_sum'])
# train['size_tau2_300'] = np.sqrt(0.5 / train['trade_order_count_sum'])
# train['size_tau2_200'] = np.sqrt(0.66 / train['trade_order_count_sum'])

test['size_tau2'] = np.sqrt(1 / test['trade_order_count_sum'])
test['size_tau2_400'] = np.sqrt(0.33 / test['trade_order_count_sum'])
test['size_tau2_300'] = np.sqrt(0.5 / test['trade_order_count_sum'])
test['size_tau2_200'] = np.sqrt(0.66 / test['trade_order_count_sum'])

# train['size_tau2_d'] = train['size_tau2_400'] - train['size_tau2']
test['size_tau2_d'] = test['size_tau2_400'] - test['size_tau2']

In [None]:
# train = get_time_stock(train)
test = get_time_stock(test)

In [None]:
train_p = pd.read_csv(data_dir + 'train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()
ids = corr.index
n_clusters=5
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(corr.values)
print(kmeans.labels_)

l = []
for n in range(n_clusters):
    l.append ([(x-1) for x in ((ids+1)*(kmeans.labels_==n)) if x > 0])
    

mat = []
matTest = []

n = 0
for ind in l:
#     print(ind)
#     newDf = train.loc[train['stock_id'].isin(ind)]
#     newDf = newDf.groupby(['time_id']).agg(np.nanmean)
#     newDf.loc[:, 'stock_id'] = str(n) + 'c1'
#     mat.append(newDf)
    
    newDf = test.loc[test['stock_id'].isin(ind)]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:, 'stock_id'] = str(n) + 'c1'
    matTest.append(newDf)
    
    n+=1
    
# mat1 = pd.concat(mat).reset_index()
# mat1.drop(columns=['target'], inplace=True)

mat2 = pd.concat(matTest).reset_index()

extra = pd.concat([mat2, mat2.iloc[1:]])
extra['stock_id'].iloc[-len(l):] = [str(n) + 'c1' for n in range(len(l))]
extra['time_id'].iloc[-len(l):] = [32751, 32753, 32758, 32763, 32767]

mat2 = pd.concat([mat2, extra])
# mat1 = mat1.pivot(index='time_id', columns='stock_id')
# mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
# mat1.reset_index(inplace=True)


mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

cluster_feature_cols = ['time_id']
for i in range(n_clusters):
    cluster_feature_cols += [
        f'log_return1_realized_volatility_{i}c1',
        f'total_volume_sum_{i}c1',
        f'trade_size_sum_{i}c1',
        f'trade_order_count_sum_{i}c1',
        f'price_spread_sum_{i}c1',
        f'bid_spread_sum_{i}c1',
        f'ask_spread_sum_{i}c1',
        f'volume_imbalance_sum_{i}c1',
        f'bid_ask_spread_sum_{i}c1',
        f'size_tau2_{i}c1',
    ]
    

# train = pd.merge(train, mat1[cluster_feature_cols], how='left', on='time_id')
test = pd.merge(test, mat2[cluster_feature_cols], how='left', on='time_id')

del mat2
gc.collect()

In [None]:
feature_cols = [
    col for col in list(test.columns)
    if col not in {"stock_id", "time_id", "target", "row_id"}
]
no_stock_feature_cols = [
    col for col in list(test.columns)
    if col not in {"time_id", "target", "row_id"}
]
stocks = test['stock_id'].unique()

In [None]:
def train_lgbm(X_train, y_train, X_val, y_val, n_estimators=10000):
    model = LGBMRegressor(
        learning_rate=0.05, 
        n_jobs=-1, 
        objective=SPELoss, 
        n_estimators=n_estimators, 
        max_depth=4, 
        boosting_type='gbdt',
        importance_type='gain',
        subsample=0.9,
        subsample_freq=4,
        subsample_for_bin=50000,
        num_leaves=100,
        min_data_in_leaf=10,
    )
    fitted = model.fit(
        X=X_train, 
        y=y_train, 
        eval_set=[(
            X_val,
            y_val
        )], 
        early_stopping_rounds=20, 
        eval_metric=eval_rmspe, 
        verbose=1000,
    )
    return fitted

In [None]:
"""
fitted_all = train_lgbm(
    train[feature_cols], 
    train['target'],
    train[feature_cols], 
    train['target'],
    n_estimators=500
)

predictions_all = pd.Series(fitted_all.predict(train[feature_cols])).to_frame('predicted')
predictions_all['actual'] = train['target']
predictions_all['err'] = ((predictions_all['actual'] - predictions_all['predicted']) / predictions_all['actual']).abs()
predictions_all['time_id'] = train['time_id']
predictions_all['stock_id'] = train['stock_id']
worst_samples = predictions_all.sort_values('err').iloc[-30000:]
worst_samples['stock_time_id'] = worst_samples['stock_id'].astype(str) + worst_samples['time_id'].astype(str)

#adjust train
train['stock_time_id'] = train['stock_id'].astype(str) + train['time_id'].astype(str)
train = train.loc[~train['stock_time_id'].isin(worst_samples['stock_time_id'])]
train = train.drop('stock_time_id', axis=1)
"""

In [None]:
"""
n = 0
stock_models = {
    stock: [] for stock in stocks
}
combined_model = []
combined_model_no_stock = []

for train_inds, val_inds in KFold(n_splits=3, random_state=2021, shuffle=True).split(train):
# for train_inds, val_inds in GroupShuffleSplit(test_size=.20, n_splits=5, random_state=7).split(train, groups=train['time_id']):
# for train_inds, val_inds in GroupShuffleSplit(test_size=.50, n_splits=20, random_state=7).split(train, groups=train['time_id']):
    
    print(f'Group Train-Val Fold {n+1}')
    X_train = train.iloc[train_inds].copy()
    X_val = train.iloc[val_inds].copy()

    # Get group stats of time_id and stock_id
    
    # X_train = get_time_stock(X_train)
    # X_val = get_time_stock(X_val)
    
    all_oof_predictions = []
    for stock in stocks:
        print(f'Stock {stock}')
        X_train_stock = X_train.loc[X_train['stock_id']==stock]
        X_val_stock = X_val.loc[X_val['stock_id']==stock]
        
        fitted = train_lgbm(
            X_train_stock[no_stock_feature_cols], 
            X_train_stock['target'],
            X_val_stock[no_stock_feature_cols], 
            X_val_stock['target']
        )
        stock_models[stock].append(fitted)
        oof_predictions = pd.Series(fitted.predict(X_val_stock[no_stock_feature_cols])).to_frame('predicted')
        oof_predictions['actual'] = X_val_stock['target'].values
        stock_rmspe = rmspe(oof_predictions['actual'], oof_predictions['predicted'])
        print(f'Stock {stock} RMSPE {stock_rmspe}')
        oof_predictions['rmspe'] = stock_rmspe
        oof_predictions['stock_id'] = X_val_stock['stock_id'].values
        oof_predictions['time_id'] = X_val_stock['time_id'].values
        oof_predictions['hist_wap_vol'] = X_val_stock['log_return1_realized_volatility'].values
        oof_predictions['hist_price_vol'] = X_val_stock['trade_log_return_realized_volatility'].values
        all_oof_predictions.append(oof_predictions)
    combined_oof_predictions = pd.concat(all_oof_predictions)
    final_fold_rmspe = rmspe(combined_oof_predictions['actual'], combined_oof_predictions['predicted'])
    print(f'Final Fold - INDIE STOCKS - {n+1} RMSPE {final_fold_rmspe}')
    n += 1
"""


In [None]:
# pickle.dump(stock_models, open('stock_models.p', 'wb+'))
stock_models = pickle.load(open('../input/k/eliosgut/too-many-features/stock_models.p', 'rb'))
pickle.dump(stock_models, open('stock_models.p', 'wb+'))

In [None]:
"""
train_all = [train.loc[train['stock_id']==stock] for stock in stocks]
target_full_per_stock = pd.concat([
    pd.DataFrame(
        [model.predict(stock_df[no_stock_feature_cols]) for model in stock_models[stock]]+ [stock_df['stock_id'].values] + [stock_df['time_id'].values]
    ).T for stock_df, stock in zip(train_all, stocks)
]).reset_index(drop=True)
target_full_per_stock.columns = [f'per_stock_{i+1}' for i in range(len(stock_models[stocks[0]]))] + ['stock_id', 'time_id']

stacked_ensemble = target_full_per_stock.merge(train[['stock_id', 'time_id', 'target']], on=['stock_id', 'time_id'], how='inner')
stacked_model_pred = pd.Series(stacked_ensemble.drop(['stock_id', 'target', 'time_id'], axis=1).values.mean(axis=1)).to_frame('predicted')
stacked_model_pred['target'] = stacked_ensemble['target']

last_rmspe = rmspe(stacked_model_pred['target'], stacked_model_pred['predicted'])
print(f'FINAL STACKED VALIDATION RMSPE: {last_rmspe}')
"""

In [None]:
test_stocks = test['stock_id'].unique()
test_all = [test.loc[test['stock_id']==stock] for stock in test_stocks]
target_full_per_stock = pd.concat([
    pd.DataFrame(
        [model.predict(stock_df[no_stock_feature_cols]) for model in stock_models[stock]]+ [stock_df['stock_id'].values] + [stock_df['time_id'].values]
    ).T for stock_df, stock in zip(test_all, test_stocks)
]).reset_index(drop=True)
target_full_per_stock.columns = [f'per_stock_{i+1}' for i in range(len(stock_models[test_stocks[0]]))] + ['stock_id', 'time_id']

stacked_ensemble = target_full_per_stock.merge(test[['stock_id', 'time_id']], on=['stock_id', 'time_id'], how='inner')
stacked_model_pred = pd.Series(stacked_ensemble.drop(['stock_id', 'time_id'], axis=1).mean(axis=1))
test['target'] = stacked_model_pred.values

# test[['row_id', 'target']].to_csv('submission.csv',index = False)