In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc
import math

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib

#import warnings
#warnings.filterwarnings('ignore')

path_submissions = '/'

target_name = 'target'
scores_folds = {}
%load_ext line_profiler

In [None]:
def compare_dataframe(df_orig, df_comp, only_error_columns = True, need_columns=[], skip_columns=[]):
    goods = []
    errors = []
    df = pd.DataFrame()
    for column in df_orig.columns:
        if column in skip_columns:
            continue
        try:        
            is_good = numpy.isclose(df_orig[column], df_comp[column], equal_nan=True)
            errors_sum = (is_good == False).sum()
            if errors_sum == 0:
                goods.append(column)
                if only_error_columns == True and column not in need_columns:
                    continue
            df['o_'+column] = df_orig[column]
            df['n_'+column] = df_comp[column]
            df['c_'+column] = is_good
            if errors_sum == 0:
                print(column, errors_sum)
            else:
                errors.append([column, errors_sum])
                print('Bad column:', column, errors_sum)
        except Exception as inst:
            print(f'Not found column: {column}', inst)
    print('Goods columns: ', goods)
    return (df.copy(deep=True),errors)

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

def tendency(price, vol):    
    df_diff = np.diff(price)
    val = (df_diff/price[1:])*100
    power = np.sum(val*vol[1:])
    return(power)

# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

In [None]:
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.std],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_balance': [np.sum, np.max],
        'price_spread':[np.sum, np.max],
        'price_spread2':[np.sum, np.max],
        'bid_spread':[np.sum, np.max],
        'ask_spread':[np.sum, np.max],
        'total_volume':[np.sum, np.max],
        'volume_imbalance':[np.sum, np.max],
        "bid_ask_spread":[np.sum,  np.max],
    }
    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

In [None]:
# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
        
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

In [None]:
# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    

In [None]:
def get_np_base_data_with_limits(np_df, time_id_index):
    np_df_row_id = np_df[:,time_id_index]
    diff_time_id = np.diff(np_df_row_id).astype(bool)
    order = np.arange(1,len(diff_time_id)+1)
    limits = order[diff_time_id]
    return (limits, np_df_row_id)

def get_np_com_data_with_limits(df, time_id_index):
    np_df = np.array(df.values)
    limits, np_df_row_id = get_np_base_data_with_limits(np_df, time_id_index)
    return (np_df, limits, np_df_row_id)

def get_np_bucket_with_limits(df, time_id_index, p_bucket_filter):
    np_df = np.array(df.to_numpy())
    np_df = np.delete(np_df, p_bucket_filter, axis = 0)
    limits, np_df_row_id = get_np_base_data_with_limits(np_df, time_id_index)    
    uniq_time_id = np.unique(np_df_row_id)
    return (np_df, limits, uniq_time_id)

def get_np_data_with_limits(df, time_id_index):
    np_df, limits, _ = get_np_com_data_with_limits(df, time_id_index)
    return (np_df, limits)

def get_np_uniq_data_with_limits(df, time_id_index):
    np_df, limits, np_df_row_id = get_np_com_data_with_limits(df, time_id_index)
    uniq_time_id = np.unique(np_df_row_id)
    return (np_df, limits, uniq_time_id)

def calc_np_wap(wap_index, np_df, limits):
    wap_batchs = np.split(np_df[:,wap_index], limits)
    rv_list = []
    for wap_part in wap_batchs:
        rv_diff = np.diff(np.log(wap_part))
        rv_list.extend([np.NaN])
        rv_list.extend(rv_diff)
    return rv_list

def calc_aggr_nan_feature(feature_index, np_df, limits, aggr_func):
    wap_batchs = np.split(np_df[:,feature_index], limits)
    return [aggr_func(wap_part[int(math.isnan(wap_part[0])):]) for wap_part in wap_batchs]

def create_base_features(p_df_feature, p_np_df, p_limits, p_df_columns, p_create_feature_dict, p_seconds_in_bucket = -1):
    for create_feature_key, create_feature_values in p_create_feature_dict.items():
        feature_index = p_df_columns.index(create_feature_key)
        for create_func in create_feature_values:
            func_name = create_func.__name__
            result = calc_aggr_nan_feature(feature_index, p_np_df, p_limits, create_func)
            feature_name = create_feature_key + '_' + func_name
            if p_seconds_in_bucket >= 0:
                feature_name += '_' + str(p_seconds_in_bucket)
            p_df_feature[feature_name] = result
    return p_df_feature

In [None]:
def calc_log_return(df):
    df.sort_values(['time_id'],kind='stable',inplace=True)
    columns = df.columns.tolist()
    np_df, limits = get_np_data_with_limits(df, columns.index('time_id'))
    df['log_return1'] = calc_np_wap(columns.index('wap1'), np_df, limits)
    df['log_return2'] = calc_np_wap(columns.index('wap2'), np_df, limits)
    df['log_return3'] = calc_np_wap(columns.index('wap3'), np_df, limits)
    df['log_return4'] = calc_np_wap(columns.index('wap4'), np_df, limits)
    return df

def get_book_common_features(file_path):
    df = pd.read_parquet(file_path)
    
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    
    # Calculate log returns
    df = calc_log_return(df)
    
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    return df

def get_base_features(df):
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.std], #tst_new],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_balance': [np.sum, np.max],
        'price_spread':[np.sum, np.max],
        'price_spread2':[np.sum, np.max],
        'bid_spread':[np.sum, np.max],
        'ask_spread':[np.sum, np.max],
        'total_volume':[np.sum, np.max],
        'volume_imbalance':[np.sum, np.max],
        "bid_ask_spread":[np.sum,  np.max],
    }
    df_columns = df.columns.tolist()
    np_df, limits, uniq_time_id = get_np_uniq_data_with_limits(df, df_columns.index('time_id'))
    df_feature = pd.DataFrame()
    df_feature['time_id'+'_'] = uniq_time_id.astype(int)

    df_feature = create_base_features(df_feature, np_df, limits, df_columns, create_feature_dict)    
    return df_feature

def get_book_time_features(df, df_feature):
    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
    }         
    df_columns = df.columns.tolist()
    for sec_in_bucket in range(500, 0, -100):
        np_delete_buckets = df['seconds_in_bucket'].to_numpy() < sec_in_bucket
        np_df, limits, uniq_time_id = get_np_bucket_with_limits(df, df_columns.index('time_id'), np_delete_buckets)
        df_feature_time = pd.DataFrame()
        df_feature_time['time_id'+'__'+f'{sec_in_bucket}'] = uniq_time_id.astype(int)
        df_feature_time = create_base_features(df_feature_time, np_df, limits, df_columns, 
                                               create_feature_dict_time, sec_in_bucket)
        df_feature = df_feature.merge(df_feature_time, how = 'left', left_on = 'time_id_', right_on = f'time_id__{sec_in_bucket}')
        df_feature.drop([f'time_id__{sec_in_bucket}'], axis = 1, inplace = True)
    return df_feature

def book_preprocessor_new(file_path):
    df = get_book_common_features(file_path)
    df_feature = get_base_features(df)   
    df_feature = get_book_time_features(df, df_feature)
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature  #.copy(deep=True)

In [None]:
def get_trade_base_features(df):
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }    
    df_columns = df.columns.tolist()
    np_df, limits, uniq_time_id = get_np_uniq_data_with_limits(df, df_columns.index('time_id'))
    df_feature = pd.DataFrame()
    df_feature['time_id'+'_'] = uniq_time_id.astype(int)

    df_feature = create_base_features(df_feature, np_df, limits, df_columns, create_feature_dict)      
    return df_feature

def get_trade_time_features(df, df_feature):
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    df_columns = df.columns.tolist()
    for sec_in_bucket in range(500, 0, -100):
        np_delete_buckets = df['seconds_in_bucket'].to_numpy() < sec_in_bucket
        np_df, limits, uniq_time_id = get_np_bucket_with_limits(df, df_columns.index('time_id'), np_delete_buckets)
        df_feature_time = pd.DataFrame()
        df_feature_time['time_id'+'__'+f'{sec_in_bucket}'] = uniq_time_id.astype(int)
        df_feature_time = create_base_features(df_feature_time, np_df, limits, df_columns, 
                                               create_feature_dict_time, sec_in_bucket)
        df_feature = df_feature.merge(df_feature_time, how = 'left', left_on = 'time_id_', right_on = f'time_id__{sec_in_bucket}')
        df_feature.drop([f'time_id__{sec_in_bucket}'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor_new(file_path):
    df = pd.read_parquet(file_path)
    
    #df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df.sort_values(['time_id'],kind='stable',inplace=True)
    columns = df.columns.tolist()
    np_df, limits = get_np_data_with_limits(df, columns.index('time_id'))
    df['log_return'] = calc_np_wap(columns.index('price'), np_df, limits)
    
    df['amount'] = df['price'] * df['size']
    
    df_feature = get_trade_base_features(df)
    # Get the stats for different windows
    df_feature = get_trade_time_features(df, df_feature)
       
    df_columns = df.columns.tolist()
    np_df, limits, uniq_time_id = get_np_uniq_data_with_limits(df, df_columns.index('time_id'))

    lis = []
    wap_batchs = np.split(np_df, limits)
    num_price = df_columns.index('price')
    num_size = df_columns.index('size')
    for n_time_id, np_id in zip(uniq_time_id, wap_batchs):
        np_price = np_id[:,num_price]
        np_size = np_id[:,num_size]

        tendencyV = tendency(np_price, np_size)      
        
        mean_price = np.mean(np_price)
        f_max = np.sum(np_price > mean_price)
        f_min = np.sum(np_price < mean_price)
        
        diff_price = np.diff(np_price)
        df_max =  np.sum(diff_price > 0)
        df_min =  np.sum(diff_price < 0)
        
        # new
        abs_diff = np.median(np.abs( np_price - mean_price))        
        energy = np.mean(np_price**2)
        iqr_p = np.percentile(np_price,75) - np.percentile(np_price,25)
        
        # vol vars
        abs_diff_v = np.median(np.abs( np_size - np.mean(np_size)))        
        energy_v = np.sum(np_size**2)
        iqr_p_v = np.percentile(np_size,75) - np.percentile(np_size,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
        
    df_lr = pd.DataFrame(lis)
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
       
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature  #.copy(deep=True)

In [None]:
#train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

In [None]:
list_trade_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
trade_file_path = list_trade_file_train[0]

tr_new = trade_preprocessor_new(trade_file_path)
tr_old = trade_preprocessor(trade_file_path)
tr_c,tr_e = compare_dataframe(tr_old,tr_new,skip_columns=['row_id'])

In [None]:
tr_e

In [None]:
tr_c[tr_c['c_trade_tendency']==False]

In [None]:
tr_c[tr_c['c_trade_f_max']==False]

In [None]:
tr_c[tr_c['c_trade_f_min']==False]

In [None]:
tr_c[tr_c['c_trade_abs_diff']==False].head(5)

In [None]:
list_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
book_file_path = list_book_file_train[0]

bk_new = book_preprocessor_new(book_file_path)
bk_old = book_preprocessor(book_file_path)
bk_c,bk_e = compare_dataframe(bk_old,bk_new,skip_columns=['row_id'])

In [None]:
bk_c.head(5)

In [None]:
%time book_preprocessor(book_file_path)
#%lprun -f book_preprocessor book_preprocessor(book_file_path)

In [None]:
%time book_preprocessor_new(book_file_path)
#%lprun -f book_preprocessor_new book_preprocessor_new(book_file_path)

In [None]:
%time trade_preprocessor(trade_file_path)
#%lprun -f trade_preprocessor trade_preprocessor(trade_file_path)

In [None]:
%time trade_preprocessor_new(trade_file_path)
#%lprun -f trade_preprocessor_new trade_preprocessor_new(trade_file_path)