# Feature extraction using GPU Accelerated library RAPIDS.

RAPIDS is a library of open source software that runs exclusively on GPUs. It works with different machine learning algorithms to provide a faster processing. In this notebook, I used "cudf" which is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. It has a pandas like API so its really easy to understand the code while getting a performance optimization of ~18-20 times faster!!
The notebook provides a feature generation techniques for generating 80 variables that can help improve the performances of the models.


In [None]:
import os
import glob
import time

import pandas as pd
import numpy as np

import cudf

from tqdm import tqdm
from joblib import Parallel,delayed

list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
# Check for nvidia GPU availability
!nvidia-smi

## Helper function:

* **wap1**,**wap2**,**wap3**,**wap4**: compute different types of wap using both book data.
* **compute_rv**: Function that computes the different variables using bid, ask price and size as well as the wap values.
* **rapids_compute_features**: compute features for each time ID.
* **rapids_features_per_stock**: compute features for each stock ID.

In [None]:
def wap1(df_book_data):
    wap =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data['ask_size1'])
    return wap

def wap2(df_book_data):
    wap =(df_book_data['bid_price2'] * df_book_data['ask_size2']+df_book_data['ask_price2'] * df_book_data['bid_size2'])  / (
                                      df_book_data['bid_size2']+ df_book_data['ask_size2'])
    
    return wap

def wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    
    return wap

def wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    
    return wap



columns = ['time_id','wap1','wap2','wap3','wap4','wap12','ask_volume','bid_volume','total_volume','wap_balance','price_spread1','price_spread2',
    'bid_spread','ask_spread','bid_ask_spread','weighted_spread1','weighted_spread2','weighted_total_spread','volume_imbalance']

def compute_rv(df_book_data,cols=columns):
    
        cp_df = df_book_data[cols].values.get()
        x = np.unique(cp_df[:, 0],return_index=True)
        wap_grouby_timeid_flatten = np.split(cp_df[:,1:],list(x[1][1:]))
        rv = np.zeros((len(wap_grouby_timeid_flatten),81))
        for i in range(len(wap_grouby_timeid_flatten)):
            
            
            ## Stats for wap1
            rv[i,5] = wap_grouby_timeid_flatten[i][:,0].max()
            rv[i,6] = wap_grouby_timeid_flatten[i][:,0].mean()
            rv[i,7] = wap_grouby_timeid_flatten[i][:,0].max()-wap_grouby_timeid_flatten[i][:,0].min()
            rv[i,8] = wap_grouby_timeid_flatten[i][:,0].sum()

            
            ## Stats for wap2
            rv[i,9] = wap_grouby_timeid_flatten[i][:,1].max()
            rv[i,10] = wap_grouby_timeid_flatten[i][:1].mean()
            rv[i,11] = wap_grouby_timeid_flatten[i][:,1].max()-wap_grouby_timeid_flatten[i][:,1].min()
            rv[i,12] = wap_grouby_timeid_flatten[i][:,1].sum()
            
            
            # Stats for wap3
            rv[i,13] = wap_grouby_timeid_flatten[i][:,2].max()
            rv[i,14] = wap_grouby_timeid_flatten[i][:,2].mean()
            rv[i,15] = wap_grouby_timeid_flatten[i][:,2].max()-wap_grouby_timeid_flatten[i][:,2].min()
            rv[i,16] = wap_grouby_timeid_flatten[i][:,2].sum()
            
            
            ## Stats for wap4
            rv[i,17] = wap_grouby_timeid_flatten[i][:,3].max()
            rv[i,18] = wap_grouby_timeid_flatten[i][:,3].mean()
            rv[i,19] = wap_grouby_timeid_flatten[i][:,3].max()-wap_grouby_timeid_flatten[i][:,3].min()
            rv[i,20] = wap_grouby_timeid_flatten[i][:,3].sum()
            
            ## Stats for wap1+wap2
            rv[i,21] = wap_grouby_timeid_flatten[i][:,4].max()
            rv[i,22] = wap_grouby_timeid_flatten[i][:,4].mean()
            rv[i,23] = wap_grouby_timeid_flatten[i][:,4].max()-wap_grouby_timeid_flatten[i][:,4].min()
            rv[i,24] = wap_grouby_timeid_flatten[i][:,4].sum()

            wap_grouby_timeid_flatten[i][:,0] = np.insert(np.diff(np.log(wap_grouby_timeid_flatten[i][:,0]),axis=0),0,0)
            wap_grouby_timeid_flatten[i][:,1] = np.insert(np.diff(np.log(wap_grouby_timeid_flatten[i][:,1]),axis=0),0,0)
            wap_grouby_timeid_flatten[i][:,2] = np.insert(np.diff(np.log(wap_grouby_timeid_flatten[i][:,2]),axis=0),0,0)
            wap_grouby_timeid_flatten[i][:,3] = np.insert(np.diff(np.log(wap_grouby_timeid_flatten[i][:,3]),axis=0),0,0)
            wap_grouby_timeid_flatten[i][:,4] = np.insert(np.diff(np.log(wap_grouby_timeid_flatten[i][:,4]),axis=0),0,0)

            
            ## predicted rv.
            rv[i,0]=np.sqrt(np.sum(wap_grouby_timeid_flatten[i][:,0]**2,axis=0))
            rv[i,1]=np.sqrt(np.sum(wap_grouby_timeid_flatten[i][:,1]**2,axis=0))
            rv[i,2]=np.sqrt(np.sum(wap_grouby_timeid_flatten[i][:,2]**2,axis=0))
            rv[i,3]=np.sqrt(np.sum(wap_grouby_timeid_flatten[i][:,3]**2,axis=0))
            rv[i,4]=np.sqrt(np.sum(wap_grouby_timeid_flatten[i][:,4]**2,axis=0))
            
            #####################################
            
            ## Ask volume
            rv[i,25] = wap_grouby_timeid_flatten[i][:,5].max()
            rv[i,26] = wap_grouby_timeid_flatten[i][:,5].mean()
            rv[i,27] = wap_grouby_timeid_flatten[i][:,5].max()-wap_grouby_timeid_flatten[i][:,5].min()
            rv[i,28] = wap_grouby_timeid_flatten[i][:,5].sum()

            ## bid volume
            rv[i,29] = wap_grouby_timeid_flatten[i][:,6].max()
            rv[i,30] = wap_grouby_timeid_flatten[i][:,6].mean()
            rv[i,31] = wap_grouby_timeid_flatten[i][:,6].max()-wap_grouby_timeid_flatten[i][:,6].min()
            rv[i,32] = wap_grouby_timeid_flatten[i][:,6].sum()
            
            ## total volume
            rv[i,33] = wap_grouby_timeid_flatten[i][:,7].max()
            rv[i,34] = wap_grouby_timeid_flatten[i][:,7].mean()
            rv[i,35] = wap_grouby_timeid_flatten[i][:,7].max()-wap_grouby_timeid_flatten[i][:,7].min()
            rv[i,36] = wap_grouby_timeid_flatten[i][:,7].sum()
            
            
            ## wap balance
            rv[i,37] = wap_grouby_timeid_flatten[i][:,8].max()
            rv[i,38] = wap_grouby_timeid_flatten[i][:,8].mean()
            rv[i,39] = wap_grouby_timeid_flatten[i][:,8].max()-wap_grouby_timeid_flatten[i][:,8].min()
            rv[i,40] = wap_grouby_timeid_flatten[i][:,8].sum()

            
            ## price spread1
            rv[i,41] = wap_grouby_timeid_flatten[i][:,9].max()
            rv[i,42] = wap_grouby_timeid_flatten[i][:,9].mean()
            rv[i,43] = wap_grouby_timeid_flatten[i][:,9].max()-wap_grouby_timeid_flatten[i][:,9].min()
            rv[i,44] = wap_grouby_timeid_flatten[i][:,9].sum()

            
            ## price spread2
            rv[i,45] = wap_grouby_timeid_flatten[i][:,10].max()
            rv[i,46] = wap_grouby_timeid_flatten[i][:,10].mean()
            rv[i,47] = wap_grouby_timeid_flatten[i][:,10].max()-wap_grouby_timeid_flatten[i][:,10].min()
            rv[i,48] = wap_grouby_timeid_flatten[i][:,10].sum()

            
            ## bid_spread
            rv[i,49] = wap_grouby_timeid_flatten[i][:,11].max()
            rv[i,50] = wap_grouby_timeid_flatten[i][:,11].mean()
            rv[i,51] = wap_grouby_timeid_flatten[i][:,11].max()-wap_grouby_timeid_flatten[i][:,11].min()
            rv[i,52] = wap_grouby_timeid_flatten[i][:,11].sum()

            ## bid_spread
            rv[i,53] = wap_grouby_timeid_flatten[i][:,12].max()
            rv[i,54] = wap_grouby_timeid_flatten[i][:,12].mean()
            rv[i,55] = wap_grouby_timeid_flatten[i][:,12].max()-wap_grouby_timeid_flatten[i][:,12].min()
            rv[i,56] = wap_grouby_timeid_flatten[i][:,12].sum()

            
            ## ask_spread
            rv[i,57] = wap_grouby_timeid_flatten[i][:,13].max()
            rv[i,58] = wap_grouby_timeid_flatten[i][:,13].mean()
            rv[i,59] = wap_grouby_timeid_flatten[i][:,13].max()-wap_grouby_timeid_flatten[i][:,13].min()
            rv[i,60] = wap_grouby_timeid_flatten[i][:,13].sum()

            ## bid ask spread
            rv[i,61] = wap_grouby_timeid_flatten[i][:,14].max()
            rv[i,62] = wap_grouby_timeid_flatten[i][:,14].mean()
            rv[i,63] = wap_grouby_timeid_flatten[i][:,14].max()-wap_grouby_timeid_flatten[i][:,14].min()
            rv[i,64] = wap_grouby_timeid_flatten[i][:,14].sum()


            ## weighted spread1
            rv[i,65] = wap_grouby_timeid_flatten[i][:,15].max()
            rv[i,66] = wap_grouby_timeid_flatten[i][:,15].mean()
            rv[i,67] = wap_grouby_timeid_flatten[i][:,15].max()-wap_grouby_timeid_flatten[i][:,15].min()
            rv[i,68] = wap_grouby_timeid_flatten[i][:,15].sum()
    
            ## weighted spread 2
            rv[i,69] = wap_grouby_timeid_flatten[i][:,16].max()
            rv[i,70] = wap_grouby_timeid_flatten[i][:,16].mean()
            rv[i,71] = wap_grouby_timeid_flatten[i][:,16].max()-wap_grouby_timeid_flatten[i][:,16].min()
            rv[i,72] = wap_grouby_timeid_flatten[i][:,16].sum()
            
            ## weighted total spread
            rv[i,73] = wap_grouby_timeid_flatten[i][:,17].max()
            rv[i,74] = wap_grouby_timeid_flatten[i][:,17].mean()
            rv[i,75] = wap_grouby_timeid_flatten[i][:,17].max()-wap_grouby_timeid_flatten[i][:,17].min()
            rv[i,76] = wap_grouby_timeid_flatten[i][:,17].sum()
            

            
            
        return rv,x[0]


In [None]:
def rapids_compute_features(file_path, pred_col_name):

    # Read a specific stock file.
    df_book_data = cudf.read_parquet(file_path)
    # Compute the wap and create a wap column.
    df_book_data['wap1'] = wap1(df_book_data)
    df_book_data['wap2'] = wap2(df_book_data)
    df_book_data['wap3'] = wap3(df_book_data)
    df_book_data['wap4'] = wap4(df_book_data)
    df_book_data['wap12'] = df_book_data['wap1'] + df_book_data['wap2']
    
    
    ['wap1','wap2','wap3','wap4','wap12','ask_volume','bid_volume','total_volume','wap_balance','price_spread1','price_spread2',
    'bid_spread','ask_spread','bid_ask_spread','weighted_spread1','weighted_spread2','weighted_total_spread','volume_imbalance']
    
    
    df_book_data['ask_volume'] = df_book_data['ask_size1']+df_book_data['ask_size2']
    df_book_data['bid_volume'] = df_book_data['bid_size1']+df_book_data['bid_size2']
    df_book_data['total_volume'] = df_book_data['ask_volume']+df_book_data['bid_volume']
    
    # Calculate wap balance
    df_book_data['wap_balance'] = abs(df_book_data['wap1'] - df_book_data['wap2'])
    # Calculate spread
    df_book_data['price_spread1'] = (df_book_data['ask_price1'] - df_book_data['bid_price1']) / ((df_book_data['ask_price1'] + df_book_data['bid_price1']) / 2)
    df_book_data['price_spread2'] = (df_book_data['ask_price2'] - df_book_data['bid_price2']) / ((df_book_data['ask_price2'] + df_book_data['bid_price2']) / 2)
    
    df_book_data['bid_spread'] = df_book_data['bid_price1'] - df_book_data['bid_price2']
    df_book_data['ask_spread'] = df_book_data['ask_price1'] - df_book_data['ask_price2']
    df_book_data["bid_ask_spread"] = abs(df_book_data['bid_spread'] - df_book_data['ask_spread'])
    
    df_book_data["weighted_spread1"] = df_book_data['ask_price1']* df_book_data['ask_size1'] - df_book_data['bid_price1']* df_book_data['bid_size1'] 
    df_book_data["weighted_spread2"] = df_book_data['ask_price2']* df_book_data['ask_size2'] - df_book_data['bid_price2']* df_book_data['bid_size2'] 
    df_book_data["weighted_total_spread"] = df_book_data["weighted_spread1"] + df_book_data["weighted_spread2"]
    
    df_book_data['volume_imbalance'] = abs((df_book_data['ask_size1'] + df_book_data['ask_size2']) - (df_book_data['bid_size1'] + df_book_data['bid_size2']))


    rv,time_id = compute_rv(df_book_data)

#       ['wap1','wap2','wap3','wap4','wap12','ask_volume','bid_volume','total_volume','wap_balance','price_spread1','price_spread2',
 #   'bid_spread','ask_spread','bid_ask_spread','weighted_spread1','weighted_spread2','weighted_total_spread','volume_imbalance']
        
    cols = [pred_col_name+'1', pred_col_name+'2', pred_col_name+'3', pred_col_name+'4', pred_col_name+'12',
     'wap1max','wap1mean','wap1range','wap1sum',
     'wap2max','wap2mean','wap2range','wap2sum',
     'wap3max','wap3mean','wap3range','wap3sum',
     'wap4max','wap4mean','wap4range','wap4sum',
     'wap12max','wap12mean','wap12range','wap12sum',
     'ask_volume_max','ask_volume_mean','ask_volume_range','ask_volume_sum',
     'bid_volume_max','bid_volume_mean','bid_volume_range','bid_volume_sum',
     'total_volume_max','total_volume_mean','total_volume_range','total_volume_sum',  
     'wap_balance_max','wap_balance_mean','wap_balance_range','wap_balance_sum',
     'price_spread1_max','price_spread1_mean','price_spread1_range','price_spread1_sum',
     'price_spread2_max','price_spread2_mean','price_spread2_range','price_spread2_sum',
     'bid_spread_max','bid_spread_mean','bid_spread_range','bid_spread_sum',
     'ask_spread_max','ask_spread_mean','ask_spread_range','ask_spread_sum',
     'bid_ask_spread_max','bid_ask_spread_mean','bid_ask_spread_range','bid_ask_spread_sum',
     'w_spread1_max','w_spread1_mean','w_spread1_range','w_spread1_sum',
     'w_spread2_max','w_spread2_mean','w_spread2_range','w_spread2_sum',
     'wt_spread_max','wt_spread_mean','wt_spread_range','wt_spread_sum',
     'vol_imb_max','vol_imb_mean','vol_imb_range','vol_imb_sum']

      
    
    df_realized_vol_per_stock = cudf.DataFrame()
    df_realized_vol_per_stock['time_id']=time_id

    for i in range(len(cols)):
        
        df_realized_vol_per_stock[cols[i]] = rv[:,i]
        
        
    #df_realized_vol_per_stock[pred_col_name+'1']=rv[:,0]


    stock_id = file_path.split('=')[1]

    # Compute an ID for the combinaison of stock-time IDs.
    df_realized_vol_per_stock['row_id'] = stock_id+'-'+df_realized_vol_per_stock['time_id'].astype(int).astype(str)    

    df_realized_vol_per_stock['fft_rv1'] = np.abs(np.fft.fft((rv[:,0])))
    df_realized_vol_per_stock['fft_rv2'] = np.abs(np.fft.fft((rv[:,1])))

    
    return df_realized_vol_per_stock








def rapids_features_per_stock(list_file,prediction_column_name):
    """
   # Loops over all the files of the books and apply the rapids_compute_features function.
    """
    df_past_realized = cudf.DataFrame()
    for file in tqdm(list_file):
        df_past_realized = cudf.concat([df_past_realized,
                                     rapids_compute_features(file,prediction_column_name)])
    return df_past_realized

## Generate training data features:

In [None]:
train = cudf.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

In [None]:
%time df_past_realized_train = rapids_features_per_stock(list_file=list_order_book_file_train, prediction_column_name='rv')

In [None]:
df_past_realized_train.to_csv('df_past_realized_train.csv',index=False)

## Create test data features:

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = rapids_features_per_stock(list_file=list_order_book_file_test,prediction_column_name='rv')


In [None]:
df_naive_pred_test.to_csv('test.csv',index=False)

# Trade data:

In [None]:
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature


# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_files):
    
    # Parrallel for loop
    def for_joblib(file):
     
            
        file_path_trade = file 
        # Preprocess book and trade data and merge them
        df_tmp = trade_preprocessor(file_path_trade)
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_file) for stock_file in list_stock_files)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df


In [None]:
list_trade_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
train_ = preprocessor(list_trade_train_files)
list_trade_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')
test_ = preprocessor(list_trade_test)