## version 1 datasets

copied from -- https://www.kaggle.com/lucasmorin/tf-keras-nn-with-stock-embedding


## First version dataset creation

In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib
from sklearn.cluster import KMeans
from scipy import stats
from tqdm import tqdm
import copy

path_submissions = '/'

target_name = 'target'
scores_folds = {}

## Train and test datasets

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap


# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def realized_volatility_10_90(series):
    x = np.sort(series)
    siz = len(x)
    x = x[int(siz*0.10):int(siz*0.90)]
    return np.sqrt(np.sum(x**2))

def realized_volatility_90(series):
    x = np.sort(series)
    x = x[~numpy.isnan(x)]
    siz = len(x)
    x = x[int(siz*0.90):]
    return np.sqrt(np.sum(x**2))

def realized_volatility_10(series):
    x = np.sort(series)
    siz = len(x)
    x = x[:int(siz*0.10)]
    return np.sqrt(np.sum(x**2))

def realized_volatility_qt(series):
    series = series[~numpy.isnan(series)].values.reshape(1, -1)
    qt = QuantileTransformer(random_state=21,n_quantiles=2000, output_distribution='normal')
    series = qt.fit_transform(series)    
    return np.sqrt(np.sum(series**2))

def realized_volatility_25(series):
    x = np.sort(series)
    x = x[~numpy.isnan(x)]
    siz = len(x)
    x = x[:int(siz*0.25)]
    return np.sqrt(np.sum(x**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

def time_return(series):
    return ( np.abs(series.diff()) )

def kurtosis2(x):
    return(stats.kurtosis(x,nan_policy='omit'))

# interquartile
def iqr_p_v(x):
    return (np.percentile( x,75) - np.percentile(x,25) )

# signal energy
def energy (series):
    return( np.mean(series**2) )

def rfil(x):
    
    if numpy.isnan(x.iat[0]):        
        return (np.log(x.iat[-1]/x.iat[1])**2)
    else:
        return (np.log(x.iat[-1]/x.iat[0])**2)
    
def rmnl(x):
    return np.log((np.max(x)/np.min(x))**2)

# Adverse retrun
def adverse_volatility(series):
    v = np.array([x for x in series if x < 0])
    return np.sqrt(np.sum(v**2))

def outliers(series):
    std = np.std(series)
    mean = np.mean(series)
    v = np.array([x for x in series if abs(x) > (mean+3*(std)) ])
    return len(v)/np.float(len(series))

def realized_volatility_abs(series):
    return (np.sum(np.abs(series)))


#https://www.kaggle.com/cldavies/single-value-baseline
def single_prediction(series):
    series = series[~numpy.isnan(series)]
    series = series[ series != 0]
    inverse_target = 1 / np.abs(series)
    single_prediction = np.sum(inverse_target) /  np.sum(np.square(inverse_target))
    return single_prediction


# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.std, rfil,rmnl],
        'wap2': [np.sum, np.std],
         'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1':  [single_prediction,realized_volatility,outliers,np.std,kurtosis2,realized_volatility_25],
        'log_return2':  [realized_volatility_abs],
        'log_return3':  [adverse_volatility],
        'log_return4': [outliers],
        'wap_balance':  [np.sum, np.max, np.std],
        'price_spread': [np.sum, np.max, np.std],
        'price_spread2':[np.sum, np.max, np.std,stats.median_abs_deviation],
        'bid_spread':   [np.sum, np.max, np.std],
        'ask_spread':   [np.sum, np.max, np.std],
        'total_volume': [np.sum, np.max, np.std],
        'volume_imbalance':[np.sum, np.max, np.std],
        "bid_ask_spread":[np.sum, np.max, np.std],
    }
    
    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility_abs],
        'log_return3': [adverse_volatility],
        'log_return4': [outliers],
    }
        
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 150, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['time_return'] = df.groupby('time_id')['seconds_in_bucket'].apply(time_return) 
    df['amount']=df['price']*df['size']
    
    # Dict for aggregations
    create_feature_dict = {
        'price': [iqr_p_v,rfil,rmnl],
        'log_return':[realized_volatility,kurtosis2],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.std, np.max, np.min,energy,iqr_p_v],
        'order_count':[np.sum,np.max],
        'time_return':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }
    

    create_feature_dict_time = {
        'log_return':[realized_volatility,outliers],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 150, add_suffix = True)
    

    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility_abs', 
                'log_return1_realized_volatility_450', 'log_return2_realized_volatility_abs_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_abs_300', 
                'log_return1_realized_volatility_150', 'log_return2_realized_volatility_abs_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450',
                'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150',
                'wap1_rfil','wap1_rmnl','log_return1_realized_volatility_25','log_return1_outliers']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in tqdm(list_stock_ids))
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
# Read train and test
train, test = read_train_test()
# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

In [None]:
train = get_time_stock(train)

In [None]:
# replace by order sum (tau)
train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
train['rrr'] = train['log_return1_realized_volatility_25']/train['log_return1_realized_volatility']

In [None]:
train.to_pickle('dataset_baseline_train_best_v3.pkl')