In [None]:
import os
import glob
from joblib import Parallel, delayed
import joblib
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
from scipy.stats.mstats import winsorize
import pickle

warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

In [None]:
def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']
    return (a1 + a2)/ b

def calculate_wap1(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b2 = df['bid_size2'] + df['ask_size2']
    
    x = (a1/b1 + a2/b2)/ 2
    
    return x

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

def pct_return(series):
    return series.pct_change()

def mdd(series):
    roll_max = series.expanding().max()
    second_drawdown = series / roll_max - 1
    return second_drawdown.min()

def mdu(series):
    roll_min = series.expanding().min()
    second_drawup = series / roll_min - 1
    return second_drawup.max()

def tick_price(series):
    diff = abs(series.diff())
    min_diff = np.nanmin(diff.where(lambda x: x > 0))
    n_ticks = (diff / min_diff).round()
    return 0.01 / np.nanmean(diff / n_ticks)

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

def count_level(series):
    
    level = series.loc[series.shift(-1) != series]
    
    return len(level)

def ewma_vola_999(ReturnSeries, Lambda=0.999):   

    #ReturnSeries = ReturnSeries[ReturnSeries != 0]
    
    SampleSize = len(ReturnSeries)
    Average = ReturnSeries.mean()

    e = np.arange(SampleSize-1,-1,-1)
    r = np.repeat(Lambda,SampleSize)
    vecLambda = np.power(r,e)

    sxxewm = (np.power(ReturnSeries-Average,2)*vecLambda).sum()
    #Vart = sxxewm/vecLambda.sum()
    EWMAVol = np.sqrt(sxxewm)

    return (EWMAVol)

def ewma_vola_99(ReturnSeries, Lambda=0.99):   

    #ReturnSeries = ReturnSeries[ReturnSeries != 0]
    
    SampleSize = len(ReturnSeries)
    Average = ReturnSeries.mean()

    e = np.arange(SampleSize-1,-1,-1)
    r = np.repeat(Lambda,SampleSize)
    vecLambda = np.power(r,e)

    sxxewm = (np.power(ReturnSeries-Average,2)*vecLambda).sum()
    #Vart = sxxewm/vecLambda.sum()
    EWMAVol = np.sqrt(sxxewm)

    return (EWMAVol)

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(data_dir + 'train.csv')
    test = pd.read_csv(data_dir + 'test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to get group stats for different windows (seconds in bucket)
def get_stats_window(df, create_feature_dict, seconds_in_bucket, add_suffix = False):
    # Group by the window
    df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
    # Rename columns joining suffix
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    # Add a suffix to differentiate windows
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
    return df_feature

def get_stats_window2(df, create_feature_dict, seconds_in_bucket_s, seconds_in_bucket_e, add_suffix = False):
    # Group by the window
    df_feature = df[(df['seconds_in_bucket'] >= seconds_in_bucket_s) & (df['seconds_in_bucket'] < seconds_in_bucket_e)].groupby(['time_id']).agg(create_feature_dict).reset_index()
    # Rename columns joining suffix
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    # Add a suffix to differentiate windows
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket_s) + '_' + str(seconds_in_bucket_e))
    return df_feature

def make_peak_features(x):
    
    from scipy.signal import find_peaks
    
    peaks, _ = find_peaks(x, distance=60)
    
    if len(peaks) > 0:
        max_diff = max(x[peaks]) - min(x[peaks])
    else:
        max_diff = 0
    
    return pd.Series([len(peaks), np.mean(x[peaks]), np.std(x[peaks]), max_diff], index=['peaks_n', 'peaks_mean', 'peaks_std', 'peaks_max_diff']) # len(peaks), np.mean(x[peaks])# 
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['wap2'] = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
#     df['wap3'] = calculate_wap(df)

    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
#     df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)

    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = (df['bid_price1'] - df['bid_price2']) / (df['bid_price1'] + df['bid_price2'])
    df['ask_spread'] = (df['ask_price2'] - df['ask_price1']) / (df['ask_price1'] + df['ask_price2'])
    
    df['total_volume_1'] = df['ask_size1'] + df['bid_size1']
    df['total_volume_2'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    
    df['volume_imbalance_1'] = abs((df['ask_size1'] - df['bid_size1']))
    df['volume_imbalance_2'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
#     df['bbo_sum1'] = df['ask_price1'] + df['bid_price1']
#     df['bbo_sum2'] = df['ask_price2'] + df['bid_price2']

#     peaks_df = df.groupby('time_id')['wap1'].apply(lambda x : make_peak_features(x.values)).unstack(level=1)
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
#         'bbo_sum1': [count_consec_unique],
#         'bbo_sum2': [count_consec_unique],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
#         'log_return3': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, realized_volatility, np.mean, np.std],
        'price_spread':[np.sum, realized_volatility, np.mean, np.std],
        'bid_spread':[np.sum, realized_volatility, np.mean, np.std],
        'ask_spread':[np.sum, realized_volatility, np.mean, np.std],
        'total_volume_1':[np.sum, realized_volatility, np.mean, np.std],
        'total_volume_2':[np.sum, realized_volatility, np.mean, np.std],
        'volume_imbalance_1':[np.sum, realized_volatility, np.mean, np.std],
        'volume_imbalance_2':[np.sum, realized_volatility, np.mean, np.std]
    }
        
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket = 0, add_suffix = False)    
    df_feature_550 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 550, add_suffix = True)
    df_feature_500 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 500, add_suffix = True)
    df_feature_450 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 450, add_suffix = True)
    df_feature_400 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 400, add_suffix = True)
#     df_feature_350 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 350, add_suffix = True)
    df_feature_300 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 300, add_suffix = True)
#     df_feature_250 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 250, add_suffix = True)
    df_feature_200 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 200, add_suffix = True)
    df_feature_150 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 150, add_suffix = True)
    df_feature_100 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 100, add_suffix = True)
    df_feature_50 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 50, add_suffix = True)
    df_feature_25 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 25, add_suffix = True)
    df_feature_10 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 10, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_550, how = 'left', left_on = 'time_id_', right_on = 'time_id__550')
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
#     df_feature = df_feature.merge(df_feature_350, how = 'left', left_on = 'time_id_', right_on = 'time_id__350')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_250, how = 'left', left_on = 'time_id_', right_on = 'time_id__250')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    df_feature = df_feature.merge(df_feature_50, how = 'left', left_on = 'time_id_', right_on = 'time_id__50')
    df_feature = df_feature.merge(df_feature_25, how = 'left', left_on = 'time_id_', right_on = 'time_id__25')
    df_feature = df_feature.merge(df_feature_10, how = 'left', left_on = 'time_id_', right_on = 'time_id__10')

#     df_feature = df_feature.merge(peaks_df, how='left', left_on = 'time_id_', right_index=True)
    
    # Drop unnecesary time_ids
    unused_time_id = df_feature.filter(like='time_id__').columns.tolist()
    df_feature.drop(unused_time_id, axis = 1, inplace = True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)

    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility, np.mean, np.std],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.mean, np.std , realized_volatility],
        'order_count':[np.sum, np.mean, np.std, realized_volatility],
    }
        
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket = 0, add_suffix = False)
    df_feature_550 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 550, add_suffix = True)
    df_feature_500 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 500, add_suffix = True)
#     df_feature_450 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 450, add_suffix = True)
#     df_feature_400 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 400, add_suffix = True)
#     df_feature_350 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 350, add_suffix = True)
    df_feature_300 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 300, add_suffix = True)
#     df_feature_250 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 250, add_suffix = True)
#     df_feature_200 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 200, add_suffix = True)
#     df_feature_150 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 150, add_suffix = True)
#     df_feature_100 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 100, add_suffix = True)
    df_feature_50 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 50, add_suffix = True)
    df_feature_25 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 25, add_suffix = True)
    df_feature_10 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 10, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_550, how = 'left', left_on = 'time_id_', right_on = 'time_id__550')
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
#     df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
#     df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
#     df_feature = df_feature.merge(df_feature_350, how = 'left', left_on = 'time_id_', right_on = 'time_id__350')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_250, how = 'left', left_on = 'time_id_', right_on = 'time_id__250')
#     df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
#     df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
#     df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    df_feature = df_feature.merge(df_feature_50, how = 'left', left_on = 'time_id_', right_on = 'time_id__50')
    df_feature = df_feature.merge(df_feature_25, how = 'left', left_on = 'time_id_', right_on = 'time_id__25')
    df_feature = df_feature.merge(df_feature_10, how = 'left', left_on = 'time_id_', right_on = 'time_id__10')

    # Drop unnecesary time_ids
    unused_time_id = df_feature.filter(like='time_id__').columns.tolist()
    df_feature.drop(unused_time_id, axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)

    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
#     vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
#                 'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
#                 'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    vol_cols = [f for f in df.columns if ('realized_volatility' in f)]

#     not_cols = ['stock_id', 'time_id', 'target', 'row_id']
    
#     stock_vol_cols = ['log_return1_realized_volatility', 'trade_size_realized_volatility', 'trade_log_return_realized_volatility'] # [f for f in df.columns if ('realized_volatility' in f) | ('realized_volatility' in f)] # [i for i in df.columns if i not in not_cols]
    
#     # Group by the stock id
#     df_stock_id = df.groupby(['stock_id'])[stock_vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
#     # Rename columns joining suffix
#     df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
#     df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
#     df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    from tqdm import tqdm
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in tqdm(list_stock_ids))
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
MODEL_DIR = '../input/lgbmv115'

# features = test.drop(columns=['time_id', 'row_id']).columns

with open(f"{MODEL_DIR}/lgbm_features.txt", "rb") as fp:   # Unpickling
    features = pickle.load(fp)

In [None]:
train, test = read_train_test()

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

In [None]:
# for col in test.drop(columns=['time_id', 'row_id', 'stock_id']).columns:
    
#     test[col] = winsorize(test[col], limits=[0, 0.01])

In [None]:
test = get_time_stock(test)

In [None]:
# target_encode = train.groupby('stock_id')['target'].agg(['mean'])
# # target_encode.columns = ['_'.join(col) for col in target_encode.columns]
# target_encode = target_encode.add_suffix('_' + 'stock')
# target_encode = target_encode.reset_index()

In [None]:
list(features)

In [None]:
y_preds = np.zeros(len(test))
files = glob.glob(f'{MODEL_DIR}/*lgbm*.pkl')
assert len(files) > 0
for i, f in enumerate(files):
    print(f)
    model = joblib.load(f)
    y_preds += model.predict(test[features].fillna(0)) # .merge(target_encode, how='left', on='stock_id')
y_preds /= (i+1)
    
test['target'] = y_preds

In [None]:
sub = test[['row_id', 'target']]
sub

In [None]:
sub.to_csv('submission.csv',index = False)