In [None]:
import glob

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import kurtosis, skew
import os
from multiprocessing import Pool
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler,QuantileTransformer
from tqdm import tqdm
import lightgbm as lgbm
import warnings
from numpy.random import seed
from numpy import matlib
seed(42)
import tensorflow as tf
tf.random.set_seed(42)
from tensorflow import keras
from keras import backend as K

In [None]:
N_JOBS = 4 # os.cpu_count()
DATA_DIR = '/kaggle/input/optiver-realized-volatility-prediction'
NUM_GROUPS = 14

In [None]:
import numpy as np


def rmspe(y_true, y_pred):
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return ** 2))


def realized_volatility_log_return(list_stock_prices):
    series_log_return = np.log(list_stock_prices).diff()
    return np.sqrt(np.sum(series_log_return ** 2))


def chunker(seq, size):
    return [seq[pos:pos + size] for pos in range(0, len(seq), size)]

In [None]:
def count_unique(series):
    return len(np.unique(series))


class FeatureExtraction:
    def __init__(self):
        pass

    def calc_wap(self, df):
        wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (
                    df['bid_size1'] + df['ask_size1'])
        return wap

    def calc_wap2(self, df):
        wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (
                    df['bid_size2'] + df['ask_size2'])
        return wap

    def calc_wap3(self, df):
        wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (
                    df['bid_size2'] + df['ask_size2'])
        return wap

    def target_encoding(self):
        pass

    def trade_stock_stat(self, path):
        df = pd.read_parquet(path)
        df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
        aggregate_dictionary = {
            'log_return': [realized_volatility, "std"],
            'seconds_in_bucket': [count_unique],
            'size': ["sum"],
            'order_count': ["sum", "mean"]
            
        }

        df_feature = df.groupby('time_id').agg(aggregate_dictionary)

        df_feature = df_feature.reset_index()
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]

        ######groupby / last XX seconds
        last_seconds = [150, 300, 450]

        for second in last_seconds:
            second = 600 - second

            df_feature_sec = df.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(aggregate_dictionary)
            df_feature_sec = df_feature_sec.reset_index()

            df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
            df_feature_sec = df_feature_sec.add_suffix('_' + str(second))

            df_feature = pd.merge(df_feature, df_feature_sec, how='left', left_on='time_id_',
                                  right_on=f'time_id__{second}')
            df_feature = df_feature.drop([f'time_id__{second}'], axis=1)

        df_feature = df_feature.add_prefix('trade_')
        stock_id = int(path.split('=')[1])
        df_feature['stock_id'] = int(stock_id)
        # df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x: f'{stock_id}-{x}')
        # df_feature = df_feature.drop(['trade_time_id_'], axis=1)
        df_feature = df_feature.rename(columns={'trade_time_id_': 'time_id'})
#         df_feature['trade_price_open'] = df.groupby('time_id')['price'].head(1).values
#         df_feature['trade_price_close'] = df.groupby('time_id')['price'].tail(1).values
#         df_feature['trade_price_gap'] = df_feature['trade_price_close'] - df_feature['trade_price_open']

        return df_feature

    def get_stock_stat(self, path):
        df = pd.read_parquet(path)
        # calculate return etc
        df['wap'] = self.calc_wap(df)
        df['wap2'] = self.calc_wap2(df)
        # df['wap3'] = self.calc_wap3(df)
        # df['wap3'] = (df['wap'] + df['wap2'])/2

        df['log_return'] = df.groupby('time_id')['wap'].apply(log_return)
        df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
        # df['log_return3'] = df.groupby('time_id')['wap3'].apply(log_return)

        df['wap_balance'] = abs(df['wap'] - df['wap2'])

        df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / (df['ask_price1'] + df['bid_price1'])
        df['bid_spread'] = df['bid_price1'] - df['bid_price2']
        df['ask_spread'] = df['ask_price1'] - df['ask_price2']
        df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
        df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

        df['bid_ask_spread'] = df['ask_price1'] / df['bid_price1']
#         df['demand'] = df['ask_size1'] * df['ask_price1']
#         df['provide'] = df['bid_size1'] * df['bid_price1']
#         df['a/b'] = df['ask_price1'] / df['bid_size1']
#         df['b/a'] = df['bid_price1'] / df['ask_size1']
#         df['demand/provide'] = df['demand'] / df['provide']
        # dict for aggregate
        create_feature_dict = {
            'log_return': [realized_volatility, np.std],
            'log_return2': [realized_volatility],
#             'seconds_in_bucket': [count_unique],
            # 'log_return3': [realized_volatility],
            'wap_balance': [np.mean, np.std],
            'price_spread': [np.mean, np.std],
            'bid_spread': [np.mean, np.std],
            'ask_spread': [np.mean, np.std],
            'volume_imbalance': [np.mean, np.std],
            'total_volume': [np.mean, np.std],
            'wap': [np.mean, np.std],
            'bid_ask_spread': [np.mean, np.std],
#             'demand': [np.mean, np.std],
#             'provide': [np.mean, np.std],
#             'demand/provide': [np.mean, np.std]
        }

        #####groupby / all seconds
        df_feature = pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()

        df_feature.columns = ['_'.join(col) for col in df_feature.columns]  # time_id is changed to time_id_

        ######groupby / last XX seconds
        last_seconds = [150, 300, 450]

        for second in last_seconds:
            second = 600 - second

            df_feature_sec = pd.DataFrame(
                df.query(f'seconds_in_bucket >= {second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()

            df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]  # time_id is changed to time_id_

            df_feature_sec = df_feature_sec.add_suffix('_' + str(second))

            df_feature = pd.merge(df_feature, df_feature_sec, how='left', left_on='time_id_',
                                  right_on=f'time_id__{second}')
            df_feature = df_feature.drop([f'time_id__{second}'], axis=1)

        # create row_id
        stock_id = path.split('=')[1]
        df_feature['stock_id'] = int(stock_id)
        # df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
        df_feature = df_feature.rename(columns={'time_id_': 'time_id'})
#         df_feature['bid_price1_open'] = df.groupby('time_id')['bid_price1'].head(1).values
#         df_feature['bid_price1_close'] = df.groupby('time_id')['bid_price1'].tail(1).values
#         df_feature['book_bid_price_gap'] = df_feature['bid_price1_close'] - df_feature['bid_price1_open']
#         df_feature['ask_price1_open'] = df.groupby('time_id')['ask_price1'].head(1).values
#         df_feature['ask_price1_close'] = df.groupby('time_id')['ask_price1'].tail(1).values
#         df_feature['book_ask_price_gap'] = df_feature['ask_price1_open'] - df_feature['ask_price1_close']
        return df_feature

    def get_stock_groups(self, df, group_number=NUM_GROUPS):
        stock_groups = df.groupby("stock_id").mean()["target"].reset_index().sort_values(by="target")
        stock_groups = chunker(list(stock_groups["stock_id"]), group_number)
        return stock_groups


feature_extraction = FeatureExtraction()


def get_overall(book):
    # total_df = feature_extraction.get_stock_stat(book[0])
    # print(total_df.columns)
    # exit()
    with Pool(N_JOBS) as pool:
        total_df = pool.map(feature_extraction.get_stock_stat, book)
    total_df = pd.concat(total_df, axis=0)
#     ic(total_df.shape)
    return total_df


def get_trade_overall(book):
    # total_df = feature_extraction.trade_stock_stat(book[0])
    # print(total_df.columns)
    # exit()
    with Pool(N_JOBS) as pool:
        total_df = pool.map(feature_extraction.trade_stock_stat, book)
    total_df = pd.concat(total_df, axis=0)
#     ic(total_df.shape)
    return total_df


def merge_frames(dataset='train'):
    order_book = sorted(glob.glob(f'{DATA_DIR}/book_{dataset}.parquet/*'))
    total_df = get_overall(order_book)
    trade_df = sorted(glob.glob(f'{DATA_DIR}/trade_{dataset}.parquet/*'))
    total_trade = get_trade_overall(trade_df)
    total_df = total_df.merge(total_trade, on=["stock_id", "time_id"], how="left")
#     total_df['trade/book'] = total_df['trade_seconds_in_bucket_count_unique']/ total_df['seconds_in_bucket_count_unique']
#     ic(total_df.shape)
    return total_df


def get_time_stock(df, mapping_group):
    # Get realized volatility columns
    vol_cols = [col for col in df.columns if "return" in col or 'provide' in col or 'seconds_in_bucket_count_unique' == col or '/' in col] 
    # vol_cols = ['log_return_realized_volatility', 'log_return2_realized_volatility',
    #             'log_return_realized_volatility_300', 'log_return2_realized_volatility_300',
    #             'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_300']
    # # Group by the stock id
    # df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # # Rename columns joining suffix
    # df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    # df_stock_id = df_stock_id.add_suffix('_' + 'by_stock')
    #
    # Group by the time id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'by_time')

    # Merge with original dataframe
    # df = df.merge(df_stock_id, how='left', left_on=['stock_id'], right_on=['stock_id__by_stock'])
    df = df.merge(df_time_id, how='left', left_on=['time_id'], right_on=['time_id__by_time'])

    df['group'] = df['stock_id'].map(mapping_group)
    df['group_time'] = df[['group', 'time_id']].apply(lambda x: f"{x[0]}_{x[1]}", axis=1)
    # group_feature_columns = ['log_return_realized_volatility',
    #                          'log_return_std',
    #                          'log_return_realized_volatility_300',
    #                          'log_return_std_300',
    #                          'price_spread_mean',
    #                          'trade_log_return_realized_volatility',
    #                          'trade_log_return_std',
    #                          'trade_log_return_realized_volatility_300'
    #                          ]
    for col in vol_cols:
        df[f'group_{col}_mean'] = df.groupby(['group_time'])[col].transform(np.mean)
        df[f'group_{col}_std'] = df.groupby(['group_time'])[col].transform(np.std)
        df[f'group_{col}_max'] = df.groupby(['group_time'])[col].transform(np.max)
        df[f'group_{col}_min'] = df.groupby(['group_time'])[col].transform(np.min)
    df.drop(columns=['group', 'group_time', 'time_id__by_time'], inplace=True)
    return df


def combined(dataset, mapping_group=None):
    print(f"Loading & extract features for {dataset}")
    df = pd.read_csv(f'{DATA_DIR}/{dataset}.csv')
#     ic(df.shape)
    book_trade_df = merge_frames(dataset=dataset)
    df = df.merge(book_trade_df, on=["stock_id", "time_id"], how="left")
    if dataset == "train":
        stock_groups = feature_extraction.get_stock_groups(df)
        mapping_group = {g: i for i, gs in enumerate(stock_groups) for g in gs}

    df = get_time_stock(df, mapping_group)
    df.fillna(method='ffill', inplace=True)
#     ic(df.head())


    return df, mapping_group


data_train, mapping_group = combined(dataset="train")
# feature_list = list(data_train.drop(columns=["stock_id", "target"]).columns)
data_test, _ = combined(dataset="test", mapping_group=mapping_group)

In [None]:
stock_ids = data_train['stock_id'].unique()

In [None]:
 data_train[data_train.stock_id ==1].reset_index(drop=True)

In [None]:
models = {}
from sklearn.linear_model import Ridge, LinearRegression
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
for stock_id in stock_ids:
    print(f'stock_id: {stock_id}')
    cv_models = []
    tiny_data =  data_train[data_train.stock_id ==stock_id].reset_index(drop=True)
    X = tiny_data.drop(columns=['stock_id','time_id','target'])
    y = tiny_data['target'].values
    for fold, (train_index, valid_index) in enumerate(kfold.split(X)):
        print(f'fold : {fold+1}')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        train_weights = 1 / np.square(y_train)
        valid_weights = 1 / np.square(y_valid)
        model = Ridge()
        model.fit(X_train, y_train, sample_weight=train_weights)
        y_pred = model.predict(X_valid)
        print('RMSPE: ', rmspe(y_valid, y_pred))
        cv_models.append(model)
    models[stock_id] = cv_models

In [None]:
test_stock_ids = data_test['stock_id'].unique()
data_test.fillna(0, inplace=True)
data_test['target'] = 0
target = []
for stock_id in test_stock_ids:
    tiny_data =  data_test[data_test.stock_id ==stock_id].copy()
    X_test = tiny_data.drop(columns=['stock_id','time_id','row_id','target'])
    y_preds = []
    for model in models[stock_id]:
        y_pred = model.predict(X_test)
        y_preds.append(y_pred)
    y_preds = np.mean(y_preds,0)
    data_test.loc[data_test.stock_id==stock_id,'target'] = y_preds

In [None]:
data_test[['row_id', 'target']].to_csv("submission.csv", index=False)
