In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
def buildData(bookFolder, tradeFolder):
    if trainFolders:
        bookFolder = bookTrainFolder
        tradeFolder = tradeTrainFolder
    else:
        bookFolder = bookTestFolder
        tradeFolder = tradeTestFolder

    def buildDataForSingleStockId(stock_id_folder, stock_id_bookFolder, stock_id_tradeFolder):
        # Read in book data for current stock_id
        bookData = pd.read_parquet(stock_id_bookFolder + stock_id_folder)
        tradeData = pd.read_parquet(stock_id_tradeFolder + stock_id_folder)
        # create dataframe with all the time_id in the current bookData and all the possible seconds_in_bucket 0-599
        time_id = []
        seconds_in_bucket = []

        for x in bookData['time_id'].unique():
            for y in range(600):
                time_id.append(x)
                seconds_in_bucket.append(y)

        allTimes = pd.DataFrame({'time_id': time_id, 'seconds_in_bucket': seconds_in_bucket})

        # Ensure all seconds are accounted for
        bookData = bookData.merge(allTimes, on=['time_id', 'seconds_in_bucket'], how='outer').sort_values(by=['time_id', 'seconds_in_bucket'])
        bookData['time_id_cp'] = bookData['time_id']

        # Forward fill and backfill
        bookData = bookData.groupby(['time_id_cp']).fillna(method='ffill').fillna(method='bfill').reset_index(drop=True)

        # Calculate weighted average price
        bookData['wap1'] = (bookData['bid_price1'] * bookData['ask_size1'] + bookData['ask_price1'] * bookData['bid_size1']) / (bookData['ask_size1'] + bookData['bid_size1'])
        bookData['wap2'] = (bookData['bid_price2'] * bookData['ask_size2'] + bookData['ask_price2'] * bookData['bid_size2']) / (bookData['ask_size2'] + bookData['bid_size2'])
        bookData['ask1_bid1_spread'] = bookData['ask_price1'] / bookData['bid_price1'] - 1
        # bid spread and ask spread
        bookData['bid_spread'] = (bookData['bid_price1'] - bookData['bid_price2']) / (bookData['bid_price1'] + bookData['bid_price2'])
        bookData['ask_spread'] = (bookData['ask_price2'] - bookData['ask_price1']) / (bookData['ask_price1'] + bookData['ask_price2'])

        df1 = pd.merge(bookData, tradeData[['time_id', 'seconds_in_bucket','price']], on = ['time_id', 'seconds_in_bucket'], how = 'left')
        df1 = df1.fillna(0)

        def aggregateBookData(interval):
            df = df1.copy()
            df['interval'] = df['seconds_in_bucket'] // interval

            df_agg = df.groupby(['time_id', 'interval']).agg(
                wap1_log_high_low=pd.NamedAgg(column='wap1', aggfunc=lambda x: np.log(np.max(x) / np.min(x))),
                wap2_log_high_low=pd.NamedAgg(column='wap2', aggfunc=lambda x: np.log(np.max(x) / np.min(x))),
                ask1_bid1_spread_avg=pd.NamedAgg(column='ask1_bid1_spread', aggfunc=np.mean),
                bid_spread_avg=pd.NamedAgg(column='bid_spread', aggfunc=np.mean),
                ask_spread_avg=pd.NamedAgg(column='ask_spread', aggfunc=np.mean),
                price_avg = pd.NamedAgg(column = 'price', aggfunc=np.mean)).reset_index()

            df_wide = pd.pivot_table(df_agg, values=['wap1_log_high_low', 'wap2_log_high_low', 'ask1_bid1_spread_avg','price_avg', 'bid_spread_avg', 'ask_spread_avg'],
                                     index='time_id', columns='interval').reset_index().fillna(0)
            df_wide.columns = ['_'.join(str(e) for e in col) for col in df_wide.columns]
            df_wide = df_wide.add_suffix(f'_{interval}s_wide').rename(columns={f'time_id__{interval}s_wide': 'time_id'})

            return df_wide

        finalBookData = aggregateBookData(10)
        finalBookData['row_id'] = stock_id_folder.split('=')[1] + '-' + finalBookData['time_id'].astype(str)
        

        return finalBookData.drop(columns='time_id').fillna(0)

    results = []
    for curr_stock_id_folder in os.listdir(bookFolder):
        if curr_stock_id_folder.startswith('stock_id='):
            results.append(buildDataForSingleStockId(curr_stock_id_folder, bookFolder, tradeFolder))


    return pd.concat(results, ignore_index=True).fillna(0)

In [None]:
bookTrainFolder = 'book_train.parquet/'
tradeTrainFolder = 'trade_train.parquet/'

print(f"Number of files in booktrain: {len(os.listdir(bookTrainFolder))}")

trainData1 = buildData(bookTrainFolder, tradeTrainFolder)

In [None]:
train_csv = pd.read_csv('train.csv')

In [None]:
# Splitting the 'row_id' into 'stock_id' and 'time_id'
trainData1['stock_id'], trainData1['time_id'] = zip(
    *trainData1['row_id'].apply(lambda x: map(int, x.split('-')))
)
result = pd.merge(trainData1, train_csv, on=['stock_id', 'time_id'], how='left')

result.drop(columns=['row_id'], inplace=True)
result['stock_id'] = result['stock_id'].astype(int)
result['time_id'] = result['time_id'].astype(int)
result.head()

In [None]:
columns_to_replace = [col for col in trainData1.columns if col.startswith('price')]

result[columns_to_replace] = result[columns_to_replace].applymap(lambda x: 1 if x > 0 else x)

In [None]:

# Splitting the data into training and testing sets
train_data, temp_data = train_test_split(result, test_size=0.2, random_state=42)

val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data.to_parquet('train.parquet')
val_data.to_parquet('val.parquet')
test_data.to_parquet('test.parquet')