In [24]:
import os
from datetime import datetime
import modin.pandas as pd
from collections import defaultdict
import re

BASE_PATH = os.getcwd()
DATA_PATH = "/".join(BASE_PATH.split('/')[:-1]) + '/data/'

In [9]:
GOOG_FILE_PATH = DATA_PATH + "GOOG/dbeq-basic-20231214.mbp-10.parquet"
goog_df = pd.read_parquet(GOOG_FILE_PATH)

GOOGL_FILE_PATH = DATA_PATH + "GOOG/dbeq-basic-20231214.mbp-10.parquet"
googl_df = pd.read_parquet(GOOGL_FILE_PATH)



In [None]:
COLS = ['ts_recv', 'ts_event', 'action', 'side', 'depth', 'price', 'size', 'ts_in_delta', 'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00', 'bid_ct_00', 'ask_ct_00']

## merge based on ts_recv

In [22]:
def time_is_after(timestamp, hour: str):
    pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6})\d+'
    match = re.match(pattern, str(timestamp))

    if match:
        dt_obj = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S.%f')
    else: 
        raise ValueError("Invalid timestamp format")
    
    dt_time = dt_obj.time()
    time_obj = datetime.strptime(hour, '%H:%M:%S').time()

    return dt_time < time_obj

In [None]:
def process_single(df, start_minutes, end_minutes):

    # df["ts_recv"] = df["ts_recv"].dt.tz_localize("UTC").dt.tz_convert("US/Eastern")
    # time = df["ts_recv"].dt.hour * 60 + df["ts_recv"].dt.minute
    # df = df[(time >= start_minutes) & (time <= end_minutes)]

    return df

In [None]:
def merge_pairs(dfA: pd.DataFrame, dfB: pd.DataFrame, start_time: str, end_time: str) -> pd.DataFrame:
    dfA['ts_recv'] = dfA['ts_recv'].dt.tz_convert('US/Eastern')
    dfA['ts_event'] = dfA['ts_event'].dt.tz_convert('US/Eastern')

    dfB['ts_recv'] = dfB['ts_recv'].dt.tz_convert('US/Eastern')
    dfB['ts_event'] = dfB['ts_event'].dt.tz_convert('US/Eastern')

    start_minutes = int(start_time.split(":")[0]) * 60 + int(start_time.split(":")[1])
    end_minutes = int(end_time.split(":")[0]) * 60 + int(end_time.split(":")[1])

    dfA = process_single(dfA, start_minutes, end_minutes)
    dfB = process_single(dfB, start_minutes, end_minutes)

    return pd.merge(dfA, dfB, on='ts_recv', how='inner')

In [None]:
def merge(time_unit: str): 
        
    goog_path = f"{base_path}/GOOG.parquet"
    googl_path = f"{base_path}/GOOGL.parquet"
    goog_df = cudf.read_parquet(goog_path)
    googl_df = cudf.read_parquet(googl_path)

    goog_df = goog_df[goog_df['price'] < 500]
    googl_df = googl_df[googl_df['price'] < 500]
    merged_goog = goog_df
    merged_googl = googl_df 
    merged_goog['ts_recv'] = merged_goog['ts_recv'].astype('datetime64[ms]').dt.ceil(time_unit)
    merged_goog = merged_goog.groupby('ts_recv').mean()
    merged_googl['ts_recv'] = merged_googl['ts_recv'].astype('datetime64[ms]').dt.ceil(time_unit)
    merged_googl = merged_googl.groupby('ts_recv').mean()

    merged_df = cudf.merge(merged_goog, merged_googl, on='ts_recv', how='inner', suffixes=('_goog', '_googl'))
    merged_df = merged_df.reset_index()
    merged_df = merged_df.sort_values(by='ts_recv')

    return merged_df