In [12]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Step 1: Load and preprocess event-level data
df = pd.read_csv("first_25000_rows.csv")
df["ts_event"] = pd.to_datetime(df["ts_event"])
df = df.sort_values("ts_event")
df["action"] = df["action"].str.upper()
df["side"] = df["side"].str.upper()

# Step 2: Define signed ∆q (Section 2.1)
def delta_q(row, prev_price, prev_size):
    if pd.isna(prev_price):
        return 0
    if row["side"] == "B":
        if row["price"] > prev_price:
            return row["size"]
        elif row["price"] == prev_price:
            return row["size"] - prev_size
        else:
            return -prev_size
    else:  # side == "A"
        if row["price"] > prev_price:
            return -prev_size
        elif row["price"] == prev_price:
            return row["size"] - prev_size
        else:
            return row["size"]

# Step 3: Compute ∆q for each (symbol, depth, side)
df["delta_q"] = 0.0
for (symbol, depth, side), group in df.groupby(["symbol", "depth", "side"]):
    group = group.sort_values("ts_event")
    prev_price = group["price"].shift(1)
    prev_size = group["size"].shift(1)
    df.loc[group.index, "delta_q"] = group.apply(
        lambda row: delta_q(row, prev_price.loc[row.name], prev_size.loc[row.name]), axis=1
    )

# Step 4: Aggregate OFI features
ofi_features = df.groupby(["ts_event", "symbol"])

# Constructed OFI: sum over all ∆q (all levels)
ofi_df = ofi_features["delta_q"].sum().reset_index(name="OFI_constructed")

# Best-Level OFI (depth == 0 only)
ofi_best = df[df["depth"] == 0].groupby(["ts_event", "symbol"])["delta_q"].sum().reset_index(name="OFI_best")

# Multi-Level OFI: sum over the given order book depth
ofi_multi = df[df["depth"] <= 10].groupby(["ts_event", "symbol"])["delta_q"].sum().reset_index(name="OFI_multi")

# Merge into single table
ofi_all = ofi_df.merge(ofi_best, on=["ts_event", "symbol"], how="left") \
                .merge(ofi_multi, on=["ts_event", "symbol"], how="left")

# Integrated OFI via PCA (on level-wise ∆q)
ofi_matrix = df.pivot_table(index="ts_event", columns="depth", values="delta_q", aggfunc="sum").fillna(0)
pca = PCA(n_components=1)
ofi_all["OFI_integrated"] = pca.fit_transform(ofi_matrix)

# Cross-Asset OFI: sum of OFI_multi across symbols (if multiple symbols exist)
if ofi_all["symbol"].nunique() > 1:
    ofi_all["OFI_cross"] = ofi_all.groupby("ts_event")["OFI_multi"].transform("sum")
else:
    ofi_all["OFI_cross"] = np.nan

ofi_all

Unnamed: 0,ts_event,symbol,OFI_constructed,OFI_best,OFI_multi,OFI_integrated,OFI_cross
0,2024-10-21 11:54:29.221064336+00:00,AAPL,0.0,,0.0,1.675065,
1,2024-10-21 11:54:29.223769812+00:00,AAPL,0.0,0.0,0.0,1.675065,
2,2024-10-21 11:54:29.225030400+00:00,AAPL,1.0,1.0,1.0,1.675720,
3,2024-10-21 11:54:29.712434212+00:00,AAPL,0.0,,0.0,1.675065,
4,2024-10-21 11:54:29.764673165+00:00,AAPL,0.0,,0.0,1.675065,
...,...,...,...,...,...,...,...
4805,2024-10-21 13:04:16.583527688+00:00,AAPL,0.0,,0.0,1.675065,
4806,2024-10-21 13:04:17.976461017+00:00,AAPL,-5.0,,-5.0,1.635200,
4807,2024-10-21 13:04:20.085638629+00:00,AAPL,0.0,,0.0,1.675065,
4808,2024-10-21 13:04:20.085651109+00:00,AAPL,-200.0,,-200.0,1.470591,
