1. Import Packages and load the cleaned dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from functools import reduce

df = pd.read_csv("../Data/cleaned_first_25000_rows.csv", parse_dates=['ts_event'])

COMPUTE ORDER FLOW IMBALANCE VALUES 

In [2]:
def compute_ofi(row):
    if row['side'] == 'B' and row['action'] == 'A':
        return row['size']
    elif row['side'] == 'B' and row['action'] == 'C':
        return -row['size']
    elif row['side'] == 'S' and row['action'] == 'A':
        return -row['size']
    elif row['side'] == 'S' and row['action'] == 'C':
        return row['size']
    return 0

df['ofi'] = df.apply(compute_ofi, axis=1)
df.set_index('ts_event', inplace=True)

COMPUTE BEST LEVEL ORDER FLOW IMBALANCE

In [3]:
ofi_best = (
    df[df['depth'] == 0]
    .groupby(['symbol', pd.Grouper(freq='1Min')])['ofi']
    .sum()
    .reset_index()
    .rename(columns={'ts_event': 'timestamp', 'ofi': 'ofi_best'})
)

COMPUTE MULTI-LEVEL ORDER FLOW IMBALANCE

In [4]:
ofi_levels = []
for lvl in range(10):
    lvl_ofi = (
        df[df['depth'] == lvl]
        .groupby(['symbol', pd.Grouper(freq='1Min')])['ofi']
        .sum()
        .reset_index()
        .rename(columns={'ts_event': 'timestamp', 'ofi': f'ofi_lvl_{lvl}'})
    )
    ofi_levels.append(lvl_ofi)

1. Merge all OFI features

In [5]:
df_merged = reduce(
    lambda left, right: pd.merge(left, right, on=['symbol', 'timestamp'], how='outer'),
    [ofi_best] + ofi_levels
)
df_merged = df_merged.sort_values(['symbol', 'timestamp']).reset_index(drop=True)

COMPUTE INTEGRATED ORDER FLOW IMBALANCE VIA PCA

In [6]:
def apply_pca(group):
    level_cols = [f'ofi_lvl_{i}' for i in range(10)]
    levels = group[level_cols].fillna(0)
    pca = PCA(n_components=1)
    group['ofi_integrated'] = pca.fit_transform(levels)
    return group

df_merged = df_merged.groupby('symbol', group_keys=False).apply(apply_pca)

  df_merged = df_merged.groupby('symbol', group_keys=False).apply(apply_pca)


COMPUTE CROSS-ASSET ORDER FORM IMBALANCE

In [7]:
# For each timestamp, compute mean integrated OFI across all symbols
def compute_cross_asset(group):
    group['ofi_cross_asset'] = group['ofi_integrated'].mean()
    return group

df_merged = df_merged.groupby('timestamp', group_keys=False).apply(compute_cross_asset)


  df_merged = df_merged.groupby('timestamp', group_keys=False).apply(compute_cross_asset)


FINAL FEATURE SET

In [8]:
df_merged.to_csv("../Data/ofi_all_features.csv", index=False)
print("Saved final OFI feature dataset to: ../Data/ofi_all_features.csv")

Saved final OFI feature dataset to: ../Data/ofi_all_features.csv


In [9]:
df_merged.head()

Unnamed: 0,symbol,timestamp,ofi_best,ofi_lvl_0,ofi_lvl_1,ofi_lvl_2,ofi_lvl_3,ofi_lvl_4,ofi_lvl_5,ofi_lvl_6,ofi_lvl_7,ofi_lvl_8,ofi_lvl_9,ofi_integrated,ofi_cross_asset
0,AAPL,2024-10-21 11:54:00+00:00,5,5,798.0,0,-1.0,0.0,0.0,0.0,,,,-580.094433,-580.094433
1,AAPL,2024-10-21 11:55:00+00:00,-516,-516,1173.0,1257,-57.0,0.0,,0.0,,,,676.386697,676.386697
2,AAPL,2024-10-21 11:56:00+00:00,-1,-1,0.0,1400,0.0,0.0,,,0.0,,,729.79527,729.79527
3,AAPL,2024-10-21 11:57:00+00:00,201,201,373.0,800,229.0,0.0,0.0,0.0,,,,84.142324,84.142324
4,AAPL,2024-10-21 11:58:00+00:00,162,162,-540.0,366,35.0,-26.0,0.0,0.0,0.0,,,-285.806836,-285.806836
