In [4]:
import polars as pl
from polars_order_book import calculate_bbo


In [38]:
n = 1
market_data = pl.DataFrame({
    'id':  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] * n,
    'price': [1, 2, 3, 6, 5, 4, 3, 1, 2, 5, 4, 6] * n,
    'qty':   [1, 2, 3, 6, 5, 4, -3, -1, -2, -5, -4, -6] * n,
    'is_bid': [True, True, True, False, False, False, True, True, True, False, False, False] * n
}, schema={
    'id': pl.Int8,
    'price': pl.Int64,
    'qty': pl.Int64,
    'is_bid': pl.Boolean
})
market_data = market_data.with_columns(
    bbo=calculate_bbo('price', 'qty', 'is_bid')
).unnest('bbo')

expected_values = {
    'id':  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'best_bid': [1, 2, 3, 3, 3, 3, 2, 2, None, None, None, None],
    'best_ask': [None, None, None, 6, 5, 4, 4, 4, 4, 4, 6 , None],
    'best_bid_qty': [1, 2, 3, 3, 3, 3, 2, 2, None, None, None, None],
    'best_ask_qty': [None, None, None, 6, 5, 4, 4, 4, 4, 4, 6, None],
}
expected = pl.DataFrame(expected_values, schema={k: v for k,v in market_data.schema.items() if k in expected_values})
expected = market_data.select('id').join(expected, on='id')

In [40]:
from polars.testing.asserts import assert_frame_equal
assert_frame_equal(
    market_data.select('id', 'best_bid', 'best_ask', 'best_bid_qty', 'best_ask_qty'), expected, check_column_order=False
)

AssertionError: DataFrames are different (value mismatch for column 'best_ask')
[left]:  [None, None, None, 6, 5, 4, 4, 4, 4, 4, 6, None]
[right]: [None, None, None, 6, 5, 4, 5, 4, 4, 4, 6, None]

In [3]:
%%timeit 
market_data.with_columns(
    bbo=calculate_bbo('price', 'qty', 'is_bid')
).unnest('bbo')

44.2 s ± 122 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
n = 10_000_000
market_data = pl.DataFrame({
    'price': [1, 2, 3, 6, 5, 4, 3, 1, 2, 5, 4, 6, 1, 2, 3, 6, 5, 4, 3, 1, 2, 5, 4, 6] * (n//2),
    'qty':   [1, 2, 3, 6, 5, 4, 3, 1, 2, 5, 4, 6, -1, -2, -3, -6, -5, -4, -3, -1, -2, -5, -4, -6] * (n//2),
    'is_bid': [True, True, True, False, False, False, True, True, True, False, False, False] * n
}, schema={
    'price': pl.Int64,
    'qty': pl.Int64,
    'is_bid': pl.Boolean
})

In [28]:
%timeit market_data.with_columns(bbo=calculate_bbo('price', 'qty', 'is_bid')).unnest('bbo')

3.9 s ± 32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
n = 10_000_000
symbols = ['A', 'B', 'C', 'D', 'E']
symbols = sum([[s] * 12 for s in symbols], []) 
market_data = pl.DataFrame({
    'price': [1, 2, 3, 6, 5, 4, 3, 1, 2, 5, 4, 6] * n,
    'qty':   [1, 2, 3, 6, 5, 4, -3, -1, -2, -5, -4, -6] * n,
    'is_bid': [True, True, True, False, False, False, True, True, True, False, False, False] * n,
    'symbol': symbols * (n // 5)
}, schema={
    'price': pl.Int64,
    'qty': pl.Int64,
    'is_bid': pl.Boolean,
    'symbol': pl.Utf8
})

In [5]:
%%timeit
market_data.group_by('symbol').agg(
    calculate_bbo('price', 'qty', 'is_bid').alias('bbo')
).explode('bbo').unnest('bbo')

46.8 s ± 326 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
