Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(streaming): test hdr approach for approx_percentile #11931

Merged
merged 10 commits into from
Aug 30, 2023
289 changes: 289 additions & 0 deletions e2e_test/streaming/aggregate/hdr_approx_percentile.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# --------
# Overview
# --------
#
# This is a demo test to show how to use hdr_histogram (http://hdrhistogram.org/)
# to compute `approx_percentile` in the RisingWave stream engine.
# It supports non-append-only streams, i.e. streams with updates and deletes are supported.
#
# The way it works is that hdr_histogram stores N significant digits of each value,
# thereby reducing the number of buckets needed to store the distribution.
#
# ----------------------
# Implementation details
# ----------------------
#
# First we create the histogram itself, where each value is encoded as a triple:
# 1. The sign of the value (1 or -1)
# 2. The exponent of the value (the power of 10, truncated to an integer)
# 3. The significand of the value (the digits after the decimal point, truncated to $PRECISION digits)
#
# With the following parameters:
# precision=2 (significand is 2 digits)
# value=123456
#
# The exponent will be 5 (log10(123456) = 5.0915... ~ 5)
# The significand will be 23 (123456 / 10^5 - 1 = 0.23... ).
# Then the histogram will store the triple (1, 5, 23)
#
# NOTE(kwannoel): We don't have to use base-10 for the exponent and significand.
# But RisingWave currently only supports log10, ln.
# So we use base-10 for convenience, and because it's more intuitive for the examples here.
#
# The bucket is computed as:
# (sign*(1.0+mantissa/pow(10.0, 2))*pow(10.0,exponent))::int
# (1 * (1.0 + 23 / 10^2) * 10^5) = 123000
#
# Next, we use window sum aggregation to compute the cumulative frequency of each bucket.
# | Bucket | 4 | 311 | 400 | 521 |
# | Counts | 3 | 6 | 7 | 8 |
# | C. Frequency | 3 | 9 | 16 | 24 |
#
# Finally we can compute the approximate percentile
# 1. Compute the sum of all frequencies. (24)
# 2. Select a percentile (90%)
# 3. Compute the frequency corresponding to the percentile (24 * 0.9 = 21.6)
# 4. Find the first bucket with the cumulative frequency >= 21.6 (521)
#
# Step 3 and 4 are required for us to use dynamic filter,
# since we don't support complex expressions in the dynamic filter.
# In the future when our optimizer can simplify the dynamic filter, we can avoid it.
#
# ----------
# Test Notes
# ----------
#
# We will test the following parameters:
# - 90 percentile.
# - 50 percentile.
# - Deletes.
# - Compare with percentile_disc. The significand should match.

statement ok
CREATE TABLE input (value BIGINT);

statement ok
INSERT INTO input SELECT * FROM generate_series(1,10);

statement ok
INSERT INTO input SELECT * FROM generate_series(1000, 1100);

statement ok
INSERT INTO input SELECT * FROM generate_series(10000, 11000);

statement ok
INSERT INTO input SELECT * FROM generate_series(100000, 110000);

statement ok
flush;

# Here `pow(10.0, 4)` is responsible for setting the precision of the histogram.
# "2" is the precision.
statement ok
CREATE MATERIALIZED VIEW hdr_histogram AS
SELECT
CASE WHEN value<0 THEN -1 ELSE 1 END AS sign,
trunc(log10(value))::int AS exponent,
trunc(
pow(10.0, 4)
* (value / pow(10.0, trunc(log10(value))::int) - 1.0))::int AS mantissa,
count(*) AS frequency,
1 as dummy
FROM input
GROUP BY sign, exponent, mantissa;

statement ok
CREATE MATERIALIZED VIEW hdr_sum AS
SELECT
sum(frequency) AS sum_frequency
FROM hdr_histogram;

# Here `pow(10.0, 4)` is responsible for recovering the position of the mantissa, with the precision.
# "2" is the precision.
statement ok
CREATE MATERIALIZED VIEW hdr_distribution AS
SELECT
(sign*(1.0+mantissa/pow(10.0, 4))*pow(10.0,exponent))::int AS bucket,
(sum(frequency)
OVER (PARTITION BY NULL
ORDER BY (sign, exponent, mantissa)
ROWS UNBOUNDED PRECEDING)
) AS cumulative_frequency
FROM hdr_histogram g
GROUP BY sign, exponent, mantissa, frequency
ORDER BY cumulative_frequency;

statement ok
CREATE INDEX hdr_distribution_idx ON hdr_distribution (cumulative_frequency);

########## 90-percentile ##########

statement ok
CREATE MATERIALIZED VIEW frequency_at_90 AS
SELECT sum_frequency * 0.9 AS scaled_sum_freq FROM hdr_sum;

statement ok
CREATE MATERIALIZED VIEW approx_percentile_90_percent AS
SELECT bucket AS approximate_percentile
FROM hdr_distribution x, frequency_at_90 y
WHERE x.cumulative_frequency >= y.scaled_sum_freq
ORDER BY cumulative_frequency
LIMIT 1;

query I
SELECT * FROM approx_percentile_90_percent;
----
108880

query I
SELECT percentile_disc(0.9) WITHIN GROUP (ORDER BY value) FROM input;
----
108889

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_90_percent, (SELECT percentile_disc(0.9) WITHIN GROUP (ORDER BY value) FROM input);
----
t

statement ok
DROP MATERIALIZED VIEW approx_percentile_90_percent;

statement ok
DROP MATERIALIZED VIEW frequency_at_90;

########## 50-percentile ##########

statement ok
CREATE MATERIALIZED VIEW frequency_at_50 AS
SELECT sum_frequency * 0.5 AS scaled_sum_freq FROM hdr_sum;

statement ok
CREATE MATERIALIZED VIEW approx_percentile_50_percent AS
SELECT bucket AS approximate_percentile
FROM hdr_distribution x, frequency_at_50 y
WHERE x.cumulative_frequency >= y.scaled_sum_freq
ORDER BY cumulative_frequency
LIMIT 1;

query I
SELECT * FROM approx_percentile_50_percent;
----
104440

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
104444

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t

### Delete > 100000

statement ok
DELETE FROM input WHERE value > 100000;

statement ok
flush;

query I
SELECT * FROM approx_percentile_50_percent;
----
10445

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
10445

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t

### Delete > 10000

statement ok
DELETE FROM input WHERE value > 10000;

statement ok
flush;

query I
SELECT * FROM approx_percentile_50_percent;
----
1045

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
1045

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t


### Delete > 100

statement ok
DELETE FROM input WHERE value > 100;

statement ok
flush;

query I
SELECT * FROM approx_percentile_50_percent;
----
5

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
5

query B
SELECT
percentile_disc = (approximate_percentile::int)
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t

### Cleanup 50-percentile

statement ok
DROP MATERIALIZED VIEW approx_percentile_50_percent;

statement ok
DROP MATERIALIZED VIEW frequency_at_50;

########## Cleanup ##########

statement ok
DELETE FROM input WHERE value < 1400;

statement ok
DROP INDEX hdr_distribution_idx;

statement ok
DROP MATERIALIZED VIEW hdr_distribution;

statement ok
DROP MATERIALIZED VIEW hdr_sum;

statement ok
DROP MATERIALIZED VIEW hdr_histogram;

statement ok
DROP TABLE input;
Loading