Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into siyuan/cdc-backfill-f…
Browse files Browse the repository at this point in the history
…ramework-single-table
  • Loading branch information
StrikeW committed Aug 30, 2023
2 parents 040582b + d1fe176 commit 05d3de8
Show file tree
Hide file tree
Showing 11 changed files with 494 additions and 11 deletions.
42 changes: 36 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

289 changes: 289 additions & 0 deletions e2e_test/streaming/aggregate/hdr_approx_percentile.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# --------
# Overview
# --------
#
# This is a demo test to show how to use hdr_histogram (http://hdrhistogram.org/)
# to compute `approx_percentile` in the RisingWave stream engine.
# It supports non-append-only streams, i.e. streams with updates and deletes are supported.
#
# The way it works is that hdr_histogram stores N significant digits of each value,
# thereby reducing the number of buckets needed to store the distribution.
#
# ----------------------
# Implementation details
# ----------------------
#
# First we create the histogram itself, where each value is encoded as a triple:
# 1. The sign of the value (1 or -1)
# 2. The exponent of the value (the power of 10, truncated to an integer)
# 3. The significand of the value (the digits after the decimal point, truncated to $PRECISION digits)
#
# With the following parameters:
# precision=2 (significand is 2 digits)
# value=123456
#
# The exponent will be 5 (log10(123456) = 5.0915... ~ 5)
# The significand will be 23 (123456 / 10^5 - 1 = 0.23... ).
# Then the histogram will store the triple (1, 5, 23)
#
# NOTE(kwannoel): We don't have to use base-10 for the exponent and significand.
# But RisingWave currently only supports log10, ln.
# So we use base-10 for convenience, and because it's more intuitive for the examples here.
#
# The bucket is computed as:
# (sign*(1.0+mantissa/pow(10.0, 2))*pow(10.0,exponent))::int
# (1 * (1.0 + 23 / 10^2) * 10^5) = 123000
#
# Next, we use window sum aggregation to compute the cumulative frequency of each bucket.
# | Bucket | 4 | 311 | 400 | 521 |
# | Counts | 3 | 6 | 7 | 8 |
# | C. Frequency | 3 | 9 | 16 | 24 |
#
# Finally we can compute the approximate percentile
# 1. Compute the sum of all frequencies. (24)
# 2. Select a percentile (90%)
# 3. Compute the frequency corresponding to the percentile (24 * 0.9 = 21.6)
# 4. Find the first bucket with the cumulative frequency >= 21.6 (521)
#
# Step 3 and 4 are required for us to use dynamic filter,
# since we don't support complex expressions in the dynamic filter.
# In the future when our optimizer can simplify the dynamic filter, we can avoid it.
#
# ----------
# Test Notes
# ----------
#
# We will test the following parameters:
# - 90 percentile.
# - 50 percentile.
# - Deletes.
# - Compare with percentile_disc. The significand should match.

statement ok
CREATE TABLE input (value BIGINT);

statement ok
INSERT INTO input SELECT * FROM generate_series(1,10);

statement ok
INSERT INTO input SELECT * FROM generate_series(1000, 1100);

statement ok
INSERT INTO input SELECT * FROM generate_series(10000, 11000);

statement ok
INSERT INTO input SELECT * FROM generate_series(100000, 110000);

statement ok
flush;

# Here `pow(10.0, 4)` is responsible for setting the precision of the histogram.
# "2" is the precision.
statement ok
CREATE MATERIALIZED VIEW hdr_histogram AS
SELECT
CASE WHEN value<0 THEN -1 ELSE 1 END AS sign,
trunc(log10(value))::int AS exponent,
trunc(
pow(10.0, 4)
* (value / pow(10.0, trunc(log10(value))::int) - 1.0))::int AS mantissa,
count(*) AS frequency,
1 as dummy
FROM input
GROUP BY sign, exponent, mantissa;

statement ok
CREATE MATERIALIZED VIEW hdr_sum AS
SELECT
sum(frequency) AS sum_frequency
FROM hdr_histogram;

# Here `pow(10.0, 4)` is responsible for recovering the position of the mantissa, with the precision.
# "2" is the precision.
statement ok
CREATE MATERIALIZED VIEW hdr_distribution AS
SELECT
(sign*(1.0+mantissa/pow(10.0, 4))*pow(10.0,exponent))::int AS bucket,
(sum(frequency)
OVER (PARTITION BY NULL
ORDER BY (sign, exponent, mantissa)
ROWS UNBOUNDED PRECEDING)
) AS cumulative_frequency
FROM hdr_histogram g
GROUP BY sign, exponent, mantissa, frequency
ORDER BY cumulative_frequency;

statement ok
CREATE INDEX hdr_distribution_idx ON hdr_distribution (cumulative_frequency);

########## 90-percentile ##########

statement ok
CREATE MATERIALIZED VIEW frequency_at_90 AS
SELECT sum_frequency * 0.9 AS scaled_sum_freq FROM hdr_sum;

statement ok
CREATE MATERIALIZED VIEW approx_percentile_90_percent AS
SELECT bucket AS approximate_percentile
FROM hdr_distribution x, frequency_at_90 y
WHERE x.cumulative_frequency >= y.scaled_sum_freq
ORDER BY cumulative_frequency
LIMIT 1;

query I
SELECT * FROM approx_percentile_90_percent;
----
108880

query I
SELECT percentile_disc(0.9) WITHIN GROUP (ORDER BY value) FROM input;
----
108889

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_90_percent, (SELECT percentile_disc(0.9) WITHIN GROUP (ORDER BY value) FROM input);
----
t

statement ok
DROP MATERIALIZED VIEW approx_percentile_90_percent;

statement ok
DROP MATERIALIZED VIEW frequency_at_90;

########## 50-percentile ##########

statement ok
CREATE MATERIALIZED VIEW frequency_at_50 AS
SELECT sum_frequency * 0.5 AS scaled_sum_freq FROM hdr_sum;

statement ok
CREATE MATERIALIZED VIEW approx_percentile_50_percent AS
SELECT bucket AS approximate_percentile
FROM hdr_distribution x, frequency_at_50 y
WHERE x.cumulative_frequency >= y.scaled_sum_freq
ORDER BY cumulative_frequency
LIMIT 1;

query I
SELECT * FROM approx_percentile_50_percent;
----
104440

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
104444

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t

### Delete > 100000

statement ok
DELETE FROM input WHERE value > 100000;

statement ok
flush;

query I
SELECT * FROM approx_percentile_50_percent;
----
10445

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
10445

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t

### Delete > 10000

statement ok
DELETE FROM input WHERE value > 10000;

statement ok
flush;

query I
SELECT * FROM approx_percentile_50_percent;
----
1045

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
1045

query B
SELECT
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t


### Delete > 100

statement ok
DELETE FROM input WHERE value > 100;

statement ok
flush;

query I
SELECT * FROM approx_percentile_50_percent;
----
5

query I
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input;
----
5

query B
SELECT
percentile_disc = (approximate_percentile::int)
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input);
----
t

### Cleanup 50-percentile

statement ok
DROP MATERIALIZED VIEW approx_percentile_50_percent;

statement ok
DROP MATERIALIZED VIEW frequency_at_50;

########## Cleanup ##########

statement ok
DELETE FROM input WHERE value < 1400;

statement ok
DROP INDEX hdr_distribution_idx;

statement ok
DROP MATERIALIZED VIEW hdr_distribution;

statement ok
DROP MATERIALIZED VIEW hdr_sum;

statement ok
DROP MATERIALIZED VIEW hdr_histogram;

statement ok
DROP TABLE input;
Loading

0 comments on commit 05d3de8

Please sign in to comment.