-
Notifications
You must be signed in to change notification settings - Fork 536
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/main' into siyuan/cdc-backfill-f…
…ramework-single-table
- Loading branch information
Showing
11 changed files
with
494 additions
and
11 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,289 @@ | ||
# -------- | ||
# Overview | ||
# -------- | ||
# | ||
# This is a demo test to show how to use hdr_histogram (http://hdrhistogram.org/) | ||
# to compute `approx_percentile` in the RisingWave stream engine. | ||
# It supports non-append-only streams, i.e. streams with updates and deletes are supported. | ||
# | ||
# The way it works is that hdr_histogram stores N significant digits of each value, | ||
# thereby reducing the number of buckets needed to store the distribution. | ||
# | ||
# ---------------------- | ||
# Implementation details | ||
# ---------------------- | ||
# | ||
# First we create the histogram itself, where each value is encoded as a triple: | ||
# 1. The sign of the value (1 or -1) | ||
# 2. The exponent of the value (the power of 10, truncated to an integer) | ||
# 3. The significand of the value (the digits after the decimal point, truncated to $PRECISION digits) | ||
# | ||
# With the following parameters: | ||
# precision=2 (significand is 2 digits) | ||
# value=123456 | ||
# | ||
# The exponent will be 5 (log10(123456) = 5.0915... ~ 5) | ||
# The significand will be 23 (123456 / 10^5 - 1 = 0.23... ). | ||
# Then the histogram will store the triple (1, 5, 23) | ||
# | ||
# NOTE(kwannoel): We don't have to use base-10 for the exponent and significand. | ||
# But RisingWave currently only supports log10, ln. | ||
# So we use base-10 for convenience, and because it's more intuitive for the examples here. | ||
# | ||
# The bucket is computed as: | ||
# (sign*(1.0+mantissa/pow(10.0, 2))*pow(10.0,exponent))::int | ||
# (1 * (1.0 + 23 / 10^2) * 10^5) = 123000 | ||
# | ||
# Next, we use window sum aggregation to compute the cumulative frequency of each bucket. | ||
# | Bucket | 4 | 311 | 400 | 521 | | ||
# | Counts | 3 | 6 | 7 | 8 | | ||
# | C. Frequency | 3 | 9 | 16 | 24 | | ||
# | ||
# Finally we can compute the approximate percentile | ||
# 1. Compute the sum of all frequencies. (24) | ||
# 2. Select a percentile (90%) | ||
# 3. Compute the frequency corresponding to the percentile (24 * 0.9 = 21.6) | ||
# 4. Find the first bucket with the cumulative frequency >= 21.6 (521) | ||
# | ||
# Step 3 and 4 are required for us to use dynamic filter, | ||
# since we don't support complex expressions in the dynamic filter. | ||
# In the future when our optimizer can simplify the dynamic filter, we can avoid it. | ||
# | ||
# ---------- | ||
# Test Notes | ||
# ---------- | ||
# | ||
# We will test the following parameters: | ||
# - 90 percentile. | ||
# - 50 percentile. | ||
# - Deletes. | ||
# - Compare with percentile_disc. The significand should match. | ||
|
||
statement ok | ||
CREATE TABLE input (value BIGINT); | ||
|
||
statement ok | ||
INSERT INTO input SELECT * FROM generate_series(1,10); | ||
|
||
statement ok | ||
INSERT INTO input SELECT * FROM generate_series(1000, 1100); | ||
|
||
statement ok | ||
INSERT INTO input SELECT * FROM generate_series(10000, 11000); | ||
|
||
statement ok | ||
INSERT INTO input SELECT * FROM generate_series(100000, 110000); | ||
|
||
statement ok | ||
flush; | ||
|
||
# Here `pow(10.0, 4)` is responsible for setting the precision of the histogram. | ||
# "2" is the precision. | ||
statement ok | ||
CREATE MATERIALIZED VIEW hdr_histogram AS | ||
SELECT | ||
CASE WHEN value<0 THEN -1 ELSE 1 END AS sign, | ||
trunc(log10(value))::int AS exponent, | ||
trunc( | ||
pow(10.0, 4) | ||
* (value / pow(10.0, trunc(log10(value))::int) - 1.0))::int AS mantissa, | ||
count(*) AS frequency, | ||
1 as dummy | ||
FROM input | ||
GROUP BY sign, exponent, mantissa; | ||
|
||
statement ok | ||
CREATE MATERIALIZED VIEW hdr_sum AS | ||
SELECT | ||
sum(frequency) AS sum_frequency | ||
FROM hdr_histogram; | ||
|
||
# Here `pow(10.0, 4)` is responsible for recovering the position of the mantissa, with the precision. | ||
# "2" is the precision. | ||
statement ok | ||
CREATE MATERIALIZED VIEW hdr_distribution AS | ||
SELECT | ||
(sign*(1.0+mantissa/pow(10.0, 4))*pow(10.0,exponent))::int AS bucket, | ||
(sum(frequency) | ||
OVER (PARTITION BY NULL | ||
ORDER BY (sign, exponent, mantissa) | ||
ROWS UNBOUNDED PRECEDING) | ||
) AS cumulative_frequency | ||
FROM hdr_histogram g | ||
GROUP BY sign, exponent, mantissa, frequency | ||
ORDER BY cumulative_frequency; | ||
|
||
statement ok | ||
CREATE INDEX hdr_distribution_idx ON hdr_distribution (cumulative_frequency); | ||
|
||
########## 90-percentile ########## | ||
|
||
statement ok | ||
CREATE MATERIALIZED VIEW frequency_at_90 AS | ||
SELECT sum_frequency * 0.9 AS scaled_sum_freq FROM hdr_sum; | ||
|
||
statement ok | ||
CREATE MATERIALIZED VIEW approx_percentile_90_percent AS | ||
SELECT bucket AS approximate_percentile | ||
FROM hdr_distribution x, frequency_at_90 y | ||
WHERE x.cumulative_frequency >= y.scaled_sum_freq | ||
ORDER BY cumulative_frequency | ||
LIMIT 1; | ||
|
||
query I | ||
SELECT * FROM approx_percentile_90_percent; | ||
---- | ||
108880 | ||
|
||
query I | ||
SELECT percentile_disc(0.9) WITHIN GROUP (ORDER BY value) FROM input; | ||
---- | ||
108889 | ||
|
||
query B | ||
SELECT | ||
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001 | ||
FROM approx_percentile_90_percent, (SELECT percentile_disc(0.9) WITHIN GROUP (ORDER BY value) FROM input); | ||
---- | ||
t | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW approx_percentile_90_percent; | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW frequency_at_90; | ||
|
||
########## 50-percentile ########## | ||
|
||
statement ok | ||
CREATE MATERIALIZED VIEW frequency_at_50 AS | ||
SELECT sum_frequency * 0.5 AS scaled_sum_freq FROM hdr_sum; | ||
|
||
statement ok | ||
CREATE MATERIALIZED VIEW approx_percentile_50_percent AS | ||
SELECT bucket AS approximate_percentile | ||
FROM hdr_distribution x, frequency_at_50 y | ||
WHERE x.cumulative_frequency >= y.scaled_sum_freq | ||
ORDER BY cumulative_frequency | ||
LIMIT 1; | ||
|
||
query I | ||
SELECT * FROM approx_percentile_50_percent; | ||
---- | ||
104440 | ||
|
||
query I | ||
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input; | ||
---- | ||
104444 | ||
|
||
query B | ||
SELECT | ||
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001 | ||
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input); | ||
---- | ||
t | ||
|
||
### Delete > 100000 | ||
|
||
statement ok | ||
DELETE FROM input WHERE value > 100000; | ||
|
||
statement ok | ||
flush; | ||
|
||
query I | ||
SELECT * FROM approx_percentile_50_percent; | ||
---- | ||
10445 | ||
|
||
query I | ||
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input; | ||
---- | ||
10445 | ||
|
||
query B | ||
SELECT | ||
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001 | ||
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input); | ||
---- | ||
t | ||
|
||
### Delete > 10000 | ||
|
||
statement ok | ||
DELETE FROM input WHERE value > 10000; | ||
|
||
statement ok | ||
flush; | ||
|
||
query I | ||
SELECT * FROM approx_percentile_50_percent; | ||
---- | ||
1045 | ||
|
||
query I | ||
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input; | ||
---- | ||
1045 | ||
|
||
query B | ||
SELECT | ||
(abs(percentile_disc - approximate_percentile) / percentile_disc) <= 0.0001 | ||
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input); | ||
---- | ||
t | ||
|
||
|
||
### Delete > 100 | ||
|
||
statement ok | ||
DELETE FROM input WHERE value > 100; | ||
|
||
statement ok | ||
flush; | ||
|
||
query I | ||
SELECT * FROM approx_percentile_50_percent; | ||
---- | ||
5 | ||
|
||
query I | ||
SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input; | ||
---- | ||
5 | ||
|
||
query B | ||
SELECT | ||
percentile_disc = (approximate_percentile::int) | ||
FROM approx_percentile_50_percent, (SELECT percentile_disc(0.5) WITHIN GROUP (ORDER BY value) FROM input); | ||
---- | ||
t | ||
|
||
### Cleanup 50-percentile | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW approx_percentile_50_percent; | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW frequency_at_50; | ||
|
||
########## Cleanup ########## | ||
|
||
statement ok | ||
DELETE FROM input WHERE value < 1400; | ||
|
||
statement ok | ||
DROP INDEX hdr_distribution_idx; | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW hdr_distribution; | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW hdr_sum; | ||
|
||
statement ok | ||
DROP MATERIALIZED VIEW hdr_histogram; | ||
|
||
statement ok | ||
DROP TABLE input; |
Oops, something went wrong.