diff --git a/aqo.c b/aqo.c index a80d0a0f..6d125666 100644 --- a/aqo.c +++ b/aqo.c @@ -34,6 +34,7 @@ void _PG_init(void); /* Strategy of determining feature space for new queries. */ int aqo_mode = AQO_MODE_CONTROLLED; bool force_collect_stat; +bool aqo_predict_with_few_neighbors; /* * Show special info in EXPLAIN mode. @@ -71,7 +72,7 @@ int auto_tuning_infinite_loop = 8; /* Machine learning parameters */ /* The number of nearest neighbors which will be chosen for ML-operations */ -int aqo_k = 3; +int aqo_k; double log_selectivity_lower_bound = -30; /* @@ -293,6 +294,29 @@ _PG_init(void) NULL ); + DefineCustomIntVariable("aqo.min_neighbors_for_predicting", + "Set how many neighbors the cardinality prediction will be calculated", + NULL, + &aqo_k, + 3, + 1, INT_MAX / 1000, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("aqo.predict_with_few_neighbors", + "Establish the ability to make predictions with fewer neighbors than were found.", + NULL, + &aqo_predict_with_few_neighbors, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = aqo_init_shmem; prev_planner_hook = planner_hook; diff --git a/aqo.h b/aqo.h index 0a373147..9418646c 100644 --- a/aqo.h +++ b/aqo.h @@ -217,6 +217,7 @@ extern double auto_tuning_convergence_error; /* Machine learning parameters */ extern int aqo_k; +extern bool aqo_predict_with_few_neighbors; extern double log_selectivity_lower_bound; /* Parameters for current query */ diff --git a/cardinality_estimation.c b/cardinality_estimation.c index aca17f1e..f93e0905 100644 --- a/cardinality_estimation.c +++ b/cardinality_estimation.c @@ -93,7 +93,7 @@ predict_for_relation(List *clauses, List *selectivities, List *relsigns, */ /* Try to search in surrounding feature spaces for the same node */ - if (!load_aqo_data(query_context.fspace_hash, *fss, data, NULL, use_wide_search)) + if (!load_aqo_data(query_context.fspace_hash, *fss, data, NULL, use_wide_search, features)) result = -1; else { diff --git a/expected/look_a_like.out b/expected/look_a_like.out index ecd73fb4..faa9b0fd 100644 --- a/expected/look_a_like.out +++ b/expected/look_a_like.out @@ -2,14 +2,19 @@ CREATE EXTENSION aqo; SET aqo.join_threshold = 0; SET aqo.mode = 'learn'; SET aqo.show_details = 'on'; +set aqo.show_hash = 'off'; +SET aqo.min_neighbors_for_predicting = 1; +SET enable_nestloop = 'off'; +SET enable_mergejoin = 'off'; SET enable_material = 'off'; DROP TABLE IF EXISTS a,b CASCADE; NOTICE: table "a" does not exist, skipping NOTICE: table "b" does not exist, skipping -CREATE TABLE a (x int); -INSERT INTO a (x) SELECT mod(ival,10) FROM generate_series(1,1000) As ival; -CREATE TABLE b (y int); -INSERT INTO b (y) SELECT mod(ival + 1,10) FROM generate_series(1,1000) As ival; +-- Create tables with correlated datas in columns +CREATE TABLE a (x1 int, x2 int, x3 int); +INSERT INTO a (x1, x2, x3) SELECT mod(ival,10), mod(ival,10), mod(ival,10) FROM generate_series(1,1000) As ival; +CREATE TABLE b (y1 int, y2 int, y3 int); +INSERT INTO b (y1, y2, y3) SELECT mod(ival + 1,10), mod(ival + 1,10), mod(ival + 1,10) FROM generate_series(1,1000) As ival; -- -- Returns string-by-string explain of a query. Made for removing some strings -- from the explain output. @@ -25,207 +30,528 @@ $$ LANGUAGE PLPGSQL; -- in the next queries with the same fss_hash SELECT str AS result FROM expln(' -SELECT x FROM A where x = 5;') AS str; - result ------------------------------------------------- - Seq Scan on public.a (actual rows=100 loops=1) - AQO not used - Output: x - Filter: (a.x = 5) - Rows Removed by Filter: 900 - Using aqo: true - AQO mode: LEARN - JOINS: 0 -(8 rows) - -SELECT str AS result -FROM expln(' -SELECT x FROM A,B WHERE x = 5 AND A.x = B.y;') AS str -; -- Find cardinality for SCAN A(x=5) from a neighbour class, created by the +SELECT x1,y1 FROM A,B WHERE x1 = 5 AND x2 = 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; result -------------------------------------------------------- Nested Loop (actual rows=10000 loops=1) AQO not used - Output: a.x - -> Seq Scan on public.b (actual rows=100 loops=1) + Output: a.x1, b.y1 + -> Seq Scan on public.a (actual rows=100 loops=1) AQO not used - Output: b.y - Filter: (b.y = 5) + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 = 5) AND (a.x2 = 5)) Rows Removed by Filter: 900 - -> Seq Scan on public.a (actual rows=100 loops=100) - AQO: rows=100, error=0% - Output: a.x - Filter: (a.x = 5) + -> Seq Scan on public.b (actual rows=100 loops=100) + AQO not used + Output: b.y1, b.y2, b.y3 + Filter: (b.y1 = 5) Rows Removed by Filter: 900 Using aqo: true AQO mode: LEARN JOINS: 0 (16 rows) --- query, executed above. SELECT str AS result FROM expln(' -SELECT x, sum(x) FROM A,B WHERE y = 5 AND A.x = B.y group by(x);') AS str -; -- Find the JOIN cardinality from a neighbour class. - result --------------------------------------------------------------- - GroupAggregate (actual rows=1 loops=1) +SELECT x1,y1 FROM A LEFT JOIN b ON A.x1 = B.y1 WHERE x1 = 5 AND x2 = 5;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------ + Hash Left Join (actual rows=10000 loops=1) AQO not used - Output: a.x, sum(a.x) - Group Key: a.x - -> Nested Loop (actual rows=10000 loops=1) - AQO: rows=10000, error=0% - Output: a.x - -> Seq Scan on public.a (actual rows=100 loops=1) - AQO: rows=100, error=0% - Output: a.x - Filter: (a.x = 5) - Rows Removed by Filter: 900 - -> Seq Scan on public.b (actual rows=100 loops=100) + Output: a.x1, b.y1 + Hash Cond: (a.x1 = b.y1) + -> Seq Scan on public.a (actual rows=100 loops=1) + AQO: rows=100, error=0% + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 = 5) AND (a.x2 = 5)) + Rows Removed by Filter: 900 + -> Hash (actual rows=100 loops=1) + Output: b.y1 + -> Seq Scan on public.b (actual rows=100 loops=1) AQO: rows=100, error=0% - Output: b.y - Filter: (b.y = 5) + Output: b.y1 + Filter: (b.y1 = 5) Rows Removed by Filter: 900 Using aqo: true AQO mode: LEARN - JOINS: 1 -(20 rows) + JOINS: 0 +(19 rows) --- cardinality 100 in the first Seq Scan on a SELECT str AS result FROM expln(' -SELECT x, sum(x) FROM A WHERE x = 5 group by(x);') AS str; - result ------------------------------------------------------- - GroupAggregate (actual rows=1 loops=1) +SELECT x1,y1 FROM A,B WHERE x1 < 5 AND x2 < 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------ + Hash Join (actual rows=50000 loops=1) AQO not used - Output: x, sum(x) - Group Key: a.x - -> Seq Scan on public.a (actual rows=100 loops=1) - AQO: rows=100, error=0% - Output: x - Filter: (a.x = 5) - Rows Removed by Filter: 900 + Output: a.x1, b.y1 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO not used + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=500 loops=1) + Output: a.x1 + -> Seq Scan on public.a (actual rows=500 loops=1) + AQO not used + Output: a.x1 + Filter: ((a.x1 < 5) AND (a.x2 < 5)) + Rows Removed by Filter: 500 Using aqo: true AQO mode: LEARN JOINS: 0 -(12 rows) +(17 rows) + +--query contains nodes that have already been predicted +SELECT str AS result +FROM expln(' +SELECT x1,y1 FROM A,B WHERE x1 < 10 AND x2 < 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------ + Hash Join (actual rows=50000 loops=1) + AQO: rows=50000, error=0% + Output: a.x1, b.y1 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=500 loops=1) + Output: a.x1 + -> Seq Scan on public.a (actual rows=500 loops=1) + AQO: rows=500, error=0% + Output: a.x1 + Filter: ((a.x1 < 10) AND (a.x2 < 5)) + Rows Removed by Filter: 500 + Using aqo: true + AQO mode: LEARN + JOINS: 0 +(17 rows) --- no one predicted rows. we use knowledge cardinalities of the query --- in the next queries with the same fss_hash SELECT str AS result FROM expln(' -SELECT x FROM A where x < 10 group by(x);') AS str -WHERE str NOT LIKE '%Memory%'; - result -------------------------------------------------------- - HashAggregate (actual rows=10 loops=1) +SELECT x1,y1 FROM A,B WHERE x1 > 2 AND x2 > 2 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------ + Hash Join (actual rows=70000 loops=1) AQO not used - Output: x - Group Key: a.x - -> Seq Scan on public.a (actual rows=1000 loops=1) - AQO not used - Output: x - Filter: (a.x < 10) + Output: a.x1, b.y1 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=700 loops=1) + Output: a.x1 + -> Seq Scan on public.a (actual rows=700 loops=1) + AQO not used + Output: a.x1 + Filter: ((a.x1 > 2) AND (a.x2 > 2)) + Rows Removed by Filter: 300 Using aqo: true AQO mode: LEARN JOINS: 0 -(11 rows) +(17 rows) --- cardinality 1000 in Seq Scan on a SELECT str AS result FROM expln(' -SELECT x,y FROM A,B WHERE x < 10 AND A.x = B.y;') AS str -WHERE str NOT LIKE '%Memory%'; - result -------------------------------------------------------------- - Merge Join (actual rows=100000 loops=1) +SELECT x1,y1 FROM A,B WHERE x1 > 5 AND x2 > 5 AND x3 < 10 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------- + Hash Join (actual rows=40000 loops=1) AQO not used - Output: a.x, b.y - Merge Cond: (a.x = b.y) - -> Sort (actual rows=1000 loops=1) - Output: a.x - Sort Key: a.x - -> Seq Scan on public.a (actual rows=1000 loops=1) - AQO: rows=1000, error=0% - Output: a.x - Filter: (a.x < 10) - -> Sort (actual rows=99901 loops=1) - Output: b.y - Sort Key: b.y - -> Seq Scan on public.b (actual rows=1000 loops=1) + Output: a.x1, b.y1 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=400 loops=1) + Output: a.x1 + -> Seq Scan on public.a (actual rows=400 loops=1) + AQO not used + Output: a.x1 + Filter: ((a.x1 > 5) AND (a.x2 > 5) AND (a.x3 < 10)) + Rows Removed by Filter: 600 + Using aqo: true + AQO mode: LEARN + JOINS: 0 +(17 rows) + +SELECT str AS result +FROM expln(' +SELECT x1,y1 FROM A,B WHERE x1 < 5 AND x2 < 5 AND x3 < 10 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------- + Hash Join (actual rows=50000 loops=1) + AQO not used + Output: a.x1, b.y1 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=500 loops=1) + Output: a.x1 + -> Seq Scan on public.a (actual rows=500 loops=1) AQO not used - Output: b.y + Output: a.x1 + Filter: ((a.x1 < 5) AND (a.x2 < 5) AND (a.x3 < 10)) + Rows Removed by Filter: 500 + Using aqo: true + AQO mode: LEARN + JOINS: 0 +(17 rows) + +--query contains nodes that have already been predicted +SELECT str AS result +FROM expln(' +SELECT x1,y1 FROM A,B WHERE x1 < 5 AND x2 < 4 AND x3 < 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------ + Hash Join (actual rows=40000 loops=1) + AQO: rows=50000, error=20% + Output: a.x1, b.y1 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=400 loops=1) + Output: a.x1 + -> Seq Scan on public.a (actual rows=400 loops=1) + AQO: rows=500, error=20% + Output: a.x1 + Filter: ((a.x1 < 5) AND (a.x2 < 4) AND (a.x3 < 5)) + Rows Removed by Filter: 600 Using aqo: true AQO mode: LEARN JOINS: 0 -(20 rows) +(17 rows) --- cardinality 100 in Seq Scan on a and Seq Scan on b SELECT str AS result FROM expln(' -SELECT x FROM A,B where x < 10 and y > 10 group by(x);') AS str -WHERE str NOT LIKE '%Memory%'; - result ----------------------------------------------------------- - HashAggregate (actual rows=0 loops=1) +SELECT x1 FROM A,B WHERE x1 < 4 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +--------------------------------------------------------------------- + Group (actual rows=2 loops=1) AQO not used - Output: a.x - Group Key: a.x - -> Nested Loop (actual rows=0 loops=1) + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=200000 loops=1) AQO not used - Output: a.x - -> Seq Scan on public.b (actual rows=0 loops=1) + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=200000 loops=1) AQO not used - Output: b.y - Filter: (b.y > 10) - Rows Removed by Filter: 1000 - -> Seq Scan on public.a (never executed) - AQO: rows=1000 - Output: a.x - Filter: (a.x < 10) + Output: a.x1 + -> Seq Scan on public.a (actual rows=200 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 < 4) AND (a.x3 > 1)) + Rows Removed by Filter: 800 + -> Seq Scan on public.b (actual rows=1000 loops=200) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 Using aqo: true AQO mode: LEARN JOINS: 1 -(19 rows) +(22 rows) --- --- TODO: --- Not executed case. What could we do better here? --- +--query contains nodes that have already been predicted SELECT str AS result FROM expln(' -SELECT x,y FROM A,B WHERE x < 10 and y > 10 AND A.x = B.y;') AS str -WHERE str NOT LIKE '%Memory%' -; - result ----------------------------------------------------------- - Hash Join (actual rows=0 loops=1) +SELECT x1 FROM A,B WHERE x1 < 4 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +--------------------------------------------------------------------- + Group (actual rows=2 loops=1) AQO not used - Output: a.x, b.y - Hash Cond: (a.x = b.y) - -> Seq Scan on public.a (actual rows=1 loops=1) - AQO: rows=1000, error=100% - Output: a.x - Filter: (a.x < 10) - -> Hash (actual rows=0 loops=1) - Output: b.y - -> Seq Scan on public.b (actual rows=0 loops=1) - AQO: rows=1, error=100% - Output: b.y - Filter: (b.y > 10) - Rows Removed by Filter: 1000 + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=200000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=200000 loops=1) + AQO: rows=200000, error=0% + Output: a.x1 + -> Seq Scan on public.a (actual rows=200 loops=1) + AQO: rows=200, error=0% + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 < 4) AND (a.x3 > 1)) + Rows Removed by Filter: 800 + -> Seq Scan on public.b (actual rows=1000 loops=200) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 Using aqo: true AQO mode: LEARN - JOINS: 0 -(18 rows) - -RESET enable_material; -DROP TABLE a,b CASCADE; -SELECT true FROM aqo_reset(); - bool ------- - t + JOINS: 1 +(22 rows) + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 4 AND x3 > 2 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +--------------------------------------------------------------------- + Group (actual rows=1 loops=1) + AQO not used + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=100000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=100000 loops=1) + AQO: rows=200000, error=50% + Output: a.x1 + -> Seq Scan on public.a (actual rows=100 loops=1) + AQO: rows=200, error=50% + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 < 4) AND (a.x3 > 2)) + Rows Removed by Filter: 900 + -> Seq Scan on public.b (actual rows=1000 loops=100) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(22 rows) + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 3 AND x2 < 5 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------------ + Group (actual rows=1 loops=1) + AQO not used + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=100000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=100000 loops=1) + AQO not used + Output: a.x1 + -> Seq Scan on public.a (actual rows=100 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 < 3) AND (a.x2 < 5) AND (a.x3 > 1)) + Rows Removed by Filter: 900 + -> Seq Scan on public.b (actual rows=1000 loops=100) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(22 rows) + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 > 1 AND x2 < 4 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------------ + Group (actual rows=2 loops=1) + AQO not used + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=200000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=200000 loops=1) + AQO not used + Output: a.x1 + -> Seq Scan on public.a (actual rows=200 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 > 1) AND (a.x2 < 4) AND (a.x3 > 1)) + Rows Removed by Filter: 800 + -> Seq Scan on public.b (actual rows=1000 loops=200) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(22 rows) + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 > 1 AND x2 < 4 AND x3 < 5 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------------ + Group (actual rows=2 loops=1) + AQO not used + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=200000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=200000 loops=1) + AQO not used + Output: a.x1 + -> Seq Scan on public.a (actual rows=200 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 > 1) AND (a.x2 < 4) AND (a.x3 < 5)) + Rows Removed by Filter: 800 + -> Seq Scan on public.b (actual rows=1000 loops=200) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(22 rows) + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 4 AND x2 < 5 AND x3 > 1 and y1 > 2 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------------ + Group (actual rows=2 loops=1) + AQO not used + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=140000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=140000 loops=1) + AQO not used + Output: a.x1 + -> Seq Scan on public.a (actual rows=200 loops=1) + AQO: rows=100, error=-100% + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 < 4) AND (a.x2 < 5) AND (a.x3 > 1)) + Rows Removed by Filter: 800 + -> Seq Scan on public.b (actual rows=700 loops=200) + AQO not used + Output: b.y1, b.y2, b.y3 + Filter: (b.y1 > 2) + Rows Removed by Filter: 300 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(24 rows) + +--query contains nodes that have already been predicted +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 3 AND x2 < 4 AND x3 > 1 and y1 > 2 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------------ + Group (actual rows=1 loops=1) + AQO not used + Output: a.x1 + Group Key: a.x1 + -> Sort (actual rows=70000 loops=1) + AQO not used + Output: a.x1 + Sort Key: a.x1 + -> Nested Loop (actual rows=70000 loops=1) + AQO: rows=140000, error=50% + Output: a.x1 + -> Seq Scan on public.a (actual rows=100 loops=1) + AQO: rows=200, error=50% + Output: a.x1, a.x2, a.x3 + Filter: ((a.x1 < 3) AND (a.x2 < 4) AND (a.x3 > 1)) + Rows Removed by Filter: 900 + -> Seq Scan on public.b (actual rows=700 loops=100) + AQO: rows=700, error=0% + Output: b.y1, b.y2, b.y3 + Filter: (b.y1 > 2) + Rows Removed by Filter: 300 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(24 rows) + +CREATE TABLE c (z1 int, z2 int, z3 int); +INSERT INTO c (z1, z2, z3) SELECT mod(ival + 1,10), mod(ival + 1,10), mod(ival + 1,10) FROM generate_series(1,1000) As ival; +SELECT str AS result +FROM expln(' +SELECT * FROM (a LEFT JOIN b ON a.x1 = b.y1) sc WHERE +not exists (SELECT z1 FROM c WHERE sc.x1=c.z1 );') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------- + Hash Left Join (actual rows=0 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3, b.y1, b.y2, b.y3 + Hash Cond: (a.x1 = b.y1) + -> Hash Anti Join (actual rows=0 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3 + Hash Cond: (a.x1 = c.z1) + -> Seq Scan on public.a (actual rows=1000 loops=1) + AQO not used + Output: a.x1, a.x2, a.x3 + -> Hash (actual rows=1000 loops=1) + Output: c.z1 + -> Seq Scan on public.c (actual rows=1000 loops=1) + AQO not used + Output: c.z1 + -> Hash (never executed) + Output: b.y1, b.y2, b.y3 + -> Seq Scan on public.b (never executed) + AQO: rows=1000 + Output: b.y1, b.y2, b.y3 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(24 rows) + +SELECT str AS result +FROM expln(' +SELECT * FROM (A LEFT JOIN B ON A.x1 = B.y1) sc left join C on sc.x1=C.z1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + result +------------------------------------------------------------------------- + Hash Right Join (actual rows=10000000 loops=1) + AQO: rows=1, error=-999999900% + Output: a.x1, a.x2, a.x3, b.y1, b.y2, b.y3, c.z1, c.z2, c.z3 + Hash Cond: (b.y1 = a.x1) + -> Seq Scan on public.b (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: b.y1, b.y2, b.y3 + -> Hash (actual rows=100000 loops=1) + Output: a.x1, a.x2, a.x3, c.z1, c.z2, c.z3 + -> Hash Left Join (actual rows=100000 loops=1) + AQO: rows=1, error=-9999900% + Output: a.x1, a.x2, a.x3, c.z1, c.z2, c.z3 + Hash Cond: (a.x1 = c.z1) + -> Seq Scan on public.a (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: a.x1, a.x2, a.x3 + -> Hash (actual rows=1000 loops=1) + Output: c.z1, c.z2, c.z3 + -> Seq Scan on public.c (actual rows=1000 loops=1) + AQO: rows=1000, error=0% + Output: c.z1, c.z2, c.z3 + Using aqo: true + AQO mode: LEARN + JOINS: 1 +(24 rows) + +SELECT 1 FROM aqo_reset(); + ?column? +---------- + 1 (1 row) +DROP TABLE a; +DROP TABLE b; +DROP TABLE c; +DROP FUNCTION expln; DROP EXTENSION aqo CASCADE; diff --git a/machine_learning.c b/machine_learning.c index 7138db38..d4f5cbee 100644 --- a/machine_learning.c +++ b/machine_learning.c @@ -74,7 +74,7 @@ fs_distance(double *a, double *b, int len) res += (a[i] - b[i]) * (a[i] - b[i]); } if (len != 0) - res = sqrt(res / len); + res = sqrt(res); return res; } @@ -148,6 +148,9 @@ OkNNr_predict(OkNNrdata *data, double *features) Assert(data != NULL); + if (!aqo_predict_with_few_neighbors && data->rows < aqo_k) + return -1.; + for (i = 0; i < data->rows; ++i) distances[i] = fs_distance(data->matrix[i], features, data->cols); diff --git a/sql/look_a_like.sql b/sql/look_a_like.sql index be71feff..9705bf1a 100644 --- a/sql/look_a_like.sql +++ b/sql/look_a_like.sql @@ -2,15 +2,21 @@ CREATE EXTENSION aqo; SET aqo.join_threshold = 0; SET aqo.mode = 'learn'; SET aqo.show_details = 'on'; - +set aqo.show_hash = 'off'; +SET aqo.min_neighbors_for_predicting = 1; +SET enable_nestloop = 'off'; +SET enable_mergejoin = 'off'; SET enable_material = 'off'; DROP TABLE IF EXISTS a,b CASCADE; -CREATE TABLE a (x int); -INSERT INTO a (x) SELECT mod(ival,10) FROM generate_series(1,1000) As ival; -CREATE TABLE b (y int); -INSERT INTO b (y) SELECT mod(ival + 1,10) FROM generate_series(1,1000) As ival; +-- Create tables with correlated datas in columns +CREATE TABLE a (x1 int, x2 int, x3 int); +INSERT INTO a (x1, x2, x3) SELECT mod(ival,10), mod(ival,10), mod(ival,10) FROM generate_series(1,1000) As ival; + +CREATE TABLE b (y1 int, y2 int, y3 int); +INSERT INTO b (y1, y2, y3) SELECT mod(ival + 1,10), mod(ival + 1,10), mod(ival + 1,10) FROM generate_series(1,1000) As ival; + -- -- Returns string-by-string explain of a query. Made for removing some strings @@ -26,55 +32,112 @@ $$ LANGUAGE PLPGSQL; -- no one predicted rows. we use knowledge cardinalities of the query -- in the next queries with the same fss_hash + SELECT str AS result FROM expln(' -SELECT x FROM A where x = 5;') AS str; +SELECT x1,y1 FROM A,B WHERE x1 = 5 AND x2 = 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; SELECT str AS result FROM expln(' -SELECT x FROM A,B WHERE x = 5 AND A.x = B.y;') AS str -; -- Find cardinality for SCAN A(x=5) from a neighbour class, created by the --- query, executed above. +SELECT x1,y1 FROM A LEFT JOIN b ON A.x1 = B.y1 WHERE x1 = 5 AND x2 = 5;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; SELECT str AS result FROM expln(' -SELECT x, sum(x) FROM A,B WHERE y = 5 AND A.x = B.y group by(x);') AS str -; -- Find the JOIN cardinality from a neighbour class. +SELECT x1,y1 FROM A,B WHERE x1 < 5 AND x2 < 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +--query contains nodes that have already been predicted --- cardinality 100 in the first Seq Scan on a SELECT str AS result FROM expln(' -SELECT x, sum(x) FROM A WHERE x = 5 group by(x);') AS str; +SELECT x1,y1 FROM A,B WHERE x1 < 10 AND x2 < 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; --- no one predicted rows. we use knowledge cardinalities of the query --- in the next queries with the same fss_hash SELECT str AS result FROM expln(' -SELECT x FROM A where x < 10 group by(x);') AS str -WHERE str NOT LIKE '%Memory%'; --- cardinality 1000 in Seq Scan on a +SELECT x1,y1 FROM A,B WHERE x1 > 2 AND x2 > 2 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + SELECT str AS result FROM expln(' -SELECT x,y FROM A,B WHERE x < 10 AND A.x = B.y;') AS str -WHERE str NOT LIKE '%Memory%'; +SELECT x1,y1 FROM A,B WHERE x1 > 5 AND x2 > 5 AND x3 < 10 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; --- cardinality 100 in Seq Scan on a and Seq Scan on b SELECT str AS result FROM expln(' -SELECT x FROM A,B where x < 10 and y > 10 group by(x);') AS str -WHERE str NOT LIKE '%Memory%'; +SELECT x1,y1 FROM A,B WHERE x1 < 5 AND x2 < 5 AND x3 < 10 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +--query contains nodes that have already been predicted + +SELECT str AS result +FROM expln(' +SELECT x1,y1 FROM A,B WHERE x1 < 5 AND x2 < 4 AND x3 < 5 AND A.x1 = B.y1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 4 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +--query contains nodes that have already been predicted + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 4 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 4 AND x3 > 2 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 3 AND x2 < 5 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 > 1 AND x2 < 4 AND x3 > 1 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 > 1 AND x2 < 4 AND x3 < 5 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 4 AND x2 < 5 AND x3 > 1 and y1 > 2 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +--query contains nodes that have already been predicted + +SELECT str AS result +FROM expln(' +SELECT x1 FROM A,B WHERE x1 < 3 AND x2 < 4 AND x3 > 1 and y1 > 2 GROUP BY(x1);') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; + +CREATE TABLE c (z1 int, z2 int, z3 int); +INSERT INTO c (z1, z2, z3) SELECT mod(ival + 1,10), mod(ival + 1,10), mod(ival + 1,10) FROM generate_series(1,1000) As ival; + +SELECT str AS result +FROM expln(' +SELECT * FROM (a LEFT JOIN b ON a.x1 = b.y1) sc WHERE +not exists (SELECT z1 FROM c WHERE sc.x1=c.z1 );') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; --- --- TODO: --- Not executed case. What could we do better here? --- SELECT str AS result FROM expln(' -SELECT x,y FROM A,B WHERE x < 10 and y > 10 AND A.x = B.y;') AS str -WHERE str NOT LIKE '%Memory%' -; +SELECT * FROM (A LEFT JOIN B ON A.x1 = B.y1) sc left join C on sc.x1=C.z1;') AS str +WHERE str NOT LIKE 'Query Identifier%' and str NOT LIKE '%Memory%' and str NOT LIKE '%Sort Method%'; -RESET enable_material; -DROP TABLE a,b CASCADE; -SELECT true FROM aqo_reset(); +SELECT 1 FROM aqo_reset(); +DROP TABLE a; +DROP TABLE b; +DROP TABLE c; +DROP FUNCTION expln; DROP EXTENSION aqo CASCADE; diff --git a/storage.c b/storage.c index fcbe5569..32446d6c 100644 --- a/storage.c +++ b/storage.c @@ -90,6 +90,8 @@ static bool _aqo_stat_remove(uint64 queryid); static bool _aqo_queries_remove(uint64 queryid); static bool _aqo_qtexts_remove(uint64 queryid); static bool _aqo_data_remove(data_key *key); +static bool neirest_neighbor(double **matrix, int old_rows, double *neighbor, int cols); +static double fs_distance(double *a, double *b, int len); PG_FUNCTION_INFO_V1(aqo_query_stat); PG_FUNCTION_INFO_V1(aqo_query_texts); @@ -108,7 +110,7 @@ PG_FUNCTION_INFO_V1(aqo_execution_time); bool load_fss_ext(uint64 fs, int fss, OkNNrdata *data, List **reloids) { - return load_aqo_data(fs, fss, data, reloids, false); + return load_aqo_data(fs, fss, data, reloids, false, NULL); } bool @@ -1409,25 +1411,73 @@ aqo_data_store(uint64 fs, int fss, OkNNrdata *data, List *reloids) return result; } +static double +fs_distance(double *a, double *b, int len) +{ + double res = 0; + int i; + + for (i = 0; i < len; ++i) + res += (a[i] - b[i]) * (a[i] - b[i]); + if (len != 0) + res = sqrt(res); + return res; +} + +bool +neirest_neighbor(double **matrix, int old_rows, double *neibour, int cols) +{ + int i; + for (i=0; icols == temp_data->cols); Assert(data->matrix); - if (data->rows > 0) - /* trivial strategy - use first suitable record and ignore others */ - return; - - memcpy(data, temp_data, sizeof(OkNNrdata)); - if (data->cols > 0) + if (features != NULL) { - int i; + int old_rows = data->rows; + int k = old_rows; - for (i = 0; i < data->rows; i++) + if (data->cols > 0) { - Assert(data->matrix[i]); - memcpy(data->matrix[i], temp_data->matrix[i], data->cols * sizeof(double)); + int i; + + for (i = 0; i < data->rows; i++) + { + if (k < aqo_K && !neirest_neighbor(data->matrix, old_rows, data->matrix[i], data->cols)) + { + memcpy(data->matrix[k], temp_data->matrix[i], data->cols * sizeof(double)); + data->rfactors[k] = temp_data->rfactors[i]; + data->targets[k] = temp_data->targets[i]; + k++; + } + } + } + } + else + { + if (data->rows > 0) + /* trivial strategy - use first suitable record and ignore others */ + return; + memcpy(data, temp_data, sizeof(OkNNrdata)); + if (data->cols > 0) + { + int i; + + for (i = 0; i < data->rows; i++) + { + Assert(data->matrix[i]); + memcpy(data->matrix[i], temp_data->matrix[i], data->cols * sizeof(double)); + } } } } @@ -1503,7 +1553,7 @@ _fill_knn_data(const DataEntry *entry, List **reloids) */ bool load_aqo_data(uint64 fs, int fss, OkNNrdata *data, List **reloids, - bool wideSearch) + bool wideSearch, double *features) { DataEntry *entry; bool found; @@ -1538,7 +1588,7 @@ load_aqo_data(uint64 fs, int fss, OkNNrdata *data, List **reloids, } temp_data = _fill_knn_data(entry, reloids); - build_knn_matrix(data, temp_data); + build_knn_matrix(data, temp_data, features); } else /* Iterate across all elements of the table. XXX: Maybe slow. */ @@ -1576,7 +1626,7 @@ load_aqo_data(uint64 fs, int fss, OkNNrdata *data, List **reloids, else list_free(tmp_oids); - build_knn_matrix(data, temp_data); + build_knn_matrix(data, temp_data, NULL); found = true; } } diff --git a/storage.h b/storage.h index 94891c5d..0e7745e1 100644 --- a/storage.h +++ b/storage.h @@ -101,7 +101,7 @@ extern void aqo_qtexts_load(void); extern bool aqo_data_store(uint64 fs, int fss, OkNNrdata *data, List *reloids); extern bool load_aqo_data(uint64 fs, int fss, OkNNrdata *data, List **reloids, - bool wideSearch); + bool wideSearch, double *features); extern void aqo_data_flush(void); extern void aqo_data_load(void);