Skip to content

Commit

Permalink
Use Append rather than MergeAppend for scanning ordered partitions.
Browse files Browse the repository at this point in the history
If we need ordered output from a scan of a partitioned table, but
the ordering matches the partition ordering, then we don't need to
use a MergeAppend to combine the pre-ordered per-partition scan
results: a plain Append will produce the same results.  This
both saves useless comparison work inside the MergeAppend proper,
and allows us to start returning tuples after istarting up just
the first child node not all of them.

However, all is not peaches and cream, because if some of the
child nodes have high startup costs then there will be big
discontinuities in the tuples-returned-versus-elapsed-time curve.
The planner's cost model cannot handle that (yet, anyway).
If we model the Append's startup cost as being just the first
child's startup cost, we may drastically underestimate the cost
of fetching slightly more tuples than are available from the first
child.  Since we've had bad experiences with over-optimistic choices
of "fast start" plans for ORDER BY LIMIT queries, that seems scary.
As a klugy workaround, set the startup cost estimate for an ordered
Append to be the sum of its children's startup costs (as MergeAppend
would).  This doesn't really describe reality, but it's less likely
to cause a bad plan choice than an underestimated startup cost would.
In practice, the cases where we really care about this optimization
will have child plans that are IndexScans with zero startup cost,
so that the overly conservative estimate is still just zero.

David Rowley, reviewed by Julien Rouhaud and Antonin Houska

Discussion: https://postgr.es/m/CAKJS1f-hAqhPLRk_RaSFTgYxd=Tz5hA7kQ2h4-DhJufQk8TGuw@mail.gmail.com
  • Loading branch information
tglsfdc committed Apr 5, 2019
1 parent 9f06d79 commit 959d00e
Show file tree
Hide file tree
Showing 19 changed files with 1,044 additions and 135 deletions.
13 changes: 13 additions & 0 deletions src/backend/executor/execProcnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,19 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node)
sortState->bound = tuples_needed;
}
}
else if (IsA(child_node, AppendState))
{
/*
* If it is an Append, we can apply the bound to any nodes that are
* children of the Append, since the Append surely need read no more
* than that many tuples from any one input.
*/
AppendState *aState = (AppendState *) child_node;
int i;

for (i = 0; i < aState->as_nplans; i++)
ExecSetTupleBound(tuples_needed, aState->appendplans[i]);
}
else if (IsA(child_node, MergeAppendState))
{
/*
Expand Down
1 change: 1 addition & 0 deletions src/backend/nodes/outfuncs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1847,6 +1847,7 @@ _outAppendPath(StringInfo str, const AppendPath *node)
WRITE_NODE_FIELD(partitioned_rels);
WRITE_NODE_FIELD(subpaths);
WRITE_INT_FIELD(first_partial_path);
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
}

static void
Expand Down
237 changes: 198 additions & 39 deletions src/backend/optimizer/path/allpaths.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "optimizer/tlist.h"
#include "parser/parse_clause.h"
#include "parser/parsetree.h"
#include "partitioning/partbounds.h"
#include "partitioning/partprune.h"
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
Expand Down Expand Up @@ -96,15 +97,16 @@ static void set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte);
static void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte);
static void generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel,
List *live_childrels,
List *all_child_pathkeys,
List *partitioned_rels);
static void generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
List *live_childrels,
List *all_child_pathkeys,
List *partitioned_rels);
static Path *get_cheapest_parameterized_child_path(PlannerInfo *root,
RelOptInfo *rel,
Relids required_outer);
static void accumulate_append_subpath(Path *path,
List **subpaths, List **special_subpaths);
static Path *get_singleton_append_subpath(Path *path);
static void set_dummy_rel_pathlist(RelOptInfo *rel);
static void set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte);
Expand Down Expand Up @@ -1520,7 +1522,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
*/
if (subpaths_valid)
add_path(rel, (Path *) create_append_path(root, rel, subpaths, NIL,
NULL, 0, false,
NIL, NULL, 0, false,
partitioned_rels, -1));

/*
Expand Down Expand Up @@ -1562,7 +1564,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,

/* Generate a partial append path. */
appendpath = create_append_path(root, rel, NIL, partial_subpaths,
NULL, parallel_workers,
NIL, NULL, parallel_workers,
enable_parallel_append,
partitioned_rels, -1);

Expand Down Expand Up @@ -1612,19 +1614,19 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,

appendpath = create_append_path(root, rel, pa_nonpartial_subpaths,
pa_partial_subpaths,
NULL, parallel_workers, true,
NIL, NULL, parallel_workers, true,
partitioned_rels, partial_rows);
add_partial_path(rel, (Path *) appendpath);
}

/*
* Also build unparameterized MergeAppend paths based on the collected
* Also build unparameterized ordered append paths based on the collected
* list of child pathkeys.
*/
if (subpaths_valid)
generate_mergeappend_paths(root, rel, live_childrels,
all_child_pathkeys,
partitioned_rels);
generate_orderedappend_paths(root, rel, live_childrels,
all_child_pathkeys,
partitioned_rels);

/*
* Build Append paths for each parameterization seen among the child rels.
Expand Down Expand Up @@ -1674,7 +1676,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
if (subpaths_valid)
add_path(rel, (Path *)
create_append_path(root, rel, subpaths, NIL,
required_outer, 0, false,
NIL, required_outer, 0, false,
partitioned_rels, -1));
}

Expand Down Expand Up @@ -1703,26 +1705,30 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
continue;

appendpath = create_append_path(root, rel, NIL, list_make1(path),
NULL, path->parallel_workers,
true,
NIL, NULL,
path->parallel_workers, true,
partitioned_rels, partial_rows);
add_partial_path(rel, (Path *) appendpath);
}
}
}

/*
* generate_mergeappend_paths
* Generate MergeAppend paths for an append relation
* generate_orderedappend_paths
* Generate ordered append paths for an append relation
*
* Generate a path for each ordering (pathkey list) appearing in
* Usually we generate MergeAppend paths here, but there are some special
* cases where we can generate simple Append paths, because the subpaths
* can provide tuples in the required order already.
*
* We generate a path for each ordering (pathkey list) appearing in
* all_child_pathkeys.
*
* We consider both cheapest-startup and cheapest-total cases, ie, for each
* interesting ordering, collect all the cheapest startup subpaths and all the
* cheapest total paths, and build a MergeAppend path for each case.
* cheapest total paths, and build a suitable path for each case.
*
* We don't currently generate any parameterized MergeAppend paths. While
* We don't currently generate any parameterized ordered paths here. While
* it would not take much more code here to do so, it's very unclear that it
* is worth the planning cycles to investigate such paths: there's little
* use for an ordered path on the inside of a nestloop. In fact, it's likely
Expand All @@ -1731,24 +1737,80 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
* and a parameterized MergeAppend is going to be more expensive than the
* corresponding parameterized Append path. If we ever try harder to support
* parameterized mergejoin plans, it might be worth adding support for
* parameterized MergeAppends to feed such joins. (See notes in
* parameterized paths here to feed such joins. (See notes in
* optimizer/README for why that might not ever happen, though.)
*/
static void
generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel,
List *live_childrels,
List *all_child_pathkeys,
List *partitioned_rels)
generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
List *live_childrels,
List *all_child_pathkeys,
List *partitioned_rels)
{
ListCell *lcp;
List *partition_pathkeys = NIL;
List *partition_pathkeys_desc = NIL;
bool partition_pathkeys_partial = true;
bool partition_pathkeys_desc_partial = true;

/*
* Some partitioned table setups may allow us to use an Append node
* instead of a MergeAppend. This is possible in cases such as RANGE
* partitioned tables where it's guaranteed that an earlier partition must
* contain rows which come earlier in the sort order. To detect whether
* this is relevant, build pathkey descriptions of the partition ordering,
* for both forward and reverse scans.
*/
if (rel->part_scheme != NULL && IS_SIMPLE_REL(rel) &&
partitions_are_ordered(rel->boundinfo, rel->nparts))
{
partition_pathkeys = build_partition_pathkeys(root, rel,
ForwardScanDirection,
&partition_pathkeys_partial);

partition_pathkeys_desc = build_partition_pathkeys(root, rel,
BackwardScanDirection,
&partition_pathkeys_desc_partial);

/*
* You might think we should truncate_useless_pathkeys here, but
* allowing partition keys which are a subset of the query's pathkeys
* can often be useful. For example, consider a table partitioned by
* RANGE (a, b), and a query with ORDER BY a, b, c. If we have child
* paths that can produce the a, b, c ordering (perhaps via indexes on
* (a, b, c)) then it works to consider the appendrel output as
* ordered by a, b, c.
*/
}

/* Now consider each interesting sort ordering */
foreach(lcp, all_child_pathkeys)
{
List *pathkeys = (List *) lfirst(lcp);
List *startup_subpaths = NIL;
List *total_subpaths = NIL;
bool startup_neq_total = false;
ListCell *lcr;
bool match_partition_order;
bool match_partition_order_desc;

/*
* Determine if this sort ordering matches any partition pathkeys we
* have, for both ascending and descending partition order. If the
* partition pathkeys happen to be contained in pathkeys then it still
* works, as described above, providing that the partition pathkeys
* are complete and not just a prefix of the partition keys. (In such
* cases we'll be relying on the child paths to have sorted the
* lower-order columns of the required pathkeys.)
*/
match_partition_order =
pathkeys_contained_in(pathkeys, partition_pathkeys) ||
(!partition_pathkeys_partial &&
pathkeys_contained_in(partition_pathkeys, pathkeys));

match_partition_order_desc = !match_partition_order &&
(pathkeys_contained_in(pathkeys, partition_pathkeys_desc) ||
(!partition_pathkeys_desc_partial &&
pathkeys_contained_in(partition_pathkeys_desc, pathkeys)));

/* Select the child paths for this ordering... */
foreach(lcr, live_childrels)
Expand Down Expand Up @@ -1791,26 +1853,94 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel,
if (cheapest_startup != cheapest_total)
startup_neq_total = true;

accumulate_append_subpath(cheapest_startup,
&startup_subpaths, NULL);
accumulate_append_subpath(cheapest_total,
&total_subpaths, NULL);
/*
* Collect the appropriate child paths. The required logic varies
* for the Append and MergeAppend cases.
*/
if (match_partition_order)
{
/*
* We're going to make a plain Append path. We don't need
* most of what accumulate_append_subpath would do, but we do
* want to cut out child Appends or MergeAppends if they have
* just a single subpath (and hence aren't doing anything
* useful).
*/
cheapest_startup = get_singleton_append_subpath(cheapest_startup);
cheapest_total = get_singleton_append_subpath(cheapest_total);

startup_subpaths = lappend(startup_subpaths, cheapest_startup);
total_subpaths = lappend(total_subpaths, cheapest_total);
}
else if (match_partition_order_desc)
{
/*
* As above, but we need to reverse the order of the children,
* because nodeAppend.c doesn't know anything about reverse
* ordering and will scan the children in the order presented.
*/
cheapest_startup = get_singleton_append_subpath(cheapest_startup);
cheapest_total = get_singleton_append_subpath(cheapest_total);

startup_subpaths = lcons(cheapest_startup, startup_subpaths);
total_subpaths = lcons(cheapest_total, total_subpaths);
}
else
{
/*
* Otherwise, rely on accumulate_append_subpath to collect the
* child paths for the MergeAppend.
*/
accumulate_append_subpath(cheapest_startup,
&startup_subpaths, NULL);
accumulate_append_subpath(cheapest_total,
&total_subpaths, NULL);
}
}

/* ... and build the MergeAppend paths */
add_path(rel, (Path *) create_merge_append_path(root,
rel,
startup_subpaths,
pathkeys,
NULL,
partitioned_rels));
if (startup_neq_total)
/* ... and build the Append or MergeAppend paths */
if (match_partition_order || match_partition_order_desc)
{
/* We only need Append */
add_path(rel, (Path *) create_append_path(root,
rel,
startup_subpaths,
NIL,
pathkeys,
NULL,
0,
false,
partitioned_rels,
-1));
if (startup_neq_total)
add_path(rel, (Path *) create_append_path(root,
rel,
total_subpaths,
NIL,
pathkeys,
NULL,
0,
false,
partitioned_rels,
-1));
}
else
{
/* We need MergeAppend */
add_path(rel, (Path *) create_merge_append_path(root,
rel,
total_subpaths,
startup_subpaths,
pathkeys,
NULL,
partitioned_rels));
if (startup_neq_total)
add_path(rel, (Path *) create_merge_append_path(root,
rel,
total_subpaths,
pathkeys,
NULL,
partitioned_rels));
}
}
}

Expand Down Expand Up @@ -1901,7 +2031,6 @@ get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel,
* omitting a sort step, which seems fine: if the parent is to be an Append,
* its result would be unsorted anyway, while if the parent is to be a
* MergeAppend, there's no point in a separate sort on a child.
* its result would be unsorted anyway.
*
* Normally, either path is a partial path and subpaths is a list of partial
* paths, or else path is a non-partial plan and subpaths is a list of those.
Expand Down Expand Up @@ -1951,6 +2080,36 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths)
*subpaths = lappend(*subpaths, path);
}

/*
* get_singleton_append_subpath
* Returns the single subpath of an Append/MergeAppend, or just
* return 'path' if it's not a single sub-path Append/MergeAppend.
*
* Note: 'path' must not be a parallel-aware path.
*/
static Path *
get_singleton_append_subpath(Path *path)
{
Assert(!path->parallel_aware);

if (IsA(path, AppendPath))
{
AppendPath *apath = (AppendPath *) path;

if (list_length(apath->subpaths) == 1)
return (Path *) linitial(apath->subpaths);
}
else if (IsA(path, MergeAppendPath))
{
MergeAppendPath *mpath = (MergeAppendPath *) path;

if (list_length(mpath->subpaths) == 1)
return (Path *) linitial(mpath->subpaths);
}

return path;
}

/*
* set_dummy_rel_pathlist
* Build a dummy path for a relation that's been excluded by constraints
Expand All @@ -1975,7 +2134,7 @@ set_dummy_rel_pathlist(RelOptInfo *rel)

/* Set up the dummy path */
add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL,
rel->lateral_relids,
NIL, rel->lateral_relids,
0, false, NIL, -1));

/*
Expand Down

0 comments on commit 959d00e

Please sign in to comment.