121 changes: 121 additions & 0 deletions parallel_index_scan_testcase.sql
@@ -0,0 +1,121 @@
set log_btree_verbosity=1;

set client_min_messages=error;
drop table if exists parallel_index_scan;
reset client_min_messages;

-- encourage use of parallel plans
set parallel_setup_cost=1;
set parallel_tuple_cost=1;
set min_parallel_table_scan_size=1;
set max_parallel_workers_per_gather=2;
set enable_seqscan=off;
set enable_bitmapscan=off;
-- Index-only scan for this
set enable_indexonlyscan=on;

create unlogged table parallel_index_scan
(
c1 int,
c2 text,
c3 date,
c4 varchar(20),
c5 float
);

insert into parallel_index_scan
select
x,
'c2_' || x,
'2000-01-01'::date + y,
'xyz',
1.1
from
generate_series(1, 100000) x,
generate_series(0,9) y;

create index parallel_index_scan_idx on parallel_index_scan(c3, c4, c5);

vacuum analyze parallel_index_scan;

show port;

select count(*) as first_count, c3 c3_six_rows
from parallel_index_scan
where c3 in (
'2000-01-01',
'2000-01-03',
'2000-01-04',
'2000-01-05',
'2000-01-07',
'2000-01-09',
'1111-11-11'
)
group by c3;
EXPLAIN (ANALYZE, BUFFERS)
select count(*) as first_count, c3 c3_six_rows
from parallel_index_scan
where c3 in (
'2000-01-01',
'2000-01-03',
'2000-01-04',
'2000-01-05',
'2000-01-07',
'2000-01-09',
'1111-11-11'
)
group by c3;

select count(*) as second_count, c3 c3_ten_rows
from parallel_index_scan
where c3 in (
'2000-01-01',
'2000-01-02',
'2000-01-03',
'2000-01-04',
'2000-01-05',
'2000-01-06',
'2000-01-07',
'2000-01-08',
'2000-01-09',
'2000-01-10'
)
group by c3;
EXPLAIN (ANALYZE, BUFFERS)
select count(*) as second_count, c3 c3_ten_rows
from parallel_index_scan
where c3 in (
'2000-01-01',
'2000-01-02',
'2000-01-03',
'2000-01-04',
'2000-01-05',
'2000-01-06',
'2000-01-07',
'2000-01-08',
'2000-01-09',
'2000-01-10'
)
group by c3;

select count(*) as third_count, c3 c3_five_rows
from parallel_index_scan
where c3 in (
'2000-01-01',
'2000-01-03',
'2000-01-05',
'2000-01-07',
'2000-01-09'
)
group by c3;
EXPLAIN (ANALYZE, BUFFERS)
select count(*) as third_count, c3 c3_five_rows
from parallel_index_scan
where c3 in (
'2000-01-01',
'2000-01-03',
'2000-01-05',
'2000-01-07',
'2000-01-09'
)
group by c3;
63 changes: 36 additions & 27 deletions src/backend/access/nbtree/nbtree.c
Expand Up @@ -48,8 +48,8 @@
* BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
* to a new page; some process can start doing that.
*
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
* We reach this state once for every distinct combination of array keys.
* BTPARALLEL_DONE indicates that the primitive index scan is complete
* (including error exit). Reached once per primitive index scan.
*/
typedef enum
{
Expand All @@ -69,8 +69,8 @@ typedef struct BTParallelScanDescData
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
int btps_arrayKeyCount; /* count indicating number of array scan
* keys processed by parallel scan */
int btps_numPrimScans; /* count indicating number of primitive
* index scans (used with array keys) */
slock_t btps_mutex; /* protects above variables */
ConditionVariable btps_cv; /* used to synchronize parallel scan */
} BTParallelScanDescData;
Expand Down Expand Up @@ -275,8 +275,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* If we have a tuple, return it ... */
if (res)
break;
/* ... otherwise see if we have more array keys to deal with */
} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
/* ... otherwise see if we need another primitive index scan */
} while (so->numArrayKeys && _bt_array_keys_remain(scan, dir));

return res;
}
Expand Down Expand Up @@ -333,8 +333,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
ntids++;
}
}
/* Now see if we have more array keys to deal with */
} while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
/* Now see if we need another primitive index scan */
} while (so->numArrayKeys && _bt_array_keys_remain(scan, ForwardScanDirection));

return ntids;
}
Expand Down Expand Up @@ -364,9 +364,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
so->keyData = NULL;

so->arrayKeyData = NULL; /* assume no array keys for now */
so->arraysStarted = false;
so->numArrayKeys = 0;
so->needPrimScan = false;
so->arrayKeys = NULL;
so->orderProcs = NULL;
so->arrayContext = NULL;

so->killedItems = NULL; /* until needed */
Expand Down Expand Up @@ -406,7 +407,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
}

so->markItemIndex = -1;
so->arrayKeyCount = 0;
so->needPrimScan = false;
so->numPrimScans = 0;
so->firstPage = false;
BTScanPosUnpinIfPinned(so->markPos);
BTScanPosInvalidate(so->markPos);
Expand Down Expand Up @@ -588,7 +590,7 @@ btinitparallelscan(void *target)
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
bt_target->btps_arrayKeyCount = 0;
bt_target->btps_numPrimScans = 0;
ConditionVariableInit(&bt_target->btps_cv);
}

Expand All @@ -614,7 +616,7 @@ btparallelrescan(IndexScanDesc scan)
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
btscan->btps_arrayKeyCount = 0;
btscan->btps_numPrimScans = 0;
SpinLockRelease(&btscan->btps_mutex);
}

Expand All @@ -625,7 +627,11 @@ btparallelrescan(IndexScanDesc scan)
*
* The return value is true if we successfully seized the scan and false
* if we did not. The latter case occurs if no pages remain for the current
* set of scankeys.
* primitive index scan.
*
* When array scan keys are in use, each worker process independently advances
* its array keys. It's crucial that each worker process never be allowed to
* scan a page from before the current scan position.
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
Expand Down Expand Up @@ -656,16 +662,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
SpinLockAcquire(&btscan->btps_mutex);
pageStatus = btscan->btps_pageStatus;

if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
if (so->numPrimScans < btscan->btps_numPrimScans)
{
/* Parallel scan has already advanced to a new set of scankeys. */
/* Top-level scan already moved on to next primitive index scan */
status = false;
}
else if (pageStatus == BTPARALLEL_DONE)
{
/*
* We're done with this set of scankeys. This may be the end, or
* there could be more sets to try.
* We're done with this primitive index scan. This might have
* been the final primitive index scan required, or the top-level
* index scan might require additional primitive scans.
*/
status = false;
}
Expand Down Expand Up @@ -697,9 +704,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
{
BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;

Assert(!so->needPrimScan);

btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);

Expand Down Expand Up @@ -733,12 +743,11 @@ _bt_parallel_done(IndexScanDesc scan)
parallel_scan->ps_offset);

/*
* Mark the parallel scan as done for this combination of scan keys,
* unless some other process already did so. See also
* _bt_advance_array_keys.
* Mark the primitive index scan as done, unless some other process
* already did so. See also _bt_array_keys_remain.
*/
SpinLockAcquire(&btscan->btps_mutex);
if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
if (so->numPrimScans >= btscan->btps_numPrimScans &&
btscan->btps_pageStatus != BTPARALLEL_DONE)
{
btscan->btps_pageStatus = BTPARALLEL_DONE;
Expand All @@ -752,14 +761,14 @@ _bt_parallel_done(IndexScanDesc scan)
}

/*
* _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
* keys.
* _bt_parallel_next_primitive_scan() -- Advances parallel primitive scan
* counter when array keys are in use.
*
* Updates the count of array keys processed for both local and parallel
* Updates the count of primitive index scans for both local and parallel
* scans.
*/
void
_bt_parallel_advance_array_keys(IndexScanDesc scan)
_bt_parallel_next_primitive_scan(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
Expand All @@ -768,13 +777,13 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);

so->arrayKeyCount++;
so->numPrimScans++;
SpinLockAcquire(&btscan->btps_mutex);
if (btscan->btps_pageStatus == BTPARALLEL_DONE)
{
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
btscan->btps_arrayKeyCount++;
btscan->btps_numPrimScans++;
}
SpinLockRelease(&btscan->btps_mutex);
}
Expand Down
92 changes: 65 additions & 27 deletions src/backend/access/nbtree/nbtsearch.c
Expand Up @@ -893,7 +893,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*/
if (!so->qual_ok)
{
/* Notify any other workers that we're done with this scan key. */
/* Notify any other workers that this primitive scan is done */
_bt_parallel_done(scan);
return false;
}
Expand Down Expand Up @@ -952,6 +952,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* one we use --- by definition, they are either redundant or
* contradictory.
*
* When SK_SEARCHARRAY keys are in use, _bt_tuple_before_array_keys is
* used to avoid prematurely stopping the scan when an array equality qual
* has its array keys advanced.
*
* Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
* If the index stores nulls at the end of the index we'll be starting
* from, and we have no boundary key for the column (which means the key
Expand Down Expand Up @@ -1537,9 +1541,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
BTReadPageState pstate;
int itemIndex;
bool continuescan;
int indnatts;
bool requiredMatchedByPrecheck;

/*
Expand All @@ -1560,8 +1563,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
}

continuescan = true; /* default assumption */
indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
pstate.dir = dir;
pstate.finaltup = NULL;
pstate.continuescan = true; /* default assumption */
pstate.finaltupchecked = false;

minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);

Expand Down Expand Up @@ -1609,9 +1615,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
* the last item on the page would give a more precise answer.
*
* We skip this for the first page in the scan to evade the possible
* slowdown of the point queries.
* slowdown of the point queries. Do the same with scans with array keys,
* since that makes the optimization unsafe (our search-type scan keys can
* change during any call to _bt_checkkeys whenever array keys are used).
*/
if (!so->firstPage && minoff < maxoff)
if (!so->firstPage && minoff < maxoff && !so->numArrayKeys)
{
ItemId iid;
IndexTuple itup;
Expand All @@ -1625,8 +1633,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
* set flag to true if all required keys are satisfied and false
* otherwise.
*/
(void) _bt_checkkeys(scan, itup, indnatts, dir,
&requiredMatchedByPrecheck, false);
_bt_checkkeys(scan, &pstate, itup, false, false);
requiredMatchedByPrecheck = pstate.continuescan;
pstate.continuescan = true; /* reset */
}
else
{
Expand All @@ -1636,6 +1645,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)

if (ScanDirectionIsForward(dir))
{
/* SK_SEARCHARRAY forward scans must provide high key up front */
if (so->numArrayKeys && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);

pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
}

/* load items[] in ascending order */
itemIndex = 0;

Expand All @@ -1659,17 +1676,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)

itup = (IndexTuple) PageGetItem(page, iid);

passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
&continuescan, requiredMatchedByPrecheck);
passes_quals = _bt_checkkeys(scan, &pstate, itup, false,
requiredMatchedByPrecheck);

/*
* If the result of prechecking required keys was true, then in
* assert-enabled builds we also recheck that the _bt_checkkeys()
* result is the same.
*/
Assert(!requiredMatchedByPrecheck ||
passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
&continuescan, false));
passes_quals == _bt_checkkeys(scan, &pstate, itup, false,
false));
if (passes_quals)
{
/* tuple passes all scan key conditions */
Expand Down Expand Up @@ -1703,7 +1720,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
}
}
/* When !continuescan, there can't be any more matches, so stop */
if (!continuescan)
if (!pstate.continuescan)
break;

offnum = OffsetNumberNext(offnum);
Expand All @@ -1720,17 +1737,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
* only appear on non-pivot tuples on the right sibling page are
* common.
*/
if (continuescan && !P_RIGHTMOST(opaque))
if (pstate.continuescan && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
IndexTuple itup;

truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
if (pstate.finaltup)
itup = pstate.finaltup;
else
{
ItemId iid = PageGetItemId(page, P_HIKEY);

itup = (IndexTuple) PageGetItem(page, iid);
}

_bt_checkkeys(scan, &pstate, itup, true, false);
}

if (!continuescan)
if (!pstate.continuescan)
so->currPos.moreRight = false;

Assert(itemIndex <= MaxTIDsPerBTreePage);
Expand All @@ -1740,6 +1763,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
}
else
{
/* SK_SEARCHARRAY backward scans must provide final tuple up front */
if (so->numArrayKeys && minoff < maxoff)
{
ItemId iid = PageGetItemId(page, minoff);

pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
}

/* load items[] in descending order */
itemIndex = MaxTIDsPerBTreePage;

Expand All @@ -1751,6 +1782,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
IndexTuple itup;
bool tuple_alive;
bool passes_quals;
bool finaltup = (offnum == minoff);

/*
* If the scan specifies not to return killed tuples, then we
Expand All @@ -1761,12 +1793,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
* tuple on the page, we do check the index keys, to prevent
* uselessly advancing to the page to the left. This is similar
* to the high key optimization used by forward scans.
*
* Separately, _bt_checkkeys actually requires that we call it
* with the final non-pivot tuple from the page, if there's one
* (final processed tuple, or first tuple in offset number terms).
* We must indicate which particular tuple comes last, too.
*/
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
{
Assert(offnum >= P_FIRSTDATAKEY(opaque));
if (offnum > P_FIRSTDATAKEY(opaque))
if (!finaltup)
{
Assert(offnum > minoff);
offnum = OffsetNumberPrev(offnum);
continue;
}
Expand All @@ -1778,17 +1816,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)

itup = (IndexTuple) PageGetItem(page, iid);

passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
&continuescan, requiredMatchedByPrecheck);
passes_quals = _bt_checkkeys(scan, &pstate, itup, finaltup,
requiredMatchedByPrecheck);

/*
* If the result of prechecking required keys was true, then in
* assert-enabled builds we also recheck that the _bt_checkkeys()
* result is the same.
*/
Assert(!requiredMatchedByPrecheck ||
passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
&continuescan, false));
passes_quals == _bt_checkkeys(scan, &pstate, itup,
finaltup, false));
if (passes_quals && tuple_alive)
{
/* tuple passes all scan key conditions */
Expand Down Expand Up @@ -1827,7 +1865,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
}
}
}
if (!continuescan)
if (!pstate.continuescan)
{
/* there can't be any more matches, so stop */
so->currPos.moreLeft = false;
Expand Down
1,472 changes: 1,401 additions & 71 deletions src/backend/access/nbtree/nbtutils.c

Large diffs are not rendered by default.

86 changes: 16 additions & 70 deletions src/backend/optimizer/path/indxpath.c
Expand Up @@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel,
IndexOptInfo *index, IndexClauseSet *clauses,
bool useful_predicate,
ScanTypeControl scantype,
bool *skip_nonnative_saop,
bool *skip_lower_saop);
bool *skip_nonnative_saop);
static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
List *clauses, List *other_clauses);
static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
Expand Down Expand Up @@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids,
* index AM supports them natively, we should just include them in simple
* index paths. If not, we should exclude them while building simple index
* paths, and then make a separate attempt to include them in bitmap paths.
* Furthermore, we should consider excluding lower-order ScalarArrayOpExpr
* quals so as to create ordered paths.
*/
static void
get_index_paths(PlannerInfo *root, RelOptInfo *rel,
Expand All @@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
{
List *indexpaths;
bool skip_nonnative_saop = false;
bool skip_lower_saop = false;
ListCell *lc;

/*
* Build simple index paths using the clauses. Allow ScalarArrayOpExpr
* clauses only if the index AM supports them natively, and skip any such
* clauses for index columns after the first (so that we produce ordered
* paths if possible).
* clauses only if the index AM supports them natively.
*/
indexpaths = build_index_paths(root, rel,
index, clauses,
index->predOK,
ST_ANYSCAN,
&skip_nonnative_saop,
&skip_lower_saop);

/*
* If we skipped any lower-order ScalarArrayOpExprs on an index with an AM
* that supports them, then try again including those clauses. This will
* produce paths with more selectivity but no ordering.
*/
if (skip_lower_saop)
{
indexpaths = list_concat(indexpaths,
build_index_paths(root, rel,
index, clauses,
index->predOK,
ST_ANYSCAN,
&skip_nonnative_saop,
NULL));
}
&skip_nonnative_saop);

/*
* Submit all the ones that can form plain IndexScan plans to add_path. (A
Expand Down Expand Up @@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
index, clauses,
false,
ST_BITMAPSCAN,
NULL,
NULL);
*bitindexpaths = list_concat(*bitindexpaths, indexpaths);
}
Expand Down Expand Up @@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
* to true if we found any such clauses (caller must initialize the variable
* to false). If it's NULL, we do not ignore ScalarArrayOpExpr clauses.
*
* If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for
* non-first index columns, and we set *skip_lower_saop to true if we found
* any such clauses (caller must initialize the variable to false). If it's
* NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will
* result in considering the scan's output to be unordered.
*
* 'rel' is the index's heap relation
* 'index' is the index for which we want to generate paths
* 'clauses' is the collection of indexable clauses (IndexClause nodes)
* 'useful_predicate' indicates whether the index has a useful predicate
* 'scantype' indicates whether we need plain or bitmap scan support
* 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't
* 'skip_lower_saop' indicates whether to accept non-first-column SAOP
*/
static List *
build_index_paths(PlannerInfo *root, RelOptInfo *rel,
IndexOptInfo *index, IndexClauseSet *clauses,
bool useful_predicate,
ScanTypeControl scantype,
bool *skip_nonnative_saop,
bool *skip_lower_saop)
bool *skip_nonnative_saop)
{
List *result = NIL;
IndexPath *ipath;
Expand All @@ -848,7 +816,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
List *orderbyclausecols;
List *index_pathkeys;
List *useful_pathkeys;
bool found_lower_saop_clause;
bool pathkeys_possibly_useful;
bool index_is_ordered;
bool index_only_scan;
Expand Down Expand Up @@ -880,19 +847,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
* on by btree and possibly other places.) The list can be empty, if the
* index AM allows that.
*
* found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr
* index clause for a non-first index column. This prevents us from
* assuming that the scan result is ordered. (Actually, the result is
* still ordered if there are equality constraints for all earlier
* columns, but it seems too expensive and non-modular for this code to be
* aware of that refinement.)
*
* We also build a Relids set showing which outer rels are required by the
* selected clauses. Any lateral_relids are included in that, but not
* otherwise accounted for.
*/
index_clauses = NIL;
found_lower_saop_clause = false;
outer_relids = bms_copy(rel->lateral_relids);
for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++)
{
Expand All @@ -903,30 +862,20 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
IndexClause *iclause = (IndexClause *) lfirst(lc);
RestrictInfo *rinfo = iclause->rinfo;

/* We might need to omit ScalarArrayOpExpr clauses */
if (IsA(rinfo->clause, ScalarArrayOpExpr))
/*
* We might need to omit ScalarArrayOpExpr clauses when index AM
* lacks native support
*/
if (!index->amsearcharray && IsA(rinfo->clause, ScalarArrayOpExpr))
{
if (!index->amsearcharray)
{
if (skip_nonnative_saop)
{
/* Ignore because not supported by index */
*skip_nonnative_saop = true;
continue;
}
/* Caller had better intend this only for bitmap scan */
Assert(scantype == ST_BITMAPSCAN);
}
if (indexcol > 0)
if (skip_nonnative_saop)
{
if (skip_lower_saop)
{
/* Caller doesn't want to lose index ordering */
*skip_lower_saop = true;
continue;
}
found_lower_saop_clause = true;
/* Ignore because not supported by index */
*skip_nonnative_saop = true;
continue;
}
/* Caller had better intend this only for bitmap scan */
Assert(scantype == ST_BITMAPSCAN);
}

/* OK to include this clause */
Expand Down Expand Up @@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
/*
* 2. Compute pathkeys describing index's ordering, if any, then see how
* many of them are actually useful for this query. This is not relevant
* if we are only trying to build bitmap indexscans, nor if we have to
* assume the scan is unordered.
* if we are only trying to build bitmap indexscans.
*/
pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN &&
!found_lower_saop_clause &&
has_useful_pathkeys(root, rel));
index_is_ordered = (index->sortopfamily != NULL);
if (index_is_ordered && pathkeys_possibly_useful)
Expand Down Expand Up @@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
index, &clauseset,
useful_predicate,
ST_BITMAPSCAN,
NULL,
NULL);
result = list_concat(result, indexpaths);
}
Expand Down
122 changes: 67 additions & 55 deletions src/backend/utils/adt/selfuncs.c
Expand Up @@ -6444,8 +6444,6 @@ genericcostestimate(PlannerInfo *root,
double numIndexTuples;
double spc_random_page_cost;
double num_sa_scans;
double num_outer_scans;
double num_scans;
double qual_op_cost;
double qual_arg_cost;
List *selectivityQuals;
Expand All @@ -6460,7 +6458,7 @@ genericcostestimate(PlannerInfo *root,

/*
* Check for ScalarArrayOpExpr index quals, and estimate the number of
* index scans that will be performed.
* primitive index scans that will be performed for caller
*/
num_sa_scans = 1;
foreach(l, indexQuals)
Expand Down Expand Up @@ -6490,19 +6488,8 @@ genericcostestimate(PlannerInfo *root,
*/
numIndexTuples = costs->numIndexTuples;
if (numIndexTuples <= 0.0)
{
numIndexTuples = indexSelectivity * index->rel->tuples;

/*
* The above calculation counts all the tuples visited across all
* scans induced by ScalarArrayOpExpr nodes. We want to consider the
* average per-indexscan number, so adjust. This is a handy place to
* round to integer, too. (If caller supplied tuple estimate, it's
* responsible for handling these considerations.)
*/
numIndexTuples = rint(numIndexTuples / num_sa_scans);
}

/*
* We can bound the number of tuples by the index size in any case. Also,
* always estimate at least one tuple is touched, even when
Expand Down Expand Up @@ -6540,27 +6527,31 @@ genericcostestimate(PlannerInfo *root,
*
* The above calculations are all per-index-scan. However, if we are in a
* nestloop inner scan, we can expect the scan to be repeated (with
* different search keys) for each row of the outer relation. Likewise,
* ScalarArrayOpExpr quals result in multiple index scans. This creates
* the potential for cache effects to reduce the number of disk page
* fetches needed. We want to estimate the average per-scan I/O cost in
* the presence of caching.
* different search keys) for each row of the outer relation. This
* creates the potential for cache effects to reduce the number of disk
* page fetches needed. We want to estimate the average per-scan I/O cost
* in the presence of caching.
*
* We use the Mackert-Lohman formula (see costsize.c for details) to
* estimate the total number of page fetches that occur. While this
* wasn't what it was designed for, it seems a reasonable model anyway.
* Note that we are counting pages not tuples anymore, so we take N = T =
* index size, as if there were one "tuple" per page.
*
* Note: we assume that there will be no repeat index page fetches across
* ScalarArrayOpExpr primitive scans from the same logical index scan.
* This is guaranteed to be true for btree indexes, but is very optimistic
* with index AMs that cannot natively execute ScalarArrayOpExpr quals.
* However, these same index AMs also accept our default pessimistic
* approach to counting num_sa_scans (btree caller caps this), so we don't
* expect the final indexTotalCost to be wildly over-optimistic.
*/
num_outer_scans = loop_count;
num_scans = num_sa_scans * num_outer_scans;

if (num_scans > 1)
if (loop_count > 1)
{
double pages_fetched;

/* total page fetches ignoring cache effects */
pages_fetched = numIndexPages * num_scans;
pages_fetched = numIndexPages * loop_count;

/* use Mackert and Lohman formula to adjust for cache effects */
pages_fetched = index_pages_fetched(pages_fetched,
Expand All @@ -6570,11 +6561,9 @@ genericcostestimate(PlannerInfo *root,

/*
* Now compute the total disk access cost, and then report a pro-rated
* share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr,
* since that's internal to the indexscan.)
* share for each outer scan
*/
indexTotalCost = (pages_fetched * spc_random_page_cost)
/ num_outer_scans;
indexTotalCost = (pages_fetched * spc_random_page_cost) / loop_count;
}
else
{
Expand All @@ -6590,10 +6579,8 @@ genericcostestimate(PlannerInfo *root,
* evaluated once at the start of the scan to reduce them to runtime keys
* to pass to the index AM (see nodeIndexscan.c). We model the per-tuple
* CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
* indexqual operator. Because we have numIndexTuples as a per-scan
* number, we have to multiply by num_sa_scans to get the correct result
* for ScalarArrayOpExpr cases. Similarly add in costs for any index
* ORDER BY expressions.
* indexqual operator. Similarly add in costs for any index ORDER BY
* expressions.
*
* Note: this neglects the possible costs of rechecking lossy operators.
* Detecting that that might be needed seems more expensive than it's
Expand All @@ -6606,7 +6593,7 @@ genericcostestimate(PlannerInfo *root,

indexStartupCost = qual_arg_cost;
indexTotalCost += qual_arg_cost;
indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);

/*
* Generic assumption about index correlation: there isn't any.
Expand Down Expand Up @@ -6684,7 +6671,6 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
bool eqQualHere;
bool found_saop;
bool found_is_null_op;
double num_sa_scans;
ListCell *lc;

/*
Expand All @@ -6699,17 +6685,12 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
*
* For a RowCompareExpr, we consider only the first column, just as
* rowcomparesel() does.
*
* If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
* index scans not one, but the ScalarArrayOpExpr's operator can be
* considered to act the same as it normally does.
*/
indexBoundQuals = NIL;
indexcol = 0;
eqQualHere = false;
found_saop = false;
found_is_null_op = false;
num_sa_scans = 1;
foreach(lc, path->indexclauses)
{
IndexClause *iclause = lfirst_node(IndexClause, lc);
Expand Down Expand Up @@ -6749,14 +6730,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
else if (IsA(clause, ScalarArrayOpExpr))
{
ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
Node *other_operand = (Node *) lsecond(saop->args);
int alength = estimate_array_length(other_operand);

clause_op = saop->opno;
found_saop = true;
/* count number of SA scans induced by indexBoundQuals only */
if (alength > 1)
num_sa_scans *= alength;
}
else if (IsA(clause, NullTest))
{
Expand Down Expand Up @@ -6816,13 +6792,6 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
JOIN_INNER,
NULL);
numIndexTuples = btreeSelectivity * index->rel->tuples;

/*
* As in genericcostestimate(), we have to adjust for any
* ScalarArrayOpExpr quals included in indexBoundQuals, and then round
* to integer.
*/
numIndexTuples = rint(numIndexTuples / num_sa_scans);
}

/*
Expand All @@ -6832,16 +6801,58 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,

genericcostestimate(root, path, loop_count, &costs);

/*
* Now compensate for btree's ability to efficiently execute scans with
* SAOP clauses.
*
* btree automatically combines individual ScalarArrayOpExpr primitive
* index scans whenever the tuples covered by the next set of array keys
* are close to tuples covered by the current set. This makes the final
* number of descents particularly difficult to estimate. However, btree
* scans never visit any single leaf page more than once. That puts a
* natural floor under the worst case number of descents.
*
* It's particularly important that we not wildly overestimate the number
* of descents needed for a clause list with several SAOPs -- the costs
* really aren't multiplicative in the way genericcostestimate expects. In
* general, most distinct combinations of SAOP keys will tend to not find
* any matching tuples. Furthermore, btree scans search for the next set
* of array keys using the next tuple in line, and so won't even need a
* direct comparison to eliminate most non-matching sets of array keys.
*
* Clamp the number of descents to the estimated number of leaf page
* visits. This is still fairly pessimistic, but tends to result in more
* accurate costing of scans with several SAOP clauses -- especially when
* each array has more than a few elements. The cost of adding additional
* array constants to a low-order SAOP column should saturate past a
* certain point (except where selectivity estimates continue to shift).
*
* Also clamp the number of descents to 1/3 the number of index pages.
* This avoids implausibly high estimates with low selectivity paths,
* where scans frequently require no more than one or two descents.
*
* XXX Ideally, we'd also account for the fact that non-boundary SAOP
* clause quals (which the B-Tree code uses "non-required" scan keys for)
* won't actually contribute to the total number of descents of the index.
* This would require pushing down more context into genericcostestimate.
*/
if (costs.num_sa_scans > 1)
{
costs.num_sa_scans = Min(costs.num_sa_scans, costs.numIndexPages);
costs.num_sa_scans = Min(costs.num_sa_scans, index->pages / 3);
costs.num_sa_scans = Max(costs.num_sa_scans, 1);
}

/*
* Add a CPU-cost component to represent the costs of initial btree
* descent. We don't charge any I/O cost for touching upper btree levels,
* since they tend to stay in cache, but we still have to do about log2(N)
* comparisons to descend a btree of N leaf tuples. We charge one
* cpu_operator_cost per comparison.
*
* If there are ScalarArrayOpExprs, charge this once per SA scan. The
* ones after the first one are not startup cost so far as the overall
* plan is concerned, so add them only to "total" cost.
* If there are ScalarArrayOpExprs, charge this once per estimated
* primitive SA scan. The ones after the first one are not startup cost
* so far as the overall plan goes, so just add them to "total" cost.
*/
if (index->tuples > 1) /* avoid computing log(0) */
{
Expand All @@ -6858,7 +6869,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
* in cases where only a single leaf page is expected to be visited. This
* cost is somewhat arbitrarily set at 50x cpu_operator_cost per page
* touched. The number of such pages is btree tree height plus one (ie,
* we charge for the leaf page too). As above, charge once per SA scan.
* we charge for the leaf page too). As above, charge once per estimated
* primitive SA scan.
*/
descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost;
costs.indexStartupCost += descentCost;
Expand Down
42 changes: 33 additions & 9 deletions src/include/access/nbtree.h
Expand Up @@ -965,7 +965,7 @@ typedef struct BTScanPosData
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* returns continuescan = false.
* sets BTReadPageState.continuescan = false.
*/
bool moreLeft;
bool moreRight;
Expand Down Expand Up @@ -1043,13 +1043,13 @@ typedef struct BTScanOpaqueData

/* workspace for SK_SEARCHARRAY support */
ScanKey arrayKeyData; /* modified copy of scan->keyData */
bool arraysStarted; /* Started array keys, but have yet to "reach
* past the end" of all arrays? */
int numArrayKeys; /* number of equality-type array keys (-1 if
* there are any unsatisfiable array keys) */
int arrayKeyCount; /* count indicating number of array scan keys
* processed */
bool needPrimScan; /* Perform another primitive scan? */
BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
FmgrInfo *orderProcs; /* ORDER procs for equality constraint keys */
int numPrimScans; /* Running tally of # primitive index scans
* (used to coordinate parallel workers) */
MemoryContext arrayContext; /* scan-lifespan context for array data */

/* info about killed items if any (killedItems is NULL if never used) */
Expand Down Expand Up @@ -1083,13 +1083,37 @@ typedef struct BTScanOpaqueData

typedef BTScanOpaqueData *BTScanOpaque;

/*
* _bt_readpage state used across _bt_checkkeys calls for a page
*
* When _bt_readpage is called during a forward scan that has one or more
* equality-type SK_SEARCHARRAY scan keys, it has an extra responsibility: to
* set up information about the final tuple from the page. This must happen
* before the first call to _bt_checkkeys. _bt_checkkeys uses the final tuple
* to manage advancement of the scan's array keys more efficiently.
*/
typedef struct BTReadPageState
{
/* Input parameters, set by _bt_readpage */
ScanDirection dir; /* current scan direction */
IndexTuple finaltup; /* final tuple (high key for forward scans) */

/* Output parameters, set by _bt_checkkeys */
bool continuescan; /* Terminate ongoing (primitive) index scan? */

/* Private _bt_checkkeys-managed state */
bool finaltupchecked; /* final tuple checked against current
* SK_SEARCHARRAY array keys? */
} BTReadPageState;

/*
* We use some private sk_flags bits in preprocessed scan keys. We're allowed
* to use bits 16-31 (see skey.h). The uppermost bits are copied from the
* index's indoption[] array entry for the index attribute.
*/
#define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */
#define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */
#define SK_BT_RDDNARRAY 0x00040000 /* redundant in array preprocessing */
#define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */
#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
Expand Down Expand Up @@ -1160,7 +1184,7 @@ extern bool btcanreturn(Relation index, int attno);
extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
extern void _bt_parallel_next_primitive_scan(IndexScanDesc scan);

/*
* prototypes for functions in nbtdedup.c
Expand Down Expand Up @@ -1253,12 +1277,12 @@ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
extern void _bt_freestack(BTStack stack);
extern void _bt_preprocess_array_keys(IndexScanDesc scan);
extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_array_keys_remain(IndexScanDesc scan, ScanDirection dir);
extern void _bt_mark_array_keys(IndexScanDesc scan);
extern void _bt_restore_array_keys(IndexScanDesc scan);
extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
int tupnatts, ScanDirection dir, bool *continuescan,
extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate,
IndexTuple tuple, bool finaltup,
bool requiredMatchedByPrecheck);
extern void _bt_killitems(IndexScanDesc scan);
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
Expand Down
329 changes: 329 additions & 0 deletions src/test/regress/expected/benchmark_dynamic_saop_advancement.out
@@ -0,0 +1,329 @@
set work_mem='100MB';
set effective_io_concurrency=100;
set effective_cache_size='24GB';
set maintenance_io_concurrency=100;
set random_page_cost=2.0;
set track_io_timing to off;
set enable_seqscan to off;
set client_min_messages=error;
set log_btree_verbosity=1;
set vacuum_freeze_min_age = 0;
create extension if not exists pageinspect; -- just to have it
reset client_min_messages;
---------------------------------------------------
-- ORDER BY column comes first, SAOPs after that --
---------------------------------------------------
set client_min_messages=error;
drop table if exists docs_testcase;
reset client_min_messages;
select setseed(0.12345); -- Need deterministic test case
setseed
---------

(1 row)

create unlogged table docs_testcase
(
id serial,
type text default 'pdf' not null,
status text not null,
sender_reference text not null,
sent_at timestamptz,
created_at timestamptz default '2000-01-01' not null
);
create index mini_idx on docs_testcase using btree(sent_at desc NULLS last, sender_reference, status);
insert into docs_testcase(type, status, sender_reference, sent_at)
select
('{pdf,doc,raw}'::text[]) [ceil(random() * 3)],
('{sent,draft,suspended}'::text[]) [ceil(random() * 3)],
('{Custom,Client}'::text[]) [ceil(random() * 2)] || '/' || floor(random() * 2000),
('2000-01-01'::timestamptz - interval '2 years' * random())::timestamptz
from
generate_series(1, 100000) g;
vacuum analyze docs_testcase;
-- Index scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to off;
set enable_indexscan to on;
set enable_sort to off;
select * from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
id | type | status | sender_reference | sent_at | created_at
-------+------+--------+------------------+-----------------------------------+------------------------------
45274 | pdf | draft | Custom/1175 | Tue Dec 14 05:08:36.2976 1999 PST | Sat Jan 01 00:00:00 2000 PST
78376 | doc | draft | Custom/280 | Fri Dec 10 14:32:53.1744 1999 PST | Sat Jan 01 00:00:00 2000 PST
71132 | raw | sent | Custom/280 | Tue Nov 30 15:15:26.1216 1999 PST | Sat Jan 01 00:00:00 2000 PST
1169 | doc | sent | Client/362 | Sat Nov 06 12:37:00.912 1999 PST | Sat Jan 01 00:00:00 2000 PST
30989 | raw | draft | Custom/1175 | Fri Nov 05 22:13:37.8912 1999 PST | Sat Jan 01 00:00:00 2000 PST
60560 | pdf | sent | Client/362 | Wed Oct 20 02:43:51.5424 1999 PDT | Sat Jan 01 00:00:00 2000 PST
17880 | raw | draft | Custom/280 | Mon Oct 18 19:34:46.9344 1999 PDT | Sat Jan 01 00:00:00 2000 PST
28432 | raw | draft | Custom/1175 | Sat Oct 09 01:40:28.4736 1999 PDT | Sat Jan 01 00:00:00 2000 PST
64704 | doc | draft | Custom/1175 | Sat Oct 02 22:00:53.424 1999 PDT | Sat Jan 01 00:00:00 2000 PST
42981 | raw | sent | Client/362 | Sun Sep 19 13:52:16.3488 1999 PDT | Sat Jan 01 00:00:00 2000 PST
80576 | doc | sent | Client/362 | Sat Sep 11 08:07:20.0064 1999 PDT | Sat Jan 01 00:00:00 2000 PST
98082 | raw | sent | Client/362 | Tue Aug 03 19:55:01.0272 1999 PDT | Sat Jan 01 00:00:00 2000 PST
69574 | pdf | sent | Client/362 | Fri Jul 02 02:42:07.9488 1999 PDT | Sat Jan 01 00:00:00 2000 PST
56977 | pdf | draft | Custom/1175 | Tue Jun 29 11:49:05.952 1999 PDT | Sat Jan 01 00:00:00 2000 PST
16531 | doc | sent | Custom/1175 | Wed Jun 02 10:17:59.0784 1999 PDT | Sat Jan 01 00:00:00 2000 PST
98135 | doc | sent | Custom/280 | Tue Jun 01 10:18:29.664 1999 PDT | Sat Jan 01 00:00:00 2000 PST
33864 | raw | sent | Custom/280 | Mon May 17 08:14:59.6544 1999 PDT | Sat Jan 01 00:00:00 2000 PST
16260 | pdf | draft | Custom/280 | Tue May 11 05:21:10.0512 1999 PDT | Sat Jan 01 00:00:00 2000 PST
33186 | raw | sent | Custom/280 | Sat Apr 17 23:56:21.1488 1999 PDT | Sat Jan 01 00:00:00 2000 PST
94130 | raw | sent | Client/362 | Fri Apr 09 17:23:23.0208 1999 PDT | Sat Jan 01 00:00:00 2000 PST
(20 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select * from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------
Limit (actual rows=20 loops=1)
Buffers: shared hit=293
-> Index Scan using mini_idx on docs_testcase (actual rows=20 loops=1)
Index Cond: ((sender_reference = ANY ('{Custom/1175,Client/362,Custom/280}'::text[])) AND (status = ANY ('{draft,sent}'::text[])))
Buffers: shared hit=293
(5 rows)

-- Index-only scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to on;
set enable_indexscan to off;
select sent_at, status, sender_reference from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
sent_at | status | sender_reference
-----------------------------------+--------+------------------
Tue Dec 14 05:08:36.2976 1999 PST | draft | Custom/1175
Fri Dec 10 14:32:53.1744 1999 PST | draft | Custom/280
Tue Nov 30 15:15:26.1216 1999 PST | sent | Custom/280
Sat Nov 06 12:37:00.912 1999 PST | sent | Client/362
Fri Nov 05 22:13:37.8912 1999 PST | draft | Custom/1175
Wed Oct 20 02:43:51.5424 1999 PDT | sent | Client/362
Mon Oct 18 19:34:46.9344 1999 PDT | draft | Custom/280
Sat Oct 09 01:40:28.4736 1999 PDT | draft | Custom/1175
Sat Oct 02 22:00:53.424 1999 PDT | draft | Custom/1175
Sun Sep 19 13:52:16.3488 1999 PDT | sent | Client/362
Sat Sep 11 08:07:20.0064 1999 PDT | sent | Client/362
Tue Aug 03 19:55:01.0272 1999 PDT | sent | Client/362
Fri Jul 02 02:42:07.9488 1999 PDT | sent | Client/362
Tue Jun 29 11:49:05.952 1999 PDT | draft | Custom/1175
Wed Jun 02 10:17:59.0784 1999 PDT | sent | Custom/1175
Tue Jun 01 10:18:29.664 1999 PDT | sent | Custom/280
Mon May 17 08:14:59.6544 1999 PDT | sent | Custom/280
Tue May 11 05:21:10.0512 1999 PDT | draft | Custom/280
Sat Apr 17 23:56:21.1488 1999 PDT | sent | Custom/280
Fri Apr 09 17:23:23.0208 1999 PDT | sent | Client/362
(20 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select sent_at, status, sender_reference from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------
Limit (actual rows=20 loops=1)
Buffers: shared hit=274
-> Index Only Scan using mini_idx on docs_testcase (actual rows=20 loops=1)
Index Cond: ((sender_reference = ANY ('{Custom/1175,Client/362,Custom/280}'::text[])) AND (status = ANY ('{draft,sent}'::text[])))
Heap Fetches: 0
Buffers: shared hit=274
(6 rows)

-------------------------------
-- James Coleman's test case --
-------------------------------
-- Per https://www.postgresql.org/message-id/flat/CAAaqYe-SsHgXKXPpjn7WCTUnB_RQSxXjpSaJd32aA%3DRquv0AgQ%40mail.gmail.com,
-- though I'm going to use my own index definition for this
set client_min_messages=error;
drop table if exists coleman;
reset client_min_messages;
select setseed(0.12345); -- Need deterministic test case
setseed
---------

(1 row)

create unlogged table coleman(
bar_fk integer,
created_at timestamptz
);
-- Original index (commented out):
-- create index index_coleman_on_bar_fk_and_created_at on coleman(bar_fk, created_at);
-- my preferred index:
create index index_coleman_on_created_and_at_bar_fk on coleman(created_at, bar_fk);
insert into coleman(bar_fk, created_at)
select i % 1000, '2000-01-01'::timestamptz -(random() * '5 years'::interval)
from generate_series(1, 500000) t(i);
VACUUM (freeze,analyze) coleman;
-- Index-only scan:
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
bar_fk | created_at
--------+-----------------------------------
1 | Wed Jan 04 15:38:31.4592 1995 PST
2 | Fri Jan 06 18:28:15.9456 1995 PST
3 | Sun Jan 08 09:06:27.8496 1995 PST
1 | Sun Jan 08 20:26:58.7616 1995 PST
3 | Mon Jan 09 15:07:25.1328 1995 PST
3 | Mon Jan 09 21:07:17.3568 1995 PST
1 | Tue Jan 10 15:17:59.136 1995 PST
3 | Wed Jan 11 12:07:41.8944 1995 PST
1 | Thu Jan 12 17:31:03.8784 1995 PST
3 | Sat Jan 14 02:17:05.6256 1995 PST
3 | Sat Jan 14 20:26:57.552 1995 PST
1 | Thu Jan 19 02:48:00.5472 1995 PST
1 | Fri Jan 20 00:58:32.592 1995 PST
1 | Sat Jan 21 07:22:49.6416 1995 PST
3 | Sun Jan 22 07:42:59.9328 1995 PST
3 | Sun Jan 22 18:39:03.168 1995 PST
2 | Wed Jan 25 06:04:53.8464 1995 PST
1 | Wed Jan 25 17:15:11.9232 1995 PST
3 | Fri Jan 27 16:32:48.9984 1995 PST
2 | Sat Jan 28 10:05:32.496 1995 PST
1 | Sat Jan 28 11:33:58.6656 1995 PST
3 | Sat Jan 28 22:29:59.8272 1995 PST
3 | Sun Jan 29 01:44:30.2208 1995 PST
3 | Mon Jan 30 08:10:22.224 1995 PST
3 | Tue Jan 31 10:25:15.6576 1995 PST
3 | Thu Feb 02 15:07:34.2912 1995 PST
2 | Thu Feb 02 21:44:04.6176 1995 PST
3 | Sat Feb 04 11:36:36.3456 1995 PST
3 | Sun Feb 05 02:13:48.4608 1995 PST
3 | Sun Feb 05 15:29:52.7136 1995 PST
2 | Mon Feb 06 13:54:01.4112 1995 PST
3 | Fri Feb 10 15:09:38.5344 1995 PST
1 | Sat Feb 11 12:26:43.7568 1995 PST
1 | Sun Feb 12 18:59:15.2736 1995 PST
3 | Tue Feb 14 06:56:52.1088 1995 PST
2 | Sun Feb 19 11:52:44.8896 1995 PST
3 | Sun Feb 19 22:52:40.5408 1995 PST
1 | Tue Feb 21 05:01:27.2352 1995 PST
2 | Wed Feb 22 00:12:02.8224 1995 PST
3 | Wed Feb 22 21:44:54.2112 1995 PST
1 | Wed Feb 22 23:27:39.3696 1995 PST
2 | Thu Feb 23 03:20:10.2048 1995 PST
1 | Thu Feb 23 08:41:58.8768 1995 PST
2 | Thu Feb 23 18:51:52.9056 1995 PST
2 | Fri Feb 24 05:17:42.864 1995 PST
1 | Fri Feb 24 23:15:51.2352 1995 PST
3 | Sat Feb 25 01:49:57.504 1995 PST
2 | Mon Feb 27 04:20:08.16 1995 PST
3 | Mon Feb 27 15:10:04.4544 1995 PST
2 | Tue Feb 28 22:13:55.1712 1995 PST
(50 rows)

-- 76 buffer hits total for parity with master:
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
QUERY PLAN
--------------------------------------------------------------------------------------------------------
Limit (actual rows=50 loops=1)
Buffers: shared hit=76
-> Index Only Scan using index_coleman_on_created_and_at_bar_fk on coleman (actual rows=50 loops=1)
Index Cond: (bar_fk = ANY ('{1,2,3}'::integer[]))
Heap Fetches: 0
Buffers: shared hit=76
(6 rows)

alter table coleman add column noise int4; -- no more index only scans
-- Index scan:
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
bar_fk | created_at | noise
--------+-----------------------------------+-------
1 | Wed Jan 04 15:38:31.4592 1995 PST |
2 | Fri Jan 06 18:28:15.9456 1995 PST |
3 | Sun Jan 08 09:06:27.8496 1995 PST |
1 | Sun Jan 08 20:26:58.7616 1995 PST |
3 | Mon Jan 09 15:07:25.1328 1995 PST |
3 | Mon Jan 09 21:07:17.3568 1995 PST |
1 | Tue Jan 10 15:17:59.136 1995 PST |
3 | Wed Jan 11 12:07:41.8944 1995 PST |
1 | Thu Jan 12 17:31:03.8784 1995 PST |
3 | Sat Jan 14 02:17:05.6256 1995 PST |
3 | Sat Jan 14 20:26:57.552 1995 PST |
1 | Thu Jan 19 02:48:00.5472 1995 PST |
1 | Fri Jan 20 00:58:32.592 1995 PST |
1 | Sat Jan 21 07:22:49.6416 1995 PST |
3 | Sun Jan 22 07:42:59.9328 1995 PST |
3 | Sun Jan 22 18:39:03.168 1995 PST |
2 | Wed Jan 25 06:04:53.8464 1995 PST |
1 | Wed Jan 25 17:15:11.9232 1995 PST |
3 | Fri Jan 27 16:32:48.9984 1995 PST |
2 | Sat Jan 28 10:05:32.496 1995 PST |
1 | Sat Jan 28 11:33:58.6656 1995 PST |
3 | Sat Jan 28 22:29:59.8272 1995 PST |
3 | Sun Jan 29 01:44:30.2208 1995 PST |
3 | Mon Jan 30 08:10:22.224 1995 PST |
3 | Tue Jan 31 10:25:15.6576 1995 PST |
3 | Thu Feb 02 15:07:34.2912 1995 PST |
2 | Thu Feb 02 21:44:04.6176 1995 PST |
3 | Sat Feb 04 11:36:36.3456 1995 PST |
3 | Sun Feb 05 02:13:48.4608 1995 PST |
3 | Sun Feb 05 15:29:52.7136 1995 PST |
2 | Mon Feb 06 13:54:01.4112 1995 PST |
3 | Fri Feb 10 15:09:38.5344 1995 PST |
1 | Sat Feb 11 12:26:43.7568 1995 PST |
1 | Sun Feb 12 18:59:15.2736 1995 PST |
3 | Tue Feb 14 06:56:52.1088 1995 PST |
2 | Sun Feb 19 11:52:44.8896 1995 PST |
3 | Sun Feb 19 22:52:40.5408 1995 PST |
1 | Tue Feb 21 05:01:27.2352 1995 PST |
2 | Wed Feb 22 00:12:02.8224 1995 PST |
3 | Wed Feb 22 21:44:54.2112 1995 PST |
1 | Wed Feb 22 23:27:39.3696 1995 PST |
2 | Thu Feb 23 03:20:10.2048 1995 PST |
1 | Thu Feb 23 08:41:58.8768 1995 PST |
2 | Thu Feb 23 18:51:52.9056 1995 PST |
2 | Fri Feb 24 05:17:42.864 1995 PST |
1 | Fri Feb 24 23:15:51.2352 1995 PST |
3 | Sat Feb 25 01:49:57.504 1995 PST |
2 | Mon Feb 27 04:20:08.16 1995 PST |
3 | Mon Feb 27 15:10:04.4544 1995 PST |
2 | Tue Feb 28 22:13:55.1712 1995 PST |
(50 rows)

-- 125 buffer hits for patch, master does 16713 hits!:
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Limit (actual rows=50 loops=1)
Buffers: shared hit=125
-> Index Scan using index_coleman_on_created_and_at_bar_fk on coleman (actual rows=50 loops=1)
Index Cond: (bar_fk = ANY ('{1,2,3}'::integer[]))
Buffers: shared hit=125
(5 rows)

@@ -0,0 +1,334 @@
set work_mem='100MB';
set effective_io_concurrency=100;
set effective_cache_size='24GB';
set maintenance_io_concurrency=100;
set random_page_cost=2.0;
set track_io_timing to off;
set enable_seqscan to off;
set client_min_messages=error;
set log_btree_verbosity=1;
ERROR: unrecognized configuration parameter "log_btree_verbosity"
set vacuum_freeze_min_age = 0;
create extension if not exists pageinspect; -- just to have it
reset client_min_messages;
---------------------------------------------------
-- ORDER BY column comes first, SAOPs after that --
---------------------------------------------------
set client_min_messages=error;
drop table if exists docs_testcase;
reset client_min_messages;
select setseed(0.12345); -- Need deterministic test case
setseed
---------

(1 row)

create unlogged table docs_testcase
(
id serial,
type text default 'pdf' not null,
status text not null,
sender_reference text not null,
sent_at timestamptz,
created_at timestamptz default '2000-01-01' not null
);
create index mini_idx on docs_testcase using btree(sent_at desc NULLS last, sender_reference, status);
insert into docs_testcase(type, status, sender_reference, sent_at)
select
('{pdf,doc,raw}'::text[]) [ceil(random() * 3)],
('{sent,draft,suspended}'::text[]) [ceil(random() * 3)],
('{Custom,Client}'::text[]) [ceil(random() * 2)] || '/' || floor(random() * 2000),
('2000-01-01'::timestamptz - interval '2 years' * random())::timestamptz
from
generate_series(1, 100000) g;
vacuum analyze docs_testcase;
-- Index scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to off;
set enable_indexscan to on;
set enable_sort to off;
select * from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
id | type | status | sender_reference | sent_at | created_at
-------+------+--------+------------------+-----------------------------------+------------------------------
45274 | pdf | draft | Custom/1175 | Tue Dec 14 05:08:36.2976 1999 PST | Sat Jan 01 00:00:00 2000 PST
78376 | doc | draft | Custom/280 | Fri Dec 10 14:32:53.1744 1999 PST | Sat Jan 01 00:00:00 2000 PST
71132 | raw | sent | Custom/280 | Tue Nov 30 15:15:26.1216 1999 PST | Sat Jan 01 00:00:00 2000 PST
1169 | doc | sent | Client/362 | Sat Nov 06 12:37:00.912 1999 PST | Sat Jan 01 00:00:00 2000 PST
30989 | raw | draft | Custom/1175 | Fri Nov 05 22:13:37.8912 1999 PST | Sat Jan 01 00:00:00 2000 PST
60560 | pdf | sent | Client/362 | Wed Oct 20 02:43:51.5424 1999 PDT | Sat Jan 01 00:00:00 2000 PST
17880 | raw | draft | Custom/280 | Mon Oct 18 19:34:46.9344 1999 PDT | Sat Jan 01 00:00:00 2000 PST
28432 | raw | draft | Custom/1175 | Sat Oct 09 01:40:28.4736 1999 PDT | Sat Jan 01 00:00:00 2000 PST
64704 | doc | draft | Custom/1175 | Sat Oct 02 22:00:53.424 1999 PDT | Sat Jan 01 00:00:00 2000 PST
42981 | raw | sent | Client/362 | Sun Sep 19 13:52:16.3488 1999 PDT | Sat Jan 01 00:00:00 2000 PST
80576 | doc | sent | Client/362 | Sat Sep 11 08:07:20.0064 1999 PDT | Sat Jan 01 00:00:00 2000 PST
98082 | raw | sent | Client/362 | Tue Aug 03 19:55:01.0272 1999 PDT | Sat Jan 01 00:00:00 2000 PST
69574 | pdf | sent | Client/362 | Fri Jul 02 02:42:07.9488 1999 PDT | Sat Jan 01 00:00:00 2000 PST
56977 | pdf | draft | Custom/1175 | Tue Jun 29 11:49:05.952 1999 PDT | Sat Jan 01 00:00:00 2000 PST
16531 | doc | sent | Custom/1175 | Wed Jun 02 10:17:59.0784 1999 PDT | Sat Jan 01 00:00:00 2000 PST
98135 | doc | sent | Custom/280 | Tue Jun 01 10:18:29.664 1999 PDT | Sat Jan 01 00:00:00 2000 PST
33864 | raw | sent | Custom/280 | Mon May 17 08:14:59.6544 1999 PDT | Sat Jan 01 00:00:00 2000 PST
16260 | pdf | draft | Custom/280 | Tue May 11 05:21:10.0512 1999 PDT | Sat Jan 01 00:00:00 2000 PST
33186 | raw | sent | Custom/280 | Sat Apr 17 23:56:21.1488 1999 PDT | Sat Jan 01 00:00:00 2000 PST
94130 | raw | sent | Client/362 | Fri Apr 09 17:23:23.0208 1999 PDT | Sat Jan 01 00:00:00 2000 PST
(20 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select * from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------
Limit (actual rows=20 loops=1)
Buffers: shared hit=36613
-> Index Scan using mini_idx on docs_testcase (actual rows=20 loops=1)
Filter: ((status = ANY ('{draft,sent}'::text[])) AND (sender_reference = ANY ('{Custom/1175,Client/362,Custom/280}'::text[])))
Rows Removed by Filter: 36365
Buffers: shared hit=36613
(6 rows)

-- Index-only scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to on;
set enable_indexscan to off;
select sent_at, status, sender_reference from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
sent_at | status | sender_reference
-----------------------------------+--------+------------------
Tue Dec 14 05:08:36.2976 1999 PST | draft | Custom/1175
Fri Dec 10 14:32:53.1744 1999 PST | draft | Custom/280
Tue Nov 30 15:15:26.1216 1999 PST | sent | Custom/280
Sat Nov 06 12:37:00.912 1999 PST | sent | Client/362
Fri Nov 05 22:13:37.8912 1999 PST | draft | Custom/1175
Wed Oct 20 02:43:51.5424 1999 PDT | sent | Client/362
Mon Oct 18 19:34:46.9344 1999 PDT | draft | Custom/280
Sat Oct 09 01:40:28.4736 1999 PDT | draft | Custom/1175
Sat Oct 02 22:00:53.424 1999 PDT | draft | Custom/1175
Sun Sep 19 13:52:16.3488 1999 PDT | sent | Client/362
Sat Sep 11 08:07:20.0064 1999 PDT | sent | Client/362
Tue Aug 03 19:55:01.0272 1999 PDT | sent | Client/362
Fri Jul 02 02:42:07.9488 1999 PDT | sent | Client/362
Tue Jun 29 11:49:05.952 1999 PDT | draft | Custom/1175
Wed Jun 02 10:17:59.0784 1999 PDT | sent | Custom/1175
Tue Jun 01 10:18:29.664 1999 PDT | sent | Custom/280
Mon May 17 08:14:59.6544 1999 PDT | sent | Custom/280
Tue May 11 05:21:10.0512 1999 PDT | draft | Custom/280
Sat Apr 17 23:56:21.1488 1999 PDT | sent | Custom/280
Fri Apr 09 17:23:23.0208 1999 PDT | sent | Client/362
(20 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select sent_at, status, sender_reference from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------
Limit (actual rows=20 loops=1)
Buffers: shared hit=274
-> Index Only Scan using mini_idx on docs_testcase (actual rows=20 loops=1)
Filter: ((status = ANY ('{draft,sent}'::text[])) AND (sender_reference = ANY ('{Custom/1175,Client/362,Custom/280}'::text[])))
Rows Removed by Filter: 36365
Heap Fetches: 0
Buffers: shared hit=274
(7 rows)

-------------------------------
-- James Coleman's test case --
-------------------------------
-- Per https://www.postgresql.org/message-id/flat/CAAaqYe-SsHgXKXPpjn7WCTUnB_RQSxXjpSaJd32aA%3DRquv0AgQ%40mail.gmail.com,
-- though I'm going to use my own index definition for this
set client_min_messages=error;
drop table if exists coleman;
reset client_min_messages;
select setseed(0.12345); -- Need deterministic test case
setseed
---------

(1 row)

create unlogged table coleman(
bar_fk integer,
created_at timestamptz
);
-- Original index (commented out):
-- create index index_coleman_on_bar_fk_and_created_at on coleman(bar_fk, created_at);
-- my preferred index:
create index index_coleman_on_created_and_at_bar_fk on coleman(created_at, bar_fk);
insert into coleman(bar_fk, created_at)
select i % 1000, '2000-01-01'::timestamptz -(random() * '5 years'::interval)
from generate_series(1, 500000) t(i);
VACUUM (freeze,analyze) coleman;
-- Index-only scan:
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
bar_fk | created_at
--------+-----------------------------------
1 | Wed Jan 04 15:38:31.4592 1995 PST
2 | Fri Jan 06 18:28:15.9456 1995 PST
3 | Sun Jan 08 09:06:27.8496 1995 PST
1 | Sun Jan 08 20:26:58.7616 1995 PST
3 | Mon Jan 09 15:07:25.1328 1995 PST
3 | Mon Jan 09 21:07:17.3568 1995 PST
1 | Tue Jan 10 15:17:59.136 1995 PST
3 | Wed Jan 11 12:07:41.8944 1995 PST
1 | Thu Jan 12 17:31:03.8784 1995 PST
3 | Sat Jan 14 02:17:05.6256 1995 PST
3 | Sat Jan 14 20:26:57.552 1995 PST
1 | Thu Jan 19 02:48:00.5472 1995 PST
1 | Fri Jan 20 00:58:32.592 1995 PST
1 | Sat Jan 21 07:22:49.6416 1995 PST
3 | Sun Jan 22 07:42:59.9328 1995 PST
3 | Sun Jan 22 18:39:03.168 1995 PST
2 | Wed Jan 25 06:04:53.8464 1995 PST
1 | Wed Jan 25 17:15:11.9232 1995 PST
3 | Fri Jan 27 16:32:48.9984 1995 PST
2 | Sat Jan 28 10:05:32.496 1995 PST
1 | Sat Jan 28 11:33:58.6656 1995 PST
3 | Sat Jan 28 22:29:59.8272 1995 PST
3 | Sun Jan 29 01:44:30.2208 1995 PST
3 | Mon Jan 30 08:10:22.224 1995 PST
3 | Tue Jan 31 10:25:15.6576 1995 PST
3 | Thu Feb 02 15:07:34.2912 1995 PST
2 | Thu Feb 02 21:44:04.6176 1995 PST
3 | Sat Feb 04 11:36:36.3456 1995 PST
3 | Sun Feb 05 02:13:48.4608 1995 PST
3 | Sun Feb 05 15:29:52.7136 1995 PST
2 | Mon Feb 06 13:54:01.4112 1995 PST
3 | Fri Feb 10 15:09:38.5344 1995 PST
1 | Sat Feb 11 12:26:43.7568 1995 PST
1 | Sun Feb 12 18:59:15.2736 1995 PST
3 | Tue Feb 14 06:56:52.1088 1995 PST
2 | Sun Feb 19 11:52:44.8896 1995 PST
3 | Sun Feb 19 22:52:40.5408 1995 PST
1 | Tue Feb 21 05:01:27.2352 1995 PST
2 | Wed Feb 22 00:12:02.8224 1995 PST
3 | Wed Feb 22 21:44:54.2112 1995 PST
1 | Wed Feb 22 23:27:39.3696 1995 PST
2 | Thu Feb 23 03:20:10.2048 1995 PST
1 | Thu Feb 23 08:41:58.8768 1995 PST
2 | Thu Feb 23 18:51:52.9056 1995 PST
2 | Fri Feb 24 05:17:42.864 1995 PST
1 | Fri Feb 24 23:15:51.2352 1995 PST
3 | Sat Feb 25 01:49:57.504 1995 PST
2 | Mon Feb 27 04:20:08.16 1995 PST
3 | Mon Feb 27 15:10:04.4544 1995 PST
2 | Tue Feb 28 22:13:55.1712 1995 PST
(50 rows)

-- 76 buffer hits total for parity with master:
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
QUERY PLAN
--------------------------------------------------------------------------------------------------------
Limit (actual rows=50 loops=1)
Buffers: shared hit=76
-> Index Only Scan using index_coleman_on_created_and_at_bar_fk on coleman (actual rows=50 loops=1)
Filter: (bar_fk = ANY ('{1,2,3}'::integer[]))
Rows Removed by Filter: 16597
Heap Fetches: 0
Buffers: shared hit=76
(7 rows)

alter table coleman add column noise int4; -- no more index only scans
-- Index scan:
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
bar_fk | created_at | noise
--------+-----------------------------------+-------
1 | Wed Jan 04 15:38:31.4592 1995 PST |
2 | Fri Jan 06 18:28:15.9456 1995 PST |
3 | Sun Jan 08 09:06:27.8496 1995 PST |
1 | Sun Jan 08 20:26:58.7616 1995 PST |
3 | Mon Jan 09 15:07:25.1328 1995 PST |
3 | Mon Jan 09 21:07:17.3568 1995 PST |
1 | Tue Jan 10 15:17:59.136 1995 PST |
3 | Wed Jan 11 12:07:41.8944 1995 PST |
1 | Thu Jan 12 17:31:03.8784 1995 PST |
3 | Sat Jan 14 02:17:05.6256 1995 PST |
3 | Sat Jan 14 20:26:57.552 1995 PST |
1 | Thu Jan 19 02:48:00.5472 1995 PST |
1 | Fri Jan 20 00:58:32.592 1995 PST |
1 | Sat Jan 21 07:22:49.6416 1995 PST |
3 | Sun Jan 22 07:42:59.9328 1995 PST |
3 | Sun Jan 22 18:39:03.168 1995 PST |
2 | Wed Jan 25 06:04:53.8464 1995 PST |
1 | Wed Jan 25 17:15:11.9232 1995 PST |
3 | Fri Jan 27 16:32:48.9984 1995 PST |
2 | Sat Jan 28 10:05:32.496 1995 PST |
1 | Sat Jan 28 11:33:58.6656 1995 PST |
3 | Sat Jan 28 22:29:59.8272 1995 PST |
3 | Sun Jan 29 01:44:30.2208 1995 PST |
3 | Mon Jan 30 08:10:22.224 1995 PST |
3 | Tue Jan 31 10:25:15.6576 1995 PST |
3 | Thu Feb 02 15:07:34.2912 1995 PST |
2 | Thu Feb 02 21:44:04.6176 1995 PST |
3 | Sat Feb 04 11:36:36.3456 1995 PST |
3 | Sun Feb 05 02:13:48.4608 1995 PST |
3 | Sun Feb 05 15:29:52.7136 1995 PST |
2 | Mon Feb 06 13:54:01.4112 1995 PST |
3 | Fri Feb 10 15:09:38.5344 1995 PST |
1 | Sat Feb 11 12:26:43.7568 1995 PST |
1 | Sun Feb 12 18:59:15.2736 1995 PST |
3 | Tue Feb 14 06:56:52.1088 1995 PST |
2 | Sun Feb 19 11:52:44.8896 1995 PST |
3 | Sun Feb 19 22:52:40.5408 1995 PST |
1 | Tue Feb 21 05:01:27.2352 1995 PST |
2 | Wed Feb 22 00:12:02.8224 1995 PST |
3 | Wed Feb 22 21:44:54.2112 1995 PST |
1 | Wed Feb 22 23:27:39.3696 1995 PST |
2 | Thu Feb 23 03:20:10.2048 1995 PST |
1 | Thu Feb 23 08:41:58.8768 1995 PST |
2 | Thu Feb 23 18:51:52.9056 1995 PST |
2 | Fri Feb 24 05:17:42.864 1995 PST |
1 | Fri Feb 24 23:15:51.2352 1995 PST |
3 | Sat Feb 25 01:49:57.504 1995 PST |
2 | Mon Feb 27 04:20:08.16 1995 PST |
3 | Mon Feb 27 15:10:04.4544 1995 PST |
2 | Tue Feb 28 22:13:55.1712 1995 PST |
(50 rows)

-- 125 buffer hits for patch, master does 16713 hits!:
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Limit (actual rows=50 loops=1)
Buffers: shared hit=16713
-> Index Scan using index_coleman_on_created_and_at_bar_fk on coleman (actual rows=50 loops=1)
Filter: (bar_fk = ANY ('{1,2,3}'::integer[]))
Rows Removed by Filter: 16597
Buffers: shared hit=16713
(6 rows)

61 changes: 48 additions & 13 deletions src/test/regress/expected/create_index.out
Expand Up @@ -1910,7 +1910,7 @@ SELECT count(*) FROM dupindexcols
(1 row)

--
-- Check ordering of =ANY indexqual results (bug in 9.2.0)
-- Check that index scans with =ANY indexquals return rows in index order
--
explain (costs off)
SELECT unique1 FROM tenk1
Expand All @@ -1936,12 +1936,11 @@ explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;
QUERY PLAN
-------------------------------------------------------
QUERY PLAN
--------------------------------------------------------------------------------
Index Only Scan using tenk1_thous_tenthous on tenk1
Index Cond: (thousand < 2)
Filter: (tenthous = ANY ('{1001,3000}'::integer[]))
(3 rows)
Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
(2 rows)

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
Expand All @@ -1952,18 +1951,35 @@ ORDER BY thousand;
1 | 1001
(2 rows)

explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;
QUERY PLAN
--------------------------------------------------------------------------------
Index Only Scan Backward using tenk1_thous_tenthous on tenk1
Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
(2 rows)

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;
thousand | tenthous
----------+----------
1 | 1001
0 | 3000
(2 rows)

SET enable_indexonlyscan = OFF;
explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;
QUERY PLAN
--------------------------------------------------------------------------------------
Sort
Sort Key: thousand
-> Index Scan using tenk1_thous_tenthous on tenk1
Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
(4 rows)
QUERY PLAN
--------------------------------------------------------------------------------
Index Scan using tenk1_thous_tenthous on tenk1
Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
(2 rows)

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
Expand All @@ -1974,6 +1990,25 @@ ORDER BY thousand;
1 | 1001
(2 rows)

explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;
QUERY PLAN
--------------------------------------------------------------------------------
Index Scan Backward using tenk1_thous_tenthous on tenk1
Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
(2 rows)

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;
thousand | tenthous
----------+----------
1 | 1001
0 | 3000
(2 rows)

RESET enable_indexonlyscan;
--
-- Check elimination of constant-NULL subexpressions
Expand Down
8,094 changes: 8,094 additions & 0 deletions src/test/regress/expected/dynamic_saop_advancement.out

Large diffs are not rendered by default.

7,913 changes: 7,913 additions & 0 deletions src/test/regress/expected/dynamic_saop_advancement_master.out

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions src/test/regress/expected/join.out
Expand Up @@ -8603,10 +8603,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]);
Merge Cond: (j1.id1 = j2.id1)
Join Filter: (j2.id2 = j1.id2)
-> Index Scan using j1_id1_idx on j1
-> Index Only Scan using j2_pkey on j2
-> Index Scan using j2_id1_idx on j2
Index Cond: (id1 >= ANY ('{1,5}'::integer[]))
Filter: ((id1 % 1000) = 1)
(7 rows)
(6 rows)

select * from j1
inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
Expand Down
140 changes: 140 additions & 0 deletions src/test/regress/expected/small_dynamic_saop_advancement.out
@@ -0,0 +1,140 @@
--set enable_bitmapscan to off;
--set enable_indexonlyscan to off;
--set enable_indexscan to off;
set enable_seqscan=off;
set log_btree_verbosity=2;
--set client_min_messages=debug1;
-- Index-only scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to on;
set enable_indexscan to off;
-- Minimal backwards scan confusion test case:
select * from nulls_test where a in (183,307) order by a desc nulls last, b desc;
a | b
-----+---
307 | 1
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
183 | 1
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
(28 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from nulls_test where a in (183,307) order by a desc nulls last, b desc;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan Backward using nulls_test_idx on nulls_test (cost=10000000000.28..10000000002.77 rows=28 width=8) (actual rows=28 loops=1)
Index Cond: (a = ANY ('{183,307}'::integer[]))
Heap Fetches: 0
Buffers: shared hit=5
(4 rows)

-- Original NYC backwards scan confusion test case from big tests:
select * from nulls_test where a in (1,2,350,359,360) and b in (-1,-2,1) order by a desc nulls last, b desc;
a | b
-----+---
360 | 1
359 | 1
350 | 1
2 | 1
1 | 1
(5 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from nulls_test where a in (1,2,350,359,360) and b in (-1,-2,1) order by a desc nulls last, b desc; -- 4 or 5 (depending on if you count the VM or not) buffer accesses
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan Backward using nulls_test_idx on nulls_test (cost=10000000000.28..10000000002.39 rows=5 width=8) (actual rows=5 loops=1)
Index Cond: ((a = ANY ('{1,2,350,359,360}'::integer[])) AND (b = ANY ('{-1,-2,1}'::integer[])))
Heap Fetches: 0
Buffers: shared hit=5
(4 rows)

select * from multi_test where a in (182, 183, 184) and b in (1,2) order by a desc, b desc;
a | b
-----+---
184 | 1
183 | 1
182 | 1
(3 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from multi_test where a in (182, 183, 184) and b in (1,2) order by a desc, b desc;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan Backward using multi_test_idx on multi_test (cost=10000000000.28..10000000002.34 rows=3 width=8) (actual rows=3 loops=1)
Index Cond: ((a = ANY ('{182,183,184}'::integer[])) AND (b = ANY ('{1,2}'::integer[])))
Heap Fetches: 0
Buffers: shared hit=4
(4 rows)

-- Same again, but forwards scan -- this one currently gets 6 hits total,
-- which is kinda weird because high key (184,-inf) is considered ahead of scan
-- keys, whereas first non-pivot tuple on sibling page (184,*) is considered
-- before scan keys:
select * from nulls_test where a in (183,307);
a | b
-----+---
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 0
183 | 1
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 0
307 | 1
(28 rows)

EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from nulls_test where a in (183,307);
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using nulls_test_idx on nulls_test (cost=10000000000.28..10000000002.77 rows=28 width=8) (actual rows=28 loops=1)
Index Cond: (a = ANY ('{183,307}'::integer[]))
Heap Fetches: 0
Buffers: shared hit=6
(4 rows)

140 changes: 140 additions & 0 deletions src/test/regress/sql/benchmark_dynamic_saop_advancement.sql
@@ -0,0 +1,140 @@
set work_mem='100MB';
set effective_io_concurrency=100;
set effective_cache_size='24GB';
set maintenance_io_concurrency=100;
set random_page_cost=2.0;
set track_io_timing to off;
set enable_seqscan to off;
set client_min_messages=error;
set log_btree_verbosity=1;
set vacuum_freeze_min_age = 0;
create extension if not exists pageinspect; -- just to have it
reset client_min_messages;

---------------------------------------------------
-- ORDER BY column comes first, SAOPs after that --
---------------------------------------------------
set client_min_messages=error;
drop table if exists docs_testcase;
reset client_min_messages;
select setseed(0.12345); -- Need deterministic test case
create unlogged table docs_testcase
(
id serial,
type text default 'pdf' not null,
status text not null,
sender_reference text not null,
sent_at timestamptz,
created_at timestamptz default '2000-01-01' not null
);
create index mini_idx on docs_testcase using btree(sent_at desc NULLS last, sender_reference, status);
insert into docs_testcase(type, status, sender_reference, sent_at)
select
('{pdf,doc,raw}'::text[]) [ceil(random() * 3)],
('{sent,draft,suspended}'::text[]) [ceil(random() * 3)],
('{Custom,Client}'::text[]) [ceil(random() * 2)] || '/' || floor(random() * 2000),
('2000-01-01'::timestamptz - interval '2 years' * random())::timestamptz
from
generate_series(1, 100000) g;
vacuum analyze docs_testcase;

-- Index scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to off;
set enable_indexscan to on;
set enable_sort to off;

select * from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select * from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;

-- Index-only scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to on;
set enable_indexscan to off;

select sent_at, status, sender_reference from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select sent_at, status, sender_reference from docs_testcase
where
status in ('draft', 'sent') and
sender_reference in ('Custom/1175', 'Client/362', 'Custom/280')
order by
sent_at desc NULLS last
limit 20;

-------------------------------
-- James Coleman's test case --
-------------------------------

-- Per https://www.postgresql.org/message-id/flat/CAAaqYe-SsHgXKXPpjn7WCTUnB_RQSxXjpSaJd32aA%3DRquv0AgQ%40mail.gmail.com,
-- though I'm going to use my own index definition for this

set client_min_messages=error;
drop table if exists coleman;
reset client_min_messages;
select setseed(0.12345); -- Need deterministic test case
create unlogged table coleman(
bar_fk integer,
created_at timestamptz
);

-- Original index (commented out):
-- create index index_coleman_on_bar_fk_and_created_at on coleman(bar_fk, created_at);

-- my preferred index:
create index index_coleman_on_created_and_at_bar_fk on coleman(created_at, bar_fk);

insert into coleman(bar_fk, created_at)
select i % 1000, '2000-01-01'::timestamptz -(random() * '5 years'::interval)
from generate_series(1, 500000) t(i);

VACUUM (freeze,analyze) coleman;

-- Index-only scan:
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
-- 76 buffer hits total for parity with master:
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;

alter table coleman add column noise int4; -- no more index only scans

-- Index scan:
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
-- 125 buffer hits for patch, master does 16713 hits!:
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF, COSTS OFF) -- need "costs off" here
select *
from coleman
where bar_fk in (1, 2, 3)
order by created_at
limit 50;
20 changes: 19 additions & 1 deletion src/test/regress/sql/create_index.sql
Expand Up @@ -753,7 +753,7 @@ SELECT count(*) FROM dupindexcols
WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX';

--
-- Check ordering of =ANY indexqual results (bug in 9.2.0)
-- Check that index scans with =ANY indexquals return rows in index order
--

explain (costs off)
Expand All @@ -774,6 +774,15 @@ SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;

explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;

SET enable_indexonlyscan = OFF;

explain (costs off)
Expand All @@ -785,6 +794,15 @@ SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;

explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;

SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand DESC, tenthous DESC;

RESET enable_indexonlyscan;

--
Expand Down
3,245 changes: 3,245 additions & 0 deletions src/test/regress/sql/dynamic_saop_advancement.sql

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/test/regress/sql/dynamic_saop_advancement_master.sql
33 changes: 33 additions & 0 deletions src/test/regress/sql/small_dynamic_saop_advancement.sql
@@ -0,0 +1,33 @@
--set enable_bitmapscan to off;
--set enable_indexonlyscan to off;
--set enable_indexscan to off;
set enable_seqscan=off;
set log_btree_verbosity=2;
--set client_min_messages=debug1;

-- Index-only scan:
set enable_bitmapscan to off;
set enable_indexonlyscan to on;
set enable_indexscan to off;

-- Minimal backwards scan confusion test case:
select * from nulls_test where a in (183,307) order by a desc nulls last, b desc;
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from nulls_test where a in (183,307) order by a desc nulls last, b desc;

-- Original NYC backwards scan confusion test case from big tests:
select * from nulls_test where a in (1,2,350,359,360) and b in (-1,-2,1) order by a desc nulls last, b desc;
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from nulls_test where a in (1,2,350,359,360) and b in (-1,-2,1) order by a desc nulls last, b desc; -- 4 or 5 (depending on if you count the VM or not) buffer accesses

select * from multi_test where a in (182, 183, 184) and b in (1,2) order by a desc, b desc;
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from multi_test where a in (182, 183, 184) and b in (1,2) order by a desc, b desc;

-- Same again, but forwards scan -- this one currently gets 6 hits total,
-- which is kinda weird because high key (184,-inf) is considered ahead of scan
-- keys, whereas first non-pivot tuple on sibling page (184,*) is considered
-- before scan keys:
select * from nulls_test where a in (183,307);
EXPLAIN (ANALYZE, BUFFERS, TIMING OFF, SUMMARY OFF)
select * from nulls_test where a in (183,307);