-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
nbtsearch.c
2546 lines (2298 loc) · 76.8 KB
/
nbtsearch.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-------------------------------------------------------------------------
*
* nbtsearch.c
* Search code for postgres btrees.
*
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/nbtree/nbtsearch.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
static int _bt_binsrch_posting(BTScanInsert key, Page page,
OffsetNumber offnum);
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum, bool firstPage);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, ItemPointer heapTid,
IndexTuple itup);
static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum,
ItemPointer heapTid, int tupleOffset);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
ScanDirection dir);
static Buffer _bt_walk_left(Relation rel, Buffer buf);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
/*
* _bt_drop_lock_and_maybe_pin()
*
* Unlock the buffer; and if it is safe to release the pin, do that, too.
* This will prevent vacuum from stalling in a blocked state trying to read a
* page when a cursor is sitting on it.
*
* See nbtree/README section on making concurrent TID recycling safe.
*/
static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
{
_bt_unlockbuf(scan->indexRelation, sp->buf);
if (IsMVCCSnapshot(scan->xs_snapshot) &&
RelationNeedsWAL(scan->indexRelation) &&
!scan->xs_want_itup)
{
ReleaseBuffer(sp->buf);
sp->buf = InvalidBuffer;
}
}
/*
* _bt_search() -- Search the tree for a particular scankey,
* or more precisely for the first leaf page it could be on.
*
* The passed scankey is an insertion-type scankey (see nbtree/README),
* but it can omit the rightmost column(s) of the index.
*
* Return value is a stack of parent-page pointers (i.e. there is no entry for
* the leaf level/page). *bufP is set to the address of the leaf-page buffer,
* which is locked and pinned. No locks are held on the parent pages,
* however!
*
* The returned buffer is locked according to access parameter. Additionally,
* access = BT_WRITE will allow an empty root page to be created and returned.
* When access = BT_READ, an empty index will result in *bufP being set to
* InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
* during the search will be finished.
*
* heaprel must be provided by callers that pass access = BT_WRITE, since we
* might need to allocate a new root page for caller -- see _bt_allocbuf.
*/
BTStack
_bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
int access)
{
BTStack stack_in = NULL;
int page_access = BT_READ;
/* heaprel must be set whenever _bt_allocbuf is reachable */
Assert(access == BT_READ || access == BT_WRITE);
Assert(access == BT_READ || heaprel != NULL);
/* Get the root page to start with */
*bufP = _bt_getroot(rel, heaprel, access);
/* If index is empty and access = BT_READ, no root page is created. */
if (!BufferIsValid(*bufP))
return (BTStack) NULL;
/* Loop iterates once per level descended in the tree */
for (;;)
{
Page page;
BTPageOpaque opaque;
OffsetNumber offnum;
ItemId itemid;
IndexTuple itup;
BlockNumber child;
BTStack new_stack;
/*
* Race -- the page we just grabbed may have split since we read its
* downlink in its parent page (or the metapage). If it has, we may
* need to move right to its new sibling. Do that.
*
* In write-mode, allow _bt_moveright to finish any incomplete splits
* along the way. Strictly speaking, we'd only need to finish an
* incomplete split on the leaf page we're about to insert to, not on
* any of the upper levels (internal pages with incomplete splits are
* also taken care of in _bt_getstackbuf). But this is a good
* opportunity to finish splits of internal pages too.
*/
*bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
stack_in, page_access);
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
opaque = BTPageGetOpaque(page);
if (P_ISLEAF(opaque))
break;
/*
* Find the appropriate pivot tuple on this page. Its downlink points
* to the child page that we're about to descend to.
*/
offnum = _bt_binsrch(rel, key, *bufP);
itemid = PageGetItemId(page, offnum);
itup = (IndexTuple) PageGetItem(page, itemid);
Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
child = BTreeTupleGetDownLink(itup);
/*
* We need to save the location of the pivot tuple we chose in a new
* stack entry for this page/level. If caller ends up splitting a
* page one level down, it usually ends up inserting a new pivot
* tuple/downlink immediately after the location recorded here.
*/
new_stack = (BTStack) palloc(sizeof(BTStackData));
new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
new_stack->bts_offset = offnum;
new_stack->bts_parent = stack_in;
/*
* Page level 1 is lowest non-leaf page level prior to leaves. So, if
* we're on the level 1 and asked to lock leaf page in write mode,
* then lock next page in write mode, because it must be a leaf.
*/
if (opaque->btpo_level == 1 && access == BT_WRITE)
page_access = BT_WRITE;
/* drop the read lock on the page, then acquire one on its child */
*bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
/* okay, all set to move down a level */
stack_in = new_stack;
}
/*
* If we're asked to lock leaf in write mode, but didn't manage to, then
* relock. This should only happen when the root page is a leaf page (and
* the only page in the index other than the metapage).
*/
if (access == BT_WRITE && page_access == BT_READ)
{
/* trade in our read lock for a write lock */
_bt_unlockbuf(rel, *bufP);
_bt_lockbuf(rel, *bufP, BT_WRITE);
/*
* Race -- the leaf page may have split after we dropped the read lock
* but before we acquired a write lock. If it has, we may need to
* move right to its new sibling. Do that.
*/
*bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
}
return stack_in;
}
/*
* _bt_moveright() -- move right in the btree if necessary.
*
* When we follow a pointer to reach a page, it is possible that
* the page has changed in the meanwhile. If this happens, we're
* guaranteed that the page has "split right" -- that is, that any
* data that appeared on the page originally is either on the page
* or strictly to the right of it.
*
* This routine decides whether or not we need to move right in the
* tree by examining the high key entry on the page. If that entry is
* strictly less than the scankey, or <= the scankey in the
* key.nextkey=true case, then we followed the wrong link and we need
* to move right.
*
* The passed insertion-type scankey can omit the rightmost column(s) of the
* index. (see nbtree/README)
*
* When key.nextkey is false (the usual case), we are looking for the first
* item >= key. When key.nextkey is true, we are looking for the first item
* strictly greater than key.
*
* If forupdate is true, we will attempt to finish any incomplete splits
* that we encounter. This is required when locking a target page for an
* insertion, because we don't allow inserting on a page before the split is
* completed. 'heaprel' and 'stack' are only used if forupdate is true.
*
* On entry, we have the buffer pinned and a lock of the type specified by
* 'access'. If we move right, we release the buffer and lock and acquire
* the same on the right sibling. Return value is the buffer we stop at.
*/
Buffer
_bt_moveright(Relation rel,
Relation heaprel,
BTScanInsert key,
Buffer buf,
bool forupdate,
BTStack stack,
int access)
{
Page page;
BTPageOpaque opaque;
int32 cmpval;
Assert(!forupdate || heaprel != NULL);
/*
* When nextkey = false (normal case): if the scan key that brought us to
* this page is > the high key stored on the page, then the page has split
* and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
* have some duplicates to the right as well as the left, but that's
* something that's only ever dealt with on the leaf level, after
* _bt_search has found an initial leaf page.)
*
* When nextkey = true: move right if the scan key is >= page's high key.
* (Note that key.scantid cannot be set in this case.)
*
* The page could even have split more than once, so scan as far as
* needed.
*
* We also have to move right if we followed a link that brought us to a
* dead page.
*/
cmpval = key->nextkey ? 0 : 1;
for (;;)
{
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
if (P_RIGHTMOST(opaque))
break;
/*
* Finish any incomplete splits we encounter along the way.
*/
if (forupdate && P_INCOMPLETE_SPLIT(opaque))
{
BlockNumber blkno = BufferGetBlockNumber(buf);
/* upgrade our lock if necessary */
if (access == BT_READ)
{
_bt_unlockbuf(rel, buf);
_bt_lockbuf(rel, buf, BT_WRITE);
}
if (P_INCOMPLETE_SPLIT(opaque))
_bt_finish_split(rel, heaprel, buf, stack);
else
_bt_relbuf(rel, buf);
/* re-acquire the lock in the right mode, and re-check */
buf = _bt_getbuf(rel, blkno, access);
continue;
}
if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
{
/* step right one page */
buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
continue;
}
else
break;
}
if (P_IGNORE(opaque))
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
return buf;
}
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
* of the last key < given scankey, or last key <= given scankey if nextkey
* is true. (Since _bt_compare treats the first data key of such a page as
* minus infinity, there will be at least one key < scankey, so the result
* always points at one of the keys on the page.)
*
* On a leaf page, _bt_binsrch() returns the final result of the initial
* positioning process that started with _bt_first's call to _bt_search.
* We're returning a non-pivot tuple offset, so things are a little different.
* It is possible that we'll return an offset that's either past the last
* non-pivot slot, or (in the case of a backward scan) before the first slot.
*
* This procedure is not responsible for walking right, it just examines
* the given page. _bt_binsrch() has no lock or refcount side effects
* on the buffer.
*/
static OffsetNumber
_bt_binsrch(Relation rel,
BTScanInsert key,
Buffer buf)
{
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
int32 result,
cmpval;
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
/* Requesting nextkey semantics while using scantid seems nonsensical */
Assert(!key->nextkey || key->scantid == NULL);
/* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
low = P_FIRSTDATAKEY(opaque);
high = PageGetMaxOffsetNumber(page);
/*
* If there are no keys on the page, return the first available slot. Note
* this covers two cases: the page is really empty (no keys), or it
* contains only a high key. The latter case is possible after vacuuming.
* This can never happen on an internal page, however, since they are
* never empty (an internal page must have at least one child).
*/
if (unlikely(high < low))
return low;
/*
* Binary search to find the first key on the page >= scan key, or first
* key > scankey when nextkey is true.
*
* For nextkey=false (cmpval=1), the loop invariant is: all slots before
* 'low' are < scan key, all slots at or after 'high' are >= scan key.
*
* For nextkey=true (cmpval=0), the loop invariant is: all slots before
* 'low' are <= scan key, all slots at or after 'high' are > scan key.
*
* We can fall out when high == low.
*/
high++; /* establish the loop invariant for high */
cmpval = key->nextkey ? 0 : 1; /* select comparison value */
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
/* We have low <= mid < high, so mid points at a real slot */
result = _bt_compare(rel, key, page, mid);
if (result >= cmpval)
low = mid + 1;
else
high = mid;
}
/*
* At this point we have high == low.
*
* On a leaf page we always return the first non-pivot tuple >= scan key
* (resp. > scan key) for forward scan callers. For backward scans, it's
* always the _last_ non-pivot tuple < scan key (resp. <= scan key).
*/
if (P_ISLEAF(opaque))
{
/*
* In the backward scan case we're supposed to locate the last
* matching tuple on the leaf level -- not the first matching tuple
* (the last tuple will be the first one returned by the scan).
*
* At this point we've located the first non-pivot tuple immediately
* after the last matching tuple (which might just be maxoff + 1).
* Compensate by stepping back.
*/
if (key->backward)
return OffsetNumberPrev(low);
return low;
}
/*
* On a non-leaf page, return the last key < scan key (resp. <= scan key).
* There must be one if _bt_compare() is playing by the rules.
*
* _bt_compare() will seldom see any exactly-matching pivot tuples, since
* a truncated -inf heap TID is usually enough to prevent it altogether.
* Even omitted scan key entries are treated as > truncated attributes.
*
* However, during backward scans _bt_compare() interprets omitted scan
* key attributes as == corresponding truncated -inf attributes instead.
* This works just like < would work here. Under this scheme, < strategy
* backward scans will always directly descend to the correct leaf page.
* In particular, they will never incur an "extra" leaf page access with a
* scan key that happens to contain the same prefix of values as some
* pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
* it uses a leaf page high key to "re-find" a page undergoing deletion.
*/
Assert(low > P_FIRSTDATAKEY(opaque));
return OffsetNumberPrev(low);
}
/*
*
* _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
*
* Like _bt_binsrch(), but with support for caching the binary search
* bounds. Only used during insertion, and only on the leaf page that it
* looks like caller will insert tuple on. Exclusive-locked and pinned
* leaf page is contained within insertstate.
*
* Caches the bounds fields in insertstate so that a subsequent call can
* reuse the low and strict high bounds of original binary search. Callers
* that use these fields directly must be prepared for the case where low
* and/or stricthigh are not on the same page (one or both exceed maxoff
* for the page). The case where there are no items on the page (high <
* low) makes bounds invalid.
*
* Caller is responsible for invalidating bounds when it modifies the page
* before calling here a second time, and for dealing with posting list
* tuple matches (callers can use insertstate's postingoff field to
* determine which existing heap TID will need to be replaced by a posting
* list split).
*/
OffsetNumber
_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
{
BTScanInsert key = insertstate->itup_key;
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high,
stricthigh;
int32 result,
cmpval;
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
Assert(P_ISLEAF(opaque));
Assert(!key->nextkey);
Assert(insertstate->postingoff == 0);
if (!insertstate->bounds_valid)
{
/* Start new binary search */
low = P_FIRSTDATAKEY(opaque);
high = PageGetMaxOffsetNumber(page);
}
else
{
/* Restore result of previous binary search against same page */
low = insertstate->low;
high = insertstate->stricthigh;
}
/* If there are no keys on the page, return the first available slot */
if (unlikely(high < low))
{
/* Caller can't reuse bounds */
insertstate->low = InvalidOffsetNumber;
insertstate->stricthigh = InvalidOffsetNumber;
insertstate->bounds_valid = false;
return low;
}
/*
* Binary search to find the first key on the page >= scan key. (nextkey
* is always false when inserting).
*
* The loop invariant is: all slots before 'low' are < scan key, all slots
* at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
* maintained to save additional search effort for caller.
*
* We can fall out when high == low.
*/
if (!insertstate->bounds_valid)
high++; /* establish the loop invariant for high */
stricthigh = high; /* high initially strictly higher */
cmpval = 1; /* !nextkey comparison value */
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
/* We have low <= mid < high, so mid points at a real slot */
result = _bt_compare(rel, key, page, mid);
if (result >= cmpval)
low = mid + 1;
else
{
high = mid;
if (result != 0)
stricthigh = high;
}
/*
* If tuple at offset located by binary search is a posting list whose
* TID range overlaps with caller's scantid, perform posting list
* binary search to set postingoff for caller. Caller must split the
* posting list when postingoff is set. This should happen
* infrequently.
*/
if (unlikely(result == 0 && key->scantid != NULL))
{
/*
* postingoff should never be set more than once per leaf page
* binary search. That would mean that there are duplicate table
* TIDs in the index, which is never okay. Check for that here.
*/
if (insertstate->postingoff != 0)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
ItemPointerGetBlockNumber(key->scantid),
ItemPointerGetOffsetNumber(key->scantid),
low, stricthigh,
BufferGetBlockNumber(insertstate->buf),
RelationGetRelationName(rel))));
insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
}
}
/*
* On a leaf page, a binary search always returns the first key >= scan
* key (at least in !nextkey case), which could be the last slot + 1. This
* is also the lower bound of cached search.
*
* stricthigh may also be the last slot + 1, which prevents caller from
* using bounds directly, but is still useful to us if we're called a
* second time with cached bounds (cached low will be < stricthigh when
* that happens).
*/
insertstate->low = low;
insertstate->stricthigh = stricthigh;
insertstate->bounds_valid = true;
return low;
}
/*----------
* _bt_binsrch_posting() -- posting list binary search.
*
* Helper routine for _bt_binsrch_insert().
*
* Returns offset into posting list where caller's scantid belongs.
*----------
*/
static int
_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
{
IndexTuple itup;
ItemId itemid;
int low,
high,
mid,
res;
/*
* If this isn't a posting tuple, then the index must be corrupt (if it is
* an ordinary non-pivot tuple then there must be an existing tuple with a
* heap TID that equals inserter's new heap TID/scantid). Defensively
* check that tuple is a posting list tuple whose posting list range
* includes caller's scantid.
*
* (This is also needed because contrib/amcheck's rootdescend option needs
* to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
*/
itemid = PageGetItemId(page, offnum);
itup = (IndexTuple) PageGetItem(page, itemid);
if (!BTreeTupleIsPosting(itup))
return 0;
Assert(key->heapkeyspace && key->allequalimage);
/*
* In the event that posting list tuple has LP_DEAD bit set, indicate this
* to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
* second call to _bt_binsrch_insert() can take place when its caller has
* removed the dead item.
*/
if (ItemIdIsDead(itemid))
return -1;
/* "high" is past end of posting list for loop invariant */
low = 0;
high = BTreeTupleGetNPosting(itup);
Assert(high >= 2);
while (high > low)
{
mid = low + ((high - low) / 2);
res = ItemPointerCompare(key->scantid,
BTreeTupleGetPostingN(itup, mid));
if (res > 0)
low = mid + 1;
else if (res < 0)
high = mid;
else
return mid;
}
/* Exact match not found */
return low;
}
/*----------
* _bt_compare() -- Compare insertion-type scankey to tuple on a page.
*
* page/offnum: location of btree item to be compared to.
*
* This routine returns:
* <0 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
* >0 if scankey > tuple at offnum.
*
* NULLs in the keys are treated as sortable values. Therefore
* "equality" does not necessarily mean that the item should be returned
* to the caller as a matching key. Similarly, an insertion scankey
* with its scantid set is treated as equal to a posting tuple whose TID
* range overlaps with their scantid. There generally won't be a
* matching TID in the posting tuple, which caller must handle
* themselves (e.g., by splitting the posting list tuple).
*
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
* "minus infinity": this routine will always claim it is less than the
* scankey. The actual key value stored is explicitly truncated to 0
* attributes (explicitly minus infinity) with version 3+ indexes, but
* that isn't relied upon. This allows us to implement the Lehman and
* Yao convention that the first down-link pointer is before the first
* key. See backend/access/nbtree/README for details.
*----------
*/
int32
_bt_compare(Relation rel,
BTScanInsert key,
Page page,
OffsetNumber offnum)
{
TupleDesc itupdesc = RelationGetDescr(rel);
BTPageOpaque opaque = BTPageGetOpaque(page);
IndexTuple itup;
ItemPointer heapTid;
ScanKey scankey;
int ncmpkey;
int ntupatts;
int32 result;
Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
Assert(key->heapkeyspace || key->scantid == NULL);
/*
* Force result ">" if target item is first data item on an internal page
* --- see NOTE above.
*/
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
return 1;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
ntupatts = BTreeTupleGetNAtts(itup, rel);
/*
* The scan key is set up with the attribute number associated with each
* term in the key. It is important that, if the index is multi-key, the
* scan contain the first k key attributes, and that they be in order. If
* you think about how multi-key ordering works, you'll understand why
* this is.
*
* We don't test for violation of this condition here, however. The
* initial setup for the index scan had better have gotten it right (see
* _bt_first).
*/
ncmpkey = Min(ntupatts, key->keysz);
Assert(key->heapkeyspace || ncmpkey == key->keysz);
Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
scankey = key->scankeys;
for (int i = 1; i <= ncmpkey; i++)
{
Datum datum;
bool isNull;
datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
{
if (isNull)
result = 0; /* NULL "=" NULL */
else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
result = -1; /* NULL "<" NOT_NULL */
else
result = 1; /* NULL ">" NOT_NULL */
}
else if (isNull) /* key is NOT_NULL and item is NULL */
{
if (scankey->sk_flags & SK_BT_NULLS_FIRST)
result = 1; /* NOT_NULL ">" NULL */
else
result = -1; /* NOT_NULL "<" NULL */
}
else
{
/*
* The sk_func needs to be passed the index value as left arg and
* the sk_argument as right arg (they might be of different
* types). Since it is convenient for callers to think of
* _bt_compare as comparing the scankey to the index item, we have
* to flip the sign of the comparison result. (Unless it's a DESC
* column, in which case we *don't* flip the sign.)
*/
result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
scankey->sk_collation,
datum,
scankey->sk_argument));
if (!(scankey->sk_flags & SK_BT_DESC))
INVERT_COMPARE_RESULT(result);
}
/* if the keys are unequal, return the difference */
if (result != 0)
return result;
scankey++;
}
/*
* All non-truncated attributes (other than heap TID) were found to be
* equal. Treat truncated attributes as minus infinity when scankey has a
* key attribute value that would otherwise be compared directly.
*
* Note: it doesn't matter if ntupatts includes non-key attributes;
* scankey won't, so explicitly excluding non-key attributes isn't
* necessary.
*/
if (key->keysz > ntupatts)
return 1;
/*
* Use the heap TID attribute and scantid to try to break the tie. The
* rules are the same as any other key attribute -- only the
* representation differs.
*/
heapTid = BTreeTupleGetHeapTID(itup);
if (key->scantid == NULL)
{
/*
* Forward scans have a scankey that is considered greater than a
* truncated pivot tuple if and when the scankey has equal values for
* attributes up to and including the least significant untruncated
* attribute in tuple. Even attributes that were omitted from the
* scan key are considered greater than -inf truncated attributes.
* (See _bt_binsrch for an explanation of our backward scan behavior.)
*
* For example, if an index has the minimum two attributes (single
* user key attribute, plus heap TID attribute), and a page's high key
* is ('foo', -inf), and scankey is ('foo', <omitted>), the search
* will not descend to the page to the left. The search will descend
* right instead. The truncated attribute in pivot tuple means that
* all non-pivot tuples on the page to the left are strictly < 'foo',
* so it isn't necessary to descend left. In other words, search
* doesn't have to descend left because it isn't interested in a match
* that has a heap TID value of -inf.
*
* Note: the heap TID part of the test ensures that scankey is being
* compared to a pivot tuple with one or more truncated -inf key
* attributes. The heap TID attribute is the last key attribute in
* every index, of course, but other than that it isn't special.
*/
if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
key->heapkeyspace)
return 1;
/* All provided scankey arguments found to be equal */
return 0;
}
/*
* Treat truncated heap TID as minus infinity, since scankey has a key
* attribute value (scantid) that would otherwise be compared directly
*/
Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
if (heapTid == NULL)
return 1;
/*
* Scankey must be treated as equal to a posting list tuple if its scantid
* value falls within the range of the posting list. In all other cases
* there can only be a single heap TID value, which is compared directly
* with scantid.
*/
Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
result = ItemPointerCompare(key->scantid, heapTid);
if (result <= 0 || !BTreeTupleIsPosting(itup))
return result;
else
{
result = ItemPointerCompare(key->scantid,
BTreeTupleGetMaxHeapTID(itup));
if (result > 0)
return 1;
}
return 0;
}
/*
* _bt_first() -- Find the first item in a scan.
*
* We need to be clever about the direction of scan, the search
* conditions, and the tree ordering. We find the first item (or,
* if backwards scan, the last item) in the tree that satisfies the
* qualifications in the scan key. On success exit, the page containing
* the current index tuple is pinned but not locked, and data about
* the matching tuple(s) on the page has been loaded into so->currPos.
* scan->xs_heaptid is set to the heap TID of the current tuple, and if
* requested, scan->xs_itup points to a copy of the index tuple.
*
* If there are no matching items in the index, we return false, with no
* pins or locks held.
*
* Note that scan->keyData[], and the so->keyData[] scankey built from it,
* are both search-type scankeys (see nbtree/README for more about this).
* Within this routine, we build a temporary insertion-type scankey to use
* in locating the scan start position.
*/
bool
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Buffer buf;
BTStack stack;
OffsetNumber offnum;
StrategyNumber strat;
BTScanInsertData inskey;
ScanKey startKeys[INDEX_MAX_KEYS];
ScanKeyData notnullkeys[INDEX_MAX_KEYS];
int keysz = 0;
int i;
bool status;
StrategyNumber strat_total;
BTScanPosItem *currItem;
BlockNumber blkno;
Assert(!BTScanPosIsValid(so->currPos));
pgstat_count_index_scan(rel);
/*
* Examine the scan keys and eliminate any redundant keys; also mark the
* keys that must be matched to continue the scan.
*/
_bt_preprocess_keys(scan);
/*
* Quit now if _bt_preprocess_keys() discovered that the scan keys can
* never be satisfied (eg, x == 1 AND x > 2).
*/
if (!so->qual_ok)
{
/* Notify any other workers that we're done with this scan key. */
_bt_parallel_done(scan);
return false;
}
/*
* For parallel scans, get the starting page from shared state. If the
* scan has not started, proceed to find out first leaf page in the usual
* way while keeping other participating processes waiting. If the scan
* has already begun, use the page number from the shared structure.
*/
if (scan->parallel_scan != NULL)
{
status = _bt_parallel_seize(scan, &blkno);
if (!status)
return false;
else if (blkno == P_NONE)
{
_bt_parallel_done(scan);
return false;
}
else if (blkno != InvalidBlockNumber)
{
if (!_bt_parallel_readpage(scan, blkno, dir))
return false;
goto readcomplete;
}
}
/*----------
* Examine the scan keys to discover where we need to start the scan.
*
* We want to identify the keys that can be used as starting boundaries;
* these are =, >, or >= keys for a forward scan or =, <, <= keys for
* a backwards scan. We can use keys for multiple attributes so long as
* the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
* a > or < boundary or find an attribute with no boundary (which can be
* thought of as the same as "> -infinity"), we can't use keys for any
* attributes to its right, because it would break our simplistic notion
* of what initial positioning strategy to use.
*
* When the scan keys include cross-type operators, _bt_preprocess_keys
* may not be able to eliminate redundant keys; in such cases we will
* arbitrarily pick a usable one for each attribute. This is correct
* but possibly not optimal behavior. (For example, with keys like
* "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
* x=5 would be more efficient.) Since the situation only arises given
* a poorly-worded query plus an incomplete opfamily, live with it.
*
* When both equality and inequality keys appear for a single attribute
* (again, only possible when cross-type operators appear), we *must*
* select one of the equality keys for the starting point, because
* _bt_checkkeys() will stop the scan as soon as an equality qual fails.
* For example, if we have keys like "x >= 4 AND x = 10" and we elect to
* start at x=4, we will fail and stop before reaching x=10. If multiple
* equality quals survive preprocessing, however, it doesn't matter which
* one we use --- by definition, they are either redundant or
* contradictory.
*
* Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
* If the index stores nulls at the end of the index we'll be starting
* from, and we have no boundary key for the column (which means the key
* we deduced NOT NULL from is an inequality key that constrains the other
* end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
* use as a boundary key. If we didn't do this, we might find ourselves
* traversing a lot of null entries at the start of the scan.
*
* In this loop, row-comparison keys are treated the same as keys on their
* first (leftmost) columns. We'll add on lower-order columns of the row
* comparison below, if possible.
*
* The selected scan keys (at most one per index column) are remembered by
* storing their addresses into the local startKeys[] array.
*----------
*/
strat_total = BTEqualStrategyNumber;
if (so->numberOfKeys > 0)
{
AttrNumber curattr;
ScanKey chosen;
ScanKey impliesNN;
ScanKey cur;
/*
* chosen is the so-far-chosen key for the current attribute, if any.
* We don't cast the decision in stone until we reach keys for the
* next attribute.
*/
curattr = 1;
chosen = NULL;
/* Also remember any scankey that implies a NOT NULL constraint */