Skip to content

Commit

Permalink
Reduce heap operations for range tombstone keys in iterator (#10877)
Browse files Browse the repository at this point in the history
Summary:
Right now in MergingIterator, for each range tombstone start and end key, we pop one end from heap and push the other end into the heap. This involves extra downheap and upheap cost. In the likely cases when a range tombstone iterator emits relatively adjacent keys, these keys should have similar order within all keys in the heap. This can happen when there is a burst of consecutive range tombstones, and most of the keys covered by them are dropped already. This PR uses `replace_top()` when inserting new range tombstone keys, which is more efficient in these common cases.

Pull Request resolved: facebook/rocksdb#10877

Test Plan:
- existing UT
- ran all flavors of stress test through sandcastle
- benchmark:
```

TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone ./db_bench --benchmarks=fillseq,levelstats --writes_per_range_tombstone=1 --max_num_range_tombstones=1000000 --range_tombstone_width=2 --num=100000000 --writes=800000 --max_bytes_for_level_base=4194304 --disable_auto_compactions --write_buffer_size=33554432 --key_size=64

Level Files Size(MB)
--------------------
  0        8      152
  1        0        0
  2        0        0
  3        0        0
  4        0        0
  5        0        0
  6        0        0

TEST_TMPDIR=/tmp/rocksdb-rangedel-test-all-tombstone/ ./db_bench --benchmarks=readseq[-W1][-X5],levelstats --use_existing_db=true --cache_size=3221225472 --num=100000000 --reads=1000000 --disable_auto_compactions=true --avoid_flush_during_recovery=true

readseq [AVG    5 runs] : 1432116 (± 59664) ops/sec;  224.0 (± 9.3) MB/sec
readseq [MEDIAN 5 runs] : 1454886 ops/sec;  227.5 MB/sec

readseq [AVG    5 runs] : 1944425 (± 29521) ops/sec;  304.1 (± 4.6) MB/sec
readseq [MEDIAN 5 runs] : 1959430 ops/sec;  306.5 MB/sec
```

Reviewed By: ajkr

Differential Revision: D40710936

Pulled By: cbi42

fbshipit-source-id: cb782fb9cdcd26c0c3eb9443215a4ef4d2f79022
  • Loading branch information
cbi42 authored and ayulas committed Feb 26, 2023
1 parent b485d47 commit 202367d
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 14 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ Note: Paired bloom filter is recommended to use when the number of bits per key


# Rocksdb Change Log
## Unreleased
### Bug Fixes
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).

## 7.7.4 (10/28/2022)
### Bug Fixes
* Fixed a case of calling malloc_usable_size on result of operator new[].
Expand Down
40 changes: 26 additions & 14 deletions table/merging_iterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ class MergingIterator : public InternalIterator {
// Add range_tombstone_iters_[level] into min heap.
// Updates active_ if the end key of a range tombstone is inserted.
// @param start_key specifies which end point of the range tombstone to add.
void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true) {
void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
bool replace_top = false) {
assert(!range_tombstone_iters_.empty() &&
range_tombstone_iters_[level]->Valid());
if (start_key) {
Expand All @@ -211,13 +212,18 @@ class MergingIterator : public InternalIterator {
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
active_.insert(level);
}
minHeap_.push(&pinned_heap_item_[level]);
if (replace_top) {
minHeap_.replace_top(&pinned_heap_item_[level]);
} else {
minHeap_.push(&pinned_heap_item_[level]);
}
}

// Add range_tombstone_iters_[level] into max heap.
// Updates active_ if the start key of a range tombstone is inserted.
// @param end_key specifies which end point of the range tombstone to add.
void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true) {
void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true,
bool replace_top = false) {
assert(!range_tombstone_iters_.empty() &&
range_tombstone_iters_[level]->Valid());
if (end_key) {
Expand All @@ -231,7 +237,11 @@ class MergingIterator : public InternalIterator {
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
active_.insert(level);
}
maxHeap_->push(&pinned_heap_item_[level]);
if (replace_top) {
maxHeap_->replace_top(&pinned_heap_item_[level]);
} else {
maxHeap_->push(&pinned_heap_item_[level]);
}
}

// Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
Expand All @@ -241,10 +251,9 @@ class MergingIterator : public InternalIterator {
void PopDeleteRangeStart() {
while (!minHeap_.empty() &&
minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
auto level = minHeap_.top()->level;
minHeap_.pop();
// insert end key of this range tombstone and updates active_
InsertRangeTombstoneToMinHeap(level, false /* start_key */);
InsertRangeTombstoneToMinHeap(
minHeap_.top()->level, false /* start_key */, true /* replace_top */);
}
}

Expand All @@ -255,10 +264,9 @@ class MergingIterator : public InternalIterator {
void PopDeleteRangeEnd() {
while (!maxHeap_->empty() &&
maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
auto level = maxHeap_->top()->level;
maxHeap_->pop();
// insert start key of this range tombstone and updates active_
InsertRangeTombstoneToMaxHeap(level, false /* end_key */);
InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
true /* replace_top */);
}
}

Expand Down Expand Up @@ -761,13 +769,15 @@ bool MergingIterator::SkipNextDeleted() {
// - range deletion end key
auto current = minHeap_.top();
if (current->type == HeapItem::DELETE_RANGE_END) {
minHeap_.pop();
active_.erase(current->level);
assert(range_tombstone_iters_[current->level] &&
range_tombstone_iters_[current->level]->Valid());
range_tombstone_iters_[current->level]->Next();
if (range_tombstone_iters_[current->level]->Valid()) {
InsertRangeTombstoneToMinHeap(current->level);
InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
true /* replace_top */);
} else {
minHeap_.pop();
}
return true /* current key deleted */;
}
Expand Down Expand Up @@ -977,13 +987,15 @@ bool MergingIterator::SkipPrevDeleted() {
// - range deletion start key
auto current = maxHeap_->top();
if (current->type == HeapItem::DELETE_RANGE_START) {
maxHeap_->pop();
active_.erase(current->level);
assert(range_tombstone_iters_[current->level] &&
range_tombstone_iters_[current->level]->Valid());
range_tombstone_iters_[current->level]->Prev();
if (range_tombstone_iters_[current->level]->Valid()) {
InsertRangeTombstoneToMaxHeap(current->level);
InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */,
true /* replace_top */);
} else {
maxHeap_->pop();
}
return true /* current key deleted */;
}
Expand Down

0 comments on commit 202367d

Please sign in to comment.