Resurrect optimization to avoid bloom filter checks during compaction

Commit 8c4b5e4 introduced an optimization which only calculates max purgeable timestamp when a tombstone satisfy the grace period. Commit 'repair: Get rid of the gc_grace_seconds' inverted the order, probably under the assumption that getting grace period can be more expensive than calculating max purgeable, as repair-mode GC will look up into history data in order to calculate gc_before. This caused a significant regression on tombstone heavy compactions, where most of tombstones are still newer than grace period. A compaction which used to take 5s, now takes 35s. 7x slower. The reason is simple, now calculation of max purgeable happens for every single tombstone (once for each key), even the ones that cannot be GC'ed yet. And each calculation has to iterate through (i.e. check the bloom filter of) every single sstable that doesn't participate in compaction. Flame graph makes it very clear that bloom filter is a heavy path without the optimization: 45.64% 45.64% sstable_compact sstable_compaction_test_g [.] utils::filter::bloom_filter::is_present With its resurrection, the problem is gone. This scenario can easily happen, e.g. after a deletion burst, and tombstones becoming only GC'able after they reach upper tiers in the LSM tree. Before this patch, a compaction can be estimated to have this # of filter checks: (# of keys containing *any* tombstone) * (# of uncompacting sstable runs[1]) [1] It's # of *runs*, as each key tend to overlap with only one fragment of each run. After this patch, the estimation becomes: (# of keys containing a GC'able tombstone) * (# of uncompacting runs). With repair mode for tombstone GC, the assumption, that retrieval of gc_before is more expensive than calculating max purgeable, is kept. We can revisit it later. But the default mode, which is the "timeout" (i.e. gc_grace_seconds) one, we still benefit from the optimization of deferring the calculation until needed. Cherry picked from commit 38b226f Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Fixes scylladb#14091. Closes scylladb#13908 Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
raphaelsc · Oct 19, 2023 · 482f1f8 · 482f1f8
1 parent eaf93b3
commit 482f1f8
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 4 deletions.
diff --git a/compaction/compaction.cc b/compaction/compaction.cc
@@ -166,7 +166,7 @@ std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
 }
 
 static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
-        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
+        const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk, uint64_t& bloom_filter_checks) {
     auto timestamp = table_s.min_memtable_timestamp();
     std::optional<utils::hashed_key> hk;
     for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
@@ -177,6 +177,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_
             hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
         }
         if (sst->filter_has_key(*hk)) {
+            bloom_filter_checks++;
             timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
         }
     }
@@ -448,6 +449,7 @@ class compaction {
     uint64_t _start_size = 0;
     uint64_t _end_size = 0;
     uint64_t _estimated_partitions = 0;
+    uint64_t _bloom_filter_checks = 0;
     db::replay_position _rp;
     encoding_stats_collector _stats_collector;
     bool _contains_multi_fragment_runs = false;
@@ -745,6 +747,7 @@ class compaction {
                 .ended_at = ended_at,
                 .start_size = _start_size,
                 .end_size = _end_size,
+                .bloom_filter_checks = _bloom_filter_checks,
             },
         };
 
@@ -785,7 +788,7 @@ class compaction {
             };
         }
         return [this] (const dht::decorated_key& dk) {
-            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
+            return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk, _bloom_filter_checks);
         };
     }
 

diff --git a/compaction/compaction.hh b/compaction/compaction.hh
@@ -92,12 +92,15 @@ struct compaction_stats {
     uint64_t start_size = 0;
     uint64_t end_size = 0;
     uint64_t validation_errors = 0;
+    // Bloom filter checks during max purgeable calculation
+    uint64_t bloom_filter_checks = 0;
 
     compaction_stats& operator+=(const compaction_stats& r) {
         ended_at = std::max(ended_at, r.ended_at);
         start_size += r.start_size;
         end_size += r.end_size;
         validation_errors += r.validation_errors;
+        bloom_filter_checks += r.bloom_filter_checks;
         return *this;
     }
     friend compaction_stats operator+(const compaction_stats& l, const compaction_stats& r) {

diff --git a/mutation_compactor.hh b/mutation_compactor.hh
@@ -226,12 +226,27 @@ private:
         }
     }
 
+    bool satisfy_grace_period(const gc_clock::time_point& deletion_time) {
+        return deletion_time < get_gc_before();
+    }
+
+    bool can_purge_tombstone(const tombstone& t, const gc_clock::time_point deletion_time) {
+        if (cheap_to_get_gc_before(_schema)) {
+            // if retrieval of grace period is cheap, can_gc() will only be
+            // called for tombstones that are older than grace period, in
+            // order to avoid unnecessary bloom filter checks when calculating
+            // max purgeable timestamp.
+            return satisfy_grace_period(deletion_time) && can_gc(t);
+        }
+        return can_gc(t) && satisfy_grace_period(deletion_time);
+    }
+
     bool can_purge_tombstone(const tombstone& t) {
-        return can_gc(t) && t.deletion_time < get_gc_before();
+        return can_purge_tombstone(t, t.deletion_time);
     };
 
     bool can_purge_tombstone(const row_tombstone& t) {
-        return can_gc(t.tomb()) && t.max_deletion_time() < get_gc_before();
+        return can_purge_tombstone(t.tomb(), t.max_deletion_time());
     };
 
     gc_clock::time_point get_gc_before() {

diff --git a/test/boost/sstable_compaction_test.cc b/test/boost/sstable_compaction_test.cc
@@ -5199,3 +5199,56 @@ SEASTAR_TEST_CASE(cleanup_incremental_compaction_test) {
         BOOST_REQUIRE(sstables_closed_during_cleanup >= sstables_nr / 2);
     });
 }
+
+// Check that tombstone newer than grace period won't trigger bloom filter check
+// against uncompacting sstable, during compaction.
+SEASTAR_TEST_CASE(compaction_optimization_to_avoid_bloom_filter_checks) {
+    return test_env::do_with_async([] (test_env& env) {
+        auto builder = schema_builder("tests", "tombstone_purge")
+                .with_column("id", utf8_type, column_kind::partition_key)
+                .with_column("value", int32_type);
+        builder.set_gc_grace_seconds(10000);
+        auto s = builder.build();
+        auto tmp = tmpdir();
+        auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)] () {
+            return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::get_highest_sstable_version(), big);
+        };
+
+        auto compact = [&, s] (std::vector<shared_sstable> all, std::vector<shared_sstable> c) -> compaction_result {
+            auto t = column_family_for_tests(env.manager(), s, tmp.path().string());
+            auto stop = deferred_stop(t);
+            t->disable_auto_compaction().get();
+            for (auto& sst : all) {
+                column_family_test(t).add_sstable(sst);
+            }
+            auto desc = sstables::compaction_descriptor(std::move(c), default_priority_class());
+            desc.enable_garbage_collection(t->get_sstable_set());
+            return compact_sstables(t.get_compaction_manager(), std::move(desc), *t, sst_gen).get0();
+        };
+
+        auto make_insert = [&] (partition_key key) {
+            mutation m(s, key);
+            m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), api::new_timestamp());
+            return m;
+        };
+        auto make_delete = [&] (partition_key key) {
+            mutation m(s, key);
+            tombstone tomb(api::new_timestamp(), gc_clock::now());
+            m.partition().apply(tomb);
+            return m;
+        };
+
+        auto uncompacting = make_sstable_containing(sst_gen, { make_insert(partition_key::from_exploded(*s, {to_bytes("pk1")}) )});
+        auto compacting = make_sstable_containing(sst_gen, { make_delete(partition_key::from_exploded(*s, {to_bytes("pk1")}) )});
+
+        auto result = compact({uncompacting, compacting}, {compacting});
+        BOOST_REQUIRE_EQUAL(1, result.new_sstables.size());
+        BOOST_REQUIRE_EQUAL(0, result.stats.bloom_filter_checks);
+
+        forward_jump_clocks(std::chrono::seconds(s->gc_grace_seconds()) + 1s);
+
+        result = compact({uncompacting, compacting}, {compacting});
+        BOOST_REQUIRE_EQUAL(1, result.new_sstables.size());
+        BOOST_REQUIRE_EQUAL(1, result.stats.bloom_filter_checks);
+    });
+}
diff --git a/tombstone_gc.cc b/tombstone_gc.cc
@@ -183,3 +183,7 @@ void validate_tombstone_gc_options(const tombstone_gc_options* options, data_dic
         throw exceptions::configuration_exception("tombstone_gc option with mode = repair not supported for table with RF one or local replication strategy");
     }
 }
+
+bool cheap_to_get_gc_before(const schema& s) noexcept {
+    return s.tombstone_gc_options().mode() != tombstone_gc_mode::repair;
+}
diff --git a/tombstone_gc.hh b/tombstone_gc.hh
@@ -50,3 +50,6 @@ gc_clock::time_point get_gc_before_for_key(schema_ptr s, const dht::decorated_ke
 void update_repair_time(schema_ptr s, const dht::token_range& range, gc_clock::time_point repair_time);
 
 void validate_tombstone_gc_options(const tombstone_gc_options* options, data_dictionary::database db, sstring ks_name);
+
+// Returns true if it's cheap to retrieve gc_before, e.g. the mode will not require accessing a system table.
+bool cheap_to_get_gc_before(const schema& s) noexcept;