From da04fea71e59ca32948d38e4de9f10bbbc66f762 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Mon, 16 Oct 2023 15:44:59 -0300 Subject: [PATCH] compaction: Fix key estimation per sstable to produce efficient filters The estimation assumes that size of other components are irrelevant, when estimating the number of partitions for each output sstable. The sstables are split according to the data file size, therefore size of other files are irrelevant for the estimation. With certain data models, like single-row partitions containing small values, the index could be even larger than data. For example, assume index is as large as data, then the estimation would say that 2x more sstables will be generated, and as a result, each sstable are underestimated to have 2x less keys. Fix it by only accounting size of data file. Fixes #15726. Signed-off-by: Raphael S. Carvalho Closes scylladb/scylladb#15727 --- compaction/compaction.cc | 5 ++- test/boost/sstable_compaction_test.cc | 51 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/compaction/compaction.cc b/compaction/compaction.cc index 23783ecf97b3..6060012ab808 100644 --- a/compaction/compaction.cc +++ b/compaction/compaction.cc @@ -439,6 +439,8 @@ class compaction { const uint32_t _sstable_level; uint64_t _start_size = 0; uint64_t _end_size = 0; + // fully expired files, which are skipped, aren't taken into account. + uint64_t _compacting_data_file_size = 0; uint64_t _estimated_partitions = 0; uint64_t _bloom_filter_checks = 0; db::replay_position _rp; @@ -501,7 +503,7 @@ class compaction { virtual uint64_t partitions_per_sstable() const { // some tests use _max_sstable_size == 0 for force many one partition per sstable auto max_sstable_size = std::max(_max_sstable_size, 1); - uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size))); + uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_compacting_data_file_size) / max_sstable_size))); return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)), _table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions)); } @@ -710,6 +712,7 @@ class compaction { // for a better estimate for the number of partitions in the merged // sstable than just adding up the lengths of individual sstables. _estimated_partitions += sst->get_estimated_key_count(); + _compacting_data_file_size += sst->ondisk_data_size(); // TODO: // Note that this is not fully correct. Since we might be merging sstables that originated on // another shard (#cpu changed), we might be comparing RP:s with differing shard ids, diff --git a/test/boost/sstable_compaction_test.cc b/test/boost/sstable_compaction_test.cc index d8b58951a3e1..f25979306a48 100644 --- a/test/boost/sstable_compaction_test.cc +++ b/test/boost/sstable_compaction_test.cc @@ -5257,3 +5257,54 @@ SEASTAR_TEST_CASE(test_sstables_excluding_staging_correctness) { } }); } + +// Reproducer for https://github.com/scylladb/scylladb/issues/15726. +SEASTAR_TEST_CASE(produces_optimal_filter_by_estimating_correctly_partitions_per_sstable) { + return test_env::do_with_async([] (test_env& env) { + auto builder = schema_builder("tests", "test") + .with_column("id", utf8_type, column_kind::partition_key) + .with_column("value", int32_type); + builder.set_compressor_params(compression_parameters::no_compression()); + auto s = builder.build(); + auto sst_gen = env.make_sst_factory(s); + + auto compact = [&, s] (std::vector c, uint64_t max_size) -> compaction_result { + auto t = env.make_table_for_tests(s); + auto stop = deferred_stop(t); + t->disable_auto_compaction().get(); + auto desc = sstables::compaction_descriptor(std::move(c)); + desc.max_sstable_bytes = max_size; + return compact_sstables(std::move(desc), t, sst_gen).get0(); + }; + + auto make_insert = [&] (partition_key key) { + mutation m(s, key); + m.set_clustered_cell(clustering_key::make_empty(), bytes("value"), data_value(int32_t(1)), api::new_timestamp()); + return m; + }; + + const sstring shared_key_prefix = "832193982198319823hsdjahdashjdsa81923189381931829sdajidjkas812938219jdsalljdadsajk319820"; + + std::vector muts; + constexpr int keys = 200; + muts.reserve(keys); + for (auto i = 0; i < keys; i++) { + muts.push_back(make_insert(partition_key::from_exploded(*s, {to_bytes(shared_key_prefix + to_sstring(i))}))); + } + auto sst = make_sstable_containing(sst_gen, std::move(muts)); + + testlog.info("index size: {}, data_size: {}", sst->index_size(), sst->ondisk_data_size()); + + uint64_t max_sstable_size = std::ceil(double(sst->ondisk_data_size()) / 10); + auto ret = compact({sst}, max_sstable_size); + + uint64_t partitions_per_sstable = keys / ret.new_sstables.size(); + auto filter = utils::i_filter::get_filter(partitions_per_sstable, s->bloom_filter_fp_chance(), utils::filter_format::m_format); + + auto comp = ret.new_sstables.front()->get_open_info().get0(); + + // Filter for SSTable generated cannot be lower than the one expected + testlog.info("filter size: actual={}, expected>={}", comp.components->filter->memory_size(), filter->memory_size()); + BOOST_REQUIRE(comp.components->filter->memory_size() >= filter->memory_size()); + }); +}