diff --git a/HISTORY.md b/HISTORY.md index ad6b66c9999..f11553fe684 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,13 @@ # Rocksdb Change Log ## Unreleased +### New Features +* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287). +* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache. + +### Public API Changes +* Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. + +## 8.2.0 (04/24/2023) ### Public API Changes * `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. * Add `multi_get_for_update` to C API. @@ -14,6 +22,7 @@ ### Bug Fixes * In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size. +* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet. ### New Features * Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called. @@ -21,9 +30,6 @@ * Added statistics tickers BYTES_COMPRESSED_FROM, BYTES_COMPRESSED_TO, BYTES_COMPRESSION_BYPASSED, BYTES_COMPRESSION_REJECTED, NUMBER_BLOCK_COMPRESSION_BYPASSED, and NUMBER_BLOCK_COMPRESSION_REJECTED. Disabled/deprecated histograms BYTES_COMPRESSED and BYTES_DECOMPRESSED, and ticker NUMBER_BLOCK_NOT_COMPRESSED. The new tickers offer more inight into compression ratios, rejected vs. disabled compression, etc. (#11388) * New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction. -### Bug Fixes -* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet. - ## 8.1.0 (03/18/2023) ### Behavior changes * Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys. diff --git a/TARGETS b/TARGETS index 2514e09a7cd..bbd4530cf3b 100644 --- a/TARGETS +++ b/TARGETS @@ -354,351 +354,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "//folly/synchronization:distributed_mutex", ], headers=None, link_whole=False, extra_test_libs=False) -cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ - "cache/cache.cc", - "cache/cache_entry_roles.cc", - "cache/cache_helpers.cc", - "cache/cache_key.cc", - "cache/cache_reservation_manager.cc", - "cache/charged_cache.cc", - "cache/clock_cache.cc", - "cache/compressed_secondary_cache.cc", - "cache/lru_cache.cc", - "cache/secondary_cache.cc", - "cache/secondary_cache_adapter.cc", - "cache/sharded_cache.cc", - "db/arena_wrapped_db_iter.cc", - "db/blob/blob_contents.cc", - "db/blob/blob_fetcher.cc", - "db/blob/blob_file_addition.cc", - "db/blob/blob_file_builder.cc", - "db/blob/blob_file_cache.cc", - "db/blob/blob_file_garbage.cc", - "db/blob/blob_file_meta.cc", - "db/blob/blob_file_reader.cc", - "db/blob/blob_garbage_meter.cc", - "db/blob/blob_log_format.cc", - "db/blob/blob_log_sequential_reader.cc", - "db/blob/blob_log_writer.cc", - "db/blob/blob_source.cc", - "db/blob/prefetch_buffer_collection.cc", - "db/builder.cc", - "db/c.cc", - "db/column_family.cc", - "db/compaction/compaction.cc", - "db/compaction/compaction_iterator.cc", - "db/compaction/compaction_job.cc", - "db/compaction/compaction_outputs.cc", - "db/compaction/compaction_picker.cc", - "db/compaction/compaction_picker_fifo.cc", - "db/compaction/compaction_picker_level.cc", - "db/compaction/compaction_picker_universal.cc", - "db/compaction/compaction_service_job.cc", - "db/compaction/compaction_state.cc", - "db/compaction/sst_partitioner.cc", - "db/compaction/subcompaction_state.cc", - "db/convenience.cc", - "db/db_filesnapshot.cc", - "db/db_impl/compacted_db_impl.cc", - "db/db_impl/db_impl.cc", - "db/db_impl/db_impl_compaction_flush.cc", - "db/db_impl/db_impl_debug.cc", - "db/db_impl/db_impl_experimental.cc", - "db/db_impl/db_impl_files.cc", - "db/db_impl/db_impl_open.cc", - "db/db_impl/db_impl_readonly.cc", - "db/db_impl/db_impl_secondary.cc", - "db/db_impl/db_impl_write.cc", - "db/db_info_dumper.cc", - "db/db_iter.cc", - "db/dbformat.cc", - "db/error_handler.cc", - "db/event_helpers.cc", - "db/experimental.cc", - "db/external_sst_file_ingestion_job.cc", - "db/file_indexer.cc", - "db/flush_job.cc", - "db/flush_scheduler.cc", - "db/forward_iterator.cc", - "db/import_column_family_job.cc", - "db/internal_stats.cc", - "db/log_reader.cc", - "db/log_writer.cc", - "db/logs_with_prep_tracker.cc", - "db/malloc_stats.cc", - "db/memtable.cc", - "db/memtable_list.cc", - "db/merge_helper.cc", - "db/merge_operator.cc", - "db/output_validator.cc", - "db/periodic_task_scheduler.cc", - "db/range_del_aggregator.cc", - "db/range_tombstone_fragmenter.cc", - "db/repair.cc", - "db/seqno_to_time_mapping.cc", - "db/snapshot_impl.cc", - "db/table_cache.cc", - "db/table_properties_collector.cc", - "db/transaction_log_impl.cc", - "db/trim_history_scheduler.cc", - "db/version_builder.cc", - "db/version_edit.cc", - "db/version_edit_handler.cc", - "db/version_set.cc", - "db/wal_edit.cc", - "db/wal_manager.cc", - "db/wide/wide_column_serialization.cc", - "db/wide/wide_columns.cc", - "db/write_batch.cc", - "db/write_batch_base.cc", - "db/write_controller.cc", - "db/write_stall_stats.cc", - "db/write_thread.cc", - "env/composite_env.cc", - "env/env.cc", - "env/env_chroot.cc", - "env/env_encryption.cc", - "env/env_posix.cc", - "env/file_system.cc", - "env/file_system_tracer.cc", - "env/fs_posix.cc", - "env/fs_remap.cc", - "env/io_posix.cc", - "env/mock_env.cc", - "env/unique_id_gen.cc", - "file/delete_scheduler.cc", - "file/file_prefetch_buffer.cc", - "file/file_util.cc", - "file/filename.cc", - "file/line_file_reader.cc", - "file/random_access_file_reader.cc", - "file/read_write_util.cc", - "file/readahead_raf.cc", - "file/sequence_file_reader.cc", - "file/sst_file_manager_impl.cc", - "file/writable_file_writer.cc", - "logging/auto_roll_logger.cc", - "logging/event_logger.cc", - "logging/log_buffer.cc", - "memory/arena.cc", - "memory/concurrent_arena.cc", - "memory/jemalloc_nodump_allocator.cc", - "memory/memkind_kmem_allocator.cc", - "memory/memory_allocator.cc", - "memtable/alloc_tracker.cc", - "memtable/hash_linklist_rep.cc", - "memtable/hash_skiplist_rep.cc", - "memtable/skiplistrep.cc", - "memtable/vectorrep.cc", - "memtable/write_buffer_manager.cc", - "monitoring/histogram.cc", - "monitoring/histogram_windowing.cc", - "monitoring/in_memory_stats_history.cc", - "monitoring/instrumented_mutex.cc", - "monitoring/iostats_context.cc", - "monitoring/perf_context.cc", - "monitoring/perf_level.cc", - "monitoring/persistent_stats_history.cc", - "monitoring/statistics.cc", - "monitoring/thread_status_impl.cc", - "monitoring/thread_status_updater.cc", - "monitoring/thread_status_updater_debug.cc", - "monitoring/thread_status_util.cc", - "monitoring/thread_status_util_debug.cc", - "options/cf_options.cc", - "options/configurable.cc", - "options/customizable.cc", - "options/db_options.cc", - "options/options.cc", - "options/options_helper.cc", - "options/options_parser.cc", - "port/mmap.cc", - "port/port_posix.cc", - "port/stack_trace.cc", - "port/win/env_default.cc", - "port/win/env_win.cc", - "port/win/io_win.cc", - "port/win/port_win.cc", - "port/win/win_logger.cc", - "port/win/win_thread.cc", - "table/adaptive/adaptive_table_factory.cc", - "table/block_based/binary_search_index_reader.cc", - "table/block_based/block.cc", - "table/block_based/block_based_table_builder.cc", - "table/block_based/block_based_table_factory.cc", - "table/block_based/block_based_table_iterator.cc", - "table/block_based/block_based_table_reader.cc", - "table/block_based/block_builder.cc", - "table/block_based/block_cache.cc", - "table/block_based/block_prefetcher.cc", - "table/block_based/block_prefix_index.cc", - "table/block_based/data_block_footer.cc", - "table/block_based/data_block_hash_index.cc", - "table/block_based/filter_block_reader_common.cc", - "table/block_based/filter_policy.cc", - "table/block_based/flush_block_policy.cc", - "table/block_based/full_filter_block.cc", - "table/block_based/hash_index_reader.cc", - "table/block_based/index_builder.cc", - "table/block_based/index_reader_common.cc", - "table/block_based/parsed_full_filter_block.cc", - "table/block_based/partitioned_filter_block.cc", - "table/block_based/partitioned_index_iterator.cc", - "table/block_based/partitioned_index_reader.cc", - "table/block_based/reader_common.cc", - "table/block_based/uncompression_dict_reader.cc", - "table/block_fetcher.cc", - "table/compaction_merging_iterator.cc", - "table/cuckoo/cuckoo_table_builder.cc", - "table/cuckoo/cuckoo_table_factory.cc", - "table/cuckoo/cuckoo_table_reader.cc", - "table/format.cc", - "table/get_context.cc", - "table/iterator.cc", - "table/merging_iterator.cc", - "table/meta_blocks.cc", - "table/persistent_cache_helper.cc", - "table/plain/plain_table_bloom.cc", - "table/plain/plain_table_builder.cc", - "table/plain/plain_table_factory.cc", - "table/plain/plain_table_index.cc", - "table/plain/plain_table_key_coding.cc", - "table/plain/plain_table_reader.cc", - "table/sst_file_dumper.cc", - "table/sst_file_reader.cc", - "table/sst_file_writer.cc", - "table/table_factory.cc", - "table/table_properties.cc", - "table/two_level_iterator.cc", - "table/unique_id.cc", - "test_util/sync_point.cc", - "test_util/sync_point_impl.cc", - "test_util/transaction_test_util.cc", - "tools/dump/db_dump_tool.cc", - "tools/io_tracer_parser_tool.cc", - "tools/ldb_cmd.cc", - "tools/ldb_tool.cc", - "tools/sst_dump_tool.cc", - "trace_replay/block_cache_tracer.cc", - "trace_replay/io_tracer.cc", - "trace_replay/trace_record.cc", - "trace_replay/trace_record_handler.cc", - "trace_replay/trace_record_result.cc", - "trace_replay/trace_replay.cc", - "util/async_file_reader.cc", - "util/build_version.cc", - "util/cleanable.cc", - "util/coding.cc", - "util/compaction_job_stats_impl.cc", - "util/comparator.cc", - "util/compression.cc", - "util/compression_context_cache.cc", - "util/concurrent_task_limiter_impl.cc", - "util/crc32c.cc", - "util/crc32c_arm64.cc", - "util/data_structure.cc", - "util/dynamic_bloom.cc", - "util/file_checksum_helper.cc", - "util/hash.cc", - "util/murmurhash.cc", - "util/random.cc", - "util/rate_limiter.cc", - "util/ribbon_config.cc", - "util/slice.cc", - "util/status.cc", - "util/stderr_logger.cc", - "util/string_util.cc", - "util/thread_local.cc", - "util/threadpool_imp.cc", - "util/xxhash.cc", - "utilities/agg_merge/agg_merge.cc", - "utilities/backup/backup_engine.cc", - "utilities/blob_db/blob_compaction_filter.cc", - "utilities/blob_db/blob_db.cc", - "utilities/blob_db/blob_db_impl.cc", - "utilities/blob_db/blob_db_impl_filesnapshot.cc", - "utilities/blob_db/blob_dump_tool.cc", - "utilities/blob_db/blob_file.cc", - "utilities/cache_dump_load.cc", - "utilities/cache_dump_load_impl.cc", - "utilities/cassandra/cassandra_compaction_filter.cc", - "utilities/cassandra/format.cc", - "utilities/cassandra/merge_operator.cc", - "utilities/checkpoint/checkpoint_impl.cc", - "utilities/compaction_filters.cc", - "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc", - "utilities/convenience/info_log_finder.cc", - "utilities/counted_fs.cc", - "utilities/debug.cc", - "utilities/env_mirror.cc", - "utilities/env_timed.cc", - "utilities/fault_injection_env.cc", - "utilities/fault_injection_fs.cc", - "utilities/fault_injection_secondary_cache.cc", - "utilities/leveldb_options/leveldb_options.cc", - "utilities/memory/memory_util.cc", - "utilities/merge_operators.cc", - "utilities/merge_operators/bytesxor.cc", - "utilities/merge_operators/max.cc", - "utilities/merge_operators/put.cc", - "utilities/merge_operators/sortlist.cc", - "utilities/merge_operators/string_append/stringappend.cc", - "utilities/merge_operators/string_append/stringappend2.cc", - "utilities/merge_operators/uint64add.cc", - "utilities/object_registry.cc", - "utilities/option_change_migration/option_change_migration.cc", - "utilities/options/options_util.cc", - "utilities/persistent_cache/block_cache_tier.cc", - "utilities/persistent_cache/block_cache_tier_file.cc", - "utilities/persistent_cache/block_cache_tier_metadata.cc", - "utilities/persistent_cache/persistent_cache_tier.cc", - "utilities/persistent_cache/volatile_tier_impl.cc", - "utilities/simulator_cache/cache_simulator.cc", - "utilities/simulator_cache/sim_cache.cc", - "utilities/table_properties_collectors/compact_on_deletion_collector.cc", - "utilities/trace/file_trace_reader_writer.cc", - "utilities/trace/replayer_impl.cc", - "utilities/transactions/lock/lock_manager.cc", - "utilities/transactions/lock/point/point_lock_manager.cc", - "utilities/transactions/lock/point/point_lock_tracker.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc", - "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc", - "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc", - "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc", - "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc", - "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc", - "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc", - "utilities/transactions/optimistic_transaction.cc", - "utilities/transactions/optimistic_transaction_db_impl.cc", - "utilities/transactions/pessimistic_transaction.cc", - "utilities/transactions/pessimistic_transaction_db.cc", - "utilities/transactions/snapshot_checker.cc", - "utilities/transactions/transaction_base.cc", - "utilities/transactions/transaction_db_mutex_impl.cc", - "utilities/transactions/transaction_util.cc", - "utilities/transactions/write_prepared_txn.cc", - "utilities/transactions/write_prepared_txn_db.cc", - "utilities/transactions/write_unprepared_txn.cc", - "utilities/transactions/write_unprepared_txn_db.cc", - "utilities/ttl/db_ttl_impl.cc", - "utilities/wal_filter.cc", - "utilities/write_batch_with_index/write_batch_with_index.cc", - "utilities/write_batch_with_index/write_batch_with_index_internal.cc", - ], deps=[ - "//folly/container:f14_hash", - "//folly/experimental/coro:blocking_wait", - "//folly/experimental/coro:collect", - "//folly/experimental/coro:coroutine", - "//folly/experimental/coro:task", - "//folly/synchronization:distributed_mutex", - ], headers=None, link_whole=True, extra_test_libs=False) +cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False) cpp_library_wrapper(name="rocksdb_test_lib", srcs=[ "db/db_test_util.cc", diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index d7e9b645be4..a9e7b447df3 100755 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -154,16 +154,9 @@ def generate_targets(repo_path, deps_map): # rocksdb_whole_archive_lib TARGETS.add_library( "rocksdb_whole_archive_lib", - src_mk["LIB_SOURCES"] + - # always add range_tree, it's only excluded on ppc64, which we don't use internally - src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"], + [], deps=[ - "//folly/container:f14_hash", - "//folly/experimental/coro:blocking_wait", - "//folly/experimental/coro:collect", - "//folly/experimental/coro:coroutine", - "//folly/experimental/coro:task", - "//folly/synchronization:distributed_mutex", + ":rocksdb_lib", ], headers=None, extra_external_deps="", diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 12be0babef9..80fbbe88f6d 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -1282,25 +1282,20 @@ size_t ClockCacheShard::GetTableAddressCount() const { // Explicit instantiation template class ClockCacheShard; -HyperClockCache::HyperClockCache( - size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy, - std::shared_ptr memory_allocator) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, - std::move(memory_allocator)) { - assert(estimated_value_size > 0 || - metadata_charge_policy != kDontChargeCacheMetadata); +HyperClockCache::HyperClockCache(const HyperClockCacheOptions& opts) + : ShardedCache(opts) { + assert(opts.estimated_entry_charge > 0 || + opts.metadata_charge_policy != kDontChargeCacheMetadata); // TODO: should not need to go through two levels of pointer indirection to // get to table entries size_t per_shard = GetPerShardCapacity(); MemoryAllocator* alloc = this->memory_allocator(); - const Cache::EvictionCallback* eviction_callback = &eviction_callback_; - InitShards([=](Shard* cs) { - HyperClockTable::Opts opts; - opts.estimated_value_size = estimated_value_size; - new (cs) Shard(per_shard, strict_capacity_limit, metadata_charge_policy, - alloc, eviction_callback, opts); + InitShards([&](Shard* cs) { + HyperClockTable::Opts table_opts; + table_opts.estimated_value_size = opts.estimated_entry_charge; + new (cs) Shard(per_shard, opts.strict_capacity_limit, + opts.metadata_charge_policy, alloc, &eviction_callback_, + table_opts); }); } @@ -1460,21 +1455,23 @@ std::shared_ptr NewClockCache( } std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const { - auto my_num_shard_bits = num_shard_bits; - if (my_num_shard_bits >= 20) { + // For sanitized options + HyperClockCacheOptions opts = *this; + if (opts.num_shard_bits >= 20) { return nullptr; // The cache cannot be sharded into too many fine pieces. } - if (my_num_shard_bits < 0) { + if (opts.num_shard_bits < 0) { // Use larger shard size to reduce risk of large entries clustering // or skewing individual shards. constexpr size_t min_shard_size = 32U * 1024U * 1024U; - my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); + opts.num_shard_bits = + GetDefaultCacheShardBits(opts.capacity, min_shard_size); } - std::shared_ptr cache = std::make_shared( - capacity, estimated_entry_charge, my_num_shard_bits, - strict_capacity_limit, metadata_charge_policy, memory_allocator); - if (secondary_cache) { - cache = std::make_shared(cache, secondary_cache); + std::shared_ptr cache = + std::make_shared(opts); + if (opts.secondary_cache) { + cache = std::make_shared(cache, + opts.secondary_cache); } return cache; } diff --git a/cache/clock_cache.h b/cache/clock_cache.h index fc5aef6cb4d..a9515146a28 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -682,10 +682,7 @@ class HyperClockCache public: using Shard = ClockCacheShard; - HyperClockCache(size_t capacity, size_t estimated_value_size, - int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy, - std::shared_ptr memory_allocator); + explicit HyperClockCache(const HyperClockCacheOptions& opts); const char* Name() const override { return "HyperClockCache"; } diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc index affea8c54f7..2408afc0a17 100644 --- a/cache/compressed_secondary_cache.cc +++ b/cache/compressed_secondary_cache.cc @@ -17,23 +17,8 @@ namespace ROCKSDB_NAMESPACE { CompressedSecondaryCache::CompressedSecondaryCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - CompressionType compression_type, uint32_t compress_format_version, - bool enable_custom_split_merge, - const CacheEntryRoleSet& do_not_compress_roles) - : cache_options_(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, - use_adaptive_mutex, metadata_charge_policy, - compression_type, compress_format_version, - enable_custom_split_merge, do_not_compress_roles) { - cache_ = - NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, - metadata_charge_policy, low_pri_pool_ratio); -} + const CompressedSecondaryCacheOptions& opts) + : cache_(opts.LRUCacheOptions::MakeSharedCache()), cache_options_(opts) {} CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); } @@ -311,31 +296,9 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper( } } -std::shared_ptr NewCompressedSecondaryCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - CompressionType compression_type, uint32_t compress_format_version, - bool enable_custom_split_merge, - const CacheEntryRoleSet& do_not_compress_roles) { - return std::make_shared( - capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, - low_pri_pool_ratio, memory_allocator, use_adaptive_mutex, - metadata_charge_policy, compression_type, compress_format_version, - enable_custom_split_merge, do_not_compress_roles); -} - -std::shared_ptr NewCompressedSecondaryCache( - const CompressedSecondaryCacheOptions& opts) { - // The secondary_cache is disabled for this LRUCache instance. - assert(opts.secondary_cache == nullptr); - return NewCompressedSecondaryCache( - opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit, - opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator, - opts.use_adaptive_mutex, opts.metadata_charge_policy, - opts.compression_type, opts.compress_format_version, - opts.enable_custom_split_merge, opts.do_not_compress_roles); +std::shared_ptr +CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const { + return std::make_shared(*this); } } // namespace ROCKSDB_NAMESPACE diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h index 7b45ca8bd91..d20f2d1d7a2 100644 --- a/cache/compressed_secondary_cache.h +++ b/cache/compressed_secondary_cache.h @@ -69,18 +69,8 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle { class CompressedSecondaryCache : public SecondaryCache { public: - CompressedSecondaryCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex, - CacheMetadataChargePolicy metadata_charge_policy = - kDefaultCacheMetadataChargePolicy, - CompressionType compression_type = CompressionType::kLZ4Compression, - uint32_t compress_format_version = 2, - bool enable_custom_split_merge = false, - const CacheEntryRoleSet& do_not_compress_roles = { - CacheEntryRole::kFilterBlock}); + explicit CompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts); ~CompressedSecondaryCache() override; const char* Name() const override { return "CompressedSecondaryCache"; } diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc index 1e41fc142b4..18b51ccf8fe 100644 --- a/cache/compressed_secondary_cache_test.cc +++ b/cache/compressed_secondary_cache_test.cc @@ -626,8 +626,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test, using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; std::unique_ptr sec_cache = - std::make_unique(1000, 0, true, 0.5, 0.0, - allocator); + std::make_unique( + CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0, + allocator)); Random rnd(301); // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split. size_t str_size{8500}; @@ -678,7 +679,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test, std::string str = str1 + str2 + str3; std::unique_ptr sec_cache = - std::make_unique(1000, 0, true, 0.5, 0.0); + std::make_unique( + CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0)); size_t charge{0}; CacheAllocationPtr value = sec_cache->MergeChunksIntoValue(chunks_head, charge); @@ -708,8 +710,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test, using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; std::unique_ptr sec_cache = - std::make_unique(1000, 0, true, 0.5, 0.0, - allocator); + std::make_unique( + CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0, + allocator)); Random rnd(301); // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split. size_t str_size{8500}; diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 3b4e80ef87b..02119c81900 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -646,23 +646,15 @@ void LRUCacheShard::AppendPrintableOptions(std::string& str) const { str.append(buffer); } -LRUCache::LRUCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, double high_pri_pool_ratio, - double low_pri_pool_ratio, - std::shared_ptr allocator, - bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, - std::move(allocator)) { +LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) { size_t per_shard = GetPerShardCapacity(); MemoryAllocator* alloc = memory_allocator(); - const EvictionCallback* eviction_callback = &eviction_callback_; - InitShards([=](LRUCacheShard* cs) { - new (cs) LRUCacheShard(per_shard, strict_capacity_limit, - high_pri_pool_ratio, low_pri_pool_ratio, - use_adaptive_mutex, metadata_charge_policy, - /* max_upper_hash_bits */ 32 - num_shard_bits, alloc, - eviction_callback); + InitShards([&](LRUCacheShard* cs) { + new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit, + opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, + opts.use_adaptive_mutex, opts.metadata_charge_policy, + /* max_upper_hash_bits */ 32 - opts.num_shard_bits, + alloc, &eviction_callback_); }); } @@ -692,13 +684,7 @@ double LRUCache::GetHighPriPoolRatio() { } // namespace lru_cache -std::shared_ptr NewLRUCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - const std::shared_ptr& secondary_cache, - double low_pri_pool_ratio) { +std::shared_ptr LRUCacheOptions::MakeSharedCache() const { if (num_shard_bits >= 20) { return nullptr; // The cache cannot be sharded into too many fine pieces. } @@ -714,36 +700,15 @@ std::shared_ptr NewLRUCache( // Invalid high_pri_pool_ratio and low_pri_pool_ratio combination return nullptr; } - if (num_shard_bits < 0) { - num_shard_bits = GetDefaultCacheShardBits(capacity); + // For sanitized options + LRUCacheOptions opts = *this; + if (opts.num_shard_bits < 0) { + opts.num_shard_bits = GetDefaultCacheShardBits(capacity); } - std::shared_ptr cache = std::make_shared( - capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, - low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex, - metadata_charge_policy); + std::shared_ptr cache = std::make_shared(opts); if (secondary_cache) { cache = std::make_shared(cache, secondary_cache); } return cache; } - -std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { - return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, - cache_opts.high_pri_pool_ratio, - cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, - cache_opts.metadata_charge_policy, - cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio); -} - -std::shared_ptr NewLRUCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - double low_pri_pool_ratio) { - return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, - metadata_charge_policy, nullptr, low_pri_pool_ratio); -} } // namespace ROCKSDB_NAMESPACE diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 554907b3bea..9e6f15062f8 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -446,12 +446,7 @@ class LRUCache #endif : public ShardedCache { public: - LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex, - CacheMetadataChargePolicy metadata_charge_policy = - kDontChargeCacheMetadata); + explicit LRUCache(const LRUCacheOptions& opts); const char* Name() const override { return "LRUCache"; } ObjectPtr Value(Handle* handle) override; size_t GetCharge(Handle* handle) const override; diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index 9ebca3ba827..f8d518067a8 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -19,14 +19,12 @@ namespace ROCKSDB_NAMESPACE { -ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, - std::shared_ptr allocator) - : Cache(std::move(allocator)), +ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts) + : Cache(opts.memory_allocator), last_id_(1), - shard_mask_((uint32_t{1} << num_shard_bits) - 1), - strict_capacity_limit_(strict_capacity_limit), - capacity_(capacity) {} + shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1), + strict_capacity_limit_(opts.strict_capacity_limit), + capacity_(opts.capacity) {} size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const { uint32_t num_shards = GetNumShards(); diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index 04eaa5318ea..d689783d3c8 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -89,9 +89,7 @@ class CacheShardBase { // Portions of ShardedCache that do not depend on the template parameter class ShardedCacheBase : public Cache { public: - ShardedCacheBase(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, - std::shared_ptr memory_allocator); + explicit ShardedCacheBase(const ShardedCacheOptions& opts); virtual ~ShardedCacheBase() = default; int GetNumShardBits() const; @@ -134,10 +132,8 @@ class ShardedCache : public ShardedCacheBase { using HashCref = typename CacheShard::HashCref; using HandleImpl = typename CacheShard::HandleImpl; - ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - std::shared_ptr allocator) - : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit, - allocator), + explicit ShardedCache(const ShardedCacheOptions& opts) + : ShardedCacheBase(opts), shards_(reinterpret_cast(port::cacheline_aligned_alloc( sizeof(CacheShard) * GetNumShards()))), destroy_shards_in_dtor_(false) {} diff --git a/db/builder.cc b/db/builder.cc index be1ec29bf0c..eadc315c9aa 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -380,7 +380,8 @@ Status BuildTable( MaxFileSizeForL0MetaPin(mutable_cf_options), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key*/ nullptr, - /*allow_unprepared_value*/ false)); + /*allow_unprepared_value*/ false, + mutable_cf_options.block_protection_bytes_per_key)); s = it->status(); if (s.ok() && paranoid_file_checks) { OutputValidator file_validator(tboptions.internal_comparator, diff --git a/db/column_family.cc b/db/column_family.cc index 24ea46ac486..0b3fe680772 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1428,6 +1428,12 @@ Status ColumnFamilyData::ValidateOptions( "Memtable per key-value checksum protection only supports 0, 1, 2, 4 " "or 8 bytes per key."); } + if (std::find(supported.begin(), supported.end(), + cf_options.block_protection_bytes_per_key) == supported.end()) { + return Status::NotSupported( + "Block per key-value checksum protection only supports 0, 1, 2, 4 " + "or 8 bytes per key."); + } return s; } diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 8a326a508f4..ed152f28c60 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -504,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() { FileMetaData* f = flevel->files[i].file_metadata; std::vector my_anchors; Status s = cfd->table_cache()->ApproximateKeyAnchors( - read_options, icomp, *f, my_anchors); + read_options, icomp, *f, + c->mutable_cf_options()->block_protection_bytes_per_key, + my_anchors); if (!s.ok() || my_anchors.empty()) { my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); } @@ -735,7 +737,9 @@ Status CompactionJob::Run() { *compact_->compaction->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + compact_->compaction->mutable_cf_options() + ->block_protection_bytes_per_key); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 9c5784d5e02..79f8e5fd52e 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -454,7 +454,8 @@ class CompactionJobTestBase : public testing::Test { Status s = cf_options_.table_factory->NewTableReader( read_opts, TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(), - cfd_->internal_comparator()), + cfd_->internal_comparator(), + 0 /* block_protection_bytes_per_key */), std::move(freader), file_size, &table_reader, false); ASSERT_OK(s); assert(table_reader); diff --git a/db/convenience.cc b/db/convenience.cc index 8ab7cbc139a..32cdfafaab2 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -64,8 +64,8 @@ Status VerifySstFileChecksum(const Options& options, const bool kImmortal = true; auto reader_options = TableReaderOptions( ioptions, options.prefix_extractor, env_options, internal_comparator, - false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */, - -1 /* level */); + options.block_protection_bytes_per_key, false /* skip_filters */, + !kImmortal, false /* force_direct_prefetch */, -1 /* level */); reader_options.largest_seqno = largest_seqno; s = ioptions.table_factory->NewTableReader( reader_options, std::move(file_reader), file_size, &table_reader, diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 1a136635339..8fa93d8d758 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -620,9 +620,9 @@ class MockCache : public LRUCache { static uint32_t low_pri_insert_count; MockCache() - : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/, - false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/, - 0.0 /*low_pri_pool_ratio*/) {} + : LRUCache(LRUCacheOptions( + size_t{1} << 25 /*capacity*/, 0 /*num_shard_bits*/, + false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/)) {} using ShardedCache::Insert; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index fcfb777316e..4e36af1e2e9 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2475,7 +2475,6 @@ std::vector DBImpl::MultiGet( // Post processing (decrement reference counts and record statistics) PERF_TIMER_GUARD(get_post_process_time); - autovector superversions_to_delete; for (auto mgd_iter : multiget_cf_data) { auto mgd = mgd_iter.second; diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index ed80760518a..adc2b36bb26 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -20,131 +20,115 @@ class DBStatisticsTest : public DBTestBase { }; TEST_F(DBStatisticsTest, CompressionStatsTest) { - CompressionType type; - - if (Snappy_Supported()) { - type = kSnappyCompression; - fprintf(stderr, "using snappy\n"); - } else if (Zlib_Supported()) { - type = kZlibCompression; - fprintf(stderr, "using zlib\n"); - } else if (BZip2_Supported()) { - type = kBZip2Compression; - fprintf(stderr, "using bzip2\n"); - } else if (LZ4_Supported()) { - type = kLZ4Compression; - fprintf(stderr, "using lz4\n"); - } else if (XPRESS_Supported()) { - type = kXpressCompression; - fprintf(stderr, "using xpress\n"); - } else if (ZSTD_Supported()) { - type = kZSTD; - fprintf(stderr, "using ZSTD\n"); - } else { - fprintf(stderr, "skipping test, compression disabled\n"); - return; - } - - Options options = CurrentOptions(); - options.compression = type; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); - BlockBasedTableOptions bbto; - bbto.enable_index_compression = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - - auto PopStat = [&](Tickers t) -> uint64_t { - return options.statistics->getAndResetTickerCount(t); - }; - - int kNumKeysWritten = 100; - double compress_to = 0.5; - // About three KVs per block - int len = static_cast(BlockBasedTableOptions().block_size / 3); - int uncomp_est = kNumKeysWritten * (len + 20); - - Random rnd(301); - std::string buf; + for (CompressionType type : GetSupportedCompressions()) { + if (type == kNoCompression) { + continue; + } + if (type == kBZip2Compression) { + // Weird behavior in this test + continue; + } + SCOPED_TRACE("Compression type: " + std::to_string(type)); + + Options options = CurrentOptions(); + options.compression = type; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); + BlockBasedTableOptions bbto; + bbto.enable_index_compression = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + auto PopStat = [&](Tickers t) -> uint64_t { + return options.statistics->getAndResetTickerCount(t); + }; + + int kNumKeysWritten = 100; + double compress_to = 0.5; + // About three KVs per block + int len = static_cast(BlockBasedTableOptions().block_size / 3); + int uncomp_est = kNumKeysWritten * (len + 20); + + Random rnd(301); + std::string buf; + + // Check that compressions occur and are counted when compression is turned + // on + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK( + Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf))); + } + ASSERT_OK(Flush()); + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSED_FROM), uncomp_est / 10); + EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_COMPRESSED_TO), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); + + // And decompressions + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_DECOMPRESSED_TO), uncomp_est / 10); + EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_DECOMPRESSED_FROM), + uncomp_est / 10); - // Check that compressions occur and are counted when compression is turned on - for (int i = 0; i < kNumKeysWritten; ++i) { - ASSERT_OK( - Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf))); - } - ASSERT_OK(Flush()); - EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSED)); - EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSED_FROM), uncomp_est / 10); - EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_COMPRESSED_TO), - uncomp_est / 10); - - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); - EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); - EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); - - // And decompressions - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); - } - EXPECT_EQ(34, PopStat(NUMBER_BLOCK_DECOMPRESSED)); - EXPECT_NEAR2(uncomp_est, PopStat(BYTES_DECOMPRESSED_TO), uncomp_est / 10); - EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_DECOMPRESSED_FROM), - uncomp_est / 10); - - EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED)); - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); - - // Check when compression is rejected. - compress_to = 0.95; - DestroyAndReopen(options); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); - for (int i = 0; i < kNumKeysWritten; ++i) { - ASSERT_OK( - Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf))); - } - ASSERT_OK(Flush()); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); - } - EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); - EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_REJECTED), - uncomp_est / 10); - - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED)); - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED)); - EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); - EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); - - // Check when compression is disabled. - options.compression = kNoCompression; - DestroyAndReopen(options); + // Check when compression is rejected. + DestroyAndReopen(options); - for (int i = 0; i < kNumKeysWritten; ++i) { - ASSERT_OK( - Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf))); - } - ASSERT_OK(Flush()); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomBinaryString(len))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_REJECTED), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); + + // Check when compression is disabled. + options.compression = kNoCompression; + DestroyAndReopen(options); + + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomBinaryString(len))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_BYPASSED), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); } - EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); - EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_BYPASSED), - uncomp_est / 10); - - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED)); - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); - EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO)); - EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED)); - EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); - EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); } TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 98bd6050a27..ca9b6fb9bbe 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -678,6 +678,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index eddade83744..75a7c599b88 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -36,7 +36,7 @@ class ForwardLevelIterator : public InternalIterator { const ColumnFamilyData* const cfd, const ReadOptions& read_options, const std::vector& files, const std::shared_ptr& prefix_extractor, - bool allow_unprepared_value) + bool allow_unprepared_value, uint8_t block_protection_bytes_per_key) : cfd_(cfd), read_options_(read_options), files_(files), @@ -45,7 +45,8 @@ class ForwardLevelIterator : public InternalIterator { file_iter_(nullptr), pinned_iters_mgr_(nullptr), prefix_extractor_(prefix_extractor), - allow_unprepared_value_(allow_unprepared_value) { + allow_unprepared_value_(allow_unprepared_value), + block_protection_bytes_per_key_(block_protection_bytes_per_key) { status_.PermitUncheckedError(); // Allow uninitialized status through } @@ -87,7 +88,8 @@ class ForwardLevelIterator : public InternalIterator { /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + block_protection_bytes_per_key_); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -211,6 +213,7 @@ class ForwardLevelIterator : public InternalIterator { // Kept alive by ForwardIterator::sv_->mutable_cf_options const std::shared_ptr& prefix_extractor_; const bool allow_unprepared_value_; + const uint8_t block_protection_bytes_per_key_; }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, @@ -738,7 +741,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key)); } BuildLevelIterators(vstorage, sv_); current_ = nullptr; @@ -819,7 +823,8 @@ void ForwardIterator::RenewIterators() { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + svnew->mutable_cf_options.block_protection_bytes_per_key)); } for (auto* f : l0_iters_) { @@ -863,7 +868,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, } else { level_iters_.push_back(new ForwardLevelIterator( cfd_, read_options_, level_files, - sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_)); + sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_, + sv->mutable_cf_options.block_protection_bytes_per_key)); } } } @@ -885,7 +891,8 @@ void ForwardIterator::ResetIncompleteIterators() { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 12d2519e9e6..9a8b48dd054 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -250,6 +250,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, diff --git a/db/kv_checksum.h b/db/kv_checksum.h index bce507fcf98..53c02485ffa 100644 --- a/db/kv_checksum.h +++ b/db/kv_checksum.h @@ -46,6 +46,8 @@ template class ProtectionInfoKVOC; template class ProtectionInfoKVOS; +template +class ProtectionInfoKV; // Aliases for 64-bit protection infos. using ProtectionInfo64 = ProtectionInfo; @@ -64,13 +66,13 @@ class ProtectionInfo { ProtectionInfoKVO ProtectKVO(const SliceParts& key, const SliceParts& value, ValueType op_type) const; - - T GetVal() const { return val_; } + ProtectionInfoKV ProtectKV(const Slice& key, const Slice& value) const; private: friend class ProtectionInfoKVO; friend class ProtectionInfoKVOS; friend class ProtectionInfoKVOC; + friend class ProtectionInfoKV; // Each field is hashed with an independent value so we can catch fields being // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, @@ -89,8 +91,47 @@ class ProtectionInfo { static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); } + T GetVal() const { return val_; } void SetVal(T val) { val_ = val; } + void Encode(uint8_t len, char* dst) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + dst[0] = static_cast(val_); + break; + case 2: + EncodeFixed16(dst, static_cast(val_)); + break; + case 4: + EncodeFixed32(dst, static_cast(val_)); + break; + case 8: + EncodeFixed64(dst, static_cast(val_)); + break; + default: + assert(false); + } + } + + bool Verify(uint8_t len, const char* checksum_ptr) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + return static_cast(checksum_ptr[0]) == + static_cast(val_); + case 2: + return DecodeFixed16(checksum_ptr) == static_cast(val_); + case 4: + return DecodeFixed32(checksum_ptr) == static_cast(val_); + case 8: + return DecodeFixed64(checksum_ptr) == static_cast(val_); + default: + assert(false); + return false; + } + } + T val_ = 0; }; @@ -113,7 +154,14 @@ class ProtectionInfoKVO { void UpdateV(const SliceParts& old_value, const SliceParts& new_value); void UpdateO(ValueType old_op_type, ValueType new_op_type); - T GetVal() const { return info_.GetVal(); } + // Encode this protection info into `len` bytes and stores them in `dst`. + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + // Verify this protection info against the protection info encoded by Encode() + // at the first `len` bytes of `checksum_ptr`. + // Returns true iff the verification is successful. + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfo; @@ -124,6 +172,7 @@ class ProtectionInfoKVO { static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), ""); } + T GetVal() const { return info_.GetVal(); } void SetVal(T val) { info_.SetVal(val); } ProtectionInfo info_; @@ -154,7 +203,10 @@ class ProtectionInfoKVOC { void UpdateC(ColumnFamilyId old_column_family_id, ColumnFamilyId new_column_family_id); - T GetVal() const { return kvo_.GetVal(); } + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfoKVO; @@ -163,6 +215,7 @@ class ProtectionInfoKVOC { static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), ""); } + T GetVal() const { return kvo_.GetVal(); } void SetVal(T val) { kvo_.SetVal(val); } ProtectionInfoKVO kvo_; @@ -193,7 +246,10 @@ class ProtectionInfoKVOS { void UpdateS(SequenceNumber old_sequence_number, SequenceNumber new_sequence_number); - T GetVal() const { return kvo_.GetVal(); } + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfoKVO; @@ -202,11 +258,32 @@ class ProtectionInfoKVOS { static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), ""); } + T GetVal() const { return kvo_.GetVal(); } void SetVal(T val) { kvo_.SetVal(val); } ProtectionInfoKVO kvo_; }; +template +class ProtectionInfoKV { + public: + ProtectionInfoKV() = default; + + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfo; + + explicit ProtectionInfoKV(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKV) == sizeof(T)); + } + + ProtectionInfo info_; +}; + template Status ProtectionInfo::GetStatus() const { if (val_ != 0) { @@ -244,6 +321,16 @@ ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key, return ProtectionInfoKVO(val); } +template +ProtectionInfoKV ProtectionInfo::ProtectKV(const Slice& key, + const Slice& value) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + return ProtectionInfoKV(val); +} + template void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) { T val = GetVal(); @@ -394,5 +481,4 @@ void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number, sizeof(new_sequence_number), ProtectionInfo::kSeedS)); SetVal(val); } - } // namespace ROCKSDB_NAMESPACE diff --git a/db/memtable.cc b/db/memtable.cc index e61ddc9db8b..f6c0cc62470 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -256,7 +256,7 @@ void MemTable::UpdateOldestKeyTime() { } Status MemTable::VerifyEntryChecksum(const char* entry, - size_t protection_bytes_per_key, + uint32_t protection_bytes_per_key, bool allow_data_in_errors) { if (protection_bytes_per_key == 0) { return Status::OK(); @@ -285,28 +285,11 @@ Status MemTable::VerifyEntryChecksum(const char* entry, Slice value = Slice(value_ptr, value_length); const char* checksum_ptr = value_ptr + value_length; - uint64_t expected = ProtectionInfo64() - .ProtectKVO(user_key, value, type) - .ProtectS(seq) - .GetVal(); - bool match = true; - switch (protection_bytes_per_key) { - case 1: - match = static_cast(checksum_ptr[0]) == - static_cast(expected); - break; - case 2: - match = DecodeFixed16(checksum_ptr) == static_cast(expected); - break; - case 4: - match = DecodeFixed32(checksum_ptr) == static_cast(expected); - break; - case 8: - match = DecodeFixed64(checksum_ptr) == expected; - break; - default: - assert(false); - } + bool match = + ProtectionInfo64() + .ProtectKVO(user_key, value, type) + .ProtectS(seq) + .Verify(static_cast(protection_bytes_per_key), checksum_ptr); if (!match) { std::string msg( "Corrupted memtable entry, per key-value checksum verification " @@ -526,7 +509,7 @@ class MemTableIterator : public InternalIterator { bool valid_; bool arena_mode_; bool value_pinned_; - size_t protection_bytes_per_key_; + uint32_t protection_bytes_per_key_; Status status_; Logger* logger_; @@ -684,28 +667,15 @@ void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, return; } - uint64_t checksum = 0; if (kv_prot_info == nullptr) { - checksum = - ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal(); + ProtectionInfo64() + .ProtectKVO(key, value, type) + .ProtectS(s) + .Encode(static_cast(moptions_.protection_bytes_per_key), + checksum_ptr); } else { - checksum = kv_prot_info->GetVal(); - } - switch (moptions_.protection_bytes_per_key) { - case 1: - checksum_ptr[0] = static_cast(checksum); - break; - case 2: - EncodeFixed16(checksum_ptr, static_cast(checksum)); - break; - case 4: - EncodeFixed32(checksum_ptr, static_cast(checksum)); - break; - case 8: - EncodeFixed64(checksum_ptr, checksum); - break; - default: - assert(false); + kv_prot_info->Encode( + static_cast(moptions_.protection_bytes_per_key), checksum_ptr); } } @@ -902,7 +872,7 @@ struct Saver { ReadCallback* callback_; bool* is_blob_index; bool allow_data_in_errors; - size_t protection_bytes_per_key; + uint32_t protection_bytes_per_key; bool CheckCallback(SequenceNumber _seq) { if (callback_) { return callback_->IsVisible(_seq); diff --git a/db/memtable.h b/db/memtable.h index aa2ba87ca4a..eefabcf88db 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -529,7 +529,7 @@ class MemTable { // Returns Corruption status if verification fails. static Status VerifyEntryChecksum(const char* entry, - size_t protection_bytes_per_key, + uint32_t protection_bytes_per_key, bool allow_data_in_errors = false); private: diff --git a/db/repair.cc b/db/repair.cc index b4b9d0c5ffb..633c348a5c3 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -518,8 +518,9 @@ class Repairer { if (status.ok()) { // TODO: plumb Env::IOActivity const ReadOptions read_options; - status = table_cache_->GetTableProperties(file_options_, read_options, - icmp_, t->meta, &props); + status = table_cache_->GetTableProperties( + file_options_, read_options, icmp_, t->meta, &props, + 0 /* block_protection_bytes_per_key */); } if (status.ok()) { auto s = @@ -577,7 +578,8 @@ class Repairer { /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key); ParsedInternalKey parsed; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); @@ -617,7 +619,9 @@ class Repairer { ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( - ropts, cfd->internal_comparator(), t->meta, &r_iter); + ropts, cfd->internal_comparator(), t->meta, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key, + &r_iter); if (r_iter) { r_iter->SeekToFirst(); diff --git a/db/table_cache.cc b/db/table_cache.cc index 28206ed359e..c288ec8c7fd 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -91,7 +91,8 @@ Status TableCache::GetTableReader( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats, - HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, const std::shared_ptr& prefix_extractor, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { @@ -140,7 +141,8 @@ Status TableCache::GetTableReader( s = ioptions_.table_factory->NewTableReader( ro, TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, skip_filters, immortal_tables_, + internal_comparator, block_protection_bytes_per_key, + skip_filters, immortal_tables_, false /* force_direct_prefetch */, level, block_cache_tracer_, max_file_size_for_l0_meta_pin, db_session_id_, file_meta.fd.GetNumber(), @@ -156,6 +158,7 @@ Status TableCache::FindTable( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle** handle, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, @@ -179,12 +182,12 @@ Status TableCache::FindTable( } std::unique_ptr table_reader; - Status s = - GetTableReader(ro, file_options, internal_comparator, file_meta, - false /* sequential mode */, record_read_stats, - file_read_hist, &table_reader, prefix_extractor, - skip_filters, level, prefetch_index_and_filter_in_cache, - max_file_size_for_l0_meta_pin, file_temperature); + Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, + false /* sequential mode */, record_read_stats, + block_protection_bytes_per_key, file_read_hist, + &table_reader, prefix_extractor, skip_filters, + level, prefetch_index_and_filter_in_cache, + max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.stats, NO_FILE_ERRORS); @@ -212,6 +215,7 @@ InternalIterator* TableCache::NewIterator( size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t block_protection_bytes_per_key, TruncatedRangeDelIterator** range_del_iter) { PERF_TIMER_GUARD(new_table_iterator_nanos); @@ -225,12 +229,13 @@ InternalIterator* TableCache::NewIterator( auto& fd = file_meta.fd; table_reader = fd.table_reader; if (table_reader == nullptr) { - s = FindTable( - options, file_options, icomparator, file_meta, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record_read_stats */, file_read_hist, skip_filters, - level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin, file_meta.temperature); + s = FindTable(options, file_options, icomparator, file_meta, &handle, + block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, + skip_filters, level, + true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { table_reader = cache_.Value(handle); } @@ -308,7 +313,7 @@ InternalIterator* TableCache::NewIterator( Status TableCache::GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, std::unique_ptr* out_iter) { assert(out_iter); const FileDescriptor& fd = file_meta.fd; @@ -317,7 +322,7 @@ Status TableCache::GetRangeTombstoneIterator( TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle); + &handle, block_protection_bytes_per_key); if (s.ok()) { t = cache_.Value(handle); } @@ -403,6 +408,7 @@ Status TableCache::Get( const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin) { @@ -430,7 +436,7 @@ Status TableCache::Get( assert(s.ok()); if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, prefix_extractor, + &handle, block_protection_bytes_per_key, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, @@ -513,7 +519,8 @@ Status TableCache::MultiGetFilter( const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle) { + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key) { auto& fd = file_meta.fd; IterKey row_cache_key; std::string row_cache_entry_buffer; @@ -531,12 +538,13 @@ Status TableCache::MultiGetFilter( MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), mget_range->end()); if (t == nullptr) { - s = FindTable( - options, file_options_, internal_comparator, file_meta, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, /*skip_filters=*/false, - level, true /* prefetch_index_and_filter_in_cache */, - /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, + /*skip_filters=*/false, level, + true /* prefetch_index_and_filter_in_cache */, + /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); if (s.ok()) { t = cache_.Value(handle); } @@ -564,6 +572,7 @@ Status TableCache::GetTableProperties( const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, bool no_io) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? @@ -575,7 +584,8 @@ Status TableCache::GetTableProperties( TypedHandle* table_handle = nullptr; Status s = FindTable(read_options, file_options, internal_comparator, - file_meta, &table_handle, prefix_extractor, no_io); + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, no_io); if (!s.ok()) { return s; } @@ -588,12 +598,14 @@ Status TableCache::GetTableProperties( Status TableCache::ApproximateKeyAnchors( const ReadOptions& ro, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, std::vector& anchors) { + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + std::vector& anchors) { Status s; TableReader* t = file_meta.fd.table_reader; TypedHandle* handle = nullptr; if (t == nullptr) { - s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle); + s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, + block_protection_bytes_per_key); if (s.ok()) { t = cache_.Value(handle); } @@ -610,7 +622,7 @@ Status TableCache::ApproximateKeyAnchors( size_t TableCache::GetMemoryUsageByTableReader( const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? @@ -620,7 +632,8 @@ size_t TableCache::GetMemoryUsageByTableReader( TypedHandle* table_handle = nullptr; Status s = FindTable(read_options, file_options, internal_comparator, - file_meta, &table_handle, prefix_extractor, true); + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, true /* no_io */); if (!s.ok()) { return 0; } @@ -639,16 +652,17 @@ uint64_t TableCache::ApproximateOffsetOf( const ReadOptions& read_options, const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); - Status s = - FindTable(read_options, file_options_, internal_comparator, file_meta, - &table_handle, prefix_extractor, false /* no_io */, - !for_compaction /* record_read_stats */); + Status s = FindTable( + read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, prefix_extractor, + false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { table_reader = cache_.Value(table_handle); } @@ -668,16 +682,17 @@ uint64_t TableCache::ApproximateSize( const ReadOptions& read_options, const Slice& start, const Slice& end, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); - Status s = - FindTable(read_options, file_options_, internal_comparator, file_meta, - &table_handle, prefix_extractor, false /* no_io */, - !for_compaction /* record_read_stats */); + Status s = FindTable( + read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, prefix_extractor, + false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { table_reader = cache_.Value(table_handle); } diff --git a/db/table_cache.h b/db/table_cache.h index 609e67498de..41201eea8a0 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -96,6 +96,7 @@ class TableCache { size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t protection_bytes_per_key, TruncatedRangeDelIterator** range_del_iter = nullptr); // If a seek to internal key "k" in specified file finds an entry, @@ -112,6 +113,7 @@ class TableCache { const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1, size_t max_file_size_for_l0_meta_pin = 0); @@ -121,7 +123,7 @@ class TableCache { Status GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, std::unique_ptr* out_iter); // Call table reader's MultiGetFilter to use the bloom filter to filter out @@ -135,7 +137,8 @@ class TableCache { const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle); + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -150,6 +153,7 @@ class TableCache { Status, MultiGet, const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, bool skip_range_deletions = false, int level = -1, @@ -165,6 +169,7 @@ class TableCache { const ReadOptions& ro, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle**, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, const bool no_io = false, bool record_read_stats = true, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, @@ -183,12 +188,14 @@ class TableCache { const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, bool no_io = false); Status ApproximateKeyAnchors(const ReadOptions& ro, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, + uint8_t block_protection_bytes_per_key, std::vector& anchors); // Return total memory usage of the table reader of the file. @@ -196,7 +203,7 @@ class TableCache { size_t GetMemoryUsageByTableReader( const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated offset of a key in a file represented by fd. @@ -204,6 +211,7 @@ class TableCache { const ReadOptions& read_options, const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated data size between start and end keys in a file @@ -212,6 +220,7 @@ class TableCache { const ReadOptions& read_options, const Slice& start, const Slice& end, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); CacheInterface& get_cache() { return cache_; } @@ -234,8 +243,8 @@ class TableCache { const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, bool sequential_mode, - bool record_read_stats, HistogramImpl* file_read_hist, - std::unique_ptr* table_reader, + bool record_read_stats, uint8_t block_protection_bytes_per_key, + HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const std::shared_ptr& prefix_extractor = nullptr, bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index b1ab73247ce..df8e9337f6b 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -17,6 +17,7 @@ namespace ROCKSDB_NAMESPACE { DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) (const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, int level, TypedHandle* handle) { @@ -65,7 +66,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) if (t == nullptr) { assert(handle == nullptr); s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, prefix_extractor, + &handle, block_protection_bytes_per_key, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, diff --git a/db/version_builder.cc b/db/version_builder.cc index 64590db5cef..d87ef94494b 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1257,7 +1257,8 @@ class VersionBuilder::Rep { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { assert(table_cache_ != nullptr); size_t table_cache_capacity = @@ -1326,7 +1327,8 @@ class VersionBuilder::Rep { statuses[file_idx] = table_cache_->FindTable( read_options, file_options_, *(base_vstorage_->InternalComparator()), *file_meta, &handle, - prefix_extractor, false /*no_io */, true /* record_read_stats */, + block_protection_bytes_per_key, prefix_extractor, false /*no_io */, + true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, file_meta->temperature); @@ -1384,11 +1386,12 @@ Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { - return rep_->LoadTableHandlers(internal_stats, max_threads, - prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, - max_file_size_for_l0_meta_pin, read_options); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { + return rep_->LoadTableHandlers( + internal_stats, max_threads, prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, + read_options, block_protection_bytes_per_key); } uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { diff --git a/db/version_builder.h b/db/version_builder.h index 8e7dd9e6613..fb2a304a843 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -48,7 +48,8 @@ class VersionBuilder { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key); uint64_t GetMinOldestBlobFileNumber() const; private: diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 7ea176e0150..d507c4b0c86 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -566,13 +566,13 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, assert(builder_iter->second != nullptr); VersionBuilder* builder = builder_iter->second->version_builder(); assert(builder); + const MutableCFOptions* moptions = cfd->GetLatestMutableCFOptions(); Status s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, prefetch_index_and_filter_in_cache, is_initial_load, - cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), - read_options_); + moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions), + read_options_, moptions->block_protection_bytes_per_key); if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } @@ -812,16 +812,16 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( assert(builder); } + const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions(); auto* version = new Version(cfd, version_set_, version_set_->file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, + *cf_opts_ptr, io_tracer_, version_set_->current_version_number_++, epoch_number_requirement_); s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, - cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), - read_options_); + cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), + read_options_, cf_opts_ptr->block_protection_bytes_per_key); if (!s.ok()) { delete version; if (s.IsCorruption()) { diff --git a/db/version_set.cc b/db/version_set.cc index 9f1888c78b9..cfe8d617366 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -941,7 +941,7 @@ class LevelIterator final : public InternalIterator { const std::shared_ptr& prefix_extractor, bool should_sample, HistogramImpl* file_read_hist, TableReaderCaller caller, bool skip_filters, int level, - RangeDelAggregator* range_del_agg, + uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg, const std::vector* compaction_boundaries = nullptr, bool allow_unprepared_value = false, @@ -964,6 +964,7 @@ class LevelIterator final : public InternalIterator { pinned_iters_mgr_(nullptr), compaction_boundaries_(compaction_boundaries), is_next_read_sequential_(false), + block_protection_bytes_per_key_(block_protection_bytes_per_key), range_tombstone_iter_(nullptr), to_return_sentinel_(false) { // Empty level is not supported. @@ -1107,7 +1108,8 @@ class LevelIterator final : public InternalIterator { nullptr /* don't need reference to table */, file_read_hist_, caller_, /*arena=*/nullptr, skip_filters_, level_, /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, - largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_); + largest_compaction_key, allow_unprepared_value_, + block_protection_bytes_per_key_, range_tombstone_iter_); } // Check if current file being fully within iterate_lower_bound. @@ -1154,6 +1156,8 @@ class LevelIterator final : public InternalIterator { bool is_next_read_sequential_; + uint8_t block_protection_bytes_per_key_; + // This is set when this level iterator is used under a merging iterator // that processes range tombstones. range_tombstone_iter_ points to where the // merging iterator stores the range tombstones iterator for this level. When @@ -1535,6 +1539,7 @@ Status Version::GetTableProperties(const ReadOptions& read_options, auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, true /* no io */); if (s.ok()) { return s; @@ -1621,6 +1626,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, Status s = table_cache->GetRangeTombstoneIterator( read_options, cfd_->internal_comparator(), *file_meta, + cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key, &tombstone_iter); if (!s.ok()) { return s; @@ -1739,6 +1745,7 @@ size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( file_options_, read_options, cfd_->internal_comparator(), *file_level.files[i].file_metadata, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor); } } @@ -1848,6 +1855,7 @@ InternalIterator* Version::TEST_GetLevelIterator( mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + mutable_cf_options_.block_protection_bytes_per_key, nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, allow_unprepared_value, &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { @@ -1946,7 +1954,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, allow_unprepared_value, - &tombstone_iter); + mutable_cf_options_.block_protection_bytes_per_key, &tombstone_iter); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(table_iter); } else { @@ -1975,8 +1983,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr, - allow_unprepared_value, &tombstone_iter_ptr); + mutable_cf_options_.block_protection_bytes_per_key, + /*range_del_agg=*/nullptr, + /*compaction_boundaries=*/nullptr, allow_unprepared_value, + &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(level_iter); } else { @@ -2019,7 +2029,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false)); + /*allow_unprepared_value=*/false, + mutable_cf_options_.block_protection_bytes_per_key)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -2034,7 +2045,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - &range_del_agg)); + mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg, + nullptr, false)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); } @@ -2333,7 +2345,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, StopWatchNano timer(clock_, timer_enabled /* auto_start */); *status = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, - &get_context, mutable_cf_options_.prefix_extractor, + &get_context, mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel()), @@ -2578,7 +2591,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, read_options, *internal_comparator(), *f->file_metadata, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle); + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); skip_range_deletions = true; if (status.ok()) { skip_filters = true; @@ -2768,7 +2782,8 @@ Status Version::ProcessBatch( read_options, *internal_comparator(), *f->file_metadata, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle); + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); if (status.ok()) { skip_filters = true; skip_range_deletions = true; @@ -5217,7 +5232,8 @@ Status VersionSet::ProcessManifestWrites( true /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, mutable_cf_options_ptrs[i]->prefix_extractor, - MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options); + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options, + mutable_cf_options_ptrs[i]->block_protection_bytes_per_key); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -6553,10 +6569,11 @@ uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, // "key" falls in the range for this table. Add the // approximate offset of "key" within the table. TableCache* table_cache = v->cfd_->table_cache(); + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( read_options, key, *f.file_metadata, caller, icmp, - v->GetMutableCFOptions().prefix_extractor); + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); } } return result; @@ -6596,9 +6613,10 @@ uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, if (table_cache == nullptr) { return 0; } + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); return table_cache->ApproximateSize( read_options, start, end, *f.file_metadata, caller, icmp, - v->GetMutableCFOptions().prefix_extractor); + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); } void VersionSet::RemoveLiveFiles( @@ -6757,6 +6775,7 @@ InternalIterator* VersionSet::MakeInputIterator( /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, /*allow_unprepared_value=*/false, + c->mutable_cf_options()->block_protection_bytes_per_key, /*range_del_iter=*/&range_tombstone_iter); range_tombstones.emplace_back(range_tombstone_iter, nullptr); } @@ -6770,8 +6789,9 @@ InternalIterator* VersionSet::MakeInputIterator( /*should_sample=*/false, /*no per level latency histogram=*/nullptr, TableReaderCaller::kCompaction, /*skip_filters=*/false, - /*level=*/static_cast(c->level(which)), range_del_agg, - c->boundaries(which), false, &tombstone_iter_ptr); + /*level=*/static_cast(c->level(which)), + c->mutable_cf_options()->block_protection_bytes_per_key, + range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr); range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); } } @@ -7008,7 +7028,8 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - read_options, file_opts, *icmp, meta_copy, &handle, pe, + read_options, file_opts, *icmp, meta_copy, &handle, + cf_opts->block_protection_bytes_per_key, pe, /*no_io=*/false, /*record_read_stats=*/true, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h index 188c2e2f950..2507762e8c8 100644 --- a/db/version_set_sync_and_async.h +++ b/db/version_set_sync_and_async.h @@ -25,6 +25,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) StopWatchNano timer(clock_, timer_enabled /* auto_start */); s = CO_AWAIT(table_cache_->MultiGet)( read_options, *internal_comparator(), *f->file_metadata, &file_range, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters, skip_range_deletions, hit_file_level, table_handle); diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index d16fefe4cd6..8756932a7ef 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -290,6 +290,7 @@ DECLARE_bool(paranoid_file_checks); DECLARE_bool(fail_if_options_file_error); DECLARE_uint64(batch_protection_bytes_per_key); DECLARE_uint32(memtable_protection_bytes_per_key); +DECLARE_uint32(block_protection_bytes_per_key); DECLARE_uint64(user_timestamp_size); DECLARE_string(secondary_cache_uri); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index b6ee6726901..9ce10f06cc9 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -975,6 +975,11 @@ DEFINE_uint32( "specified number of bytes per key. Currently the supported " "nonzero values are 1, 2, 4 and 8."); +DEFINE_uint32(block_protection_bytes_per_key, 0, + "If nonzero, enables integrity protection in blocks at the " + "specified number of bytes per key. Currently the supported " + "nonzero values are 1, 2, 4 and 8."); + DEFINE_string(file_checksum_impl, "none", "Name of an implementation for file_checksum_gen_factory, or " "\"none\" for null."); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 60a12b33149..710c7687b9a 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -3122,6 +3122,7 @@ void InitializeOptionsFromFlags( FLAGS_verify_sst_unique_id_in_manifest; options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; + options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index c41c5051f39..62e96af23ba 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -31,6 +31,7 @@ namespace ROCKSDB_NAMESPACE { namespace { static std::shared_ptr env_guard; static std::shared_ptr env_wrapper_guard; +static std::shared_ptr legacy_env_wrapper_guard; static std::shared_ptr dbsl_env_wrapper_guard; static std::shared_ptr fault_env_guard; @@ -99,6 +100,17 @@ int db_stress_tool(int argc, char** argv) { env_wrapper_guard = std::make_shared( raw_env, std::make_shared(raw_env->GetFileSystem())); + if (!env_opts && !FLAGS_use_io_uring) { + // If using the default Env (Posix), wrap DbStressEnvWrapper with the + // legacy EnvWrapper. This is a workaround to prevent MultiGet and scans + // from failing when IO uring is disabled. The EnvWrapper + // has a default implementation of ReadAsync that redirects to Read. + legacy_env_wrapper_guard = std::make_shared(raw_env); + env_wrapper_guard = std::make_shared( + legacy_env_wrapper_guard, + std::make_shared( + legacy_env_wrapper_guard->GetFileSystem())); + } db_stress_env = env_wrapper_guard.get(); FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc index e93b9a3f5f8..676daf574fa 100644 --- a/fuzz/sst_file_writer_fuzzer.cc +++ b/fuzz/sst_file_writer_fuzzer.cc @@ -92,7 +92,8 @@ TableReader* NewTableReader(const std::string& sst_file_path, if (s.ok()) { ImmutableOptions iopts(options, cf_ioptions); TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options, - cf_ioptions.internal_comparator); + cf_ioptions.internal_comparator, + 0 /* block_protection_bytes_per_key */); t_opt.largest_seqno = kMaxSequenceNumber; s = options.table_factory->NewTableReader(t_opt, std::move(file_reader), file_size, &table_reader, diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 5862126d0af..ff0a408958e 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -1122,6 +1122,20 @@ struct AdvancedColumnFamilyOptions { // only compatible changes are allowed. bool persist_user_defined_timestamps = true; + // Enable/disable per key-value checksum protection for in memory blocks. + // + // Checksum is constructed when a block is loaded into memory and verification + // is done for each key read from the block. This is useful for detecting + // in-memory data corruption. Note that this feature has a non-trivial + // negative impact on read performance. Different values of the + // option have similar performance impact, but different memory cost and + // corruption detection probability (e.g. 1 byte gives 255/256 chance for + // detecting a corruption). + // + // Default: 0 (no protection) + // Supported values: 0, 1, 2, 4, 8. + uint8_t block_protection_bytes_per_key = 0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 387da17539b..9aadca94742 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -151,6 +151,13 @@ struct ShardedCacheOptions { metadata_charge_policy(_metadata_charge_policy) {} }; +// LRUCache - A cache using LRU eviction to stay at or below a set capacity. +// The cache is sharded to 2^num_shard_bits shards, by hash of the key. +// The total capacity is divided and evenly assigned to each shard, and each +// shard has its own LRU list for evictions. Each shard also has a mutex for +// exclusive access during operations; even read operations need exclusive +// access in order to update the LRU list. Mutex contention is usually low +// with enough shards. struct LRUCacheOptions : public ShardedCacheOptions { // Ratio of cache reserved for high-priority and low-priority entries, // respectively. (See Cache::Priority below more information on the levels.) @@ -158,7 +165,8 @@ struct LRUCacheOptions : public ShardedCacheOptions { // values cannot exceed 1. // // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU - // list is maintained by the cache. Similarly, if low_pri_pool_ratio is + // list is maintained by the cache. A ratio of 0.5 means non-high-priority + // entries will use midpoint insertion. Similarly, if low_pri_pool_ratio is // greater than zero, a dedicated low-priority LRU list is maintained. // There is also a bottom-priority LRU list, which is always enabled and not // explicitly configurable. Entries are spilled over to the next available @@ -173,9 +181,6 @@ struct LRUCacheOptions : public ShardedCacheOptions { // otherwise, they are placed in the bottom-priority pool.) This results // in lower-priority entries without hits getting evicted from the cache // sooner. - // - // Default values: high_pri_pool_ratio = 0.5 (which is referred to as - // "midpoint insertion"), low_pri_pool_ratio = 0 double high_pri_pool_ratio = 0.5; double low_pri_pool_ratio = 0.0; @@ -199,31 +204,36 @@ struct LRUCacheOptions : public ShardedCacheOptions { high_pri_pool_ratio(_high_pri_pool_ratio), low_pri_pool_ratio(_low_pri_pool_ratio), use_adaptive_mutex(_use_adaptive_mutex) {} + + // Construct an instance of LRUCache using these options + std::shared_ptr MakeSharedCache() const; }; -// Create a new cache with a fixed size capacity. The cache is sharded -// to 2^num_shard_bits shards, by hash of the key. The total capacity -// is divided and evenly assigned to each shard. If strict_capacity_limit -// is set, insert to the cache will fail when cache is full. User can also -// set percentage of the cache reserves for high priority entries via -// high_pri_pool_pct. -// num_shard_bits = -1 means it is automatically determined: every shard -// will be at least 512KB and number of shard bits will not exceed 6. -extern std::shared_ptr NewLRUCache( +// DEPRECATED wrapper function +inline std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, std::shared_ptr memory_allocator = nullptr, bool use_adaptive_mutex = kDefaultToAdaptiveMutex, CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy, - double low_pri_pool_ratio = 0.0); - -extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); + double low_pri_pool_ratio = 0.0) { + return LRUCacheOptions(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, + low_pri_pool_ratio) + .MakeSharedCache(); +} + +// DEPRECATED wrapper function +inline std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { + return cache_opts.MakeSharedCache(); +} // EXPERIMENTAL -// Options structure for configuring a SecondaryCache instance based on -// LRUCache. The LRUCacheOptions.secondary_cache is not used and -// should not be set. +// Options structure for configuring a SecondaryCache instance with in-memory +// compression. The implementation uses LRUCache so inherits its options, +// except LRUCacheOptions.secondary_cache is not used and should not be set. struct CompressedSecondaryCacheOptions : LRUCacheOptions { // The compression method (if any) that is used to compress data. CompressionType compression_type = CompressionType::kLZ4Compression; @@ -264,11 +274,16 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { compress_format_version(_compress_format_version), enable_custom_split_merge(_enable_custom_split_merge), do_not_compress_roles(_do_not_compress_roles) {} + + // Construct an instance of CompressedSecondaryCache using these options + std::shared_ptr MakeSharedSecondaryCache() const; + + // Avoid confusion with LRUCache + std::shared_ptr MakeSharedCache() const = delete; }; -// EXPERIMENTAL -// Create a new Secondary Cache that is implemented on top of LRUCache. -extern std::shared_ptr NewCompressedSecondaryCache( +// DEPRECATED wrapper function +inline std::shared_ptr NewCompressedSecondaryCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, double low_pri_pool_ratio = 0.0, @@ -280,10 +295,21 @@ extern std::shared_ptr NewCompressedSecondaryCache( uint32_t compress_format_version = 2, bool enable_custom_split_merge = false, const CacheEntryRoleSet& _do_not_compress_roles = { - CacheEntryRole::kFilterBlock}); - -extern std::shared_ptr NewCompressedSecondaryCache( - const CompressedSecondaryCacheOptions& opts); + CacheEntryRole::kFilterBlock}) { + return CompressedSecondaryCacheOptions( + capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, compression_type, + compress_format_version, enable_custom_split_merge, + _do_not_compress_roles) + .MakeSharedSecondaryCache(); +} + +// DEPRECATED wrapper function +inline std::shared_ptr NewCompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts) { + return opts.MakeSharedSecondaryCache(); +} // HyperClockCache - A lock-free Cache alternative for RocksDB block cache // that offers much improved CPU efficiency vs. LRUCache under high parallel diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index 5cb799e4273..d126abfe6d6 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -55,6 +55,11 @@ struct JemallocAllocatorOptions { // Upper bound of allocation size to use tcache, if limit_tcache_size=true. // When used with block cache, it is recommended to set it to block_size. size_t tcache_size_upper_bound = 16 * 1024; + + // Number of arenas across which we spread allocation requests. Increasing + // this setting can mitigate arena mutex contention. The value must be + // positive. + size_t num_arenas = 1; }; // Generate memory allocator which allocates through Jemalloc and utilize @@ -70,7 +75,8 @@ struct JemallocAllocatorOptions { // core dump. Side benefit of using single arena would be reduction of jemalloc // metadata for some workloads. // -// To mitigate mutex contention for using one single arena, jemalloc tcache +// To mitigate mutex contention for using one single arena (see also +// `JemallocAllocatorOptions::num_arenas` above), jemalloc tcache // (thread-local cache) is enabled to cache unused allocations for future use. // The tcache normally incurs 0.5M extra memory usage per-thread. The usage // can be reduced by limiting allocation sizes to cache. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index f5ac3057c47..cfa4021035f 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -12,7 +12,7 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. #define ROCKSDB_MAJOR 8 -#define ROCKSDB_MINOR 2 +#define ROCKSDB_MINOR 3 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc index cdad14576d2..d05248224d7 100644 --- a/memory/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -14,6 +14,8 @@ #include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_type.h" +#include "util/fastrange.h" +#include "util/random.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -35,6 +37,9 @@ static std::unordered_map jemalloc_type_info = { {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_arenas", + {offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, }; bool JemallocNodumpAllocator::IsSupported(std::string* why) { #ifndef ROCKSDB_JEMALLOC @@ -59,11 +64,13 @@ bool JemallocNodumpAllocator::IsSupported(std::string* why) { JemallocNodumpAllocator::JemallocNodumpAllocator( JemallocAllocatorOptions& options) - : options_(options), + : options_(options) #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache), + , + tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) { +#else // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +{ #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - arena_index_(0) { RegisterOptions(&options_, &jemalloc_type_info); } @@ -75,9 +82,9 @@ JemallocNodumpAllocator::~JemallocNodumpAllocator() { for (void* tcache_index : tcache_list) { DestroyThreadSpecificCache(tcache_index); } - if (arena_index_ > 0) { + for (auto arena_index : arena_indexes_) { // Destroy arena. Silently ignore error. - Status s = DestroyArena(arena_index_); + Status s = DestroyArena(arena_index); assert(s.ok()); s.PermitUncheckedError(); } @@ -90,7 +97,8 @@ size_t JemallocNodumpAllocator::UsableSize(void* p, void* JemallocNodumpAllocator::Allocate(size_t size) { int tcache_flag = GetThreadSpecificCache(size); - return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); + uint32_t arena_index = GetArenaIndex(); + return mallocx(size, MALLOCX_ARENA(arena_index) | tcache_flag); } void JemallocNodumpAllocator::Deallocate(void* p) { @@ -105,45 +113,71 @@ void JemallocNodumpAllocator::Deallocate(void* p) { dallocx(p, tcache_flag); } -Status JemallocNodumpAllocator::InitializeArenas() { - // Create arena. - size_t arena_index_size = sizeof(arena_index_); - int ret = - mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0); - if (ret != 0) { - return Status::Incomplete("Failed to create jemalloc arena, error code: " + - std::to_string(ret)); +uint32_t JemallocNodumpAllocator::GetArenaIndex() const { + if (arena_indexes_.size() == 1) { + return arena_indexes_[0]; } - assert(arena_index_ != 0); - // Read existing hooks. - std::string key = "arena." + std::to_string(arena_index_) + ".extent_hooks"; - extent_hooks_t* hooks; - size_t hooks_size = sizeof(hooks); - ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); - if (ret != 0) { - return Status::Incomplete("Failed to read existing hooks, error code: " + - std::to_string(ret)); - } + static std::atomic next_seed = 0; + // Core-local may work in place of `thread_local` as we should be able to + // tolerate occasional stale reads in thread migration cases. However we need + // to make Random thread-safe and prevent cacheline bouncing. Whether this is + // worthwhile is still an open question. + thread_local Random tl_random(next_seed.fetch_add(1)); + return arena_indexes_[FastRange32(tl_random.Next(), arena_indexes_.size())]; +} - // Store existing alloc. - extent_alloc_t* original_alloc = hooks->alloc; - extent_alloc_t* expected = nullptr; - bool success = - JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( - expected, original_alloc); - if (!success && original_alloc != expected) { - return Status::Incomplete("Original alloc conflict."); - } +Status JemallocNodumpAllocator::InitializeArenas() { + assert(!init_); + init_ = true; - // Set the custom hook. - arena_hooks_.reset(new extent_hooks_t(*hooks)); - arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc; - extent_hooks_t* hooks_ptr = arena_hooks_.get(); - ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); - if (ret != 0) { - return Status::Incomplete("Failed to set custom hook, error code: " + - std::to_string(ret)); + for (size_t i = 0; i < options_.num_arenas; i++) { + // Create arena. + unsigned arena_index; + size_t arena_index_size = sizeof(arena_index); + int ret = + mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete( + "Failed to create jemalloc arena, error code: " + + std::to_string(ret)); + } + arena_indexes_.push_back(arena_index); + + // Read existing hooks. + std::string key = + "arena." + std::to_string(arena_indexes_[i]) + ".extent_hooks"; + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to read existing hooks, error code: " + + std::to_string(ret)); + } + + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + if (!success && original_alloc != expected) { + // This could happen if jemalloc creates new arenas with different initial + // values in their `alloc` function pointers. See `original_alloc_` API + // doc for more details. + return Status::Incomplete("Original alloc conflict."); + } + + // Set the custom hook. + per_arena_hooks_.emplace_back(); + per_arena_hooks_.back().reset(new extent_hooks_t(*hooks)); + per_arena_hooks_.back()->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = per_arena_hooks_.back().get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + return Status::Incomplete("Failed to set custom hook, error code: " + + std::to_string(ret)); + } } return Status::OK(); } @@ -161,6 +195,8 @@ Status JemallocNodumpAllocator::PrepareOptions( options_.tcache_size_upper_bound) { return Status::InvalidArgument( "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } else if (options_.num_arenas < 1) { + return Status::InvalidArgument("num_arenas must be a positive integer"); } else if (IsMutable()) { Status s = MemoryAllocator::PrepareOptions(config_options); #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR @@ -221,7 +257,7 @@ void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, return result; } -Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) { +Status JemallocNodumpAllocator::DestroyArena(uint32_t arena_index) { assert(arena_index != 0); std::string key = "arena." + std::to_string(arena_index) + ".destroy"; int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); diff --git a/memory/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h index a1e1547d7b3..2bdbaeb3286 100644 --- a/memory/jemalloc_nodump_allocator.h +++ b/memory/jemalloc_nodump_allocator.h @@ -24,6 +24,10 @@ #endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX namespace ROCKSDB_NAMESPACE { + +// Allocation requests are randomly sharded across +// `JemallocAllocatorOptions::num_arenas` arenas to reduce contention on per- +// arena mutexes. class JemallocNodumpAllocator : public BaseMemoryAllocator { public: explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options); @@ -38,7 +42,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { return IsSupported(&unused); } static bool IsSupported(std::string* why); - bool IsMutable() const { return arena_index_ == 0; } + bool IsMutable() const { return !init_; } Status PrepareOptions(const ConfigOptions& config_options) override; @@ -52,9 +56,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR Status InitializeArenas(); - friend Status NewJemallocNodumpAllocator( - JemallocAllocatorOptions& options, - std::shared_ptr* memory_allocator); + uint32_t GetArenaIndex() const; // Custom alloc hook to replace jemalloc default alloc. static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size, @@ -62,7 +64,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { unsigned arena_ind); // Destroy arena on destruction of the allocator, or on failure. - static Status DestroyArena(unsigned arena_index); + static Status DestroyArena(uint32_t arena_index); // Destroy tcache on destruction of the allocator, or thread exit. static void DestroyThreadSpecificCache(void* ptr); @@ -78,17 +80,20 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { // NewJemallocNodumpAllocator is thread-safe. // // Hack: original_alloc_ needs to be static for Alloc() to access it. - // alloc needs to be static to pass to jemalloc as function pointer. + // alloc needs to be static to pass to jemalloc as function pointer. We can + // use a single process-wide value as long as we assume that any newly created + // arena has the same original value in its `alloc` function pointer. static std::atomic original_alloc_; // Custom hooks has to outlive corresponding arena. - std::unique_ptr arena_hooks_; + std::vector> per_arena_hooks_; // Hold thread-local tcache index. ThreadLocalPtr tcache_; + + std::vector arena_indexes_; #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - // Arena index. - unsigned arena_index_; + bool init_ = false; }; } // namespace ROCKSDB_NAMESPACE diff --git a/options/cf_options.cc b/options/cf_options.cc index 2057e300a20..0fccd501434 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -488,6 +488,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key), OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"block_protection_bytes_per_key", + {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, diff --git a/options/cf_options.h b/options/cf_options.h index d5e8da73481..37ef54c0cba 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -172,6 +172,7 @@ struct MutableCFOptions { : options.last_level_temperature), memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), + block_protection_bytes_per_key(options.block_protection_bytes_per_key), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level) { @@ -222,6 +223,7 @@ struct MutableCFOptions { bottommost_compression(kDisableCompressionOption), last_level_temperature(Temperature::kUnknown), memtable_protection_bytes_per_key(0), + block_protection_bytes_per_key(0), sample_for_compression(0) {} explicit MutableCFOptions(const Options& options); @@ -312,6 +314,7 @@ struct MutableCFOptions { CompressionOptions bottommost_compression_opts; Temperature last_level_temperature; uint32_t memtable_protection_bytes_per_key; + uint8_t block_protection_bytes_per_key; uint64_t sample_for_compression; std::vector compression_per_level; diff --git a/options/options_helper.cc b/options/options_helper.cc index fc651ffdba7..abe5053d229 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -206,6 +206,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.experimental_mempurge_threshold; cf_opts->memtable_protection_bytes_per_key = moptions.memtable_protection_bytes_per_key; + cf_opts->block_protection_bytes_per_key = + moptions.block_protection_bytes_per_key; // Compaction related options cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index c772c786c9c..6357b5e9eea 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -552,7 +552,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "compaction=false;age_for_warm=1;};" "blob_cache=1M;" "memtable_protection_bytes_per_key=2;" - "persist_user_defined_timestamps=true;", + "persist_user_defined_timestamps=true;" + "block_protection_bytes_per_key=1;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); diff --git a/port/stack_trace.cc b/port/stack_trace.cc index ad648d3bc1d..9e9d8b8b548 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -128,6 +128,14 @@ void PrintStackTraceLine(const char* symbol, void* frame) { #endif +const char* GetLldbScriptSelectThread(long long tid) { + // NOTE: called from a signal handler, so no heap allocation + static char script[80]; + snprintf(script, sizeof(script), + "script -l python -- lldb.process.SetSelectedThreadByID(%lld)", tid); + return script; +} + } // namespace void PrintStack(void* frames[], int num_frames) { @@ -152,9 +160,13 @@ void PrintStack(int first_frames_to_skip) { // * It doesn't appear easy to detect when ASLR is in use. // * With DEBUG_LEVEL < 2, backtrace() can skip frames that are not skipped // in GDB. + // + // LLDB also available as an option + bool lldb_stack_trace = getenv("ROCKSDB_LLDB_STACK") != nullptr; #if defined(OS_LINUX) // Default true, override with ROCKSDB_BACKTRACE_STACK=1 - bool gdb_stack_trace = getenv("ROCKSDB_BACKTRACE_STACK") == nullptr; + bool gdb_stack_trace = + !lldb_stack_trace && getenv("ROCKSDB_BACKTRACE_STACK") == nullptr; #else // Default false, override with ROCKSDB_GDB_STACK=1 bool gdb_stack_trace = getenv("ROCKSDB_GDB_STACK") != nullptr; @@ -164,53 +176,84 @@ void PrintStack(int first_frames_to_skip) { char* debug_env = getenv("ROCKSDB_DEBUG"); bool debug = debug_env != nullptr && strlen(debug_env) > 0; - if (gdb_stack_trace || debug) { + if (lldb_stack_trace || gdb_stack_trace || debug) { // Allow ouside debugger to attach, even with Yama security restrictions #ifdef PR_SET_PTRACER_ANY (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); #endif // Try to invoke GDB, either for stack trace or debugging. - long long attach_id = getpid(); + long long attach_pid = getpid(); + // NOTE: we're in a signal handler, so no heap allocation + char attach_pid_str[20]; + snprintf(attach_pid_str, sizeof(attach_pid_str), "%lld", attach_pid); // `gdb -p PID` seems to always attach to main thread, but `gdb -p TID` // seems to be able to attach to a particular thread in a process, which // makes sense as the main thread TID == PID of the process. // But I haven't found that gdb capability documented anywhere, so leave // a back door to attach to main thread. + long long gdb_attach_id = attach_pid; + // Save current thread id before fork + long long attach_tid = 0; #ifdef OS_LINUX + attach_tid = gettid(); if (getenv("ROCKSDB_DEBUG_USE_PID") == nullptr) { - attach_id = gettid(); + gdb_attach_id = attach_tid; } #endif - char attach_id_str[20]; - snprintf(attach_id_str, sizeof(attach_id_str), "%lld", attach_id); + + char gdb_attach_id_str[20]; + snprintf(gdb_attach_id_str, sizeof(gdb_attach_id_str), "%lld", + gdb_attach_id); + pid_t child_pid = fork(); if (child_pid == 0) { // child process if (debug) { - fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n", - debug_env); - execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", attach_id_str, - (char*)nullptr); - return; + if (strcmp(debug_env, "lldb") == 0) { + fprintf(stderr, "Invoking LLDB for debugging (ROCKSDB_DEBUG=%s)...\n", + debug_env); + execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, + /*"-Q",*/ "-o", GetLldbScriptSelectThread(attach_tid), + (char*)nullptr); + return; + } else { + fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n", + debug_env); + execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", gdb_attach_id_str, + (char*)nullptr); + return; + } } else { - fprintf(stderr, "Invoking GDB for stack trace...\n"); - - // Skip top ~4 frames here in PrintStack - // See https://stackoverflow.com/q/40991943/454544 - auto bt_in_gdb = - "frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 " - "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 " - "42 43 44 -q frame"; // Redirect child stdout to original stderr dup2(2, 1); // No child stdin (don't use pager) close(0); - // -n : Loading config files can apparently cause failures with the - // other options here. - // -batch : non-interactive; suppress banners as much as possible - execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p", - attach_id_str, "-ex", bt_in_gdb, (char*)nullptr); + if (lldb_stack_trace) { + fprintf(stderr, "Invoking LLDB for stack trace...\n"); + + // Skip top ~8 frames here in PrintStack + auto bt_in_lldb = + "script -l python -- for f in lldb.thread.frames[8:]: print(f)"; + execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, + "-b", "-Q", "-o", GetLldbScriptSelectThread(attach_tid), "-o", + bt_in_lldb, (char*)nullptr); + } else { + // gdb_stack_trace + fprintf(stderr, "Invoking GDB for stack trace...\n"); + + // Skip top ~4 frames here in PrintStack + // See https://stackoverflow.com/q/40991943/454544 + auto bt_in_gdb = + "frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 " + "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 " + "42 43 44 -q frame"; + // -n : Loading config files can apparently cause failures with the + // other options here. + // -batch : non-interactive; suppress banners as much as possible + execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p", + gdb_attach_id_str, "-ex", bt_in_gdb, (char*)nullptr); + } return; } } else { diff --git a/table/block_based/block.cc b/table/block_based/block.cc index b9b5d6e7e91..136275b6c89 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not derefence past "limit". +// "*value_length", respectively. Will not dereference past "limit". // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). @@ -137,17 +137,26 @@ struct DecodeEntryV4 { return DecodeKeyV4()(p, limit, shared, non_shared); } }; + void DataBlockIter::NextImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return; +#endif bool is_shared = false; ParseNextDataKey(&is_shared); + ++cur_entry_idx_; } void MetaBlockIter::NextImpl() { bool is_shared = false; ParseNextKey(&is_shared); + ++cur_entry_idx_; } -void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } +void IndexBlockIter::NextImpl() { + ParseNextIndexKey(); + ++cur_entry_idx_; +} void IndexBlockIter::PrevImpl() { assert(Valid()); @@ -166,6 +175,7 @@ void IndexBlockIter::PrevImpl() { // Loop until end of current entry hits the start of original entry while (ParseNextIndexKey() && NextEntryOffset() < original) { } + --cur_entry_idx_; } void MetaBlockIter::PrevImpl() { @@ -187,6 +197,7 @@ void MetaBlockIter::PrevImpl() { while (ParseNextKey(&is_shared) && NextEntryOffset() < original) { } + --cur_entry_idx_; } // Similar to IndexBlockIter::PrevImpl but also caches the prev entries @@ -195,6 +206,7 @@ void DataBlockIter::PrevImpl() { assert(prev_entries_idx_ == -1 || static_cast(prev_entries_idx_) < prev_entries_.size()); + --cur_entry_idx_; // Check if we can use cached prev_entries_ if (prev_entries_idx_ > 0 && prev_entries_[prev_entries_idx_].offset == current_) { @@ -319,10 +331,10 @@ void MetaBlockIter::SeekImpl(const Slice& target) { // inclusive; AND // 2) the last key of this block has a greater user_key from seek_user_key // -// If the return value is TRUE, iter location has two possibilies: -// 1) If iter is valid, it is set to a location as if set by BinarySeek. In -// this case, it points to the first key with a larger user_key or a matching -// user_key with a seqno no greater than the seeking seqno. +// If the return value is TRUE, iter location has two possibilities: +// 1) If iter is valid, it is set to a location as if set by SeekImpl(target). +// In this case, it points to the first key with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. // 2) If the iter is invalid, it means that either all the user_key is less // than the seek_user_key, or the block ends with a matching user_key but // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno @@ -347,11 +359,11 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // boundary key: axy@50 (we make minimal assumption about a boundary key) // Block N+1: [axy@10, ... ] // - // If seek_key = axy@60, the search will starts from Block N. + // If seek_key = axy@60, the search will start from Block N. // Even if the user_key is not found in the hash map, the caller still // have to continue searching the next block. // - // In this case, we pretend the key is the the last restart interval. + // In this case, we pretend the key is in the last restart interval. // The while-loop below will search the last restart interval for the // key. It will stop at the first key that is larger than the seek_key, // or to the end of the block if no one is larger. @@ -364,12 +376,15 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { assert(restart_index < num_restarts_); SeekToRestartPoint(restart_index); current_ = GetRestartPoint(restart_index); + cur_entry_idx_ = + static_cast(restart_index * block_restart_interval_) - 1; uint32_t limit = restarts_; if (restart_index + 1 < num_restarts_) { limit = GetRestartPoint(restart_index + 1); } while (current_ < limit) { + ++cur_entry_idx_; bool shared; // Here we only linear seek the target key inside the restart interval. // If a key does not exist inside a restart interval, we avoid @@ -381,14 +396,20 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // we stop at the first potential matching user key. break; } + // If the loop exits due to CompareCurrentKey(target) >= 0, then current key + // exists, and its checksum verification will be done in UpdateKey() called + // in SeekForGet(). + // TODO(cbi): If this loop exits with current_ == restart_, per key-value + // checksum will not be verified in UpdateKey() since Valid() + // will return false. } if (current_ == restarts_) { - // Search reaches to the end of the block. There are three possibilites: - // 1) there is only one user_key match in the block (otherwise collsion). + // Search reaches to the end of the block. There are three possibilities: + // 1) there is only one user_key match in the block (otherwise collision). // the matching user_key resides in the last restart interval, and it // is the last key of the restart interval and of the block as well. - // ParseNextKey() skiped it as its [ type | seqno ] is smaller. + // ParseNextKey() skipped it as its [ type | seqno ] is smaller. // // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, // AND all existing user_keys in the restart interval are smaller than @@ -424,6 +445,9 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { } void IndexBlockIter::SeekImpl(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return; +#endif TEST_SYNC_POINT("IndexBlockIter::Seek:0"); PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet @@ -478,7 +502,9 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLastImpl(); + if (status_.ok()) { + SeekToLastImpl(); + } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); @@ -502,7 +528,9 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLastImpl(); + if (status_.ok()) { + SeekToLastImpl(); + } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); @@ -517,6 +545,7 @@ void DataBlockIter::SeekToFirstImpl() { SeekToRestartPoint(0); bool is_shared = false; ParseNextDataKey(&is_shared); + cur_entry_idx_ = 0; } void MetaBlockIter::SeekToFirstImpl() { @@ -526,15 +555,20 @@ void MetaBlockIter::SeekToFirstImpl() { SeekToRestartPoint(0); bool is_shared = false; ParseNextKey(&is_shared); + cur_entry_idx_ = 0; } void IndexBlockIter::SeekToFirstImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return; +#endif if (data_ == nullptr) { // Not init yet return; } status_ = Status::OK(); SeekToRestartPoint(0); ParseNextIndexKey(); + cur_entry_idx_ = 0; } void DataBlockIter::SeekToLastImpl() { @@ -543,8 +577,10 @@ void DataBlockIter::SeekToLastImpl() { } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { // Keep skipping + ++cur_entry_idx_; } } @@ -554,9 +590,13 @@ void MetaBlockIter::SeekToLastImpl() { } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; + assert(num_restarts_ >= 1); + cur_entry_idx_ = + static_cast((num_restarts_ - 1) * block_restart_interval_); while (ParseNextKey(&is_shared) && NextEntryOffset() < restarts_) { - // Keep skipping + // Will probably never reach here since restart_interval is always 1 + ++cur_entry_idx_; } } @@ -566,20 +606,12 @@ void IndexBlockIter::SeekToLastImpl() { } status_ = Status::OK(); SeekToRestartPoint(num_restarts_ - 1); + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { - // Keep skipping + ++cur_entry_idx_; } } -template -void BlockIter::CorruptionError() { - current_ = restarts_; - restart_index_ = num_restarts_; - status_ = Status::Corruption("bad entry in block"); - raw_key_.Clear(); - value_.clear(); -} - template template bool BlockIter::ParseNextKey(bool* is_shared) { @@ -666,12 +698,12 @@ bool IndexBlockIter::ParseNextIndexKey() { // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // ... // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) -// where, k is key, v is value, and its encoding is in parenthesis. +// where, k is key, v is value, and its encoding is in parentheses. // The format of each key is (shared_size, non_shared_size, shared, non_shared) // The format of each value, i.e., block handle, is (offset, size) whenever the // is_shared is false, which included the first entry in each restart point. -// Otherwise the format is delta-size = block handle size - size of last block -// handle. +// Otherwise, the format is delta-size = the size of current block - the size o +// last block. void IndexBlockIter::DecodeCurrentValue(bool is_shared) { Slice v(value_.data(), data_ + restarts_ - value_.data()); // Delta encoding is used if `shared` != 0. @@ -710,6 +742,7 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, // to follow it up with NextImpl() to position the iterator at the restart // key. SeekToRestartPoint(index); + cur_entry_idx_ = static_cast(index * block_restart_interval_) - 1; NextImpl(); if (!skip_linear_scan) { @@ -728,6 +761,8 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, while (true) { NextImpl(); if (!Valid()) { + // TODO(cbi): per key-value checksum will not be verified in UpdateKey() + // since Valid() will returns false. break; } if (current_ == max_offset) { @@ -976,6 +1011,7 @@ Block::~Block() { // This sync point can be re-enabled if RocksDB can control the // initialization order of any/all static options created by the user. // TEST_SYNC_POINT("Block::~Block"); + delete[] kv_checksum_; } Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, @@ -1035,6 +1071,126 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, } } +void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp) { + protection_bytes_per_key_ = 0; + if (protection_bytes_per_key > 0 && num_restarts_ > 0) { + // NewDataIterator() is called with protection_bytes_per_key_ = 0. + // This is intended since checksum is not constructed yet. + // + // We do not know global_seqno yet, so checksum computation and + // verification all assume global_seqno = 0. + std::unique_ptr iter{NewDataIterator( + raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */, + nullptr /* stats */, true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + size_t i = 0; + iter->SeekToFirst(); + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They + // do not impact how the index block is parsed. During checksum + // construction/verification, we use the entire key buffer from + // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value + // checksum, and the content of this buffer do not change for different + // values of `global_seqno` or `key_includes_seq`. + std::unique_ptr iter{NewIndexIterator( + raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr, + nullptr /* Statistics */, true /* total_order_seek */, + index_has_first_key /* have_first_key */, false /* key_includes_seq */, + value_is_full, true /* block_contents_pinned */, + nullptr /* prefix_index */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->raw_value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeMetaIndexBlockProtectionInfo( + uint8_t protection_bytes_per_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + std::unique_ptr iter{ + NewMetaIterator(true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { MetaBlockIter* iter = new MetaBlockIter(); if (size_ < 2 * sizeof(uint32_t)) { @@ -1045,7 +1201,8 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { iter->Invalidate(Status::OK()); } else { iter->Initialize(data_, restart_offset_, num_restarts_, - block_contents_pinned); + block_contents_pinned, protection_bytes_per_key_, + kv_checksum_, block_restart_interval_); } return iter; } @@ -1072,7 +1229,8 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, ret_iter->Initialize( raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, - data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, + protection_bytes_per_key_, kv_checksum_, block_restart_interval_); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ @@ -1108,8 +1266,9 @@ IndexBlockIter* Block::NewIndexIterator( total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, prefix_index_ptr, have_first_key, - key_includes_seq, value_is_full, - block_contents_pinned); + key_includes_seq, value_is_full, block_contents_pinned, + protection_bytes_per_key_, kv_checksum_, + block_restart_interval_); } return ret_iter; @@ -1125,6 +1284,7 @@ size_t Block::ApproximateMemoryUsage() const { if (read_amp_bitmap_) { usage += read_amp_bitmap_->ApproximateMemoryUsage(); } + usage += checksum_size_; return usage; } diff --git a/table/block_based/block.h b/table/block_based/block.h index dfbca866325..68b6906fac9 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -14,6 +14,7 @@ #include #include +#include "db/kv_checksum.h" #include "db/pinned_iterators_manager.h" #include "port/malloc.h" #include "rocksdb/advanced_cache.h" @@ -240,6 +241,34 @@ class Block { // For TypedCacheInterface const Slice& ContentSlice() const { return contents_.data; } + // Initializes per key-value checksum protection. + // After this method is called, each DataBlockIterator returned + // by NewDataIterator will verify per key-value checksum for any key it read. + void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp); + + // Initializes per key-value checksum protection. + // After this method is called, each IndexBlockIterator returned + // by NewIndexIterator will verify per key-value checksum for any key it read. + // value_is_full and index_has_first_key are needed to be able to parse + // the index block content and construct checksums. + void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key); + + // Initializes per key-value checksum protection. + // After this method is called, each MetaBlockIter returned + // by NewMetaIterator will verify per key-value checksum for any key it read. + void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key); + + static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len, + const Slice& key, const Slice& value) { + ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr); + } + + const char* TEST_GetKVChecksum() const { return kv_checksum_; } + private: BlockContents contents_; const char* data_; // contents_.data.data() @@ -247,6 +276,11 @@ class Block { uint32_t restart_offset_; // Offset in data_ of restart array uint32_t num_restarts_; std::unique_ptr read_amp_bitmap_; + char* kv_checksum_{nullptr}; + uint32_t checksum_size_{0}; + // Used by block iterators to calculate current key index within a block + uint32_t block_restart_interval_{0}; + uint8_t protection_bytes_per_key_{0}; DataBlockHashIndex data_block_hash_index_; }; @@ -269,6 +303,14 @@ class Block { // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These // "Impl" functions are responsible for positioning `raw_key_` but not // invoking `UpdateKey()`. +// +// Per key-value checksum is enabled if relevant states are passed in during +// `InitializeBase()`. The checksum verification is done in each call to +// UpdateKey() for the current key. Each subclass is responsible for keeping +// track of cur_entry_idx_, the index of the current key within the block. +// BlockIter uses this index to get the corresponding checksum for current key. +// Additional checksum verification may be done in subclasses if they read keys +// other than the key being processed in UpdateKey(). template class BlockIter : public InternalIteratorBase { public: @@ -286,9 +328,16 @@ class BlockIter : public InternalIteratorBase { Cleanable::Reset(); } - bool Valid() const override { return current_ < restarts_; } + bool Valid() const override { + // When status_ is not ok, iter should be invalid. + assert(status_.ok() || current_ >= restarts_); + return current_ < restarts_; + } virtual void SeekToFirst() override final { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; +#endif SeekToFirstImpl(); UpdateKey(); } @@ -325,6 +374,7 @@ class BlockIter : public InternalIteratorBase { } Status status() const override { return status_; } + Slice key() const override { assert(Valid()); return key_; @@ -337,10 +387,22 @@ class BlockIter : public InternalIteratorBase { (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); status_.PermitUncheckedError(); } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; + + bool TEST_Corrupt_Callback(const std::string& sync_point) { + bool corrupt = false; + TEST_SYNC_POINT_CALLBACK(sync_point, static_cast(&corrupt)); + + if (corrupt) { + CorruptionError(); + } + return corrupt; + } #endif bool IsKeyPinned() const override { @@ -377,27 +439,74 @@ class BlockIter : public InternalIteratorBase { Status status_; // Key to be exposed to users. Slice key_; + SequenceNumber global_seqno_; + + // Per key-value checksum related states + const char* kv_checksum_; + int32_t cur_entry_idx_; + uint32_t block_restart_interval_; + uint8_t protection_bytes_per_key_; + bool key_pinned_; // Whether the block data is guaranteed to outlive this iterator, and // as long as the cleanup functions are transferred to another class, // e.g. PinnableSlice, the pointer to the bytes will still be valid. bool block_contents_pinned_; - SequenceNumber global_seqno_; virtual void SeekToFirstImpl() = 0; virtual void SeekToLastImpl() = 0; virtual void SeekImpl(const Slice& target) = 0; virtual void SeekForPrevImpl(const Slice& target) = 0; virtual void NextImpl() = 0; - virtual void PrevImpl() = 0; + // Returns the restart interval of this block. + // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized. + virtual uint32_t GetRestartInterval() { + if (num_restarts_ <= 1 || data_ == nullptr) { + return 0; + } + SeekToFirstImpl(); + uint32_t end_index = GetRestartPoint(1); + uint32_t count = 1; + while (NextEntryOffset() < end_index && status_.ok()) { + assert(Valid()); + NextImpl(); + ++count; + } + return count; + } + + // Returns the number of keys in this block. + virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) { + if (num_restarts_ == 0 || data_ == nullptr) { + return 0; + } + uint32_t count = (num_restarts_ - 1) * block_restart_interval; + // Add number of keys from the last restart interval + SeekToRestartPoint(num_restarts_ - 1); + while (NextEntryOffset() < restarts_ && status_.ok()) { + NextImpl(); + ++count; + } + return count; + } + + // Stores whether the current key has a shared bytes with prev key in + // *is_shared. + // Sets raw_key_, value_ to the current parsed key and value. + // Sets restart_index_ to point to the restart interval that contains + // the current key. template inline bool ParseNextKey(bool* is_shared); + // protection_bytes_per_key, kv_checksum, and block_restart_interval + // are needed only for per kv checksum verification. void InitializeBase(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, bool block_contents_pinned) { + SequenceNumber global_seqno, bool block_contents_pinned, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid @@ -410,11 +519,41 @@ class BlockIter : public InternalIteratorBase { global_seqno_ = global_seqno; block_contents_pinned_ = block_contents_pinned; cache_handle_ = nullptr; + cur_entry_idx_ = -1; + protection_bytes_per_key_ = protection_bytes_per_key; + kv_checksum_ = kv_checksum; + block_restart_interval_ = block_restart_interval; + // Checksum related states are either all 0/nullptr or all non-zero. + // One exception is when num_restarts == 0, block_restart_interval can be 0 + // since we are not able to compute it. + assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || + (protection_bytes_per_key > 0 && kv_checksum != nullptr && + (block_restart_interval > 0 || num_restarts == 1))); + } + + void CorruptionError(const std::string& error_msg = "bad entry in block") { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption(error_msg); + raw_key_.Clear(); + value_.clear(); + } + + void PerKVChecksumCorruptionError() { + std::string error_msg{ + "Corrupted block entry: per key-value checksum verification " + "failed."}; + error_msg.append(" Offset: " + std::to_string(current_) + "."); + error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + "."); + CorruptionError(error_msg); } // Must be called every time a key is found that needs to be returned to user, // and may be called when no key is found (as a no-op). Updates `key_`, // `key_buf_`, and `key_pinned_` with info about the found key. + // Per key-value checksum verification is done if available for the key to be + // returned. Iterator is invalidated with corruption status if checksum + // verification fails. void UpdateKey() { key_buf_.Clear(); if (!Valid()) { @@ -433,6 +572,19 @@ class BlockIter : public InternalIteratorBase { key_ = key_buf_.GetInternalKey(); key_pinned_ = false; } + TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", + (void*)value_.data()); + TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", + &protection_bytes_per_key_); + if (protection_bytes_per_key_ > 0) { + if (!ProtectionInfo64() + .ProtectKV(raw_key_.GetKey(), value_) + .Verify( + protection_bytes_per_key_, + kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { + PerKVChecksumCorruptionError(); + } + } } // Returns the result of `Comparator::Compare()`, where the appropriate @@ -464,7 +616,7 @@ class BlockIter : public InternalIteratorBase { return static_cast((value_.data() + value_.size()) - data_); } - uint32_t GetRestartPoint(uint32_t index) { + uint32_t GetRestartPoint(uint32_t index) const { assert(index < num_restarts_); return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); } @@ -479,13 +631,20 @@ class BlockIter : public InternalIteratorBase { value_ = Slice(data_ + offset, 0); } - void CorruptionError(); - protected: template inline bool BinarySeek(const Slice& target, uint32_t* index, bool* is_index_key_result); + // Find the first key in restart interval `index` that is >= `target`. + // If there is no such key, iterator is positioned at the first key in + // restart interval `index + 1`. + // If is_index_key_result is true, it positions the iterator at the first key + // in this restart interval. + // Per key-value checksum verification is done for all keys scanned + // up to but not including the last key (the key that current_ points to + // when this function returns). This key's checksum is verified in + // UpdateKey(). void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, bool is_index_key_result); }; @@ -494,22 +653,17 @@ class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} - DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts, - uint32_t num_restarts, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, - DataBlockHashIndex* data_block_hash_index) - : DataBlockIter() { - Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno, - read_amp_bitmap, block_contents_pinned, data_block_hash_index); - } void Initialize(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, - DataBlockHashIndex* data_block_hash_index) { + DataBlockHashIndex* data_block_hash_index, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, - block_contents_pinned); + block_contents_pinned, protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(false); read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; @@ -527,7 +681,11 @@ class DataBlockIter final : public BlockIter { return value_; } + // Returns if `target` may exist. inline bool SeekForGet(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true; +#endif if (!data_block_hash_index_) { SeekImpl(target); UpdateKey(); @@ -599,11 +757,14 @@ class MetaBlockIter final : public BlockIter { public: MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, - bool block_contents_pinned) { + bool block_contents_pinned, uint8_t protection_bytes_per_key, + const char* kv_checksum, uint32_t block_restart_interval) { // Initializes the iterator with a BytewiseComparator and // the raw key being a user key. InitializeBase(BytewiseComparator(), data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + kDisableGlobalSequenceNumber, block_contents_pinned, + protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(true); } @@ -613,12 +774,17 @@ class MetaBlockIter final : public BlockIter { } protected: + friend Block; void SeekToFirstImpl() override; void SeekToLastImpl() override; void SeekImpl(const Slice& target) override; void SeekForPrevImpl(const Slice& target) override; void NextImpl() override; void PrevImpl() override; + // Meta index block's restart interval is always 1. See + // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval. + uint32_t GetRestartInterval() override { return 1; } + uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; } }; class IndexBlockIter final : public BlockIter { @@ -633,9 +799,13 @@ class IndexBlockIter final : public BlockIter { uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, bool have_first_key, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) { + bool value_is_full, bool block_contents_pinned, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { InitializeBase(raw_ucmp, data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + kDisableGlobalSequenceNumber, block_contents_pinned, + protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(!key_includes_seq); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; @@ -666,11 +836,17 @@ class IndexBlockIter final : public BlockIter { } } + Slice raw_value() const { + assert(Valid()); + return value_; + } + bool IsValuePinned() const override { return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); } protected: + friend Block; // IndexBlockIter follows a different contract for prefix iterator // from data iterators. // If prefix of the seek key `target` exists in the file, it must @@ -692,11 +868,8 @@ class IndexBlockIter final : public BlockIter { } void PrevImpl() override; - void NextImpl() override; - void SeekToFirstImpl() override; - void SeekToLastImpl() override; private: diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 2a8e44d1cb6..5121d1b43d7 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -450,7 +450,12 @@ struct BlockBasedTableBuilder::Rep { table_options, data_block)), create_context(&table_options, ioptions.stats, compression_type == kZSTD || - compression_type == kZSTDNotFinalCompression), + compression_type == kZSTDNotFinalCompression, + tbo.moptions.block_protection_bytes_per_key, + tbo.internal_comparator.user_comparator(), + !use_delta_encoding_for_index_values, + table_opt.index_type == + BlockBasedTableOptions::kBinarySearchWithFirstKey), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index dc852e543cd..2bca0703327 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -567,7 +567,8 @@ Status BlockBasedTableFactory::NewTableReader( return BlockBasedTable::Open( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), - file_size, table_reader, table_reader_cache_res_mgr_, + file_size, table_reader_options.block_protection_bytes_per_key, + table_reader, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 0ed42348f0c..b5144166198 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -560,6 +560,7 @@ Status BlockBasedTable::Open( const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, + uint8_t block_protection_bytes_per_key, std::unique_ptr* table_reader, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, @@ -645,6 +646,7 @@ Status BlockBasedTable::Open( // meta-block reads. rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + rep->create_context.protection_bytes_per_key = block_protection_bytes_per_key; // Read metaindex std::unique_ptr new_table( new BlockBasedTable(rep, block_cache_tracer)); @@ -671,9 +673,11 @@ Status BlockBasedTable::Open( CompressionTypeToString(kZSTD) || rep->table_properties->compression_name == CompressionTypeToString(kZSTDNotFinalCompression)); - rep->create_context = - BlockCreateContext(&rep->table_options, rep->ioptions.stats, - blocks_definitely_zstd_compressed); + rep->create_context = BlockCreateContext( + &rep->table_options, rep->ioptions.stats, + blocks_definitely_zstd_compressed, block_protection_bytes_per_key, + rep->internal_comparator.user_comparator(), rep->index_value_is_full, + rep->index_has_first_key); // Check expected unique id if provided if (expected_unique_id != kNullUniqueId64x2) { @@ -2168,6 +2172,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } s = biter.status(); + if (!s.ok()) { + break; + } } // Write the block cache access record. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index df296a0d3d5..dafaa4ebf85 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -98,6 +98,7 @@ class BlockBasedTable : public TableReader { const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, + uint8_t block_protection_bytes_per_key, std::unique_ptr* table_reader, std::shared_ptr table_reader_cache_res_mgr = nullptr, diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index eb1175a7d43..a6ee940d801 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -116,8 +116,9 @@ class BlockBasedTableReaderBaseTest : public testing::Test { bool prefetch_index_and_filter_in_cache = true, Status* status = nullptr) { const MutableCFOptions moptions(options_); - TableReaderOptions table_reader_options = TableReaderOptions( - ioptions, moptions.prefix_extractor, EnvOptions(), comparator); + TableReaderOptions table_reader_options = + TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), + comparator, 0 /* block_protection_bytes_per_key */); std::unique_ptr file; NewFileReader(table_name, foptions, &file); diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc index 318d30d84e3..a252899d24a 100644 --- a/table/block_based/block_cache.cc +++ b/table/block_based/block_cache.cc @@ -11,17 +11,25 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kData( std::move(block), table_options->read_amp_bytes_per_bit, statistics)); + parsed_out->get()->InitializeDataBlockProtectionInfo(protection_bytes_per_key, + raw_ucmp); } void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kIndex(std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); } void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kFilterPartitionIndex( std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); } void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { @@ -32,6 +40,8 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kMetaIndex( std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeMetaIndexBlockProtectionInfo( + protection_bytes_per_key); } void BlockCreateContext::Create( diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h index ec39405fe54..00eaface370 100644 --- a/table/block_based/block_cache.h +++ b/table/block_based/block_cache.h @@ -70,14 +70,26 @@ class Block_kMetaIndex : public Block { struct BlockCreateContext : public Cache::CreateContext { BlockCreateContext() {} BlockCreateContext(const BlockBasedTableOptions* _table_options, - Statistics* _statistics, bool _using_zstd) + Statistics* _statistics, bool _using_zstd, + uint8_t _protection_bytes_per_key, + const Comparator* _raw_ucmp, + bool _index_value_is_full = false, + bool _index_has_first_key = false) : table_options(_table_options), statistics(_statistics), - using_zstd(_using_zstd) {} + using_zstd(_using_zstd), + protection_bytes_per_key(_protection_bytes_per_key), + raw_ucmp(_raw_ucmp), + index_value_is_full(_index_value_is_full), + index_has_first_key(_index_has_first_key) {} const BlockBasedTableOptions* table_options = nullptr; Statistics* statistics = nullptr; bool using_zstd = false; + uint8_t protection_bytes_per_key = 0; + const Comparator* raw_ucmp = nullptr; + bool index_value_is_full; + bool index_has_first_key; // For TypedCacheInterface template diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 83b87fe79e8..90a47ef2cc1 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -15,6 +15,7 @@ #include #include +#include "db/db_test_util.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" @@ -506,7 +507,7 @@ class IndexBlockTest void GenerateRandomIndexEntries(std::vector *separators, std::vector *block_handles, std::vector *first_keys, - const int len) { + const int len, bool zero_seqno = false) { Random rnd(42); // For each of `len` blocks, we need to generate a first and last key. @@ -514,7 +515,11 @@ void GenerateRandomIndexEntries(std::vector *separators, std::set keys; while ((int)keys.size() < len * 2) { // Keys need to be at least 8 bytes long to look like internal keys. - keys.insert(test::RandomKey(&rnd, 12)); + std::string new_key = test::RandomKey(&rnd, 12); + if (zero_seqno) { + AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue); + } + keys.insert(std::move(new_key)); } uint64_t offset = 0; @@ -618,6 +623,917 @@ INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, std::make_tuple(true, false), std::make_tuple(true, true))); +class BlockPerKVChecksumTest : public DBTestBase { + public: + BlockPerKVChecksumTest() + : DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {} + + template + void TestIterateForward(std::unique_ptr &biter, + size_t &verification_count) { + while (biter->Valid()) { + verification_count = 0; + biter->Next(); + if (biter->Valid()) { + ASSERT_GE(verification_count, 1); + } + } + } + + template + void TestIterateBackward(std::unique_ptr &biter, + size_t &verification_count) { + while (biter->Valid()) { + verification_count = 0; + biter->Prev(); + if (biter->Valid()) { + ASSERT_GE(verification_count, 1); + } + } + } + + template + void TestSeekToFirst(std::unique_ptr &biter, + size_t &verification_count) { + verification_count = 0; + biter->SeekToFirst(); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + } + + template + void TestSeekToLast(std::unique_ptr &biter, + size_t &verification_count) { + verification_count = 0; + biter->SeekToLast(); + ASSERT_GE(verification_count, 1); + TestIterateBackward(biter, verification_count); + } + + template + void TestSeekForPrev(std::unique_ptr &biter, + size_t &verification_count, std::string k) { + verification_count = 0; + biter->SeekForPrev(k); + ASSERT_GE(verification_count, 1); + TestIterateBackward(biter, verification_count); + } + + template + void TestSeek(std::unique_ptr &biter, size_t &verification_count, + std::string k) { + verification_count = 0; + biter->Seek(k); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + } + + bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr, + const Slice &key, const Slice &val) { + if (!checksum_len) { + return checksum_ptr == nullptr; + } + return ProtectionInfo64().ProtectKV(key, val).Verify( + static_cast(checksum_len), checksum_ptr); + } +}; + +TEST_F(BlockPerKVChecksumTest, EmptyBlock) { + // Tests that empty block code path is not broken by per kv checksum. + BlockBuilder builder( + 16 /* block_restart_interval */, true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch); + Slice raw_block = builder.Finish(); + BlockContents contents; + contents.data = raw_block; + + std::unique_ptr data_block; + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + create_context.Create(&data_block, std::move(contents)); + std::unique_ptr biter{data_block->NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber)}; + biter->SeekToFirst(); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + Random rnd(33); + biter->SeekForGet(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->SeekToLast(); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->Seek(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->SeekForPrev(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); +} + +TEST_F(BlockPerKVChecksumTest, UnsupportedOptionValue) { + Options options = Options(); + options.block_protection_bytes_per_key = 128; + Destroy(options); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) { + // Make sure that the checksum construction code path does not break + // when the block is itself already corrupted. + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr data_block; + create_context.Create(&data_block, std::move(contents)); + std::unique_ptr iter{data_block->NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr index_block; + create_context.Create(&index_block, std::move(contents)); + std::unique_ptr iter{index_block->NewIndexIterator( + options.comparator, kDisableGlobalSequenceNumber, nullptr, nullptr, + true, false, true, true)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr meta_block; + create_context.Create(&meta_block, std::move(contents)); + std::unique_ptr iter{meta_block->NewMetaIterator(true)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } +} + +TEST_F(BlockPerKVChecksumTest, ApproximateMemory) { + // Tests that ApproximateMemoryUsage() includes memory used by block kv + // checksum. + const int kNumRecords = 20; + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords, 1 /* step */, + 24 /* padding_size */); + std::unique_ptr builder; + auto generate_block_content = [&]() { + builder = std::make_unique(16 /* restart_interval */); + for (int i = 0; i < kNumRecords; ++i) { + builder->Add(keys[i], values[i]); + } + Slice raw_block = builder->Finish(); + BlockContents contents; + contents.data = raw_block; + return contents; + }; + + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext with_checksum_create_context{ + &tbo, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator, + true /* index_value_is_full */}; + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + 0, options.comparator, true /* index_value_is_full */}; + + { + std::unique_ptr data_block; + create_context.Create(&data_block, generate_block_content()); + size_t block_memory = data_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_data_block; + with_checksum_create_context.Create(&with_checksum_data_block, + generate_block_content()); + ASSERT_GT(with_checksum_data_block->ApproximateMemoryUsage() - block_memory, + 100); + } + + { + std::unique_ptr meta_block; + create_context.Create(&meta_block, generate_block_content()); + size_t block_memory = meta_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_meta_block; + with_checksum_create_context.Create(&with_checksum_meta_block, + generate_block_content()); + // Rough comparison to avoid flaky test due to memory allocation alignment. + ASSERT_GT(with_checksum_meta_block->ApproximateMemoryUsage() - block_memory, + 100); + } + + { + // Index block has different contents. + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords); + auto generate_index_content = [&]() { + builder = std::make_unique(16 /* restart_interval */); + BlockHandle last_encoded_handle; + for (int i = 0; i < kNumRecords; ++i) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, false, nullptr); + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder->Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + Slice raw_block = builder->Finish(); + BlockContents contents; + contents.data = raw_block; + return contents; + }; + + std::unique_ptr index_block; + create_context.Create(&index_block, generate_index_content()); + size_t block_memory = index_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_index_block; + with_checksum_create_context.Create(&with_checksum_index_block, + generate_index_content()); + ASSERT_GT( + with_checksum_index_block->ApproximateMemoryUsage() - block_memory, + 100); + } +} + +std::string GetDataBlockIndexTypeStr( + BlockBasedTableOptions::DataBlockIndexType t) { + return t == BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch + ? "BinarySearch" + : "BinaryAndHash"; +} + +class DataBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface> { + public: + DataBlockKVChecksumTest() = default; + + BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const { + return std::get<0>(GetParam()); + } + uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); } + uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); } + bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); } + + std::unique_ptr GenerateDataBlock( + std::vector &keys, std::vector &values, + int num_record) { + BlockBasedTableOptions tbo; + BlockCreateContext create_context{&tbo, nullptr /* statistics */, + false /* using_zstd */, GetChecksumLen(), + Options().comparator}; + builder_ = std::make_unique( + static_cast(GetRestartInterval()), + GetUseDeltaEncoding() /* use_delta_encoding */, + false /* use_value_delta_encoding */, GetDataBlockIndexType()); + for (int i = 0; i < num_record; i++) { + builder_->Add(keys[i], values[i]); + } + Slice raw_block = builder_->Finish(); + BlockContents contents; + contents.data = raw_block; + std::unique_ptr data_block; + create_context.Create(&data_block, std::move(contents)); + return data_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P( + P, DataBlockKVChecksumTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */, + ::testing::Values(1, 2, 3, 8, 16) /* restart_interval */, + ::testing::Values(false, true)) /* delta_encoding */, + [](const testing::TestParamInfo> + &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) + << "ProtectionPerKey" << std::to_string(std::get<1>(args.param)) + << "RestartInterval" << std::to_string(std::get<2>(args.param)) + << "DeltaEncode" << std::to_string(std::get<3>(args.param)); + return oss.str(); + }); + +TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) { + uint8_t protection_bytes_per_key = GetChecksumLen(); + std::vector num_restart_intervals = {1, 16}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr data_block = + GenerateDataBlock(keys, values, kNumRecords); + + const char *checksum_ptr = data_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, + checksum_ptr + i * protection_bytes_per_key, + keys[i], values[i])); + } + std::vector seqnos{kDisableGlobalSequenceNumber, 0}; + + // Could just use a boolean flag. Use a counter here just to keep open the + // possibility of checking the exact number of verifications in the future. + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). So verification count is incremented even with + // protection_bytes_per_key = 0. No actual checksum computation is done in + // that case (see Block::VerifyChecksum()). + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (const auto seqno : seqnos) { + std::unique_ptr biter{ + data_block->NewDataIterator(Options().comparator, seqno)}; + + // SeekForGet() some key that does not exist + biter->SeekForGet(keys[kNumRecords]); + TestIterateForward(biter, verification_count); + + verification_count = 0; + biter->SeekForGet(keys[kNumRecords / 2]); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]); + TestSeek(biter, verification_count, keys[kNumRecords / 2]); + } + } +} + +class IndexBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface< + std::tuple> { + public: + IndexBlockKVChecksumTest() = default; + + BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const { + return std::get<0>(GetParam()); + } + uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); } + uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); } + bool UseValueDeltaEncoding() const { return std::get<3>(GetParam()); } + bool IncludeFirstKey() const { return std::get<4>(GetParam()); } + + std::unique_ptr GenerateIndexBlock( + std::vector &separators, + std::vector &block_handles, + std::vector &first_keys, int num_record) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, + nullptr /* statistics */, + false /* _using_zstd */, + protection_bytes_per_key, + options.comparator, + !UseValueDeltaEncoding() /* value_is_full */, + IncludeFirstKey()}; + builder_ = std::make_unique( + static_cast(GetRestartInterval()), true /* use_delta_encoding */, + UseValueDeltaEncoding() /* use_value_delta_encoding */, + GetDataBlockIndexType()); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_record; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, IncludeFirstKey(), nullptr); + if (UseValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, IncludeFirstKey(), + &last_encoded_handle); + } + + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder_->Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + // read serialized contents of the block + Slice raw_block = builder_->Finish(); + // create block reader + BlockContents contents; + contents.data = raw_block; + std::unique_ptr index_block; + + create_context.Create(&index_block, std::move(contents)); + return index_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P( + P, IndexBlockKVChecksumTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(0, 1, 2, 4, 8), ::testing::Values(1, 3, 8, 16), + ::testing::Values(true, false), ::testing::Values(true, false)), + [](const testing::TestParamInfo< + std::tuple> &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode" + << std::to_string(std::get<3>(args.param)) << "IncludeFirstKey" + << std::to_string(std::get<4>(args.param)); + return oss.str(); + }); + +TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { + Options options = Options(); + uint8_t protection_bytes_per_key = GetChecksumLen(); + std::vector num_restart_intervals = {1, 16}; + std::vector seqnos{kDisableGlobalSequenceNumber, 10001}; + + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + for (const auto seqno : seqnos) { + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords, + seqno != kDisableGlobalSequenceNumber); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr index_block = GenerateIndexBlock( + separators, block_handles, first_keys, kNumRecords); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + std::unique_ptr biter{index_block->NewIndexIterator( + options.comparator, seqno, kNullIter, kNullStats, + true /* total_order_seek */, IncludeFirstKey() /* have_first_key */, + true /* key_includes_seq */, + !UseValueDeltaEncoding() /* value_is_full */, + true /* block_contents_pinned */, nullptr /* prefix_index */)}; + biter->SeekToFirst(); + const char *checksum_ptr = index_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + // Obtaining the actual content written as value to index block is not + // trivial: delta-encoded value is only persisted when not at block + // restart point and that keys share some byte (see more in + // BlockBuilder::AddWithLastKeyImpl()). So here we just do verification + // using value from iterator unlike tests for DataBlockIter or + // MetaBlockIter. + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, checksum_ptr, + biter->key(), biter->raw_value())); + } + + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). To make the testing code below simpler and not + // having to differentiate 0 vs non-0 checksum_len, we do an explicit + // assert checking on checksum_len here. + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeek(biter, verification_count, first_keys[kNumRecords / 2]); + } + } +} + +class MetaIndexBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface< + uint8_t /* block_protection_bytes_per_key */> { + public: + MetaIndexBlockKVChecksumTest() = default; + uint8_t GetChecksumLen() const { return GetParam(); } + uint32_t GetRestartInterval() const { return 1; } + + std::unique_ptr GenerateMetaIndexBlock( + std::vector &keys, std::vector &values, + int num_record) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + builder_ = + std::make_unique(static_cast(GetRestartInterval())); + // add a bunch of records to a block + for (int i = 0; i < num_record; i++) { + builder_->Add(keys[i], values[i]); + } + Slice raw_block = builder_->Finish(); + BlockContents contents; + contents.data = raw_block; + std::unique_ptr meta_block; + create_context.Create(&meta_block, std::move(contents)); + return meta_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest, + ::testing::Values(0, 1, 2, 4, 8), + [](const testing::TestParamInfo &args) { + std::ostringstream oss; + oss << "ProtBytes" << std::to_string(args.param); + return oss.str(); + }); + +TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, nullptr /* statistics */, false /* using_zstd */, + protection_bytes_per_key, options.comparator}; + std::vector num_restart_intervals = {1, 16}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = num_restart_interval * GetRestartInterval(); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr meta_block = + GenerateMetaIndexBlock(keys, values, kNumRecords); + const char *checksum_ptr = meta_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, + checksum_ptr + i * protection_bytes_per_key, + keys[i], values[i])); + } + + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). To make the testing code below simpler and not + // having to differentiate 0 vs non-0 checksum_len, we do an explicit assert + // checking on checksum_len here. + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + // Check that block iterator does checksum verification + std::unique_ptr biter{ + meta_block->NewMetaIterator(true /* block_contents_pinned */)}; + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeek(biter, verification_count, keys[kNumRecords / 2]); + TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]); + } +} + +class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest { + public: + DataBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateDataBlockIter( + std::vector &keys, std::vector &values, + int num_record) { + // During Block construction, we may create block iter to initialize per kv + // checksum. Disable syncpoint that may be created for block iter methods. + SyncPoint::GetInstance()->DisableProcessing(); + block_ = GenerateDataBlock(keys, values, num_record); + std::unique_ptr biter{block_->NewDataIterator( + Options().comparator, kDisableGlobalSequenceNumber)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) { + std::vector num_restart_intervals = {1, 3}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // values generated by GenerateRandomKVs are of length 100 + ++value[10]; + }); + + // Purely for reducing the number of lines of code. + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + + std::string seek_key = keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); }); + + typedef void (DataBlockIter::*IterStepAPI)(); + auto test_step = [&](IterStepAPI iter_api, std::string &k) { + IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + if (kNumRecords > 1) { + test_step(&DataBlockIter::Prev, seek_key); + test_step(&DataBlockIter::Next, seek_key); + } + } +} + +INSTANTIATE_TEST_CASE_P( + P, DataBlockKVChecksumCorruptionTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + ::testing::Values(1, 3, 8, 16) /* restart_interval */, + ::testing::Values(false, true)), + [](const testing::TestParamInfo> + &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "DeltaEncode" + << std::to_string(std::get<3>(args.param)); + return oss.str(); + }); + +class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest { + public: + IndexBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateIndexBlockIter( + std::vector &separators, + std::vector &block_handles, + std::vector &first_keys, int num_record, + SequenceNumber seqno) { + SyncPoint::GetInstance()->DisableProcessing(); + block_ = + GenerateIndexBlock(separators, block_handles, first_keys, num_record); + std::unique_ptr biter{block_->NewIndexIterator( + Options().comparator, seqno, nullptr, nullptr, + true /* total_order_seek */, IncludeFirstKey() /* have_first_key */, + true /* key_includes_seq */, + !UseValueDeltaEncoding() /* value_is_full */, + true /* block_contents_pinned */, nullptr /* prefix_index */)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +INSTANTIATE_TEST_CASE_P( + P, IndexBlockKVChecksumCorruptionTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + ::testing::Values(1, 3, 8, 16) /* restart_interval */, + ::testing::Values(true, false), ::testing::Values(true, false)), + [](const testing::TestParamInfo< + std::tuple> &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode" + << std::to_string(std::get<3>(args.param)) << "IncludeFirstKey" + << std::to_string(std::get<4>(args.param)); + return oss.str(); + }); + +TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) { + std::vector num_restart_intervals = {1, 3}; + std::vector seqnos{kDisableGlobalSequenceNumber, 10001}; + + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + for (const auto seqno : seqnos) { + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords, + seqno != kDisableGlobalSequenceNumber); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // value can be delta-encoded with different lengths, so we corrupt + // first bytes here to be safe + ++value[0]; + }); + + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + std::string seek_key = first_keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + std::unique_ptr biter = GenerateIndexBlockIter( + separators, block_handles, first_keys, kNumRecords, seqno); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + + typedef void (IndexBlockIter::*IterStepAPI)(); + auto test_step = [&](IterStepAPI iter_api, std::string &k) { + std::unique_ptr biter = GenerateIndexBlockIter( + separators, block_handles, first_keys, kNumRecords, seqno); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + if (kNumRecords > 1) { + test_step(&IndexBlockIter::Prev, seek_key); + test_step(&IndexBlockIter::Next, seek_key); + } + } + } +} + +class MetaIndexBlockKVChecksumCorruptionTest + : public MetaIndexBlockKVChecksumTest { + public: + MetaIndexBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateMetaIndexBlockIter( + std::vector &keys, std::vector &values, + int num_record) { + SyncPoint::GetInstance()->DisableProcessing(); + block_ = GenerateMetaIndexBlock(keys, values, num_record); + std::unique_ptr biter{ + block_->NewMetaIterator(true /* block_contents_pinned */)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +INSTANTIATE_TEST_CASE_P( + P, MetaIndexBlockKVChecksumCorruptionTest, + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + [](const testing::TestParamInfo &args) { + std::ostringstream oss; + oss << "ProtBytes" << std::to_string(args.param); + return oss.str(); + }); + +TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) { + Options options = Options(); + std::vector num_restart_intervals = {1, 3}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // values generated by GenerateRandomKVs are of length 100 + ++value[10]; + }); + + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + typedef void (MetaBlockIter::*IterStepAPI)(); + std::string seek_key = keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + + auto test_step = [&](IterStepAPI iter_api, const std::string &k) { + IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + if (kNumRecords > 1) { + test_step(&MetaBlockIter::Prev, seek_key); + test_step(&MetaBlockIter::Next, seek_key); + } + } +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char **argv) { diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index cd2e30833dd..2841b271dea 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -581,8 +581,9 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, const bool kImmortal = true; ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, - internal_comparator, !kSkipFilters, !kImmortal, - level_), + internal_comparator, + 0 /* block_protection_bytes_per_key */, !kSkipFilters, + !kImmortal, level_), std::move(file_reader), sink->contents().size(), &table_reader)); // Search using Get() ReadOptions ro; diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 6d983f9b74a..9696a509dce 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -266,9 +266,9 @@ class BlockFetcherTest : public testing::Test { const auto* table_options = table_factory_.GetOptions(); ASSERT_NE(table_options, nullptr); - ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, - comparator, std::move(file), file_size, - &table_reader)); + ASSERT_OK(BlockBasedTable::Open( + ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), + file_size, 0 /* block_protection_bytes_per_key */, &table_reader)); table->reset(reinterpret_cast(table_reader.release())); } diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index fa3e5b47da9..f6d6e195d68 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -165,10 +165,10 @@ Status SstFileDumper::NewTableReader( const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { - auto t_opt = - TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_, - internal_comparator_, false /* skip_filters */, - false /* imortal */, true /* force_direct_prefetch */); + auto t_opt = TableReaderOptions( + ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_, + 0 /* block_protection_bytes_per_key */, false /* skip_filters */, + false /* immortal */, true /* force_direct_prefetch */); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index c95c91743f0..533b7cd6ac7 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -56,7 +56,8 @@ Status SstFileReader::Open(const std::string& file_path) { } if (s.ok()) { TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor, - r->soptions, r->ioptions.internal_comparator); + r->soptions, r->ioptions.internal_comparator, + r->moptions.block_protection_bytes_per_key); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader), diff --git a/table/table_builder.h b/table/table_builder.h index 1790f33b1b3..e1bb4b55747 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -37,9 +37,9 @@ struct TableReaderOptions { const std::shared_ptr& _prefix_extractor, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, - bool _skip_filters = false, bool _immortal = false, - bool _force_direct_prefetch = false, int _level = -1, - BlockCacheTracer* const _block_cache_tracer = nullptr, + uint8_t _block_protection_bytes_per_key, bool _skip_filters = false, + bool _immortal = false, bool _force_direct_prefetch = false, + int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0) @@ -56,7 +56,8 @@ struct TableReaderOptions { max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin), cur_db_session_id(_cur_db_session_id), cur_file_num(_cur_file_num), - unique_id(_unique_id) {} + unique_id(_unique_id), + block_protection_bytes_per_key(_block_protection_bytes_per_key) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -86,6 +87,8 @@ struct TableReaderOptions { // Known unique_id or {}, kNullUniqueId64x2 means unknown UniqueId64x2 unique_id; + + uint8_t block_protection_bytes_per_key; }; struct TableBuilderOptions { diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 09146f0efb6..60c84d7bf09 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -144,7 +144,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, new RandomAccessFileReader(std::move(raf), file_name)); s = opts.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, env_options, - ikc), + ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), file_size, &table_reader); if (!s.ok()) { fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str()); diff --git a/table/table_test.cc b/table/table_test.cc index 8f0f4b1f1ac..17b8af9f131 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -444,7 +444,9 @@ class TableConstructor : public Constructor { file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, - *last_internal_comparator_, /*skip_filters*/ false, + *last_internal_comparator_, + 0 /* block_protection_bytes_per_key */, + /*skip_filters*/ false, /*immortal*/ false, false, level_, &block_cache_tracer_, moptions.write_buffer_size, "", file_num_, kNullUniqueId64x2, largest_seqno_), @@ -4795,7 +4797,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { options.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), - ikc), + ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), ss_rw.contents().size(), &table_reader); return table_reader->NewIterator( @@ -4964,7 +4966,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(), - GetPlainInternalComparator(options2.comparator)), + GetPlainInternalComparator(options2.comparator), + 0 /* block_protection_bytes_per_key */), std::move(file_reader), sink->contents().size(), &table_reader)); ReadOptions read_options; @@ -5081,63 +5084,53 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { } TEST_P(BlockBasedTableTest, CompressionRatioThreshold) { - Options options; - if (Snappy_Supported()) { - options.compression = kSnappyCompression; - fprintf(stderr, "using snappy\n"); - } else if (Zlib_Supported()) { - options.compression = kZlibCompression; - fprintf(stderr, "using zlib\n"); - } else if (BZip2_Supported()) { - options.compression = kBZip2Compression; - fprintf(stderr, "using bzip2\n"); - } else if (LZ4_Supported()) { - options.compression = kLZ4Compression; - fprintf(stderr, "using lz4\n"); - } else if (XPRESS_Supported()) { - options.compression = kXpressCompression; - fprintf(stderr, "using xpress\n"); - } else if (ZSTD_Supported()) { - options.compression = kZSTD; - fprintf(stderr, "using ZSTD\n"); - } else { - fprintf(stderr, "skipping test, compression disabled\n"); - return; - } - - BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - int len = 10000; - Random rnd(301); - std::vector keys; - stl_wrappers::KVMap kvmap; + for (CompressionType type : GetSupportedCompressions()) { + if (type == kNoCompression) { + continue; + } + if (type == kBZip2Compression) { + // Weird behavior in this test + continue; + } + SCOPED_TRACE("Compression type: " + std::to_string(type)); - // Test the max_compressed_bytes_per_kb option - for (int threshold : {0, 1, 100, 400, 600, 900, 1024}) { - SCOPED_TRACE("threshold=" + std::to_string(threshold)); - options.compression_opts.max_compressed_bytes_per_kb = threshold; - ImmutableOptions ioptions(options); - MutableCFOptions moptions(options); + Options options; + options.compression = type; - for (double compressible_to : {0.25, 0.75}) { - SCOPED_TRACE("compressible_to=" + std::to_string(compressible_to)); - TableConstructor c(BytewiseComparator(), - true /* convert_to_internal_key_ */); - std::string buf; - c.Add("x", test::CompressibleString(&rnd, compressible_to, len, &buf)); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + int len = 10000; + Random rnd(301); + std::vector keys; + stl_wrappers::KVMap kvmap; - // write an SST file - c.Finish(options, ioptions, moptions, table_options, - GetPlainInternalComparator(options.comparator), &keys, &kvmap); + // Test the max_compressed_bytes_per_kb option + for (int threshold : {0, 1, 100, 400, 600, 900, 1024}) { + SCOPED_TRACE("threshold=" + std::to_string(threshold)); + options.compression_opts.max_compressed_bytes_per_kb = threshold; + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); - size_t table_file_size = c.TEST_GetSink()->contents().size(); - size_t approx_sst_overhead = 1000; - if (compressible_to < threshold / 1024.0) { - // Should be compressed - EXPECT_NEAR2(len * compressible_to + approx_sst_overhead, - table_file_size, len / 10); - } else { - // Should not be compressed - EXPECT_NEAR2(len + approx_sst_overhead, table_file_size, len / 10); + for (double compressible_to : {0.25, 0.75}) { + SCOPED_TRACE("compressible_to=" + std::to_string(compressible_to)); + TableConstructor c(BytewiseComparator(), + true /* convert_to_internal_key_ */); + std::string buf; + c.Add("x", test::CompressibleString(&rnd, compressible_to, len, &buf)); + + // write an SST file + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + size_t table_file_size = c.TEST_GetSink()->contents().size(); + size_t approx_sst_overhead = 1000; + if (compressible_to < threshold / 1024.0) { + // Should be compressed (substantial variance depending on algorithm) + EXPECT_NEAR2(len * compressible_to + approx_sst_overhead, + table_file_size, len / 8); + } else { + // Should not be compressed + EXPECT_NEAR2(len + approx_sst_overhead, table_file_size, len / 10); + } } } } diff --git a/test_util/testutil.cc b/test_util/testutil.cc index b3dfc0830ee..2500f926eef 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -76,7 +76,7 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction, int len, std::string* dst) { int raw = static_cast(len * compressed_fraction); if (raw < 1) raw = 1; - std::string raw_data = rnd->RandomString(raw); + std::string raw_data = rnd->RandomBinaryString(raw); // Duplicate the random data until we have filled "len" bytes dst->clear(); diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 19f3ca24d26..784de098c89 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -125,7 +125,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb") +declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 759f634b2e2..ea40f5fa096 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1725,6 +1725,10 @@ DEFINE_uint32( "This options determines the size of such checksums. " "Supported values: 0, 1, 2, 4, 8."); +DEFINE_uint32(block_protection_bytes_per_key, 0, + "Enable block per key-value checksum protection. " + "Supported values: 0, 1, 2, 4, 8."); + DEFINE_bool(build_info, false, "Print the build info via GetRocksBuildInfoAsString"); @@ -4565,6 +4569,8 @@ class Benchmark { } options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; + options.block_protection_bytes_per_key = + FLAGS_block_protection_bytes_per_key; } void InitializeOptionsGeneral(Options* opts) { diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index ac478e7adfa..b57905ed412 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -37,6 +37,7 @@ "backup_one_in": 100000, "batch_protection_bytes_per_key": lambda: random.choice([0, 8]), "memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), + "block_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), "block_size": 16384, "bloom_bits": lambda: random.choice( [random.randint(0, 19), random.lognormvariate(2.3, 1.3)] diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 3f25c22a294..015ceb90761 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -1187,6 +1187,12 @@ TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { options.statistics = statistics; Open(bdb_options, options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size()); std::string small_value(50, 'v'); std::map data; @@ -1196,10 +1202,15 @@ TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { ASSERT_OK(Put("key" + std::to_string(i), small_value, &data)); } ASSERT_OK(blob_db_->Flush(FlushOptions())); + uint64_t live_sst_size = 0; ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &live_sst_size)); ASSERT_TRUE(live_sst_size > 0); + + TEST_SYNC_POINT( + "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush"); + ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size()); bdb_options.max_db_size = live_sst_size + 2000; @@ -1223,6 +1234,8 @@ TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); // Verify large_key2 still exists. VerifyDB(data); + + SyncPoint::GetInstance()->DisableProcessing(); } // Test flush or compaction will trigger FIFO eviction since they update @@ -1241,6 +1254,12 @@ TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) { options.compression = kNoCompression; Open(bdb_options, options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + std::string value(800, 'v'); ASSERT_OK(PutWithTTL("large_key", value, 60)); ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); @@ -1254,11 +1273,15 @@ TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) { } ASSERT_OK(blob_db_->Flush(FlushOptions())); + TEST_SYNC_POINT("BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush"); + // Verify large_key is deleted by FIFO eviction. blob_db_impl()->TEST_DeleteObsoleteFiles(); ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); VerifyDB(data); + + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(BlobDBTest, InlineSmallValues) { @@ -1637,6 +1660,12 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) { options.disable_auto_compactions = true; Open(bdb_options, options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "BlobDBTest.FilterForFIFOEviction:AfterFlush"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + std::map data; std::map data_after_compact; // Insert some small values that will be inlined. @@ -1651,6 +1680,9 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) { } uint64_t num_keys_to_evict = data.size() - data_after_compact.size(); ASSERT_OK(blob_db_->Flush(FlushOptions())); + + TEST_SYNC_POINT("BlobDBTest.FilterForFIFOEviction:AfterFlush"); + uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size(); ASSERT_GT(live_sst_size, 0); VerifyDB(data); @@ -1702,6 +1734,8 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) { data_after_compact["large_key2"] = large_value; data_after_compact["large_key3"] = large_value; VerifyDB(data_after_compact); + + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(BlobDBTest, GarbageCollection) { @@ -2394,4 +2428,3 @@ int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } -