diff --git a/HISTORY.md b/HISTORY.md
index ad6b66c9999..f11553fe684 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,13 @@
# Rocksdb Change Log
## Unreleased
+### New Features
+* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
+* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache.
+
+### Public API Changes
+* Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists.
+
+## 8.2.0 (04/24/2023)
### Public API Changes
* `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined.
* Add `multi_get_for_update` to C API.
@@ -14,6 +22,7 @@
### Bug Fixes
* In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size.
+* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet.
### New Features
* Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called.
@@ -21,9 +30,6 @@
* Added statistics tickers BYTES_COMPRESSED_FROM, BYTES_COMPRESSED_TO, BYTES_COMPRESSION_BYPASSED, BYTES_COMPRESSION_REJECTED, NUMBER_BLOCK_COMPRESSION_BYPASSED, and NUMBER_BLOCK_COMPRESSION_REJECTED. Disabled/deprecated histograms BYTES_COMPRESSED and BYTES_DECOMPRESSED, and ticker NUMBER_BLOCK_NOT_COMPRESSED. The new tickers offer more inight into compression ratios, rejected vs. disabled compression, etc. (#11388)
* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction.
-### Bug Fixes
-* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet.
-
## 8.1.0 (03/18/2023)
### Behavior changes
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
diff --git a/TARGETS b/TARGETS
index 2514e09a7cd..bbd4530cf3b 100644
--- a/TARGETS
+++ b/TARGETS
@@ -354,351 +354,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"//folly/synchronization:distributed_mutex",
], headers=None, link_whole=False, extra_test_libs=False)
-cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
- "cache/cache.cc",
- "cache/cache_entry_roles.cc",
- "cache/cache_helpers.cc",
- "cache/cache_key.cc",
- "cache/cache_reservation_manager.cc",
- "cache/charged_cache.cc",
- "cache/clock_cache.cc",
- "cache/compressed_secondary_cache.cc",
- "cache/lru_cache.cc",
- "cache/secondary_cache.cc",
- "cache/secondary_cache_adapter.cc",
- "cache/sharded_cache.cc",
- "db/arena_wrapped_db_iter.cc",
- "db/blob/blob_contents.cc",
- "db/blob/blob_fetcher.cc",
- "db/blob/blob_file_addition.cc",
- "db/blob/blob_file_builder.cc",
- "db/blob/blob_file_cache.cc",
- "db/blob/blob_file_garbage.cc",
- "db/blob/blob_file_meta.cc",
- "db/blob/blob_file_reader.cc",
- "db/blob/blob_garbage_meter.cc",
- "db/blob/blob_log_format.cc",
- "db/blob/blob_log_sequential_reader.cc",
- "db/blob/blob_log_writer.cc",
- "db/blob/blob_source.cc",
- "db/blob/prefetch_buffer_collection.cc",
- "db/builder.cc",
- "db/c.cc",
- "db/column_family.cc",
- "db/compaction/compaction.cc",
- "db/compaction/compaction_iterator.cc",
- "db/compaction/compaction_job.cc",
- "db/compaction/compaction_outputs.cc",
- "db/compaction/compaction_picker.cc",
- "db/compaction/compaction_picker_fifo.cc",
- "db/compaction/compaction_picker_level.cc",
- "db/compaction/compaction_picker_universal.cc",
- "db/compaction/compaction_service_job.cc",
- "db/compaction/compaction_state.cc",
- "db/compaction/sst_partitioner.cc",
- "db/compaction/subcompaction_state.cc",
- "db/convenience.cc",
- "db/db_filesnapshot.cc",
- "db/db_impl/compacted_db_impl.cc",
- "db/db_impl/db_impl.cc",
- "db/db_impl/db_impl_compaction_flush.cc",
- "db/db_impl/db_impl_debug.cc",
- "db/db_impl/db_impl_experimental.cc",
- "db/db_impl/db_impl_files.cc",
- "db/db_impl/db_impl_open.cc",
- "db/db_impl/db_impl_readonly.cc",
- "db/db_impl/db_impl_secondary.cc",
- "db/db_impl/db_impl_write.cc",
- "db/db_info_dumper.cc",
- "db/db_iter.cc",
- "db/dbformat.cc",
- "db/error_handler.cc",
- "db/event_helpers.cc",
- "db/experimental.cc",
- "db/external_sst_file_ingestion_job.cc",
- "db/file_indexer.cc",
- "db/flush_job.cc",
- "db/flush_scheduler.cc",
- "db/forward_iterator.cc",
- "db/import_column_family_job.cc",
- "db/internal_stats.cc",
- "db/log_reader.cc",
- "db/log_writer.cc",
- "db/logs_with_prep_tracker.cc",
- "db/malloc_stats.cc",
- "db/memtable.cc",
- "db/memtable_list.cc",
- "db/merge_helper.cc",
- "db/merge_operator.cc",
- "db/output_validator.cc",
- "db/periodic_task_scheduler.cc",
- "db/range_del_aggregator.cc",
- "db/range_tombstone_fragmenter.cc",
- "db/repair.cc",
- "db/seqno_to_time_mapping.cc",
- "db/snapshot_impl.cc",
- "db/table_cache.cc",
- "db/table_properties_collector.cc",
- "db/transaction_log_impl.cc",
- "db/trim_history_scheduler.cc",
- "db/version_builder.cc",
- "db/version_edit.cc",
- "db/version_edit_handler.cc",
- "db/version_set.cc",
- "db/wal_edit.cc",
- "db/wal_manager.cc",
- "db/wide/wide_column_serialization.cc",
- "db/wide/wide_columns.cc",
- "db/write_batch.cc",
- "db/write_batch_base.cc",
- "db/write_controller.cc",
- "db/write_stall_stats.cc",
- "db/write_thread.cc",
- "env/composite_env.cc",
- "env/env.cc",
- "env/env_chroot.cc",
- "env/env_encryption.cc",
- "env/env_posix.cc",
- "env/file_system.cc",
- "env/file_system_tracer.cc",
- "env/fs_posix.cc",
- "env/fs_remap.cc",
- "env/io_posix.cc",
- "env/mock_env.cc",
- "env/unique_id_gen.cc",
- "file/delete_scheduler.cc",
- "file/file_prefetch_buffer.cc",
- "file/file_util.cc",
- "file/filename.cc",
- "file/line_file_reader.cc",
- "file/random_access_file_reader.cc",
- "file/read_write_util.cc",
- "file/readahead_raf.cc",
- "file/sequence_file_reader.cc",
- "file/sst_file_manager_impl.cc",
- "file/writable_file_writer.cc",
- "logging/auto_roll_logger.cc",
- "logging/event_logger.cc",
- "logging/log_buffer.cc",
- "memory/arena.cc",
- "memory/concurrent_arena.cc",
- "memory/jemalloc_nodump_allocator.cc",
- "memory/memkind_kmem_allocator.cc",
- "memory/memory_allocator.cc",
- "memtable/alloc_tracker.cc",
- "memtable/hash_linklist_rep.cc",
- "memtable/hash_skiplist_rep.cc",
- "memtable/skiplistrep.cc",
- "memtable/vectorrep.cc",
- "memtable/write_buffer_manager.cc",
- "monitoring/histogram.cc",
- "monitoring/histogram_windowing.cc",
- "monitoring/in_memory_stats_history.cc",
- "monitoring/instrumented_mutex.cc",
- "monitoring/iostats_context.cc",
- "monitoring/perf_context.cc",
- "monitoring/perf_level.cc",
- "monitoring/persistent_stats_history.cc",
- "monitoring/statistics.cc",
- "monitoring/thread_status_impl.cc",
- "monitoring/thread_status_updater.cc",
- "monitoring/thread_status_updater_debug.cc",
- "monitoring/thread_status_util.cc",
- "monitoring/thread_status_util_debug.cc",
- "options/cf_options.cc",
- "options/configurable.cc",
- "options/customizable.cc",
- "options/db_options.cc",
- "options/options.cc",
- "options/options_helper.cc",
- "options/options_parser.cc",
- "port/mmap.cc",
- "port/port_posix.cc",
- "port/stack_trace.cc",
- "port/win/env_default.cc",
- "port/win/env_win.cc",
- "port/win/io_win.cc",
- "port/win/port_win.cc",
- "port/win/win_logger.cc",
- "port/win/win_thread.cc",
- "table/adaptive/adaptive_table_factory.cc",
- "table/block_based/binary_search_index_reader.cc",
- "table/block_based/block.cc",
- "table/block_based/block_based_table_builder.cc",
- "table/block_based/block_based_table_factory.cc",
- "table/block_based/block_based_table_iterator.cc",
- "table/block_based/block_based_table_reader.cc",
- "table/block_based/block_builder.cc",
- "table/block_based/block_cache.cc",
- "table/block_based/block_prefetcher.cc",
- "table/block_based/block_prefix_index.cc",
- "table/block_based/data_block_footer.cc",
- "table/block_based/data_block_hash_index.cc",
- "table/block_based/filter_block_reader_common.cc",
- "table/block_based/filter_policy.cc",
- "table/block_based/flush_block_policy.cc",
- "table/block_based/full_filter_block.cc",
- "table/block_based/hash_index_reader.cc",
- "table/block_based/index_builder.cc",
- "table/block_based/index_reader_common.cc",
- "table/block_based/parsed_full_filter_block.cc",
- "table/block_based/partitioned_filter_block.cc",
- "table/block_based/partitioned_index_iterator.cc",
- "table/block_based/partitioned_index_reader.cc",
- "table/block_based/reader_common.cc",
- "table/block_based/uncompression_dict_reader.cc",
- "table/block_fetcher.cc",
- "table/compaction_merging_iterator.cc",
- "table/cuckoo/cuckoo_table_builder.cc",
- "table/cuckoo/cuckoo_table_factory.cc",
- "table/cuckoo/cuckoo_table_reader.cc",
- "table/format.cc",
- "table/get_context.cc",
- "table/iterator.cc",
- "table/merging_iterator.cc",
- "table/meta_blocks.cc",
- "table/persistent_cache_helper.cc",
- "table/plain/plain_table_bloom.cc",
- "table/plain/plain_table_builder.cc",
- "table/plain/plain_table_factory.cc",
- "table/plain/plain_table_index.cc",
- "table/plain/plain_table_key_coding.cc",
- "table/plain/plain_table_reader.cc",
- "table/sst_file_dumper.cc",
- "table/sst_file_reader.cc",
- "table/sst_file_writer.cc",
- "table/table_factory.cc",
- "table/table_properties.cc",
- "table/two_level_iterator.cc",
- "table/unique_id.cc",
- "test_util/sync_point.cc",
- "test_util/sync_point_impl.cc",
- "test_util/transaction_test_util.cc",
- "tools/dump/db_dump_tool.cc",
- "tools/io_tracer_parser_tool.cc",
- "tools/ldb_cmd.cc",
- "tools/ldb_tool.cc",
- "tools/sst_dump_tool.cc",
- "trace_replay/block_cache_tracer.cc",
- "trace_replay/io_tracer.cc",
- "trace_replay/trace_record.cc",
- "trace_replay/trace_record_handler.cc",
- "trace_replay/trace_record_result.cc",
- "trace_replay/trace_replay.cc",
- "util/async_file_reader.cc",
- "util/build_version.cc",
- "util/cleanable.cc",
- "util/coding.cc",
- "util/compaction_job_stats_impl.cc",
- "util/comparator.cc",
- "util/compression.cc",
- "util/compression_context_cache.cc",
- "util/concurrent_task_limiter_impl.cc",
- "util/crc32c.cc",
- "util/crc32c_arm64.cc",
- "util/data_structure.cc",
- "util/dynamic_bloom.cc",
- "util/file_checksum_helper.cc",
- "util/hash.cc",
- "util/murmurhash.cc",
- "util/random.cc",
- "util/rate_limiter.cc",
- "util/ribbon_config.cc",
- "util/slice.cc",
- "util/status.cc",
- "util/stderr_logger.cc",
- "util/string_util.cc",
- "util/thread_local.cc",
- "util/threadpool_imp.cc",
- "util/xxhash.cc",
- "utilities/agg_merge/agg_merge.cc",
- "utilities/backup/backup_engine.cc",
- "utilities/blob_db/blob_compaction_filter.cc",
- "utilities/blob_db/blob_db.cc",
- "utilities/blob_db/blob_db_impl.cc",
- "utilities/blob_db/blob_db_impl_filesnapshot.cc",
- "utilities/blob_db/blob_dump_tool.cc",
- "utilities/blob_db/blob_file.cc",
- "utilities/cache_dump_load.cc",
- "utilities/cache_dump_load_impl.cc",
- "utilities/cassandra/cassandra_compaction_filter.cc",
- "utilities/cassandra/format.cc",
- "utilities/cassandra/merge_operator.cc",
- "utilities/checkpoint/checkpoint_impl.cc",
- "utilities/compaction_filters.cc",
- "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
- "utilities/convenience/info_log_finder.cc",
- "utilities/counted_fs.cc",
- "utilities/debug.cc",
- "utilities/env_mirror.cc",
- "utilities/env_timed.cc",
- "utilities/fault_injection_env.cc",
- "utilities/fault_injection_fs.cc",
- "utilities/fault_injection_secondary_cache.cc",
- "utilities/leveldb_options/leveldb_options.cc",
- "utilities/memory/memory_util.cc",
- "utilities/merge_operators.cc",
- "utilities/merge_operators/bytesxor.cc",
- "utilities/merge_operators/max.cc",
- "utilities/merge_operators/put.cc",
- "utilities/merge_operators/sortlist.cc",
- "utilities/merge_operators/string_append/stringappend.cc",
- "utilities/merge_operators/string_append/stringappend2.cc",
- "utilities/merge_operators/uint64add.cc",
- "utilities/object_registry.cc",
- "utilities/option_change_migration/option_change_migration.cc",
- "utilities/options/options_util.cc",
- "utilities/persistent_cache/block_cache_tier.cc",
- "utilities/persistent_cache/block_cache_tier_file.cc",
- "utilities/persistent_cache/block_cache_tier_metadata.cc",
- "utilities/persistent_cache/persistent_cache_tier.cc",
- "utilities/persistent_cache/volatile_tier_impl.cc",
- "utilities/simulator_cache/cache_simulator.cc",
- "utilities/simulator_cache/sim_cache.cc",
- "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
- "utilities/trace/file_trace_reader_writer.cc",
- "utilities/trace/replayer_impl.cc",
- "utilities/transactions/lock/lock_manager.cc",
- "utilities/transactions/lock/point/point_lock_manager.cc",
- "utilities/transactions/lock/point/point_lock_tracker.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
- "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
- "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
- "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
- "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
- "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
- "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
- "utilities/transactions/optimistic_transaction.cc",
- "utilities/transactions/optimistic_transaction_db_impl.cc",
- "utilities/transactions/pessimistic_transaction.cc",
- "utilities/transactions/pessimistic_transaction_db.cc",
- "utilities/transactions/snapshot_checker.cc",
- "utilities/transactions/transaction_base.cc",
- "utilities/transactions/transaction_db_mutex_impl.cc",
- "utilities/transactions/transaction_util.cc",
- "utilities/transactions/write_prepared_txn.cc",
- "utilities/transactions/write_prepared_txn_db.cc",
- "utilities/transactions/write_unprepared_txn.cc",
- "utilities/transactions/write_unprepared_txn_db.cc",
- "utilities/ttl/db_ttl_impl.cc",
- "utilities/wal_filter.cc",
- "utilities/write_batch_with_index/write_batch_with_index.cc",
- "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
- ], deps=[
- "//folly/container:f14_hash",
- "//folly/experimental/coro:blocking_wait",
- "//folly/experimental/coro:collect",
- "//folly/experimental/coro:coroutine",
- "//folly/experimental/coro:task",
- "//folly/synchronization:distributed_mutex",
- ], headers=None, link_whole=True, extra_test_libs=False)
+cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
"db/db_test_util.cc",
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index d7e9b645be4..a9e7b447df3 100755
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -154,16 +154,9 @@ def generate_targets(repo_path, deps_map):
# rocksdb_whole_archive_lib
TARGETS.add_library(
"rocksdb_whole_archive_lib",
- src_mk["LIB_SOURCES"] +
- # always add range_tree, it's only excluded on ppc64, which we don't use internally
- src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
+ [],
deps=[
- "//folly/container:f14_hash",
- "//folly/experimental/coro:blocking_wait",
- "//folly/experimental/coro:collect",
- "//folly/experimental/coro:coroutine",
- "//folly/experimental/coro:task",
- "//folly/synchronization:distributed_mutex",
+ ":rocksdb_lib",
],
headers=None,
extra_external_deps="",
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 12be0babef9..80fbbe88f6d 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -1282,25 +1282,20 @@ size_t ClockCacheShard
::GetTableAddressCount() const {
// Explicit instantiation
template class ClockCacheShard;
-HyperClockCache::HyperClockCache(
- size_t capacity, size_t estimated_value_size, int num_shard_bits,
- bool strict_capacity_limit,
- CacheMetadataChargePolicy metadata_charge_policy,
- std::shared_ptr memory_allocator)
- : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
- std::move(memory_allocator)) {
- assert(estimated_value_size > 0 ||
- metadata_charge_policy != kDontChargeCacheMetadata);
+HyperClockCache::HyperClockCache(const HyperClockCacheOptions& opts)
+ : ShardedCache(opts) {
+ assert(opts.estimated_entry_charge > 0 ||
+ opts.metadata_charge_policy != kDontChargeCacheMetadata);
// TODO: should not need to go through two levels of pointer indirection to
// get to table entries
size_t per_shard = GetPerShardCapacity();
MemoryAllocator* alloc = this->memory_allocator();
- const Cache::EvictionCallback* eviction_callback = &eviction_callback_;
- InitShards([=](Shard* cs) {
- HyperClockTable::Opts opts;
- opts.estimated_value_size = estimated_value_size;
- new (cs) Shard(per_shard, strict_capacity_limit, metadata_charge_policy,
- alloc, eviction_callback, opts);
+ InitShards([&](Shard* cs) {
+ HyperClockTable::Opts table_opts;
+ table_opts.estimated_value_size = opts.estimated_entry_charge;
+ new (cs) Shard(per_shard, opts.strict_capacity_limit,
+ opts.metadata_charge_policy, alloc, &eviction_callback_,
+ table_opts);
});
}
@@ -1460,21 +1455,23 @@ std::shared_ptr NewClockCache(
}
std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const {
- auto my_num_shard_bits = num_shard_bits;
- if (my_num_shard_bits >= 20) {
+ // For sanitized options
+ HyperClockCacheOptions opts = *this;
+ if (opts.num_shard_bits >= 20) {
return nullptr; // The cache cannot be sharded into too many fine pieces.
}
- if (my_num_shard_bits < 0) {
+ if (opts.num_shard_bits < 0) {
// Use larger shard size to reduce risk of large entries clustering
// or skewing individual shards.
constexpr size_t min_shard_size = 32U * 1024U * 1024U;
- my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
+ opts.num_shard_bits =
+ GetDefaultCacheShardBits(opts.capacity, min_shard_size);
}
- std::shared_ptr cache = std::make_shared(
- capacity, estimated_entry_charge, my_num_shard_bits,
- strict_capacity_limit, metadata_charge_policy, memory_allocator);
- if (secondary_cache) {
- cache = std::make_shared(cache, secondary_cache);
+ std::shared_ptr cache =
+ std::make_shared(opts);
+ if (opts.secondary_cache) {
+ cache = std::make_shared(cache,
+ opts.secondary_cache);
}
return cache;
}
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index fc5aef6cb4d..a9515146a28 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -682,10 +682,7 @@ class HyperClockCache
public:
using Shard = ClockCacheShard;
- HyperClockCache(size_t capacity, size_t estimated_value_size,
- int num_shard_bits, bool strict_capacity_limit,
- CacheMetadataChargePolicy metadata_charge_policy,
- std::shared_ptr memory_allocator);
+ explicit HyperClockCache(const HyperClockCacheOptions& opts);
const char* Name() const override { return "HyperClockCache"; }
diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index affea8c54f7..2408afc0a17 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -17,23 +17,8 @@
namespace ROCKSDB_NAMESPACE {
CompressedSecondaryCache::CompressedSecondaryCache(
- size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio, double low_pri_pool_ratio,
- std::shared_ptr memory_allocator, bool use_adaptive_mutex,
- CacheMetadataChargePolicy metadata_charge_policy,
- CompressionType compression_type, uint32_t compress_format_version,
- bool enable_custom_split_merge,
- const CacheEntryRoleSet& do_not_compress_roles)
- : cache_options_(capacity, num_shard_bits, strict_capacity_limit,
- high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
- use_adaptive_mutex, metadata_charge_policy,
- compression_type, compress_format_version,
- enable_custom_split_merge, do_not_compress_roles) {
- cache_ =
- NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
- high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
- metadata_charge_policy, low_pri_pool_ratio);
-}
+ const CompressedSecondaryCacheOptions& opts)
+ : cache_(opts.LRUCacheOptions::MakeSharedCache()), cache_options_(opts) {}
CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); }
@@ -311,31 +296,9 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
}
}
-std::shared_ptr NewCompressedSecondaryCache(
- size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio, double low_pri_pool_ratio,
- std::shared_ptr memory_allocator, bool use_adaptive_mutex,
- CacheMetadataChargePolicy metadata_charge_policy,
- CompressionType compression_type, uint32_t compress_format_version,
- bool enable_custom_split_merge,
- const CacheEntryRoleSet& do_not_compress_roles) {
- return std::make_shared(
- capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
- low_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
- metadata_charge_policy, compression_type, compress_format_version,
- enable_custom_split_merge, do_not_compress_roles);
-}
-
-std::shared_ptr NewCompressedSecondaryCache(
- const CompressedSecondaryCacheOptions& opts) {
- // The secondary_cache is disabled for this LRUCache instance.
- assert(opts.secondary_cache == nullptr);
- return NewCompressedSecondaryCache(
- opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit,
- opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator,
- opts.use_adaptive_mutex, opts.metadata_charge_policy,
- opts.compression_type, opts.compress_format_version,
- opts.enable_custom_split_merge, opts.do_not_compress_roles);
+std::shared_ptr
+CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
+ return std::make_shared(*this);
}
} // namespace ROCKSDB_NAMESPACE
diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h
index 7b45ca8bd91..d20f2d1d7a2 100644
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@@ -69,18 +69,8 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
class CompressedSecondaryCache : public SecondaryCache {
public:
- CompressedSecondaryCache(
- size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio, double low_pri_pool_ratio,
- std::shared_ptr memory_allocator = nullptr,
- bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
- CacheMetadataChargePolicy metadata_charge_policy =
- kDefaultCacheMetadataChargePolicy,
- CompressionType compression_type = CompressionType::kLZ4Compression,
- uint32_t compress_format_version = 2,
- bool enable_custom_split_merge = false,
- const CacheEntryRoleSet& do_not_compress_roles = {
- CacheEntryRole::kFilterBlock});
+ explicit CompressedSecondaryCache(
+ const CompressedSecondaryCacheOptions& opts);
~CompressedSecondaryCache() override;
const char* Name() const override { return "CompressedSecondaryCache"; }
diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc
index 1e41fc142b4..18b51ccf8fe 100644
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@@ -626,8 +626,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
std::unique_ptr sec_cache =
- std::make_unique(1000, 0, true, 0.5, 0.0,
- allocator);
+ std::make_unique(
+ CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
+ allocator));
Random rnd(301);
// 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
size_t str_size{8500};
@@ -678,7 +679,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::string str = str1 + str2 + str3;
std::unique_ptr sec_cache =
- std::make_unique(1000, 0, true, 0.5, 0.0);
+ std::make_unique(
+ CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0));
size_t charge{0};
CacheAllocationPtr value =
sec_cache->MergeChunksIntoValue(chunks_head, charge);
@@ -708,8 +710,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
std::unique_ptr sec_cache =
- std::make_unique(1000, 0, true, 0.5, 0.0,
- allocator);
+ std::make_unique(
+ CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
+ allocator));
Random rnd(301);
// 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
size_t str_size{8500};
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 3b4e80ef87b..02119c81900 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -646,23 +646,15 @@ void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
str.append(buffer);
}
-LRUCache::LRUCache(size_t capacity, int num_shard_bits,
- bool strict_capacity_limit, double high_pri_pool_ratio,
- double low_pri_pool_ratio,
- std::shared_ptr allocator,
- bool use_adaptive_mutex,
- CacheMetadataChargePolicy metadata_charge_policy)
- : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
- std::move(allocator)) {
+LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) {
size_t per_shard = GetPerShardCapacity();
MemoryAllocator* alloc = memory_allocator();
- const EvictionCallback* eviction_callback = &eviction_callback_;
- InitShards([=](LRUCacheShard* cs) {
- new (cs) LRUCacheShard(per_shard, strict_capacity_limit,
- high_pri_pool_ratio, low_pri_pool_ratio,
- use_adaptive_mutex, metadata_charge_policy,
- /* max_upper_hash_bits */ 32 - num_shard_bits, alloc,
- eviction_callback);
+ InitShards([&](LRUCacheShard* cs) {
+ new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit,
+ opts.high_pri_pool_ratio, opts.low_pri_pool_ratio,
+ opts.use_adaptive_mutex, opts.metadata_charge_policy,
+ /* max_upper_hash_bits */ 32 - opts.num_shard_bits,
+ alloc, &eviction_callback_);
});
}
@@ -692,13 +684,7 @@ double LRUCache::GetHighPriPoolRatio() {
} // namespace lru_cache
-std::shared_ptr NewLRUCache(
- size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio,
- std::shared_ptr memory_allocator, bool use_adaptive_mutex,
- CacheMetadataChargePolicy metadata_charge_policy,
- const std::shared_ptr& secondary_cache,
- double low_pri_pool_ratio) {
+std::shared_ptr LRUCacheOptions::MakeSharedCache() const {
if (num_shard_bits >= 20) {
return nullptr; // The cache cannot be sharded into too many fine pieces.
}
@@ -714,36 +700,15 @@ std::shared_ptr NewLRUCache(
// Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
return nullptr;
}
- if (num_shard_bits < 0) {
- num_shard_bits = GetDefaultCacheShardBits(capacity);
+ // For sanitized options
+ LRUCacheOptions opts = *this;
+ if (opts.num_shard_bits < 0) {
+ opts.num_shard_bits = GetDefaultCacheShardBits(capacity);
}
- std::shared_ptr cache = std::make_shared(
- capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
- low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex,
- metadata_charge_policy);
+ std::shared_ptr cache = std::make_shared(opts);
if (secondary_cache) {
cache = std::make_shared(cache, secondary_cache);
}
return cache;
}
-
-std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) {
- return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
- cache_opts.strict_capacity_limit,
- cache_opts.high_pri_pool_ratio,
- cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
- cache_opts.metadata_charge_policy,
- cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio);
-}
-
-std::shared_ptr NewLRUCache(
- size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio,
- std::shared_ptr memory_allocator, bool use_adaptive_mutex,
- CacheMetadataChargePolicy metadata_charge_policy,
- double low_pri_pool_ratio) {
- return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
- high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
- metadata_charge_policy, nullptr, low_pri_pool_ratio);
-}
} // namespace ROCKSDB_NAMESPACE
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 554907b3bea..9e6f15062f8 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -446,12 +446,7 @@ class LRUCache
#endif
: public ShardedCache {
public:
- LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio, double low_pri_pool_ratio,
- std::shared_ptr memory_allocator = nullptr,
- bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
- CacheMetadataChargePolicy metadata_charge_policy =
- kDontChargeCacheMetadata);
+ explicit LRUCache(const LRUCacheOptions& opts);
const char* Name() const override { return "LRUCache"; }
ObjectPtr Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index 9ebca3ba827..f8d518067a8 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -19,14 +19,12 @@
namespace ROCKSDB_NAMESPACE {
-ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
- bool strict_capacity_limit,
- std::shared_ptr allocator)
- : Cache(std::move(allocator)),
+ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts)
+ : Cache(opts.memory_allocator),
last_id_(1),
- shard_mask_((uint32_t{1} << num_shard_bits) - 1),
- strict_capacity_limit_(strict_capacity_limit),
- capacity_(capacity) {}
+ shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1),
+ strict_capacity_limit_(opts.strict_capacity_limit),
+ capacity_(opts.capacity) {}
size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
uint32_t num_shards = GetNumShards();
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index 04eaa5318ea..d689783d3c8 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -89,9 +89,7 @@ class CacheShardBase {
// Portions of ShardedCache that do not depend on the template parameter
class ShardedCacheBase : public Cache {
public:
- ShardedCacheBase(size_t capacity, int num_shard_bits,
- bool strict_capacity_limit,
- std::shared_ptr memory_allocator);
+ explicit ShardedCacheBase(const ShardedCacheOptions& opts);
virtual ~ShardedCacheBase() = default;
int GetNumShardBits() const;
@@ -134,10 +132,8 @@ class ShardedCache : public ShardedCacheBase {
using HashCref = typename CacheShard::HashCref;
using HandleImpl = typename CacheShard::HandleImpl;
- ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- std::shared_ptr allocator)
- : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
- allocator),
+ explicit ShardedCache(const ShardedCacheOptions& opts)
+ : ShardedCacheBase(opts),
shards_(reinterpret_cast(port::cacheline_aligned_alloc(
sizeof(CacheShard) * GetNumShards()))),
destroy_shards_in_dtor_(false) {}
diff --git a/db/builder.cc b/db/builder.cc
index be1ec29bf0c..eadc315c9aa 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -380,7 +380,8 @@ Status BuildTable(
MaxFileSizeForL0MetaPin(mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key*/ nullptr,
- /*allow_unprepared_value*/ false));
+ /*allow_unprepared_value*/ false,
+ mutable_cf_options.block_protection_bytes_per_key));
s = it->status();
if (s.ok() && paranoid_file_checks) {
OutputValidator file_validator(tboptions.internal_comparator,
diff --git a/db/column_family.cc b/db/column_family.cc
index 24ea46ac486..0b3fe680772 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1428,6 +1428,12 @@ Status ColumnFamilyData::ValidateOptions(
"Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
+ if (std::find(supported.begin(), supported.end(),
+ cf_options.block_protection_bytes_per_key) == supported.end()) {
+ return Status::NotSupported(
+ "Block per key-value checksum protection only supports 0, 1, 2, 4 "
+ "or 8 bytes per key.");
+ }
return s;
}
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 8a326a508f4..ed152f28c60 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -504,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
FileMetaData* f = flevel->files[i].file_metadata;
std::vector my_anchors;
Status s = cfd->table_cache()->ApproximateKeyAnchors(
- read_options, icomp, *f, my_anchors);
+ read_options, icomp, *f,
+ c->mutable_cf_options()->block_protection_bytes_per_key,
+ my_anchors);
if (!s.ok() || my_anchors.empty()) {
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
}
@@ -735,7 +737,9 @@ Status CompactionJob::Run() {
*compact_->compaction->mutable_cf_options()),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
- /*allow_unprepared_value=*/false);
+ /*allow_unprepared_value=*/false,
+ compact_->compaction->mutable_cf_options()
+ ->block_protection_bytes_per_key);
auto s = iter->status();
if (s.ok() && paranoid_file_checks_) {
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 9c5784d5e02..79f8e5fd52e 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -454,7 +454,8 @@ class CompactionJobTestBase : public testing::Test {
Status s = cf_options_.table_factory->NewTableReader(
read_opts,
TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
- cfd_->internal_comparator()),
+ cfd_->internal_comparator(),
+ 0 /* block_protection_bytes_per_key */),
std::move(freader), file_size, &table_reader, false);
ASSERT_OK(s);
assert(table_reader);
diff --git a/db/convenience.cc b/db/convenience.cc
index 8ab7cbc139a..32cdfafaab2 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -64,8 +64,8 @@ Status VerifySstFileChecksum(const Options& options,
const bool kImmortal = true;
auto reader_options = TableReaderOptions(
ioptions, options.prefix_extractor, env_options, internal_comparator,
- false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
- -1 /* level */);
+ options.block_protection_bytes_per_key, false /* skip_filters */,
+ !kImmortal, false /* force_direct_prefetch */, -1 /* level */);
reader_options.largest_seqno = largest_seqno;
s = ioptions.table_factory->NewTableReader(
reader_options, std::move(file_reader), file_size, &table_reader,
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 1a136635339..8fa93d8d758 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -620,9 +620,9 @@ class MockCache : public LRUCache {
static uint32_t low_pri_insert_count;
MockCache()
- : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
- false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/,
- 0.0 /*low_pri_pool_ratio*/) {}
+ : LRUCache(LRUCacheOptions(
+ size_t{1} << 25 /*capacity*/, 0 /*num_shard_bits*/,
+ false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/)) {}
using ShardedCache::Insert;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index fcfb777316e..4e36af1e2e9 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2475,7 +2475,6 @@ std::vector DBImpl::MultiGet(
// Post processing (decrement reference counts and record statistics)
PERF_TIMER_GUARD(get_post_process_time);
- autovector superversions_to_delete;
for (auto mgd_iter : multiget_cf_data) {
auto mgd = mgd_iter.second;
diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc
index ed80760518a..adc2b36bb26 100644
--- a/db/db_statistics_test.cc
+++ b/db/db_statistics_test.cc
@@ -20,131 +20,115 @@ class DBStatisticsTest : public DBTestBase {
};
TEST_F(DBStatisticsTest, CompressionStatsTest) {
- CompressionType type;
-
- if (Snappy_Supported()) {
- type = kSnappyCompression;
- fprintf(stderr, "using snappy\n");
- } else if (Zlib_Supported()) {
- type = kZlibCompression;
- fprintf(stderr, "using zlib\n");
- } else if (BZip2_Supported()) {
- type = kBZip2Compression;
- fprintf(stderr, "using bzip2\n");
- } else if (LZ4_Supported()) {
- type = kLZ4Compression;
- fprintf(stderr, "using lz4\n");
- } else if (XPRESS_Supported()) {
- type = kXpressCompression;
- fprintf(stderr, "using xpress\n");
- } else if (ZSTD_Supported()) {
- type = kZSTD;
- fprintf(stderr, "using ZSTD\n");
- } else {
- fprintf(stderr, "skipping test, compression disabled\n");
- return;
- }
-
- Options options = CurrentOptions();
- options.compression = type;
- options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
- options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
- BlockBasedTableOptions bbto;
- bbto.enable_index_compression = false;
- options.table_factory.reset(NewBlockBasedTableFactory(bbto));
- DestroyAndReopen(options);
-
- auto PopStat = [&](Tickers t) -> uint64_t {
- return options.statistics->getAndResetTickerCount(t);
- };
-
- int kNumKeysWritten = 100;
- double compress_to = 0.5;
- // About three KVs per block
- int len = static_cast(BlockBasedTableOptions().block_size / 3);
- int uncomp_est = kNumKeysWritten * (len + 20);
-
- Random rnd(301);
- std::string buf;
+ for (CompressionType type : GetSupportedCompressions()) {
+ if (type == kNoCompression) {
+ continue;
+ }
+ if (type == kBZip2Compression) {
+ // Weird behavior in this test
+ continue;
+ }
+ SCOPED_TRACE("Compression type: " + std::to_string(type));
+
+ Options options = CurrentOptions();
+ options.compression = type;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+ BlockBasedTableOptions bbto;
+ bbto.enable_index_compression = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ auto PopStat = [&](Tickers t) -> uint64_t {
+ return options.statistics->getAndResetTickerCount(t);
+ };
+
+ int kNumKeysWritten = 100;
+ double compress_to = 0.5;
+ // About three KVs per block
+ int len = static_cast(BlockBasedTableOptions().block_size / 3);
+ int uncomp_est = kNumKeysWritten * (len + 20);
+
+ Random rnd(301);
+ std::string buf;
+
+ // Check that compressions occur and are counted when compression is turned
+ // on
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ ASSERT_OK(
+ Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf)));
+ }
+ ASSERT_OK(Flush());
+ EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSED));
+ EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSED_FROM), uncomp_est / 10);
+ EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_COMPRESSED_TO),
+ uncomp_est / 10);
+
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED));
+ EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM));
+ EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO));
+
+ // And decompressions
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ EXPECT_EQ(34, PopStat(NUMBER_BLOCK_DECOMPRESSED));
+ EXPECT_NEAR2(uncomp_est, PopStat(BYTES_DECOMPRESSED_TO), uncomp_est / 10);
+ EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_DECOMPRESSED_FROM),
+ uncomp_est / 10);
- // Check that compressions occur and are counted when compression is turned on
- for (int i = 0; i < kNumKeysWritten; ++i) {
- ASSERT_OK(
- Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf)));
- }
- ASSERT_OK(Flush());
- EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSED));
- EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSED_FROM), uncomp_est / 10);
- EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_COMPRESSED_TO),
- uncomp_est / 10);
-
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED));
- EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM));
- EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO));
-
- // And decompressions
- for (int i = 0; i < kNumKeysWritten; ++i) {
- auto r = Get(Key(i));
- }
- EXPECT_EQ(34, PopStat(NUMBER_BLOCK_DECOMPRESSED));
- EXPECT_NEAR2(uncomp_est, PopStat(BYTES_DECOMPRESSED_TO), uncomp_est / 10);
- EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_DECOMPRESSED_FROM),
- uncomp_est / 10);
-
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED));
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
-
- // Check when compression is rejected.
- compress_to = 0.95;
- DestroyAndReopen(options);
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED));
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
- for (int i = 0; i < kNumKeysWritten; ++i) {
- ASSERT_OK(
- Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf)));
- }
- ASSERT_OK(Flush());
- for (int i = 0; i < kNumKeysWritten; ++i) {
- auto r = Get(Key(i));
- }
- EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
- EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_REJECTED),
- uncomp_est / 10);
-
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED));
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED));
- EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM));
- EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO));
-
- // Check when compression is disabled.
- options.compression = kNoCompression;
- DestroyAndReopen(options);
+ // Check when compression is rejected.
+ DestroyAndReopen(options);
- for (int i = 0; i < kNumKeysWritten; ++i) {
- ASSERT_OK(
- Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf)));
- }
- ASSERT_OK(Flush());
- for (int i = 0; i < kNumKeysWritten; ++i) {
- auto r = Get(Key(i));
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomBinaryString(len)));
+ }
+ ASSERT_OK(Flush());
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+ EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_REJECTED),
+ uncomp_est / 10);
+
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED));
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED));
+ EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM));
+ EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO));
+
+ // Check when compression is disabled.
+ options.compression = kNoCompression;
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ ASSERT_OK(Put(Key(i), rnd.RandomBinaryString(len)));
+ }
+ ASSERT_OK(Flush());
+ for (int i = 0; i < kNumKeysWritten; ++i) {
+ auto r = Get(Key(i));
+ }
+ EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+ EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_BYPASSED),
+ uncomp_est / 10);
+
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED));
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+ EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO));
+ EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED));
+ EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM));
+ EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO));
}
- EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
- EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_BYPASSED),
- uncomp_est / 10);
-
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED));
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
- EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO));
- EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED));
- EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM));
- EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO));
}
TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) {
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 98bd6050a27..ca9b6fb9bbe 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -678,6 +678,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
TableReaderOptions(
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
env_options_, cfd_->internal_comparator(),
+ sv->mutable_cf_options.block_protection_bytes_per_key,
/*skip_filters*/ false, /*immortal*/ false,
/*force_direct_prefetch*/ false, /*level*/ -1,
/*block_cache_tracer*/ nullptr,
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index eddade83744..75a7c599b88 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -36,7 +36,7 @@ class ForwardLevelIterator : public InternalIterator {
const ColumnFamilyData* const cfd, const ReadOptions& read_options,
const std::vector& files,
const std::shared_ptr& prefix_extractor,
- bool allow_unprepared_value)
+ bool allow_unprepared_value, uint8_t block_protection_bytes_per_key)
: cfd_(cfd),
read_options_(read_options),
files_(files),
@@ -45,7 +45,8 @@ class ForwardLevelIterator : public InternalIterator {
file_iter_(nullptr),
pinned_iters_mgr_(nullptr),
prefix_extractor_(prefix_extractor),
- allow_unprepared_value_(allow_unprepared_value) {
+ allow_unprepared_value_(allow_unprepared_value),
+ block_protection_bytes_per_key_(block_protection_bytes_per_key) {
status_.PermitUncheckedError(); // Allow uninitialized status through
}
@@ -87,7 +88,8 @@ class ForwardLevelIterator : public InternalIterator {
/*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
/*max_file_size_for_l0_meta_pin=*/0,
/*smallest_compaction_key=*/nullptr,
- /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_,
+ block_protection_bytes_per_key_);
file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
valid_ = false;
if (!range_del_agg.IsEmpty()) {
@@ -211,6 +213,7 @@ class ForwardLevelIterator : public InternalIterator {
// Kept alive by ForwardIterator::sv_->mutable_cf_options
const std::shared_ptr& prefix_extractor_;
const bool allow_unprepared_value_;
+ const uint8_t block_protection_bytes_per_key_;
};
ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
@@ -738,7 +741,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
/*skip_filters=*/false, /*level=*/-1,
MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
- /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_,
+ sv_->mutable_cf_options.block_protection_bytes_per_key));
}
BuildLevelIterators(vstorage, sv_);
current_ = nullptr;
@@ -819,7 +823,8 @@ void ForwardIterator::RenewIterators() {
/*skip_filters=*/false, /*level=*/-1,
MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
- /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_,
+ svnew->mutable_cf_options.block_protection_bytes_per_key));
}
for (auto* f : l0_iters_) {
@@ -863,7 +868,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
} else {
level_iters_.push_back(new ForwardLevelIterator(
cfd_, read_options_, level_files,
- sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
+ sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_,
+ sv->mutable_cf_options.block_protection_bytes_per_key));
}
}
}
@@ -885,7 +891,8 @@ void ForwardIterator::ResetIncompleteIterators() {
/*skip_filters=*/false, /*level=*/-1,
MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
- /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value_,
+ sv_->mutable_cf_options.block_protection_bytes_per_key);
l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
}
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 12d2519e9e6..9a8b48dd054 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -250,6 +250,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
TableReaderOptions(
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
env_options_, cfd_->internal_comparator(),
+ sv->mutable_cf_options.block_protection_bytes_per_key,
/*skip_filters*/ false, /*immortal*/ false,
/*force_direct_prefetch*/ false, /*level*/ -1,
/*block_cache_tracer*/ nullptr,
diff --git a/db/kv_checksum.h b/db/kv_checksum.h
index bce507fcf98..53c02485ffa 100644
--- a/db/kv_checksum.h
+++ b/db/kv_checksum.h
@@ -46,6 +46,8 @@ template
class ProtectionInfoKVOC;
template
class ProtectionInfoKVOS;
+template
+class ProtectionInfoKV;
// Aliases for 64-bit protection infos.
using ProtectionInfo64 = ProtectionInfo;
@@ -64,13 +66,13 @@ class ProtectionInfo {
ProtectionInfoKVO ProtectKVO(const SliceParts& key,
const SliceParts& value,
ValueType op_type) const;
-
- T GetVal() const { return val_; }
+ ProtectionInfoKV ProtectKV(const Slice& key, const Slice& value) const;
private:
friend class ProtectionInfoKVO;
friend class ProtectionInfoKVOS;
friend class ProtectionInfoKVOC;
+ friend class ProtectionInfoKV;
// Each field is hashed with an independent value so we can catch fields being
// swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
@@ -89,8 +91,47 @@ class ProtectionInfo {
static_assert(sizeof(ProtectionInfo) == sizeof(T), "");
}
+ T GetVal() const { return val_; }
void SetVal(T val) { val_ = val; }
+ void Encode(uint8_t len, char* dst) const {
+ assert(sizeof(val_) >= len);
+ switch (len) {
+ case 1:
+ dst[0] = static_cast(val_);
+ break;
+ case 2:
+ EncodeFixed16(dst, static_cast(val_));
+ break;
+ case 4:
+ EncodeFixed32(dst, static_cast(val_));
+ break;
+ case 8:
+ EncodeFixed64(dst, static_cast(val_));
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ bool Verify(uint8_t len, const char* checksum_ptr) const {
+ assert(sizeof(val_) >= len);
+ switch (len) {
+ case 1:
+ return static_cast(checksum_ptr[0]) ==
+ static_cast(val_);
+ case 2:
+ return DecodeFixed16(checksum_ptr) == static_cast(val_);
+ case 4:
+ return DecodeFixed32(checksum_ptr) == static_cast(val_);
+ case 8:
+ return DecodeFixed64(checksum_ptr) == static_cast(val_);
+ default:
+ assert(false);
+ return false;
+ }
+ }
+
T val_ = 0;
};
@@ -113,7 +154,14 @@ class ProtectionInfoKVO {
void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
void UpdateO(ValueType old_op_type, ValueType new_op_type);
- T GetVal() const { return info_.GetVal(); }
+ // Encode this protection info into `len` bytes and stores them in `dst`.
+ void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); }
+ // Verify this protection info against the protection info encoded by Encode()
+ // at the first `len` bytes of `checksum_ptr`.
+ // Returns true iff the verification is successful.
+ bool Verify(uint8_t len, const char* checksum_ptr) const {
+ return info_.Verify(len, checksum_ptr);
+ }
private:
friend class ProtectionInfo;
@@ -124,6 +172,7 @@ class ProtectionInfoKVO {
static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), "");
}
+ T GetVal() const { return info_.GetVal(); }
void SetVal(T val) { info_.SetVal(val); }
ProtectionInfo info_;
@@ -154,7 +203,10 @@ class ProtectionInfoKVOC {
void UpdateC(ColumnFamilyId old_column_family_id,
ColumnFamilyId new_column_family_id);
- T GetVal() const { return kvo_.GetVal(); }
+ void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); }
+ bool Verify(uint8_t len, const char* checksum_ptr) const {
+ return kvo_.Verify(len, checksum_ptr);
+ }
private:
friend class ProtectionInfoKVO;
@@ -163,6 +215,7 @@ class ProtectionInfoKVOC {
static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), "");
}
+ T GetVal() const { return kvo_.GetVal(); }
void SetVal(T val) { kvo_.SetVal(val); }
ProtectionInfoKVO kvo_;
@@ -193,7 +246,10 @@ class ProtectionInfoKVOS {
void UpdateS(SequenceNumber old_sequence_number,
SequenceNumber new_sequence_number);
- T GetVal() const { return kvo_.GetVal(); }
+ void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); }
+ bool Verify(uint8_t len, const char* checksum_ptr) const {
+ return kvo_.Verify(len, checksum_ptr);
+ }
private:
friend class ProtectionInfoKVO;
@@ -202,11 +258,32 @@ class ProtectionInfoKVOS {
static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), "");
}
+ T GetVal() const { return kvo_.GetVal(); }
void SetVal(T val) { kvo_.SetVal(val); }
ProtectionInfoKVO kvo_;
};
+template
+class ProtectionInfoKV {
+ public:
+ ProtectionInfoKV() = default;
+
+ void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); }
+ bool Verify(uint8_t len, const char* checksum_ptr) const {
+ return info_.Verify(len, checksum_ptr);
+ }
+
+ private:
+ friend class ProtectionInfo;
+
+ explicit ProtectionInfoKV(T val) : info_(val) {
+ static_assert(sizeof(ProtectionInfoKV) == sizeof(T));
+ }
+
+ ProtectionInfo info_;
+};
+
template
Status ProtectionInfo::GetStatus() const {
if (val_ != 0) {
@@ -244,6 +321,16 @@ ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key,
return ProtectionInfoKVO(val);
}
+template
+ProtectionInfoKV ProtectionInfo::ProtectKV(const Slice& key,
+ const Slice& value) const {
+ T val = GetVal();
+ val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK));
+ val =
+ val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV));
+ return ProtectionInfoKV(val);
+}
+
template
void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) {
T val = GetVal();
@@ -394,5 +481,4 @@ void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number,
sizeof(new_sequence_number), ProtectionInfo::kSeedS));
SetVal(val);
}
-
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/memtable.cc b/db/memtable.cc
index e61ddc9db8b..f6c0cc62470 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -256,7 +256,7 @@ void MemTable::UpdateOldestKeyTime() {
}
Status MemTable::VerifyEntryChecksum(const char* entry,
- size_t protection_bytes_per_key,
+ uint32_t protection_bytes_per_key,
bool allow_data_in_errors) {
if (protection_bytes_per_key == 0) {
return Status::OK();
@@ -285,28 +285,11 @@ Status MemTable::VerifyEntryChecksum(const char* entry,
Slice value = Slice(value_ptr, value_length);
const char* checksum_ptr = value_ptr + value_length;
- uint64_t expected = ProtectionInfo64()
- .ProtectKVO(user_key, value, type)
- .ProtectS(seq)
- .GetVal();
- bool match = true;
- switch (protection_bytes_per_key) {
- case 1:
- match = static_cast(checksum_ptr[0]) ==
- static_cast(expected);
- break;
- case 2:
- match = DecodeFixed16(checksum_ptr) == static_cast(expected);
- break;
- case 4:
- match = DecodeFixed32(checksum_ptr) == static_cast(expected);
- break;
- case 8:
- match = DecodeFixed64(checksum_ptr) == expected;
- break;
- default:
- assert(false);
- }
+ bool match =
+ ProtectionInfo64()
+ .ProtectKVO(user_key, value, type)
+ .ProtectS(seq)
+ .Verify(static_cast(protection_bytes_per_key), checksum_ptr);
if (!match) {
std::string msg(
"Corrupted memtable entry, per key-value checksum verification "
@@ -526,7 +509,7 @@ class MemTableIterator : public InternalIterator {
bool valid_;
bool arena_mode_;
bool value_pinned_;
- size_t protection_bytes_per_key_;
+ uint32_t protection_bytes_per_key_;
Status status_;
Logger* logger_;
@@ -684,28 +667,15 @@ void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
return;
}
- uint64_t checksum = 0;
if (kv_prot_info == nullptr) {
- checksum =
- ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal();
+ ProtectionInfo64()
+ .ProtectKVO(key, value, type)
+ .ProtectS(s)
+ .Encode(static_cast(moptions_.protection_bytes_per_key),
+ checksum_ptr);
} else {
- checksum = kv_prot_info->GetVal();
- }
- switch (moptions_.protection_bytes_per_key) {
- case 1:
- checksum_ptr[0] = static_cast(checksum);
- break;
- case 2:
- EncodeFixed16(checksum_ptr, static_cast(checksum));
- break;
- case 4:
- EncodeFixed32(checksum_ptr, static_cast(checksum));
- break;
- case 8:
- EncodeFixed64(checksum_ptr, checksum);
- break;
- default:
- assert(false);
+ kv_prot_info->Encode(
+ static_cast(moptions_.protection_bytes_per_key), checksum_ptr);
}
}
@@ -902,7 +872,7 @@ struct Saver {
ReadCallback* callback_;
bool* is_blob_index;
bool allow_data_in_errors;
- size_t protection_bytes_per_key;
+ uint32_t protection_bytes_per_key;
bool CheckCallback(SequenceNumber _seq) {
if (callback_) {
return callback_->IsVisible(_seq);
diff --git a/db/memtable.h b/db/memtable.h
index aa2ba87ca4a..eefabcf88db 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -529,7 +529,7 @@ class MemTable {
// Returns Corruption status if verification fails.
static Status VerifyEntryChecksum(const char* entry,
- size_t protection_bytes_per_key,
+ uint32_t protection_bytes_per_key,
bool allow_data_in_errors = false);
private:
diff --git a/db/repair.cc b/db/repair.cc
index b4b9d0c5ffb..633c348a5c3 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -518,8 +518,9 @@ class Repairer {
if (status.ok()) {
// TODO: plumb Env::IOActivity
const ReadOptions read_options;
- status = table_cache_->GetTableProperties(file_options_, read_options,
- icmp_, t->meta, &props);
+ status = table_cache_->GetTableProperties(
+ file_options_, read_options, icmp_, t->meta, &props,
+ 0 /* block_protection_bytes_per_key */);
}
if (status.ok()) {
auto s =
@@ -577,7 +578,8 @@ class Repairer {
/*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
- /*allow_unprepared_value=*/false);
+ /*allow_unprepared_value=*/false,
+ cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key);
ParsedInternalKey parsed;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
@@ -617,7 +619,9 @@ class Repairer {
ReadOptions ropts;
std::unique_ptr r_iter;
status = table_cache_->GetRangeTombstoneIterator(
- ropts, cfd->internal_comparator(), t->meta, &r_iter);
+ ropts, cfd->internal_comparator(), t->meta,
+ cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key,
+ &r_iter);
if (r_iter) {
r_iter->SeekToFirst();
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 28206ed359e..c288ec8c7fd 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -91,7 +91,8 @@ Status TableCache::GetTableReader(
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats,
- HistogramImpl* file_read_hist, std::unique_ptr* table_reader,
+ uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist,
+ std::unique_ptr* table_reader,
const std::shared_ptr& prefix_extractor,
bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
@@ -140,7 +141,8 @@ Status TableCache::GetTableReader(
s = ioptions_.table_factory->NewTableReader(
ro,
TableReaderOptions(ioptions_, prefix_extractor, file_options,
- internal_comparator, skip_filters, immortal_tables_,
+ internal_comparator, block_protection_bytes_per_key,
+ skip_filters, immortal_tables_,
false /* force_direct_prefetch */, level,
block_cache_tracer_, max_file_size_for_l0_meta_pin,
db_session_id_, file_meta.fd.GetNumber(),
@@ -156,6 +158,7 @@ Status TableCache::FindTable(
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, TypedHandle** handle,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor,
const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
@@ -179,12 +182,12 @@ Status TableCache::FindTable(
}
std::unique_ptr table_reader;
- Status s =
- GetTableReader(ro, file_options, internal_comparator, file_meta,
- false /* sequential mode */, record_read_stats,
- file_read_hist, &table_reader, prefix_extractor,
- skip_filters, level, prefetch_index_and_filter_in_cache,
- max_file_size_for_l0_meta_pin, file_temperature);
+ Status s = GetTableReader(ro, file_options, internal_comparator, file_meta,
+ false /* sequential mode */, record_read_stats,
+ block_protection_bytes_per_key, file_read_hist,
+ &table_reader, prefix_extractor, skip_filters,
+ level, prefetch_index_and_filter_in_cache,
+ max_file_size_for_l0_meta_pin, file_temperature);
if (!s.ok()) {
assert(table_reader == nullptr);
RecordTick(ioptions_.stats, NO_FILE_ERRORS);
@@ -212,6 +215,7 @@ InternalIterator* TableCache::NewIterator(
size_t max_file_size_for_l0_meta_pin,
const InternalKey* smallest_compaction_key,
const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+ uint8_t block_protection_bytes_per_key,
TruncatedRangeDelIterator** range_del_iter) {
PERF_TIMER_GUARD(new_table_iterator_nanos);
@@ -225,12 +229,13 @@ InternalIterator* TableCache::NewIterator(
auto& fd = file_meta.fd;
table_reader = fd.table_reader;
if (table_reader == nullptr) {
- s = FindTable(
- options, file_options, icomparator, file_meta, &handle,
- prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
- !for_compaction /* record_read_stats */, file_read_hist, skip_filters,
- level, true /* prefetch_index_and_filter_in_cache */,
- max_file_size_for_l0_meta_pin, file_meta.temperature);
+ s = FindTable(options, file_options, icomparator, file_meta, &handle,
+ block_protection_bytes_per_key, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ !for_compaction /* record_read_stats */, file_read_hist,
+ skip_filters, level,
+ true /* prefetch_index_and_filter_in_cache */,
+ max_file_size_for_l0_meta_pin, file_meta.temperature);
if (s.ok()) {
table_reader = cache_.Value(handle);
}
@@ -308,7 +313,7 @@ InternalIterator* TableCache::NewIterator(
Status TableCache::GetRangeTombstoneIterator(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
- const FileMetaData& file_meta,
+ const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
std::unique_ptr* out_iter) {
assert(out_iter);
const FileDescriptor& fd = file_meta.fd;
@@ -317,7 +322,7 @@ Status TableCache::GetRangeTombstoneIterator(
TypedHandle* handle = nullptr;
if (t == nullptr) {
s = FindTable(options, file_options_, internal_comparator, file_meta,
- &handle);
+ &handle, block_protection_bytes_per_key);
if (s.ok()) {
t = cache_.Value(handle);
}
@@ -403,6 +408,7 @@ Status TableCache::Get(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor,
HistogramImpl* file_read_hist, bool skip_filters, int level,
size_t max_file_size_for_l0_meta_pin) {
@@ -430,7 +436,7 @@ Status TableCache::Get(
assert(s.ok());
if (t == nullptr) {
s = FindTable(options, file_options_, internal_comparator, file_meta,
- &handle, prefix_extractor,
+ &handle, block_protection_bytes_per_key, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
@@ -513,7 +519,8 @@ Status TableCache::MultiGetFilter(
const FileMetaData& file_meta,
const std::shared_ptr& prefix_extractor,
HistogramImpl* file_read_hist, int level,
- MultiGetContext::Range* mget_range, TypedHandle** table_handle) {
+ MultiGetContext::Range* mget_range, TypedHandle** table_handle,
+ uint8_t block_protection_bytes_per_key) {
auto& fd = file_meta.fd;
IterKey row_cache_key;
std::string row_cache_entry_buffer;
@@ -531,12 +538,13 @@ Status TableCache::MultiGetFilter(
MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
mget_range->end());
if (t == nullptr) {
- s = FindTable(
- options, file_options_, internal_comparator, file_meta, &handle,
- prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
- true /* record_read_stats */, file_read_hist, /*skip_filters=*/false,
- level, true /* prefetch_index_and_filter_in_cache */,
- /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
+ s = FindTable(options, file_options_, internal_comparator, file_meta,
+ &handle, block_protection_bytes_per_key, prefix_extractor,
+ options.read_tier == kBlockCacheTier /* no_io */,
+ true /* record_read_stats */, file_read_hist,
+ /*skip_filters=*/false, level,
+ true /* prefetch_index_and_filter_in_cache */,
+ /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
if (s.ok()) {
t = cache_.Value(handle);
}
@@ -564,6 +572,7 @@ Status TableCache::GetTableProperties(
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr* properties,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor, bool no_io) {
auto table_reader = file_meta.fd.table_reader;
// table already been pre-loaded?
@@ -575,7 +584,8 @@ Status TableCache::GetTableProperties(
TypedHandle* table_handle = nullptr;
Status s = FindTable(read_options, file_options, internal_comparator,
- file_meta, &table_handle, prefix_extractor, no_io);
+ file_meta, &table_handle, block_protection_bytes_per_key,
+ prefix_extractor, no_io);
if (!s.ok()) {
return s;
}
@@ -588,12 +598,14 @@ Status TableCache::GetTableProperties(
Status TableCache::ApproximateKeyAnchors(
const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
- const FileMetaData& file_meta, std::vector& anchors) {
+ const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
+ std::vector& anchors) {
Status s;
TableReader* t = file_meta.fd.table_reader;
TypedHandle* handle = nullptr;
if (t == nullptr) {
- s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle);
+ s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle,
+ block_protection_bytes_per_key);
if (s.ok()) {
t = cache_.Value(handle);
}
@@ -610,7 +622,7 @@ Status TableCache::ApproximateKeyAnchors(
size_t TableCache::GetMemoryUsageByTableReader(
const FileOptions& file_options, const ReadOptions& read_options,
const InternalKeyComparator& internal_comparator,
- const FileMetaData& file_meta,
+ const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor) {
auto table_reader = file_meta.fd.table_reader;
// table already been pre-loaded?
@@ -620,7 +632,8 @@ size_t TableCache::GetMemoryUsageByTableReader(
TypedHandle* table_handle = nullptr;
Status s = FindTable(read_options, file_options, internal_comparator,
- file_meta, &table_handle, prefix_extractor, true);
+ file_meta, &table_handle, block_protection_bytes_per_key,
+ prefix_extractor, true /* no_io */);
if (!s.ok()) {
return 0;
}
@@ -639,16 +652,17 @@ uint64_t TableCache::ApproximateOffsetOf(
const ReadOptions& read_options, const Slice& key,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor) {
uint64_t result = 0;
TableReader* table_reader = file_meta.fd.table_reader;
TypedHandle* table_handle = nullptr;
if (table_reader == nullptr) {
const bool for_compaction = (caller == TableReaderCaller::kCompaction);
- Status s =
- FindTable(read_options, file_options_, internal_comparator, file_meta,
- &table_handle, prefix_extractor, false /* no_io */,
- !for_compaction /* record_read_stats */);
+ Status s = FindTable(
+ read_options, file_options_, internal_comparator, file_meta,
+ &table_handle, block_protection_bytes_per_key, prefix_extractor,
+ false /* no_io */, !for_compaction /* record_read_stats */);
if (s.ok()) {
table_reader = cache_.Value(table_handle);
}
@@ -668,16 +682,17 @@ uint64_t TableCache::ApproximateSize(
const ReadOptions& read_options, const Slice& start, const Slice& end,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor) {
uint64_t result = 0;
TableReader* table_reader = file_meta.fd.table_reader;
TypedHandle* table_handle = nullptr;
if (table_reader == nullptr) {
const bool for_compaction = (caller == TableReaderCaller::kCompaction);
- Status s =
- FindTable(read_options, file_options_, internal_comparator, file_meta,
- &table_handle, prefix_extractor, false /* no_io */,
- !for_compaction /* record_read_stats */);
+ Status s = FindTable(
+ read_options, file_options_, internal_comparator, file_meta,
+ &table_handle, block_protection_bytes_per_key, prefix_extractor,
+ false /* no_io */, !for_compaction /* record_read_stats */);
if (s.ok()) {
table_reader = cache_.Value(table_handle);
}
diff --git a/db/table_cache.h b/db/table_cache.h
index 609e67498de..41201eea8a0 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -96,6 +96,7 @@ class TableCache {
size_t max_file_size_for_l0_meta_pin,
const InternalKey* smallest_compaction_key,
const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+ uint8_t protection_bytes_per_key,
TruncatedRangeDelIterator** range_del_iter = nullptr);
// If a seek to internal key "k" in specified file finds an entry,
@@ -112,6 +113,7 @@ class TableCache {
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
@@ -121,7 +123,7 @@ class TableCache {
Status GetRangeTombstoneIterator(
const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
- const FileMetaData& file_meta,
+ const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
std::unique_ptr* out_iter);
// Call table reader's MultiGetFilter to use the bloom filter to filter out
@@ -135,7 +137,8 @@ class TableCache {
const FileMetaData& file_meta,
const std::shared_ptr& prefix_extractor,
HistogramImpl* file_read_hist, int level,
- MultiGetContext::Range* mget_range, TypedHandle** table_handle);
+ MultiGetContext::Range* mget_range, TypedHandle** table_handle,
+ uint8_t block_protection_bytes_per_key);
// If a seek to internal key "k" in specified file finds an entry,
// call get_context->SaveValue() repeatedly until
@@ -150,6 +153,7 @@ class TableCache {
Status, MultiGet, const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
bool skip_range_deletions = false, int level = -1,
@@ -165,6 +169,7 @@ class TableCache {
const ReadOptions& ro, const FileOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, TypedHandle**,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr,
const bool no_io = false, bool record_read_stats = true,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
@@ -183,12 +188,14 @@ class TableCache {
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr* properties,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr,
bool no_io = false);
Status ApproximateKeyAnchors(const ReadOptions& ro,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
+ uint8_t block_protection_bytes_per_key,
std::vector& anchors);
// Return total memory usage of the table reader of the file.
@@ -196,7 +203,7 @@ class TableCache {
size_t GetMemoryUsageByTableReader(
const FileOptions& toptions, const ReadOptions& read_options,
const InternalKeyComparator& internal_comparator,
- const FileMetaData& file_meta,
+ const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr);
// Returns approximated offset of a key in a file represented by fd.
@@ -204,6 +211,7 @@ class TableCache {
const ReadOptions& read_options, const Slice& key,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr);
// Returns approximated data size between start and end keys in a file
@@ -212,6 +220,7 @@ class TableCache {
const ReadOptions& read_options, const Slice& start, const Slice& end,
const FileMetaData& file_meta, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor = nullptr);
CacheInterface& get_cache() { return cache_; }
@@ -234,8 +243,8 @@ class TableCache {
const ReadOptions& ro, const FileOptions& file_options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, bool sequential_mode,
- bool record_read_stats, HistogramImpl* file_read_hist,
- std::unique_ptr* table_reader,
+ bool record_read_stats, uint8_t block_protection_bytes_per_key,
+ HistogramImpl* file_read_hist, std::unique_ptr* table_reader,
const std::shared_ptr& prefix_extractor = nullptr,
bool skip_filters = false, int level = -1,
bool prefetch_index_and_filter_in_cache = true,
diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h
index b1ab73247ce..df8e9337f6b 100644
--- a/db/table_cache_sync_and_async.h
+++ b/db/table_cache_sync_and_async.h
@@ -17,6 +17,7 @@ namespace ROCKSDB_NAMESPACE {
DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ uint8_t block_protection_bytes_per_key,
const std::shared_ptr& prefix_extractor,
HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions,
int level, TypedHandle* handle) {
@@ -65,7 +66,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
if (t == nullptr) {
assert(handle == nullptr);
s = FindTable(options, file_options_, internal_comparator, file_meta,
- &handle, prefix_extractor,
+ &handle, block_protection_bytes_per_key, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters,
level, true /* prefetch_index_and_filter_in_cache */,
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 64590db5cef..d87ef94494b 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -1257,7 +1257,8 @@ class VersionBuilder::Rep {
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr& prefix_extractor,
- size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) {
+ size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
+ uint8_t block_protection_bytes_per_key) {
assert(table_cache_ != nullptr);
size_t table_cache_capacity =
@@ -1326,7 +1327,8 @@ class VersionBuilder::Rep {
statuses[file_idx] = table_cache_->FindTable(
read_options, file_options_,
*(base_vstorage_->InternalComparator()), *file_meta, &handle,
- prefix_extractor, false /*no_io */, true /* record_read_stats */,
+ block_protection_bytes_per_key, prefix_extractor, false /*no_io */,
+ true /* record_read_stats */,
internal_stats->GetFileReadHist(level), false, level,
prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
file_meta->temperature);
@@ -1384,11 +1386,12 @@ Status VersionBuilder::LoadTableHandlers(
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr& prefix_extractor,
- size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) {
- return rep_->LoadTableHandlers(internal_stats, max_threads,
- prefetch_index_and_filter_in_cache,
- is_initial_load, prefix_extractor,
- max_file_size_for_l0_meta_pin, read_options);
+ size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
+ uint8_t block_protection_bytes_per_key) {
+ return rep_->LoadTableHandlers(
+ internal_stats, max_threads, prefetch_index_and_filter_in_cache,
+ is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin,
+ read_options, block_protection_bytes_per_key);
}
uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
diff --git a/db/version_builder.h b/db/version_builder.h
index 8e7dd9e6613..fb2a304a843 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -48,7 +48,8 @@ class VersionBuilder {
InternalStats* internal_stats, int max_threads,
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
const std::shared_ptr& prefix_extractor,
- size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options);
+ size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
+ uint8_t block_protection_bytes_per_key);
uint64_t GetMinOldestBlobFileNumber() const;
private:
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index 7ea176e0150..d507c4b0c86 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -566,13 +566,13 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
assert(builder_iter->second != nullptr);
VersionBuilder* builder = builder_iter->second->version_builder();
assert(builder);
+ const MutableCFOptions* moptions = cfd->GetLatestMutableCFOptions();
Status s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads,
prefetch_index_and_filter_in_cache, is_initial_load,
- cfd->GetLatestMutableCFOptions()->prefix_extractor,
- MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()),
- read_options_);
+ moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions),
+ read_options_, moptions->block_protection_bytes_per_key);
if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
s = Status::OK();
}
@@ -812,16 +812,16 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
assert(builder);
}
+ const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions();
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
- *cfd->GetLatestMutableCFOptions(), io_tracer_,
+ *cf_opts_ptr, io_tracer_,
version_set_->current_version_number_++,
epoch_number_requirement_);
s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true,
- cfd->GetLatestMutableCFOptions()->prefix_extractor,
- MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()),
- read_options_);
+ cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr),
+ read_options_, cf_opts_ptr->block_protection_bytes_per_key);
if (!s.ok()) {
delete version;
if (s.IsCorruption()) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 9f1888c78b9..cfe8d617366 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -941,7 +941,7 @@ class LevelIterator final : public InternalIterator {
const std::shared_ptr& prefix_extractor,
bool should_sample, HistogramImpl* file_read_hist,
TableReaderCaller caller, bool skip_filters, int level,
- RangeDelAggregator* range_del_agg,
+ uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg,
const std::vector* compaction_boundaries =
nullptr,
bool allow_unprepared_value = false,
@@ -964,6 +964,7 @@ class LevelIterator final : public InternalIterator {
pinned_iters_mgr_(nullptr),
compaction_boundaries_(compaction_boundaries),
is_next_read_sequential_(false),
+ block_protection_bytes_per_key_(block_protection_bytes_per_key),
range_tombstone_iter_(nullptr),
to_return_sentinel_(false) {
// Empty level is not supported.
@@ -1107,7 +1108,8 @@ class LevelIterator final : public InternalIterator {
nullptr /* don't need reference to table */, file_read_hist_, caller_,
/*arena=*/nullptr, skip_filters_, level_,
/*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
- largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
+ largest_compaction_key, allow_unprepared_value_,
+ block_protection_bytes_per_key_, range_tombstone_iter_);
}
// Check if current file being fully within iterate_lower_bound.
@@ -1154,6 +1156,8 @@ class LevelIterator final : public InternalIterator {
bool is_next_read_sequential_;
+ uint8_t block_protection_bytes_per_key_;
+
// This is set when this level iterator is used under a merging iterator
// that processes range tombstones. range_tombstone_iter_ points to where the
// merging iterator stores the range tombstones iterator for this level. When
@@ -1535,6 +1539,7 @@ Status Version::GetTableProperties(const ReadOptions& read_options,
auto ioptions = cfd_->ioptions();
Status s = table_cache->GetTableProperties(
file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp,
+ mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor, true /* no io */);
if (s.ok()) {
return s;
@@ -1621,6 +1626,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
Status s = table_cache->GetRangeTombstoneIterator(
read_options, cfd_->internal_comparator(), *file_meta,
+ cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key,
&tombstone_iter);
if (!s.ok()) {
return s;
@@ -1739,6 +1745,7 @@ size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) {
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
file_options_, read_options, cfd_->internal_comparator(),
*file_level.files[i].file_metadata,
+ mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor);
}
}
@@ -1848,6 +1855,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+ mutable_cf_options_.block_protection_bytes_per_key,
nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
allow_unprepared_value, &tombstone_iter_ptr);
if (read_options.ignore_range_deletions) {
@@ -1946,7 +1954,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, allow_unprepared_value,
- &tombstone_iter);
+ mutable_cf_options_.block_protection_bytes_per_key, &tombstone_iter);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(table_iter);
} else {
@@ -1975,8 +1983,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
- /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
- allow_unprepared_value, &tombstone_iter_ptr);
+ mutable_cf_options_.block_protection_bytes_per_key,
+ /*range_del_agg=*/nullptr,
+ /*compaction_boundaries=*/nullptr, allow_unprepared_value,
+ &tombstone_iter_ptr);
if (read_options.ignore_range_deletions) {
merge_iter_builder->AddIterator(level_iter);
} else {
@@ -2019,7 +2029,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
- /*allow_unprepared_value=*/false));
+ /*allow_unprepared_value=*/false,
+ mutable_cf_options_.block_protection_bytes_per_key));
status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
iter.get(), overlap);
if (!status.ok() || *overlap) {
@@ -2034,7 +2045,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
cfd_->internal_stats()->GetFileReadHist(level),
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
- &range_del_agg));
+ mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg,
+ nullptr, false));
status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
iter.get(), overlap);
}
@@ -2333,7 +2345,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
*status = table_cache_->Get(
read_options, *internal_comparator(), *f->file_metadata, ikey,
- &get_context, mutable_cf_options_.prefix_extractor,
+ &get_context, mutable_cf_options_.block_protection_bytes_per_key,
+ mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
IsFilterSkipped(static_cast(fp.GetHitFileLevel()),
fp.IsHitFileLastInLevel()),
@@ -2578,7 +2591,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
read_options, *internal_comparator(), *f->file_metadata,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
- fp.GetHitFileLevel(), &file_range, &table_handle);
+ fp.GetHitFileLevel(), &file_range, &table_handle,
+ mutable_cf_options_.block_protection_bytes_per_key);
skip_range_deletions = true;
if (status.ok()) {
skip_filters = true;
@@ -2768,7 +2782,8 @@ Status Version::ProcessBatch(
read_options, *internal_comparator(), *f->file_metadata,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
- fp.GetHitFileLevel(), &file_range, &table_handle);
+ fp.GetHitFileLevel(), &file_range, &table_handle,
+ mutable_cf_options_.block_protection_bytes_per_key);
if (status.ok()) {
skip_filters = true;
skip_range_deletions = true;
@@ -5217,7 +5232,8 @@ Status VersionSet::ProcessManifestWrites(
true /* prefetch_index_and_filter_in_cache */,
false /* is_initial_load */,
mutable_cf_options_ptrs[i]->prefix_extractor,
- MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options);
+ MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options,
+ mutable_cf_options_ptrs[i]->block_protection_bytes_per_key);
if (!s.ok()) {
if (db_options_->paranoid_checks) {
break;
@@ -6553,10 +6569,11 @@ uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options,
// "key" falls in the range for this table. Add the
// approximate offset of "key" within the table.
TableCache* table_cache = v->cfd_->table_cache();
+ const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
if (table_cache != nullptr) {
result = table_cache->ApproximateOffsetOf(
read_options, key, *f.file_metadata, caller, icmp,
- v->GetMutableCFOptions().prefix_extractor);
+ cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor);
}
}
return result;
@@ -6596,9 +6613,10 @@ uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options,
if (table_cache == nullptr) {
return 0;
}
+ const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
return table_cache->ApproximateSize(
read_options, start, end, *f.file_metadata, caller, icmp,
- v->GetMutableCFOptions().prefix_extractor);
+ cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor);
}
void VersionSet::RemoveLiveFiles(
@@ -6757,6 +6775,7 @@ InternalIterator* VersionSet::MakeInputIterator(
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false,
+ c->mutable_cf_options()->block_protection_bytes_per_key,
/*range_del_iter=*/&range_tombstone_iter);
range_tombstones.emplace_back(range_tombstone_iter, nullptr);
}
@@ -6770,8 +6789,9 @@ InternalIterator* VersionSet::MakeInputIterator(
/*should_sample=*/false,
/*no per level latency histogram=*/nullptr,
TableReaderCaller::kCompaction, /*skip_filters=*/false,
- /*level=*/static_cast(c->level(which)), range_del_agg,
- c->boundaries(which), false, &tombstone_iter_ptr);
+ /*level=*/static_cast(c->level(which)),
+ c->mutable_cf_options()->block_protection_bytes_per_key,
+ range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr);
range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
}
}
@@ -7008,7 +7028,8 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
TableCache::TypedHandle* handle = nullptr;
FileMetaData meta_copy = meta;
status = table_cache->FindTable(
- read_options, file_opts, *icmp, meta_copy, &handle, pe,
+ read_options, file_opts, *icmp, meta_copy, &handle,
+ cf_opts->block_protection_bytes_per_key, pe,
/*no_io=*/false, /*record_read_stats=*/true,
internal_stats->GetFileReadHist(level), false, level,
/*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h
index 188c2e2f950..2507762e8c8 100644
--- a/db/version_set_sync_and_async.h
+++ b/db/version_set_sync_and_async.h
@@ -25,6 +25,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
s = CO_AWAIT(table_cache_->MultiGet)(
read_options, *internal_comparator(), *f->file_metadata, &file_range,
+ mutable_cf_options_.block_protection_bytes_per_key,
mutable_cf_options_.prefix_extractor,
cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
skip_range_deletions, hit_file_level, table_handle);
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index d16fefe4cd6..8756932a7ef 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -290,6 +290,7 @@ DECLARE_bool(paranoid_file_checks);
DECLARE_bool(fail_if_options_file_error);
DECLARE_uint64(batch_protection_bytes_per_key);
DECLARE_uint32(memtable_protection_bytes_per_key);
+DECLARE_uint32(block_protection_bytes_per_key);
DECLARE_uint64(user_timestamp_size);
DECLARE_string(secondary_cache_uri);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index b6ee6726901..9ce10f06cc9 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -975,6 +975,11 @@ DEFINE_uint32(
"specified number of bytes per key. Currently the supported "
"nonzero values are 1, 2, 4 and 8.");
+DEFINE_uint32(block_protection_bytes_per_key, 0,
+ "If nonzero, enables integrity protection in blocks at the "
+ "specified number of bytes per key. Currently the supported "
+ "nonzero values are 1, 2, 4 and 8.");
+
DEFINE_string(file_checksum_impl, "none",
"Name of an implementation for file_checksum_gen_factory, or "
"\"none\" for null.");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 60a12b33149..710c7687b9a 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3122,6 +3122,7 @@ void InitializeOptionsFromFlags(
FLAGS_verify_sst_unique_id_in_manifest;
options.memtable_protection_bytes_per_key =
FLAGS_memtable_protection_bytes_per_key;
+ options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
// Integrated BlobDB
options.enable_blob_files = FLAGS_enable_blob_files;
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index c41c5051f39..62e96af23ba 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -31,6 +31,7 @@ namespace ROCKSDB_NAMESPACE {
namespace {
static std::shared_ptr env_guard;
static std::shared_ptr env_wrapper_guard;
+static std::shared_ptr legacy_env_wrapper_guard;
static std::shared_ptr
dbsl_env_wrapper_guard;
static std::shared_ptr fault_env_guard;
@@ -99,6 +100,17 @@ int db_stress_tool(int argc, char** argv) {
env_wrapper_guard = std::make_shared(
raw_env, std::make_shared(raw_env->GetFileSystem()));
+ if (!env_opts && !FLAGS_use_io_uring) {
+ // If using the default Env (Posix), wrap DbStressEnvWrapper with the
+ // legacy EnvWrapper. This is a workaround to prevent MultiGet and scans
+ // from failing when IO uring is disabled. The EnvWrapper
+ // has a default implementation of ReadAsync that redirects to Read.
+ legacy_env_wrapper_guard = std::make_shared(raw_env);
+ env_wrapper_guard = std::make_shared(
+ legacy_env_wrapper_guard,
+ std::make_shared(
+ legacy_env_wrapper_guard->GetFileSystem()));
+ }
db_stress_env = env_wrapper_guard.get();
FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc
index e93b9a3f5f8..676daf574fa 100644
--- a/fuzz/sst_file_writer_fuzzer.cc
+++ b/fuzz/sst_file_writer_fuzzer.cc
@@ -92,7 +92,8 @@ TableReader* NewTableReader(const std::string& sst_file_path,
if (s.ok()) {
ImmutableOptions iopts(options, cf_ioptions);
TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options,
- cf_ioptions.internal_comparator);
+ cf_ioptions.internal_comparator,
+ 0 /* block_protection_bytes_per_key */);
t_opt.largest_seqno = kMaxSequenceNumber;
s = options.table_factory->NewTableReader(t_opt, std::move(file_reader),
file_size, &table_reader,
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 5862126d0af..ff0a408958e 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -1122,6 +1122,20 @@ struct AdvancedColumnFamilyOptions {
// only compatible changes are allowed.
bool persist_user_defined_timestamps = true;
+ // Enable/disable per key-value checksum protection for in memory blocks.
+ //
+ // Checksum is constructed when a block is loaded into memory and verification
+ // is done for each key read from the block. This is useful for detecting
+ // in-memory data corruption. Note that this feature has a non-trivial
+ // negative impact on read performance. Different values of the
+ // option have similar performance impact, but different memory cost and
+ // corruption detection probability (e.g. 1 byte gives 255/256 chance for
+ // detecting a corruption).
+ //
+ // Default: 0 (no protection)
+ // Supported values: 0, 1, 2, 4, 8.
+ uint8_t block_protection_bytes_per_key = 0;
+
// Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 387da17539b..9aadca94742 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -151,6 +151,13 @@ struct ShardedCacheOptions {
metadata_charge_policy(_metadata_charge_policy) {}
};
+// LRUCache - A cache using LRU eviction to stay at or below a set capacity.
+// The cache is sharded to 2^num_shard_bits shards, by hash of the key.
+// The total capacity is divided and evenly assigned to each shard, and each
+// shard has its own LRU list for evictions. Each shard also has a mutex for
+// exclusive access during operations; even read operations need exclusive
+// access in order to update the LRU list. Mutex contention is usually low
+// with enough shards.
struct LRUCacheOptions : public ShardedCacheOptions {
// Ratio of cache reserved for high-priority and low-priority entries,
// respectively. (See Cache::Priority below more information on the levels.)
@@ -158,7 +165,8 @@ struct LRUCacheOptions : public ShardedCacheOptions {
// values cannot exceed 1.
//
// If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU
- // list is maintained by the cache. Similarly, if low_pri_pool_ratio is
+ // list is maintained by the cache. A ratio of 0.5 means non-high-priority
+ // entries will use midpoint insertion. Similarly, if low_pri_pool_ratio is
// greater than zero, a dedicated low-priority LRU list is maintained.
// There is also a bottom-priority LRU list, which is always enabled and not
// explicitly configurable. Entries are spilled over to the next available
@@ -173,9 +181,6 @@ struct LRUCacheOptions : public ShardedCacheOptions {
// otherwise, they are placed in the bottom-priority pool.) This results
// in lower-priority entries without hits getting evicted from the cache
// sooner.
- //
- // Default values: high_pri_pool_ratio = 0.5 (which is referred to as
- // "midpoint insertion"), low_pri_pool_ratio = 0
double high_pri_pool_ratio = 0.5;
double low_pri_pool_ratio = 0.0;
@@ -199,31 +204,36 @@ struct LRUCacheOptions : public ShardedCacheOptions {
high_pri_pool_ratio(_high_pri_pool_ratio),
low_pri_pool_ratio(_low_pri_pool_ratio),
use_adaptive_mutex(_use_adaptive_mutex) {}
+
+ // Construct an instance of LRUCache using these options
+ std::shared_ptr MakeSharedCache() const;
};
-// Create a new cache with a fixed size capacity. The cache is sharded
-// to 2^num_shard_bits shards, by hash of the key. The total capacity
-// is divided and evenly assigned to each shard. If strict_capacity_limit
-// is set, insert to the cache will fail when cache is full. User can also
-// set percentage of the cache reserves for high priority entries via
-// high_pri_pool_pct.
-// num_shard_bits = -1 means it is automatically determined: every shard
-// will be at least 512KB and number of shard bits will not exceed 6.
-extern std::shared_ptr NewLRUCache(
+// DEPRECATED wrapper function
+inline std::shared_ptr NewLRUCache(
size_t capacity, int num_shard_bits = -1,
bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
std::shared_ptr memory_allocator = nullptr,
bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
CacheMetadataChargePolicy metadata_charge_policy =
kDefaultCacheMetadataChargePolicy,
- double low_pri_pool_ratio = 0.0);
-
-extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts);
+ double low_pri_pool_ratio = 0.0) {
+ return LRUCacheOptions(capacity, num_shard_bits, strict_capacity_limit,
+ high_pri_pool_ratio, memory_allocator,
+ use_adaptive_mutex, metadata_charge_policy,
+ low_pri_pool_ratio)
+ .MakeSharedCache();
+}
+
+// DEPRECATED wrapper function
+inline std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) {
+ return cache_opts.MakeSharedCache();
+}
// EXPERIMENTAL
-// Options structure for configuring a SecondaryCache instance based on
-// LRUCache. The LRUCacheOptions.secondary_cache is not used and
-// should not be set.
+// Options structure for configuring a SecondaryCache instance with in-memory
+// compression. The implementation uses LRUCache so inherits its options,
+// except LRUCacheOptions.secondary_cache is not used and should not be set.
struct CompressedSecondaryCacheOptions : LRUCacheOptions {
// The compression method (if any) that is used to compress data.
CompressionType compression_type = CompressionType::kLZ4Compression;
@@ -264,11 +274,16 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
compress_format_version(_compress_format_version),
enable_custom_split_merge(_enable_custom_split_merge),
do_not_compress_roles(_do_not_compress_roles) {}
+
+ // Construct an instance of CompressedSecondaryCache using these options
+ std::shared_ptr MakeSharedSecondaryCache() const;
+
+ // Avoid confusion with LRUCache
+ std::shared_ptr MakeSharedCache() const = delete;
};
-// EXPERIMENTAL
-// Create a new Secondary Cache that is implemented on top of LRUCache.
-extern std::shared_ptr NewCompressedSecondaryCache(
+// DEPRECATED wrapper function
+inline std::shared_ptr NewCompressedSecondaryCache(
size_t capacity, int num_shard_bits = -1,
bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
double low_pri_pool_ratio = 0.0,
@@ -280,10 +295,21 @@ extern std::shared_ptr NewCompressedSecondaryCache(
uint32_t compress_format_version = 2,
bool enable_custom_split_merge = false,
const CacheEntryRoleSet& _do_not_compress_roles = {
- CacheEntryRole::kFilterBlock});
-
-extern std::shared_ptr NewCompressedSecondaryCache(
- const CompressedSecondaryCacheOptions& opts);
+ CacheEntryRole::kFilterBlock}) {
+ return CompressedSecondaryCacheOptions(
+ capacity, num_shard_bits, strict_capacity_limit,
+ high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
+ use_adaptive_mutex, metadata_charge_policy, compression_type,
+ compress_format_version, enable_custom_split_merge,
+ _do_not_compress_roles)
+ .MakeSharedSecondaryCache();
+}
+
+// DEPRECATED wrapper function
+inline std::shared_ptr NewCompressedSecondaryCache(
+ const CompressedSecondaryCacheOptions& opts) {
+ return opts.MakeSharedSecondaryCache();
+}
// HyperClockCache - A lock-free Cache alternative for RocksDB block cache
// that offers much improved CPU efficiency vs. LRUCache under high parallel
diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h
index 5cb799e4273..d126abfe6d6 100644
--- a/include/rocksdb/memory_allocator.h
+++ b/include/rocksdb/memory_allocator.h
@@ -55,6 +55,11 @@ struct JemallocAllocatorOptions {
// Upper bound of allocation size to use tcache, if limit_tcache_size=true.
// When used with block cache, it is recommended to set it to block_size.
size_t tcache_size_upper_bound = 16 * 1024;
+
+ // Number of arenas across which we spread allocation requests. Increasing
+ // this setting can mitigate arena mutex contention. The value must be
+ // positive.
+ size_t num_arenas = 1;
};
// Generate memory allocator which allocates through Jemalloc and utilize
@@ -70,7 +75,8 @@ struct JemallocAllocatorOptions {
// core dump. Side benefit of using single arena would be reduction of jemalloc
// metadata for some workloads.
//
-// To mitigate mutex contention for using one single arena, jemalloc tcache
+// To mitigate mutex contention for using one single arena (see also
+// `JemallocAllocatorOptions::num_arenas` above), jemalloc tcache
// (thread-local cache) is enabled to cache unused allocations for future use.
// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
// can be reduced by limiting allocation sizes to cache.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index f5ac3057c47..cfa4021035f 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
// NOTE: in 'main' development branch, this should be the *next*
// minor or major version number planned for release.
#define ROCKSDB_MAJOR 8
-#define ROCKSDB_MINOR 2
+#define ROCKSDB_MINOR 3
#define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with
diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc
index cdad14576d2..d05248224d7 100644
--- a/memory/jemalloc_nodump_allocator.cc
+++ b/memory/jemalloc_nodump_allocator.cc
@@ -14,6 +14,8 @@
#include "rocksdb/utilities/customizable_util.h"
#include "rocksdb/utilities/object_registry.h"
#include "rocksdb/utilities/options_type.h"
+#include "util/fastrange.h"
+#include "util/random.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
@@ -35,6 +37,9 @@ static std::unordered_map jemalloc_type_info = {
{offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound),
OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
+ {"num_arenas",
+ {offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
};
bool JemallocNodumpAllocator::IsSupported(std::string* why) {
#ifndef ROCKSDB_JEMALLOC
@@ -59,11 +64,13 @@ bool JemallocNodumpAllocator::IsSupported(std::string* why) {
JemallocNodumpAllocator::JemallocNodumpAllocator(
JemallocAllocatorOptions& options)
- : options_(options),
+ : options_(options)
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
- tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache),
+ ,
+ tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {
+#else // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+{
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
- arena_index_(0) {
RegisterOptions(&options_, &jemalloc_type_info);
}
@@ -75,9 +82,9 @@ JemallocNodumpAllocator::~JemallocNodumpAllocator() {
for (void* tcache_index : tcache_list) {
DestroyThreadSpecificCache(tcache_index);
}
- if (arena_index_ > 0) {
+ for (auto arena_index : arena_indexes_) {
// Destroy arena. Silently ignore error.
- Status s = DestroyArena(arena_index_);
+ Status s = DestroyArena(arena_index);
assert(s.ok());
s.PermitUncheckedError();
}
@@ -90,7 +97,8 @@ size_t JemallocNodumpAllocator::UsableSize(void* p,
void* JemallocNodumpAllocator::Allocate(size_t size) {
int tcache_flag = GetThreadSpecificCache(size);
- return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+ uint32_t arena_index = GetArenaIndex();
+ return mallocx(size, MALLOCX_ARENA(arena_index) | tcache_flag);
}
void JemallocNodumpAllocator::Deallocate(void* p) {
@@ -105,45 +113,71 @@ void JemallocNodumpAllocator::Deallocate(void* p) {
dallocx(p, tcache_flag);
}
-Status JemallocNodumpAllocator::InitializeArenas() {
- // Create arena.
- size_t arena_index_size = sizeof(arena_index_);
- int ret =
- mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0);
- if (ret != 0) {
- return Status::Incomplete("Failed to create jemalloc arena, error code: " +
- std::to_string(ret));
+uint32_t JemallocNodumpAllocator::GetArenaIndex() const {
+ if (arena_indexes_.size() == 1) {
+ return arena_indexes_[0];
}
- assert(arena_index_ != 0);
- // Read existing hooks.
- std::string key = "arena." + std::to_string(arena_index_) + ".extent_hooks";
- extent_hooks_t* hooks;
- size_t hooks_size = sizeof(hooks);
- ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
- if (ret != 0) {
- return Status::Incomplete("Failed to read existing hooks, error code: " +
- std::to_string(ret));
- }
+ static std::atomic next_seed = 0;
+ // Core-local may work in place of `thread_local` as we should be able to
+ // tolerate occasional stale reads in thread migration cases. However we need
+ // to make Random thread-safe and prevent cacheline bouncing. Whether this is
+ // worthwhile is still an open question.
+ thread_local Random tl_random(next_seed.fetch_add(1));
+ return arena_indexes_[FastRange32(tl_random.Next(), arena_indexes_.size())];
+}
- // Store existing alloc.
- extent_alloc_t* original_alloc = hooks->alloc;
- extent_alloc_t* expected = nullptr;
- bool success =
- JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
- expected, original_alloc);
- if (!success && original_alloc != expected) {
- return Status::Incomplete("Original alloc conflict.");
- }
+Status JemallocNodumpAllocator::InitializeArenas() {
+ assert(!init_);
+ init_ = true;
- // Set the custom hook.
- arena_hooks_.reset(new extent_hooks_t(*hooks));
- arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc;
- extent_hooks_t* hooks_ptr = arena_hooks_.get();
- ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
- if (ret != 0) {
- return Status::Incomplete("Failed to set custom hook, error code: " +
- std::to_string(ret));
+ for (size_t i = 0; i < options_.num_arenas; i++) {
+ // Create arena.
+ unsigned arena_index;
+ size_t arena_index_size = sizeof(arena_index);
+ int ret =
+ mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
+ if (ret != 0) {
+ return Status::Incomplete(
+ "Failed to create jemalloc arena, error code: " +
+ std::to_string(ret));
+ }
+ arena_indexes_.push_back(arena_index);
+
+ // Read existing hooks.
+ std::string key =
+ "arena." + std::to_string(arena_indexes_[i]) + ".extent_hooks";
+ extent_hooks_t* hooks;
+ size_t hooks_size = sizeof(hooks);
+ ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+ if (ret != 0) {
+ return Status::Incomplete("Failed to read existing hooks, error code: " +
+ std::to_string(ret));
+ }
+
+ // Store existing alloc.
+ extent_alloc_t* original_alloc = hooks->alloc;
+ extent_alloc_t* expected = nullptr;
+ bool success =
+ JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+ expected, original_alloc);
+ if (!success && original_alloc != expected) {
+ // This could happen if jemalloc creates new arenas with different initial
+ // values in their `alloc` function pointers. See `original_alloc_` API
+ // doc for more details.
+ return Status::Incomplete("Original alloc conflict.");
+ }
+
+ // Set the custom hook.
+ per_arena_hooks_.emplace_back();
+ per_arena_hooks_.back().reset(new extent_hooks_t(*hooks));
+ per_arena_hooks_.back()->alloc = &JemallocNodumpAllocator::Alloc;
+ extent_hooks_t* hooks_ptr = per_arena_hooks_.back().get();
+ ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+ if (ret != 0) {
+ return Status::Incomplete("Failed to set custom hook, error code: " +
+ std::to_string(ret));
+ }
}
return Status::OK();
}
@@ -161,6 +195,8 @@ Status JemallocNodumpAllocator::PrepareOptions(
options_.tcache_size_upper_bound) {
return Status::InvalidArgument(
"tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+ } else if (options_.num_arenas < 1) {
+ return Status::InvalidArgument("num_arenas must be a positive integer");
} else if (IsMutable()) {
Status s = MemoryAllocator::PrepareOptions(config_options);
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
@@ -221,7 +257,7 @@ void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
return result;
}
-Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
+Status JemallocNodumpAllocator::DestroyArena(uint32_t arena_index) {
assert(arena_index != 0);
std::string key = "arena." + std::to_string(arena_index) + ".destroy";
int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
diff --git a/memory/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h
index a1e1547d7b3..2bdbaeb3286 100644
--- a/memory/jemalloc_nodump_allocator.h
+++ b/memory/jemalloc_nodump_allocator.h
@@ -24,6 +24,10 @@
#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
namespace ROCKSDB_NAMESPACE {
+
+// Allocation requests are randomly sharded across
+// `JemallocAllocatorOptions::num_arenas` arenas to reduce contention on per-
+// arena mutexes.
class JemallocNodumpAllocator : public BaseMemoryAllocator {
public:
explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options);
@@ -38,7 +42,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
return IsSupported(&unused);
}
static bool IsSupported(std::string* why);
- bool IsMutable() const { return arena_index_ == 0; }
+ bool IsMutable() const { return !init_; }
Status PrepareOptions(const ConfigOptions& config_options) override;
@@ -52,9 +56,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
Status InitializeArenas();
- friend Status NewJemallocNodumpAllocator(
- JemallocAllocatorOptions& options,
- std::shared_ptr* memory_allocator);
+ uint32_t GetArenaIndex() const;
// Custom alloc hook to replace jemalloc default alloc.
static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
@@ -62,7 +64,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
unsigned arena_ind);
// Destroy arena on destruction of the allocator, or on failure.
- static Status DestroyArena(unsigned arena_index);
+ static Status DestroyArena(uint32_t arena_index);
// Destroy tcache on destruction of the allocator, or thread exit.
static void DestroyThreadSpecificCache(void* ptr);
@@ -78,17 +80,20 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator {
// NewJemallocNodumpAllocator is thread-safe.
//
// Hack: original_alloc_ needs to be static for Alloc() to access it.
- // alloc needs to be static to pass to jemalloc as function pointer.
+ // alloc needs to be static to pass to jemalloc as function pointer. We can
+ // use a single process-wide value as long as we assume that any newly created
+ // arena has the same original value in its `alloc` function pointer.
static std::atomic original_alloc_;
// Custom hooks has to outlive corresponding arena.
- std::unique_ptr arena_hooks_;
+ std::vector> per_arena_hooks_;
// Hold thread-local tcache index.
ThreadLocalPtr tcache_;
+
+ std::vector arena_indexes_;
#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
- // Arena index.
- unsigned arena_index_;
+ bool init_ = false;
};
} // namespace ROCKSDB_NAMESPACE
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 2057e300a20..0fccd501434 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -488,6 +488,10 @@ static std::unordered_map
{offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key),
OptionType::kUInt32T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
+ {"block_protection_bytes_per_key",
+ {offsetof(struct MutableCFOptions, block_protection_bytes_per_key),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
{kOptNameCompOpts,
OptionTypeInfo::Struct(
kOptNameCompOpts, &compression_options_type_info,
diff --git a/options/cf_options.h b/options/cf_options.h
index d5e8da73481..37ef54c0cba 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -172,6 +172,7 @@ struct MutableCFOptions {
: options.last_level_temperature),
memtable_protection_bytes_per_key(
options.memtable_protection_bytes_per_key),
+ block_protection_bytes_per_key(options.block_protection_bytes_per_key),
sample_for_compression(
options.sample_for_compression), // TODO: is 0 fine here?
compression_per_level(options.compression_per_level) {
@@ -222,6 +223,7 @@ struct MutableCFOptions {
bottommost_compression(kDisableCompressionOption),
last_level_temperature(Temperature::kUnknown),
memtable_protection_bytes_per_key(0),
+ block_protection_bytes_per_key(0),
sample_for_compression(0) {}
explicit MutableCFOptions(const Options& options);
@@ -312,6 +314,7 @@ struct MutableCFOptions {
CompressionOptions bottommost_compression_opts;
Temperature last_level_temperature;
uint32_t memtable_protection_bytes_per_key;
+ uint8_t block_protection_bytes_per_key;
uint64_t sample_for_compression;
std::vector compression_per_level;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index fc651ffdba7..abe5053d229 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -206,6 +206,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
moptions.experimental_mempurge_threshold;
cf_opts->memtable_protection_bytes_per_key =
moptions.memtable_protection_bytes_per_key;
+ cf_opts->block_protection_bytes_per_key =
+ moptions.block_protection_bytes_per_key;
// Compaction related options
cf_opts->disable_auto_compactions = moptions.disable_auto_compactions;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index c772c786c9c..6357b5e9eea 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -552,7 +552,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"compaction=false;age_for_warm=1;};"
"blob_cache=1M;"
"memtable_protection_bytes_per_key=2;"
- "persist_user_defined_timestamps=true;",
+ "persist_user_defined_timestamps=true;"
+ "block_protection_bytes_per_key=1;",
new_options));
ASSERT_NE(new_options->blob_cache.get(), nullptr);
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
index ad648d3bc1d..9e9d8b8b548 100644
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@@ -128,6 +128,14 @@ void PrintStackTraceLine(const char* symbol, void* frame) {
#endif
+const char* GetLldbScriptSelectThread(long long tid) {
+ // NOTE: called from a signal handler, so no heap allocation
+ static char script[80];
+ snprintf(script, sizeof(script),
+ "script -l python -- lldb.process.SetSelectedThreadByID(%lld)", tid);
+ return script;
+}
+
} // namespace
void PrintStack(void* frames[], int num_frames) {
@@ -152,9 +160,13 @@ void PrintStack(int first_frames_to_skip) {
// * It doesn't appear easy to detect when ASLR is in use.
// * With DEBUG_LEVEL < 2, backtrace() can skip frames that are not skipped
// in GDB.
+ //
+ // LLDB also available as an option
+ bool lldb_stack_trace = getenv("ROCKSDB_LLDB_STACK") != nullptr;
#if defined(OS_LINUX)
// Default true, override with ROCKSDB_BACKTRACE_STACK=1
- bool gdb_stack_trace = getenv("ROCKSDB_BACKTRACE_STACK") == nullptr;
+ bool gdb_stack_trace =
+ !lldb_stack_trace && getenv("ROCKSDB_BACKTRACE_STACK") == nullptr;
#else
// Default false, override with ROCKSDB_GDB_STACK=1
bool gdb_stack_trace = getenv("ROCKSDB_GDB_STACK") != nullptr;
@@ -164,53 +176,84 @@ void PrintStack(int first_frames_to_skip) {
char* debug_env = getenv("ROCKSDB_DEBUG");
bool debug = debug_env != nullptr && strlen(debug_env) > 0;
- if (gdb_stack_trace || debug) {
+ if (lldb_stack_trace || gdb_stack_trace || debug) {
// Allow ouside debugger to attach, even with Yama security restrictions
#ifdef PR_SET_PTRACER_ANY
(void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif
// Try to invoke GDB, either for stack trace or debugging.
- long long attach_id = getpid();
+ long long attach_pid = getpid();
+ // NOTE: we're in a signal handler, so no heap allocation
+ char attach_pid_str[20];
+ snprintf(attach_pid_str, sizeof(attach_pid_str), "%lld", attach_pid);
// `gdb -p PID` seems to always attach to main thread, but `gdb -p TID`
// seems to be able to attach to a particular thread in a process, which
// makes sense as the main thread TID == PID of the process.
// But I haven't found that gdb capability documented anywhere, so leave
// a back door to attach to main thread.
+ long long gdb_attach_id = attach_pid;
+ // Save current thread id before fork
+ long long attach_tid = 0;
#ifdef OS_LINUX
+ attach_tid = gettid();
if (getenv("ROCKSDB_DEBUG_USE_PID") == nullptr) {
- attach_id = gettid();
+ gdb_attach_id = attach_tid;
}
#endif
- char attach_id_str[20];
- snprintf(attach_id_str, sizeof(attach_id_str), "%lld", attach_id);
+
+ char gdb_attach_id_str[20];
+ snprintf(gdb_attach_id_str, sizeof(gdb_attach_id_str), "%lld",
+ gdb_attach_id);
+
pid_t child_pid = fork();
if (child_pid == 0) {
// child process
if (debug) {
- fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n",
- debug_env);
- execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", attach_id_str,
- (char*)nullptr);
- return;
+ if (strcmp(debug_env, "lldb") == 0) {
+ fprintf(stderr, "Invoking LLDB for debugging (ROCKSDB_DEBUG=%s)...\n",
+ debug_env);
+ execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str,
+ /*"-Q",*/ "-o", GetLldbScriptSelectThread(attach_tid),
+ (char*)nullptr);
+ return;
+ } else {
+ fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n",
+ debug_env);
+ execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", gdb_attach_id_str,
+ (char*)nullptr);
+ return;
+ }
} else {
- fprintf(stderr, "Invoking GDB for stack trace...\n");
-
- // Skip top ~4 frames here in PrintStack
- // See https://stackoverflow.com/q/40991943/454544
- auto bt_in_gdb =
- "frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 "
- "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 "
- "42 43 44 -q frame";
// Redirect child stdout to original stderr
dup2(2, 1);
// No child stdin (don't use pager)
close(0);
- // -n : Loading config files can apparently cause failures with the
- // other options here.
- // -batch : non-interactive; suppress banners as much as possible
- execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p",
- attach_id_str, "-ex", bt_in_gdb, (char*)nullptr);
+ if (lldb_stack_trace) {
+ fprintf(stderr, "Invoking LLDB for stack trace...\n");
+
+ // Skip top ~8 frames here in PrintStack
+ auto bt_in_lldb =
+ "script -l python -- for f in lldb.thread.frames[8:]: print(f)";
+ execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str,
+ "-b", "-Q", "-o", GetLldbScriptSelectThread(attach_tid), "-o",
+ bt_in_lldb, (char*)nullptr);
+ } else {
+ // gdb_stack_trace
+ fprintf(stderr, "Invoking GDB for stack trace...\n");
+
+ // Skip top ~4 frames here in PrintStack
+ // See https://stackoverflow.com/q/40991943/454544
+ auto bt_in_gdb =
+ "frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 "
+ "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 "
+ "42 43 44 -q frame";
+ // -n : Loading config files can apparently cause failures with the
+ // other options here.
+ // -batch : non-interactive; suppress banners as much as possible
+ execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p",
+ gdb_attach_id_str, "-ex", bt_in_gdb, (char*)nullptr);
+ }
return;
}
} else {
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index b9b5d6e7e91..136275b6c89 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE {
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
-// "*value_length", respectively. Will not derefence past "limit".
+// "*value_length", respectively. Will not dereference past "limit".
//
// If any errors are detected, returns nullptr. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
@@ -137,17 +137,26 @@ struct DecodeEntryV4 {
return DecodeKeyV4()(p, limit, shared, non_shared);
}
};
+
void DataBlockIter::NextImpl() {
+#ifndef NDEBUG
+ if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return;
+#endif
bool is_shared = false;
ParseNextDataKey(&is_shared);
+ ++cur_entry_idx_;
}
void MetaBlockIter::NextImpl() {
bool is_shared = false;
ParseNextKey(&is_shared);
+ ++cur_entry_idx_;
}
-void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
+void IndexBlockIter::NextImpl() {
+ ParseNextIndexKey();
+ ++cur_entry_idx_;
+}
void IndexBlockIter::PrevImpl() {
assert(Valid());
@@ -166,6 +175,7 @@ void IndexBlockIter::PrevImpl() {
// Loop until end of current entry hits the start of original entry
while (ParseNextIndexKey() && NextEntryOffset() < original) {
}
+ --cur_entry_idx_;
}
void MetaBlockIter::PrevImpl() {
@@ -187,6 +197,7 @@ void MetaBlockIter::PrevImpl() {
while (ParseNextKey(&is_shared) &&
NextEntryOffset() < original) {
}
+ --cur_entry_idx_;
}
// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
@@ -195,6 +206,7 @@ void DataBlockIter::PrevImpl() {
assert(prev_entries_idx_ == -1 ||
static_cast(prev_entries_idx_) < prev_entries_.size());
+ --cur_entry_idx_;
// Check if we can use cached prev_entries_
if (prev_entries_idx_ > 0 &&
prev_entries_[prev_entries_idx_].offset == current_) {
@@ -319,10 +331,10 @@ void MetaBlockIter::SeekImpl(const Slice& target) {
// inclusive; AND
// 2) the last key of this block has a greater user_key from seek_user_key
//
-// If the return value is TRUE, iter location has two possibilies:
-// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
-// this case, it points to the first key with a larger user_key or a matching
-// user_key with a seqno no greater than the seeking seqno.
+// If the return value is TRUE, iter location has two possibilities:
+// 1) If iter is valid, it is set to a location as if set by SeekImpl(target).
+// In this case, it points to the first key with a larger user_key or a
+// matching user_key with a seqno no greater than the seeking seqno.
// 2) If the iter is invalid, it means that either all the user_key is less
// than the seek_user_key, or the block ends with a matching user_key but
// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
@@ -347,11 +359,11 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
// boundary key: axy@50 (we make minimal assumption about a boundary key)
// Block N+1: [axy@10, ... ]
//
- // If seek_key = axy@60, the search will starts from Block N.
+ // If seek_key = axy@60, the search will start from Block N.
// Even if the user_key is not found in the hash map, the caller still
// have to continue searching the next block.
//
- // In this case, we pretend the key is the the last restart interval.
+ // In this case, we pretend the key is in the last restart interval.
// The while-loop below will search the last restart interval for the
// key. It will stop at the first key that is larger than the seek_key,
// or to the end of the block if no one is larger.
@@ -364,12 +376,15 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
assert(restart_index < num_restarts_);
SeekToRestartPoint(restart_index);
current_ = GetRestartPoint(restart_index);
+ cur_entry_idx_ =
+ static_cast(restart_index * block_restart_interval_) - 1;
uint32_t limit = restarts_;
if (restart_index + 1 < num_restarts_) {
limit = GetRestartPoint(restart_index + 1);
}
while (current_ < limit) {
+ ++cur_entry_idx_;
bool shared;
// Here we only linear seek the target key inside the restart interval.
// If a key does not exist inside a restart interval, we avoid
@@ -381,14 +396,20 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
// we stop at the first potential matching user key.
break;
}
+ // If the loop exits due to CompareCurrentKey(target) >= 0, then current key
+ // exists, and its checksum verification will be done in UpdateKey() called
+ // in SeekForGet().
+ // TODO(cbi): If this loop exits with current_ == restart_, per key-value
+ // checksum will not be verified in UpdateKey() since Valid()
+ // will return false.
}
if (current_ == restarts_) {
- // Search reaches to the end of the block. There are three possibilites:
- // 1) there is only one user_key match in the block (otherwise collsion).
+ // Search reaches to the end of the block. There are three possibilities:
+ // 1) there is only one user_key match in the block (otherwise collision).
// the matching user_key resides in the last restart interval, and it
// is the last key of the restart interval and of the block as well.
- // ParseNextKey() skiped it as its [ type | seqno ] is smaller.
+ // ParseNextKey() skipped it as its [ type | seqno ] is smaller.
//
// 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
// AND all existing user_keys in the restart interval are smaller than
@@ -424,6 +445,9 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
}
void IndexBlockIter::SeekImpl(const Slice& target) {
+#ifndef NDEBUG
+ if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return;
+#endif
TEST_SYNC_POINT("IndexBlockIter::Seek:0");
PERF_TIMER_GUARD(block_seek_nanos);
if (data_ == nullptr) { // Not init yet
@@ -478,7 +502,9 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) {
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
if (!Valid()) {
- SeekToLastImpl();
+ if (status_.ok()) {
+ SeekToLastImpl();
+ }
} else {
while (Valid() && CompareCurrentKey(seek_key) > 0) {
PrevImpl();
@@ -502,7 +528,9 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
if (!Valid()) {
- SeekToLastImpl();
+ if (status_.ok()) {
+ SeekToLastImpl();
+ }
} else {
while (Valid() && CompareCurrentKey(seek_key) > 0) {
PrevImpl();
@@ -517,6 +545,7 @@ void DataBlockIter::SeekToFirstImpl() {
SeekToRestartPoint(0);
bool is_shared = false;
ParseNextDataKey(&is_shared);
+ cur_entry_idx_ = 0;
}
void MetaBlockIter::SeekToFirstImpl() {
@@ -526,15 +555,20 @@ void MetaBlockIter::SeekToFirstImpl() {
SeekToRestartPoint(0);
bool is_shared = false;
ParseNextKey(&is_shared);
+ cur_entry_idx_ = 0;
}
void IndexBlockIter::SeekToFirstImpl() {
+#ifndef NDEBUG
+ if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return;
+#endif
if (data_ == nullptr) { // Not init yet
return;
}
status_ = Status::OK();
SeekToRestartPoint(0);
ParseNextIndexKey();
+ cur_entry_idx_ = 0;
}
void DataBlockIter::SeekToLastImpl() {
@@ -543,8 +577,10 @@ void DataBlockIter::SeekToLastImpl() {
}
SeekToRestartPoint(num_restarts_ - 1);
bool is_shared = false;
+ cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
// Keep skipping
+ ++cur_entry_idx_;
}
}
@@ -554,9 +590,13 @@ void MetaBlockIter::SeekToLastImpl() {
}
SeekToRestartPoint(num_restarts_ - 1);
bool is_shared = false;
+ assert(num_restarts_ >= 1);
+ cur_entry_idx_ =
+ static_cast((num_restarts_ - 1) * block_restart_interval_);
while (ParseNextKey(&is_shared) &&
NextEntryOffset() < restarts_) {
- // Keep skipping
+ // Will probably never reach here since restart_interval is always 1
+ ++cur_entry_idx_;
}
}
@@ -566,20 +606,12 @@ void IndexBlockIter::SeekToLastImpl() {
}
status_ = Status::OK();
SeekToRestartPoint(num_restarts_ - 1);
+ cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
- // Keep skipping
+ ++cur_entry_idx_;
}
}
-template
-void BlockIter::CorruptionError() {
- current_ = restarts_;
- restart_index_ = num_restarts_;
- status_ = Status::Corruption("bad entry in block");
- raw_key_.Clear();
- value_.clear();
-}
-
template
template
bool BlockIter::ParseNextKey(bool* is_shared) {
@@ -666,12 +698,12 @@ bool IndexBlockIter::ParseNextIndexKey() {
// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
// ...
// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
-// where, k is key, v is value, and its encoding is in parenthesis.
+// where, k is key, v is value, and its encoding is in parentheses.
// The format of each key is (shared_size, non_shared_size, shared, non_shared)
// The format of each value, i.e., block handle, is (offset, size) whenever the
// is_shared is false, which included the first entry in each restart point.
-// Otherwise the format is delta-size = block handle size - size of last block
-// handle.
+// Otherwise, the format is delta-size = the size of current block - the size o
+// last block.
void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
Slice v(value_.data(), data_ + restarts_ - value_.data());
// Delta encoding is used if `shared` != 0.
@@ -710,6 +742,7 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target,
// to follow it up with NextImpl() to position the iterator at the restart
// key.
SeekToRestartPoint(index);
+ cur_entry_idx_ = static_cast(index * block_restart_interval_) - 1;
NextImpl();
if (!skip_linear_scan) {
@@ -728,6 +761,8 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target,
while (true) {
NextImpl();
if (!Valid()) {
+ // TODO(cbi): per key-value checksum will not be verified in UpdateKey()
+ // since Valid() will returns false.
break;
}
if (current_ == max_offset) {
@@ -976,6 +1011,7 @@ Block::~Block() {
// This sync point can be re-enabled if RocksDB can control the
// initialization order of any/all static options created by the user.
// TEST_SYNC_POINT("Block::~Block");
+ delete[] kv_checksum_;
}
Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
@@ -1035,6 +1071,126 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
}
}
+void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
+ const Comparator* raw_ucmp) {
+ protection_bytes_per_key_ = 0;
+ if (protection_bytes_per_key > 0 && num_restarts_ > 0) {
+ // NewDataIterator() is called with protection_bytes_per_key_ = 0.
+ // This is intended since checksum is not constructed yet.
+ //
+ // We do not know global_seqno yet, so checksum computation and
+ // verification all assume global_seqno = 0.
+ std::unique_ptr iter{NewDataIterator(
+ raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */,
+ nullptr /* stats */, true /* block_contents_pinned */)};
+ if (iter->status().ok()) {
+ block_restart_interval_ = iter->GetRestartInterval();
+ }
+ uint32_t num_keys = 0;
+ if (iter->status().ok()) {
+ num_keys = iter->NumberOfKeys(block_restart_interval_);
+ }
+ if (iter->status().ok()) {
+ checksum_size_ = num_keys * protection_bytes_per_key;
+ kv_checksum_ = new char[(size_t)checksum_size_];
+ size_t i = 0;
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
+ iter->key(), iter->value());
+ iter->Next();
+ i += protection_bytes_per_key;
+ }
+ assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
+ }
+ if (!iter->status().ok()) {
+ size_ = 0; // Error marker
+ return;
+ }
+ protection_bytes_per_key_ = protection_bytes_per_key;
+ }
+}
+
+void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
+ const Comparator* raw_ucmp,
+ bool value_is_full,
+ bool index_has_first_key) {
+ protection_bytes_per_key_ = 0;
+ if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
+ // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They
+ // do not impact how the index block is parsed. During checksum
+ // construction/verification, we use the entire key buffer from
+ // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value
+ // checksum, and the content of this buffer do not change for different
+ // values of `global_seqno` or `key_includes_seq`.
+ std::unique_ptr iter{NewIndexIterator(
+ raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr,
+ nullptr /* Statistics */, true /* total_order_seek */,
+ index_has_first_key /* have_first_key */, false /* key_includes_seq */,
+ value_is_full, true /* block_contents_pinned */,
+ nullptr /* prefix_index */)};
+ if (iter->status().ok()) {
+ block_restart_interval_ = iter->GetRestartInterval();
+ }
+ uint32_t num_keys = 0;
+ if (iter->status().ok()) {
+ num_keys = iter->NumberOfKeys(block_restart_interval_);
+ }
+ if (iter->status().ok()) {
+ checksum_size_ = num_keys * protection_bytes_per_key;
+ kv_checksum_ = new char[(size_t)checksum_size_];
+ iter->SeekToFirst();
+ size_t i = 0;
+ while (iter->Valid()) {
+ GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
+ iter->key(), iter->raw_value());
+ iter->Next();
+ i += protection_bytes_per_key;
+ }
+ assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
+ }
+ if (!iter->status().ok()) {
+ size_ = 0; // Error marker
+ return;
+ }
+ protection_bytes_per_key_ = protection_bytes_per_key;
+ }
+}
+
+void Block::InitializeMetaIndexBlockProtectionInfo(
+ uint8_t protection_bytes_per_key) {
+ protection_bytes_per_key_ = 0;
+ if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
+ std::unique_ptr iter{
+ NewMetaIterator(true /* block_contents_pinned */)};
+ if (iter->status().ok()) {
+ block_restart_interval_ = iter->GetRestartInterval();
+ }
+ uint32_t num_keys = 0;
+ if (iter->status().ok()) {
+ num_keys = iter->NumberOfKeys(block_restart_interval_);
+ }
+ if (iter->status().ok()) {
+ checksum_size_ = num_keys * protection_bytes_per_key;
+ kv_checksum_ = new char[(size_t)checksum_size_];
+ iter->SeekToFirst();
+ size_t i = 0;
+ while (iter->Valid()) {
+ GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
+ iter->key(), iter->value());
+ iter->Next();
+ i += protection_bytes_per_key;
+ }
+ assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
+ }
+ if (!iter->status().ok()) {
+ size_ = 0; // Error marker
+ return;
+ }
+ protection_bytes_per_key_ = protection_bytes_per_key;
+ }
+}
+
MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
MetaBlockIter* iter = new MetaBlockIter();
if (size_ < 2 * sizeof(uint32_t)) {
@@ -1045,7 +1201,8 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
iter->Invalidate(Status::OK());
} else {
iter->Initialize(data_, restart_offset_, num_restarts_,
- block_contents_pinned);
+ block_contents_pinned, protection_bytes_per_key_,
+ kv_checksum_, block_restart_interval_);
}
return iter;
}
@@ -1072,7 +1229,8 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
ret_iter->Initialize(
raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
read_amp_bitmap_.get(), block_contents_pinned,
- data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
+ data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
+ protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
if (read_amp_bitmap_) {
if (read_amp_bitmap_->GetStatistics() != stats) {
// DB changed the Statistics pointer, we need to notify read_amp_bitmap_
@@ -1108,8 +1266,9 @@ IndexBlockIter* Block::NewIndexIterator(
total_order_seek ? nullptr : prefix_index;
ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
global_seqno, prefix_index_ptr, have_first_key,
- key_includes_seq, value_is_full,
- block_contents_pinned);
+ key_includes_seq, value_is_full, block_contents_pinned,
+ protection_bytes_per_key_, kv_checksum_,
+ block_restart_interval_);
}
return ret_iter;
@@ -1125,6 +1284,7 @@ size_t Block::ApproximateMemoryUsage() const {
if (read_amp_bitmap_) {
usage += read_amp_bitmap_->ApproximateMemoryUsage();
}
+ usage += checksum_size_;
return usage;
}
diff --git a/table/block_based/block.h b/table/block_based/block.h
index dfbca866325..68b6906fac9 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -14,6 +14,7 @@
#include
#include
+#include "db/kv_checksum.h"
#include "db/pinned_iterators_manager.h"
#include "port/malloc.h"
#include "rocksdb/advanced_cache.h"
@@ -240,6 +241,34 @@ class Block {
// For TypedCacheInterface
const Slice& ContentSlice() const { return contents_.data; }
+ // Initializes per key-value checksum protection.
+ // After this method is called, each DataBlockIterator returned
+ // by NewDataIterator will verify per key-value checksum for any key it read.
+ void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
+ const Comparator* raw_ucmp);
+
+ // Initializes per key-value checksum protection.
+ // After this method is called, each IndexBlockIterator returned
+ // by NewIndexIterator will verify per key-value checksum for any key it read.
+ // value_is_full and index_has_first_key are needed to be able to parse
+ // the index block content and construct checksums.
+ void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
+ const Comparator* raw_ucmp,
+ bool value_is_full,
+ bool index_has_first_key);
+
+ // Initializes per key-value checksum protection.
+ // After this method is called, each MetaBlockIter returned
+ // by NewMetaIterator will verify per key-value checksum for any key it read.
+ void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key);
+
+ static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len,
+ const Slice& key, const Slice& value) {
+ ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr);
+ }
+
+ const char* TEST_GetKVChecksum() const { return kv_checksum_; }
+
private:
BlockContents contents_;
const char* data_; // contents_.data.data()
@@ -247,6 +276,11 @@ class Block {
uint32_t restart_offset_; // Offset in data_ of restart array
uint32_t num_restarts_;
std::unique_ptr read_amp_bitmap_;
+ char* kv_checksum_{nullptr};
+ uint32_t checksum_size_{0};
+ // Used by block iterators to calculate current key index within a block
+ uint32_t block_restart_interval_{0};
+ uint8_t protection_bytes_per_key_{0};
DataBlockHashIndex data_block_hash_index_;
};
@@ -269,6 +303,14 @@ class Block {
// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
// "Impl" functions are responsible for positioning `raw_key_` but not
// invoking `UpdateKey()`.
+//
+// Per key-value checksum is enabled if relevant states are passed in during
+// `InitializeBase()`. The checksum verification is done in each call to
+// UpdateKey() for the current key. Each subclass is responsible for keeping
+// track of cur_entry_idx_, the index of the current key within the block.
+// BlockIter uses this index to get the corresponding checksum for current key.
+// Additional checksum verification may be done in subclasses if they read keys
+// other than the key being processed in UpdateKey().
template
class BlockIter : public InternalIteratorBase {
public:
@@ -286,9 +328,16 @@ class BlockIter : public InternalIteratorBase {
Cleanable::Reset();
}
- bool Valid() const override { return current_ < restarts_; }
+ bool Valid() const override {
+ // When status_ is not ok, iter should be invalid.
+ assert(status_.ok() || current_ >= restarts_);
+ return current_ < restarts_;
+ }
virtual void SeekToFirst() override final {
+#ifndef NDEBUG
+ if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
+#endif
SeekToFirstImpl();
UpdateKey();
}
@@ -325,6 +374,7 @@ class BlockIter : public InternalIteratorBase {
}
Status status() const override { return status_; }
+
Slice key() const override {
assert(Valid());
return key_;
@@ -337,10 +387,22 @@ class BlockIter : public InternalIteratorBase {
(pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
status_.PermitUncheckedError();
}
+
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
}
+
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+
+ bool TEST_Corrupt_Callback(const std::string& sync_point) {
+ bool corrupt = false;
+ TEST_SYNC_POINT_CALLBACK(sync_point, static_cast(&corrupt));
+
+ if (corrupt) {
+ CorruptionError();
+ }
+ return corrupt;
+ }
#endif
bool IsKeyPinned() const override {
@@ -377,27 +439,74 @@ class BlockIter : public InternalIteratorBase {
Status status_;
// Key to be exposed to users.
Slice key_;
+ SequenceNumber global_seqno_;
+
+ // Per key-value checksum related states
+ const char* kv_checksum_;
+ int32_t cur_entry_idx_;
+ uint32_t block_restart_interval_;
+ uint8_t protection_bytes_per_key_;
+
bool key_pinned_;
// Whether the block data is guaranteed to outlive this iterator, and
// as long as the cleanup functions are transferred to another class,
// e.g. PinnableSlice, the pointer to the bytes will still be valid.
bool block_contents_pinned_;
- SequenceNumber global_seqno_;
virtual void SeekToFirstImpl() = 0;
virtual void SeekToLastImpl() = 0;
virtual void SeekImpl(const Slice& target) = 0;
virtual void SeekForPrevImpl(const Slice& target) = 0;
virtual void NextImpl() = 0;
-
virtual void PrevImpl() = 0;
+ // Returns the restart interval of this block.
+ // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized.
+ virtual uint32_t GetRestartInterval() {
+ if (num_restarts_ <= 1 || data_ == nullptr) {
+ return 0;
+ }
+ SeekToFirstImpl();
+ uint32_t end_index = GetRestartPoint(1);
+ uint32_t count = 1;
+ while (NextEntryOffset() < end_index && status_.ok()) {
+ assert(Valid());
+ NextImpl();
+ ++count;
+ }
+ return count;
+ }
+
+ // Returns the number of keys in this block.
+ virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) {
+ if (num_restarts_ == 0 || data_ == nullptr) {
+ return 0;
+ }
+ uint32_t count = (num_restarts_ - 1) * block_restart_interval;
+ // Add number of keys from the last restart interval
+ SeekToRestartPoint(num_restarts_ - 1);
+ while (NextEntryOffset() < restarts_ && status_.ok()) {
+ NextImpl();
+ ++count;
+ }
+ return count;
+ }
+
+ // Stores whether the current key has a shared bytes with prev key in
+ // *is_shared.
+ // Sets raw_key_, value_ to the current parsed key and value.
+ // Sets restart_index_ to point to the restart interval that contains
+ // the current key.
template
inline bool ParseNextKey(bool* is_shared);
+ // protection_bytes_per_key, kv_checksum, and block_restart_interval
+ // are needed only for per kv checksum verification.
void InitializeBase(const Comparator* raw_ucmp, const char* data,
uint32_t restarts, uint32_t num_restarts,
- SequenceNumber global_seqno, bool block_contents_pinned) {
+ SequenceNumber global_seqno, bool block_contents_pinned,
+ uint8_t protection_bytes_per_key, const char* kv_checksum,
+ uint32_t block_restart_interval) {
assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid
@@ -410,11 +519,41 @@ class BlockIter : public InternalIteratorBase {
global_seqno_ = global_seqno;
block_contents_pinned_ = block_contents_pinned;
cache_handle_ = nullptr;
+ cur_entry_idx_ = -1;
+ protection_bytes_per_key_ = protection_bytes_per_key;
+ kv_checksum_ = kv_checksum;
+ block_restart_interval_ = block_restart_interval;
+ // Checksum related states are either all 0/nullptr or all non-zero.
+ // One exception is when num_restarts == 0, block_restart_interval can be 0
+ // since we are not able to compute it.
+ assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
+ (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
+ (block_restart_interval > 0 || num_restarts == 1)));
+ }
+
+ void CorruptionError(const std::string& error_msg = "bad entry in block") {
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ status_ = Status::Corruption(error_msg);
+ raw_key_.Clear();
+ value_.clear();
+ }
+
+ void PerKVChecksumCorruptionError() {
+ std::string error_msg{
+ "Corrupted block entry: per key-value checksum verification "
+ "failed."};
+ error_msg.append(" Offset: " + std::to_string(current_) + ".");
+ error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + ".");
+ CorruptionError(error_msg);
}
// Must be called every time a key is found that needs to be returned to user,
// and may be called when no key is found (as a no-op). Updates `key_`,
// `key_buf_`, and `key_pinned_` with info about the found key.
+ // Per key-value checksum verification is done if available for the key to be
+ // returned. Iterator is invalidated with corruption status if checksum
+ // verification fails.
void UpdateKey() {
key_buf_.Clear();
if (!Valid()) {
@@ -433,6 +572,19 @@ class BlockIter : public InternalIteratorBase {
key_ = key_buf_.GetInternalKey();
key_pinned_ = false;
}
+ TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
+ (void*)value_.data());
+ TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
+ &protection_bytes_per_key_);
+ if (protection_bytes_per_key_ > 0) {
+ if (!ProtectionInfo64()
+ .ProtectKV(raw_key_.GetKey(), value_)
+ .Verify(
+ protection_bytes_per_key_,
+ kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
+ PerKVChecksumCorruptionError();
+ }
+ }
}
// Returns the result of `Comparator::Compare()`, where the appropriate
@@ -464,7 +616,7 @@ class BlockIter : public InternalIteratorBase {
return static_cast((value_.data() + value_.size()) - data_);
}
- uint32_t GetRestartPoint(uint32_t index) {
+ uint32_t GetRestartPoint(uint32_t index) const {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
@@ -479,13 +631,20 @@ class BlockIter : public InternalIteratorBase {
value_ = Slice(data_ + offset, 0);
}
- void CorruptionError();
-
protected:
template
inline bool BinarySeek(const Slice& target, uint32_t* index,
bool* is_index_key_result);
+ // Find the first key in restart interval `index` that is >= `target`.
+ // If there is no such key, iterator is positioned at the first key in
+ // restart interval `index + 1`.
+ // If is_index_key_result is true, it positions the iterator at the first key
+ // in this restart interval.
+ // Per key-value checksum verification is done for all keys scanned
+ // up to but not including the last key (the key that current_ points to
+ // when this function returns). This key's checksum is verified in
+ // UpdateKey().
void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
bool is_index_key_result);
};
@@ -494,22 +653,17 @@ class DataBlockIter final : public BlockIter {
public:
DataBlockIter()
: BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
- DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
- uint32_t num_restarts, SequenceNumber global_seqno,
- BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
- DataBlockHashIndex* data_block_hash_index)
- : DataBlockIter() {
- Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
- read_amp_bitmap, block_contents_pinned, data_block_hash_index);
- }
void Initialize(const Comparator* raw_ucmp, const char* data,
uint32_t restarts, uint32_t num_restarts,
SequenceNumber global_seqno,
BlockReadAmpBitmap* read_amp_bitmap,
bool block_contents_pinned,
- DataBlockHashIndex* data_block_hash_index) {
+ DataBlockHashIndex* data_block_hash_index,
+ uint8_t protection_bytes_per_key, const char* kv_checksum,
+ uint32_t block_restart_interval) {
InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
- block_contents_pinned);
+ block_contents_pinned, protection_bytes_per_key, kv_checksum,
+ block_restart_interval);
raw_key_.SetIsUserKey(false);
read_amp_bitmap_ = read_amp_bitmap;
last_bitmap_offset_ = current_ + 1;
@@ -527,7 +681,11 @@ class DataBlockIter final : public BlockIter {
return value_;
}
+ // Returns if `target` may exist.
inline bool SeekForGet(const Slice& target) {
+#ifndef NDEBUG
+ if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true;
+#endif
if (!data_block_hash_index_) {
SeekImpl(target);
UpdateKey();
@@ -599,11 +757,14 @@ class MetaBlockIter final : public BlockIter {
public:
MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
- bool block_contents_pinned) {
+ bool block_contents_pinned, uint8_t protection_bytes_per_key,
+ const char* kv_checksum, uint32_t block_restart_interval) {
// Initializes the iterator with a BytewiseComparator and
// the raw key being a user key.
InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
- kDisableGlobalSequenceNumber, block_contents_pinned);
+ kDisableGlobalSequenceNumber, block_contents_pinned,
+ protection_bytes_per_key, kv_checksum,
+ block_restart_interval);
raw_key_.SetIsUserKey(true);
}
@@ -613,12 +774,17 @@ class MetaBlockIter final : public BlockIter {
}
protected:
+ friend Block;
void SeekToFirstImpl() override;
void SeekToLastImpl() override;
void SeekImpl(const Slice& target) override;
void SeekForPrevImpl(const Slice& target) override;
void NextImpl() override;
void PrevImpl() override;
+ // Meta index block's restart interval is always 1. See
+ // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval.
+ uint32_t GetRestartInterval() override { return 1; }
+ uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; }
};
class IndexBlockIter final : public BlockIter {
@@ -633,9 +799,13 @@ class IndexBlockIter final : public BlockIter {
uint32_t restarts, uint32_t num_restarts,
SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
bool have_first_key, bool key_includes_seq,
- bool value_is_full, bool block_contents_pinned) {
+ bool value_is_full, bool block_contents_pinned,
+ uint8_t protection_bytes_per_key, const char* kv_checksum,
+ uint32_t block_restart_interval) {
InitializeBase(raw_ucmp, data, restarts, num_restarts,
- kDisableGlobalSequenceNumber, block_contents_pinned);
+ kDisableGlobalSequenceNumber, block_contents_pinned,
+ protection_bytes_per_key, kv_checksum,
+ block_restart_interval);
raw_key_.SetIsUserKey(!key_includes_seq);
prefix_index_ = prefix_index;
value_delta_encoded_ = !value_is_full;
@@ -666,11 +836,17 @@ class IndexBlockIter final : public BlockIter {
}
}
+ Slice raw_value() const {
+ assert(Valid());
+ return value_;
+ }
+
bool IsValuePinned() const override {
return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
}
protected:
+ friend Block;
// IndexBlockIter follows a different contract for prefix iterator
// from data iterators.
// If prefix of the seek key `target` exists in the file, it must
@@ -692,11 +868,8 @@ class IndexBlockIter final : public BlockIter {
}
void PrevImpl() override;
-
void NextImpl() override;
-
void SeekToFirstImpl() override;
-
void SeekToLastImpl() override;
private:
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 2a8e44d1cb6..5121d1b43d7 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -450,7 +450,12 @@ struct BlockBasedTableBuilder::Rep {
table_options, data_block)),
create_context(&table_options, ioptions.stats,
compression_type == kZSTD ||
- compression_type == kZSTDNotFinalCompression),
+ compression_type == kZSTDNotFinalCompression,
+ tbo.moptions.block_protection_bytes_per_key,
+ tbo.internal_comparator.user_comparator(),
+ !use_delta_encoding_for_index_values,
+ table_opt.index_type ==
+ BlockBasedTableOptions::kBinarySearchWithFirstKey),
status_ok(true),
io_status_ok(true) {
if (tbo.target_file_size == 0) {
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index dc852e543cd..2bca0703327 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -567,7 +567,8 @@ Status BlockBasedTableFactory::NewTableReader(
return BlockBasedTable::Open(
ro, table_reader_options.ioptions, table_reader_options.env_options,
table_options_, table_reader_options.internal_comparator, std::move(file),
- file_size, table_reader, table_reader_cache_res_mgr_,
+ file_size, table_reader_options.block_protection_bytes_per_key,
+ table_reader, table_reader_cache_res_mgr_,
table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
table_reader_options.skip_filters, table_reader_options.level,
table_reader_options.immortal, table_reader_options.largest_seqno,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 0ed42348f0c..b5144166198 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -560,6 +560,7 @@ Status BlockBasedTable::Open(
const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator,
std::unique_ptr&& file, uint64_t file_size,
+ uint8_t block_protection_bytes_per_key,
std::unique_ptr* table_reader,
std::shared_ptr table_reader_cache_res_mgr,
const std::shared_ptr& prefix_extractor,
@@ -645,6 +646,7 @@ Status BlockBasedTable::Open(
// meta-block reads.
rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+ rep->create_context.protection_bytes_per_key = block_protection_bytes_per_key;
// Read metaindex
std::unique_ptr new_table(
new BlockBasedTable(rep, block_cache_tracer));
@@ -671,9 +673,11 @@ Status BlockBasedTable::Open(
CompressionTypeToString(kZSTD) ||
rep->table_properties->compression_name ==
CompressionTypeToString(kZSTDNotFinalCompression));
- rep->create_context =
- BlockCreateContext(&rep->table_options, rep->ioptions.stats,
- blocks_definitely_zstd_compressed);
+ rep->create_context = BlockCreateContext(
+ &rep->table_options, rep->ioptions.stats,
+ blocks_definitely_zstd_compressed, block_protection_bytes_per_key,
+ rep->internal_comparator.user_comparator(), rep->index_value_is_full,
+ rep->index_has_first_key);
// Check expected unique id if provided
if (expected_unique_id != kNullUniqueId64x2) {
@@ -2168,6 +2172,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
}
}
s = biter.status();
+ if (!s.ok()) {
+ break;
+ }
}
// Write the block cache access record.
if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index df296a0d3d5..dafaa4ebf85 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -98,6 +98,7 @@ class BlockBasedTable : public TableReader {
const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_key_comparator,
std::unique_ptr&& file, uint64_t file_size,
+ uint8_t block_protection_bytes_per_key,
std::unique_ptr* table_reader,
std::shared_ptr table_reader_cache_res_mgr =
nullptr,
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index eb1175a7d43..a6ee940d801 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -116,8 +116,9 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
bool prefetch_index_and_filter_in_cache = true,
Status* status = nullptr) {
const MutableCFOptions moptions(options_);
- TableReaderOptions table_reader_options = TableReaderOptions(
- ioptions, moptions.prefix_extractor, EnvOptions(), comparator);
+ TableReaderOptions table_reader_options =
+ TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
+ comparator, 0 /* block_protection_bytes_per_key */);
std::unique_ptr file;
NewFileReader(table_name, foptions, &file);
diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc
index 318d30d84e3..a252899d24a 100644
--- a/table/block_based/block_cache.cc
+++ b/table/block_based/block_cache.cc
@@ -11,17 +11,25 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kData(
std::move(block), table_options->read_amp_bytes_per_bit, statistics));
+ parsed_out->get()->InitializeDataBlockProtectionInfo(protection_bytes_per_key,
+ raw_ucmp);
}
void BlockCreateContext::Create(std::unique_ptr* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kIndex(std::move(block),
/*read_amp_bytes_per_bit*/ 0, statistics));
+ parsed_out->get()->InitializeIndexBlockProtectionInfo(
+ protection_bytes_per_key, raw_ucmp, index_value_is_full,
+ index_has_first_key);
}
void BlockCreateContext::Create(
std::unique_ptr* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kFilterPartitionIndex(
std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics));
+ parsed_out->get()->InitializeIndexBlockProtectionInfo(
+ protection_bytes_per_key, raw_ucmp, index_value_is_full,
+ index_has_first_key);
}
void BlockCreateContext::Create(
std::unique_ptr* parsed_out, BlockContents&& block) {
@@ -32,6 +40,8 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out,
BlockContents&& block) {
parsed_out->reset(new Block_kMetaIndex(
std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics));
+ parsed_out->get()->InitializeMetaIndexBlockProtectionInfo(
+ protection_bytes_per_key);
}
void BlockCreateContext::Create(
diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h
index ec39405fe54..00eaface370 100644
--- a/table/block_based/block_cache.h
+++ b/table/block_based/block_cache.h
@@ -70,14 +70,26 @@ class Block_kMetaIndex : public Block {
struct BlockCreateContext : public Cache::CreateContext {
BlockCreateContext() {}
BlockCreateContext(const BlockBasedTableOptions* _table_options,
- Statistics* _statistics, bool _using_zstd)
+ Statistics* _statistics, bool _using_zstd,
+ uint8_t _protection_bytes_per_key,
+ const Comparator* _raw_ucmp,
+ bool _index_value_is_full = false,
+ bool _index_has_first_key = false)
: table_options(_table_options),
statistics(_statistics),
- using_zstd(_using_zstd) {}
+ using_zstd(_using_zstd),
+ protection_bytes_per_key(_protection_bytes_per_key),
+ raw_ucmp(_raw_ucmp),
+ index_value_is_full(_index_value_is_full),
+ index_has_first_key(_index_has_first_key) {}
const BlockBasedTableOptions* table_options = nullptr;
Statistics* statistics = nullptr;
bool using_zstd = false;
+ uint8_t protection_bytes_per_key = 0;
+ const Comparator* raw_ucmp = nullptr;
+ bool index_value_is_full;
+ bool index_has_first_key;
// For TypedCacheInterface
template
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index 83b87fe79e8..90a47ef2cc1 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -15,6 +15,7 @@
#include
#include
+#include "db/db_test_util.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
@@ -506,7 +507,7 @@ class IndexBlockTest
void GenerateRandomIndexEntries(std::vector *separators,
std::vector *block_handles,
std::vector *first_keys,
- const int len) {
+ const int len, bool zero_seqno = false) {
Random rnd(42);
// For each of `len` blocks, we need to generate a first and last key.
@@ -514,7 +515,11 @@ void GenerateRandomIndexEntries(std::vector *separators,
std::set