Skip to content

Commit

Permalink
Merge 'config: add index_cache_fraction' from Michał Chojnowski
Browse files Browse the repository at this point in the history
Index caching was disabled by default because it caused performance regressions
for some small-partition workloads. See #11202.

However, it also means that there are workloads which could benefit from the
index cache, but (by default) don't.

As a compromise, we can set a default limit on the memory usage of index cache,
which should be small enough to avoid catastrophic regressions in
small-partition workloads, but big enough to accommodate workloads where
index cache is obviously beneficial.

This series adds such a configurable limit, sets it to to 0.2 of total cache memory by default,
and re-enables index caching by default.

Fixes #15118

Closes #14994

* github.com:scylladb/scylladb:
  test: boost/cache_algorithm_test: add cache_algorithm_test
  sstables: partition_index_cache: deglobalize stats
  utils: cached_file: deglobalize cached_file metrics
  db: config: enable index caching by default
  config: add index_cache_fraction
  utils: lru: add move semantics to list links
  • Loading branch information
avikivity committed Sep 3, 2023
2 parents a5448fa + bcc235a commit 9a3d572
Show file tree
Hide file tree
Showing 20 changed files with 450 additions and 145 deletions.
1 change: 1 addition & 0 deletions configure.py
Expand Up @@ -359,6 +359,7 @@ def find_headers(repodir, excluded_dirs):
'test/boost/big_decimal_test',
'test/boost/broken_sstable_test',
'test/boost/bytes_ostream_test',
'test/boost/cache_algorithm_test',
'test/boost/cache_flat_mutation_reader_test',
'test/boost/cached_file_test',
'test/boost/caching_options_test',
Expand Down
13 changes: 11 additions & 2 deletions db/cache_tracker.hh
Expand Up @@ -10,8 +10,11 @@

#include "utils/lru.hh"
#include "utils/logalloc.hh"
#include "utils/updateable_value.hh"
#include "mutation/partition_version.hh"
#include "mutation/mutation_cleaner.hh"
#include "utils/cached_file_stats.hh"
#include "sstables/partition_index_cache_stats.hh"

#include <seastar/core/metrics_registration.hh>

Expand Down Expand Up @@ -77,18 +80,22 @@ public:
};
private:
stats _stats{};
cached_file_stats _index_cached_file_stats{};
partition_index_cache_stats _partition_index_cache_stats{};
seastar::metrics::metric_groups _metrics;
logalloc::region _region;
lru _lru;
mutation_cleaner _garbage;
mutation_cleaner _memtable_cleaner;
mutation_application_stats& _app_stats;
utils::updateable_value<double> _index_cache_fraction;
private:
void setup_metrics();
public:
using register_metrics = bool_class<class register_metrics_tag>;
cache_tracker(mutation_application_stats&, register_metrics);
cache_tracker(register_metrics = register_metrics::no);
cache_tracker(utils::updateable_value<double> index_cache_fraction, mutation_application_stats&, register_metrics);
cache_tracker(utils::updateable_value<double> index_cache_fraction, register_metrics);
cache_tracker();
~cache_tracker();
void clear();
void touch(rows_entry&);
Expand Down Expand Up @@ -130,6 +137,8 @@ public:
stats& get_stats() noexcept { return _stats; }
void set_compaction_scheduling_group(seastar::scheduling_group);
lru& get_lru() { return _lru; }
cached_file_stats& get_index_cached_file_stats() { return _index_cached_file_stats; }
partition_index_cache_stats& get_partition_index_cache_stats() { return _partition_index_cache_stats; }
seastar::memory::reclaiming_result evict_from_lru_shallow() noexcept;
};

Expand Down
6 changes: 4 additions & 2 deletions db/config.cc
Expand Up @@ -1002,8 +1002,10 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, task_ttl_seconds(this, "task_ttl_in_seconds", liveness::LiveUpdate, value_status::Used, 10, "Time for which information about finished task stays in memory.")
, nodeops_watchdog_timeout_seconds(this, "nodeops_watchdog_timeout_seconds", liveness::LiveUpdate, value_status::Used, 120, "Time in seconds after which node operations abort when not hearing from the coordinator")
, nodeops_heartbeat_interval_seconds(this, "nodeops_heartbeat_interval_seconds", liveness::LiveUpdate, value_status::Used, 10, "Period of heartbeat ticks in node operations")
, cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, false,
"Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
, cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
"Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions. The amount of memory usable by index cache is limited with `index_cache_fraction`.")
, index_cache_fraction(this, "index_cache_fraction", liveness::LiveUpdate, value_status::Used, 0.2,
"The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
, consistent_cluster_management(this, "consistent_cluster_management", value_status::Used, true, "Use RAFT for cluster management and DDL")
, wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory")
, wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache")
Expand Down
1 change: 1 addition & 0 deletions db/config.hh
Expand Up @@ -424,6 +424,7 @@ public:
named_value<uint32_t> nodeops_heartbeat_interval_seconds;

named_value<bool> cache_index_pages;
named_value<double> index_cache_fraction;

named_value<bool> consistent_cluster_management;

Expand Down
2 changes: 1 addition & 1 deletion replica/database.cc
Expand Up @@ -352,7 +352,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
std::numeric_limits<size_t>::max(),
utils::updateable_value(std::numeric_limits<uint32_t>::max()),
utils::updateable_value(std::numeric_limits<uint32_t>::max()))
, _row_cache_tracker(cache_tracker::register_metrics::yes)
, _row_cache_tracker(_cfg.index_cache_fraction.operator utils::updateable_value<double>(), cache_tracker::register_metrics::yes)
, _apply_stage("db_apply", &database::do_apply)
, _version(empty_version)
, _compaction_manager(cm)
Expand Down
50 changes: 46 additions & 4 deletions row_cache.cc
Expand Up @@ -25,6 +25,7 @@
#include "cache_flat_mutation_reader.hh"
#include "partition_snapshot_reader.hh"
#include "clustering_key_filter.hh"
#include "utils/updateable_value.hh"

namespace cache {

Expand All @@ -51,17 +52,23 @@ row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, con
}

static thread_local mutation_application_stats dummy_app_stats;
static thread_local utils::updateable_value<double> dummy_index_cache_fraction(1.0);

cache_tracker::cache_tracker(register_metrics with_metrics)
: cache_tracker(dummy_app_stats, with_metrics)
cache_tracker::cache_tracker()
: cache_tracker(dummy_index_cache_fraction, dummy_app_stats, register_metrics::no)
{}

cache_tracker::cache_tracker(utils::updateable_value<double> index_cache_fraction, register_metrics with_metrics)
: cache_tracker(std::move(index_cache_fraction), dummy_app_stats, with_metrics)
{}

static thread_local cache_tracker* current_tracker;

cache_tracker::cache_tracker(mutation_application_stats& app_stats, register_metrics with_metrics)
cache_tracker::cache_tracker(utils::updateable_value<double> index_cache_fraction, mutation_application_stats& app_stats, register_metrics with_metrics)
: _garbage(_region, this, app_stats)
, _memtable_cleaner(_region, nullptr, app_stats)
, _app_stats(app_stats)
, _index_cache_fraction(std::move(index_cache_fraction))
{
if (with_metrics) {
setup_metrics();
Expand All @@ -78,7 +85,35 @@ cache_tracker::cache_tracker(mutation_application_stats& app_stats, register_met
return memory::reclaiming_result::reclaimed_something;
}
current_tracker = this;
return _lru.evict();

// Cache replacement algorithm:
//
// if sstable index caches occupy more than index_cache_fraction of cache memory:
// evict the least recently used index entry
// else:
// evict the least recently used entry (data or index)
//
// This algorithm has the following good properties:
// 1. When index and data entries contend for cache space, it prevents
// index cache from taking more than index_cache_fraction of memory.
// This makes sure that the index cache doesn't catastrophically
// deprive the data cache of memory in small-partition workloads.
// 2. Since it doesn't enforce a lower limit on the index space, but only
// the upper limit, the parameter shouldn't require careful balancing.
// In workloads where it makes sense to cache the index (usually: where
// the index is small enough to fit in RAM), setting the fraction to any big number (e.g. 1.0)
// should work well enough. In workloads where it doesn't make sense to cache the index,
// setting the fraction to any small number (e.g. 0.0) should work well enough.
// Setting it to a medium number (something like 0.2) should work well enough
// for both extremes, although it might be suboptimal for non-extremes.
// 3. The parameter is trivially live-updateable.
//
// Perhaps this logic should be encapsulated somewhere else, maybe in `class lru` itself.
size_t total_cache_space = _region.occupancy().total_space();
size_t index_cache_space = _partition_index_cache_stats.used_bytes + _index_cached_file_stats.cached_bytes;
bool should_evict_index = index_cache_space > total_cache_space * _index_cache_fraction.get();

return _lru.evict(should_evict_index);
});
});
}
Expand All @@ -99,6 +134,11 @@ void cache_tracker::set_compaction_scheduling_group(seastar::scheduling_group sg
_garbage.set_scheduling_group(sg);
}

namespace sstables {
void register_index_page_cache_metrics(seastar::metrics::metric_groups&, cached_file_stats&);
void register_index_page_metrics(seastar::metrics::metric_groups&, partition_index_cache_stats&);
};

void
cache_tracker::setup_metrics() {
namespace sm = seastar::metrics;
Expand Down Expand Up @@ -146,6 +186,8 @@ cache_tracker::setup_metrics() {
sm::make_counter("rows_compacted_away", _stats.rows_compacted_away,
sm::description("total amount of compacted and removed rows during read")),
});
sstables::register_index_page_cache_metrics(_metrics, _index_cached_file_stats);
sstables::register_index_page_metrics(_metrics, _partition_index_cache_stats);
}

void cache_tracker::clear() {
Expand Down
6 changes: 3 additions & 3 deletions sstables/index_reader.hh
Expand Up @@ -22,7 +22,6 @@
namespace sstables {

extern seastar::logger sstlog;
extern thread_local cached_file::metrics index_page_cache_metrics;
extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;

// Promoted index information produced by the parser.
Expand Down Expand Up @@ -337,7 +336,7 @@ std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstab
seastar::shared_ptr<cached_file> cached_file_ptr = caching
? sst->_cached_index_file
: seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
index_page_cache_metrics,
sst->manager().get_cache_tracker().get_index_cached_file_stats(),
sst->manager().get_cache_tracker().get_lru(),
sst->manager().get_cache_tracker().region(),
sst->_index_file_size);
Expand Down Expand Up @@ -779,7 +778,8 @@ public:
, _trace_state(std::move(trace_state))
, _local_index_cache(caching ? nullptr
: std::make_unique<partition_index_cache>(_sstable->manager().get_cache_tracker().get_lru(),
_sstable->manager().get_cache_tracker().region()))
_sstable->manager().get_cache_tracker().region(),
_sstable->manager().get_cache_tracker().get_partition_index_cache_stats()))
, _index_cache(caching ? *_sstable->_index_cache : *_local_index_cache)
, _region(_sstable->manager().get_cache_tracker().region())
, _use_caching(caching)
Expand Down
34 changes: 13 additions & 21 deletions sstables/partition_index_cache.hh
Expand Up @@ -19,6 +19,7 @@
#include "utils/bptree.hh"
#include "utils/lru.hh"
#include "utils/lsa/weak_ptr.hh"
#include "sstables/partition_index_cache_stats.hh"

namespace sstables {

Expand All @@ -32,7 +33,7 @@ public:
using key_type = uint64_t;
private:
// Allocated inside LSA
class entry : public evictable, public lsa::weakly_referencable<entry> {
class entry final : public index_evictable, public lsa::weakly_referencable<entry> {
public:
partition_index_cache* _parent;
key_type _key;
Expand Down Expand Up @@ -76,15 +77,6 @@ private:
key_type key() const { return _key; }
};
public:
static thread_local struct stats {
uint64_t hits = 0; // Number of times entry was found ready
uint64_t misses = 0; // Number of times entry was not found
uint64_t blocks = 0; // Number of times entry was not ready (>= misses)
uint64_t evictions = 0; // Number of times entry was evicted
uint64_t populations = 0; // Number of times entry was inserted
uint64_t used_bytes = 0; // Number of bytes entries occupy in memory
} _shard_stats;

struct key_less_comparator {
bool operator()(key_type lhs, key_type rhs) const noexcept {
return lhs < rhs;
Expand Down Expand Up @@ -161,13 +153,15 @@ private:
logalloc::region& _region;
logalloc::allocating_section _as;
lru& _lru;
partition_index_cache_stats& _stats;
public:

// Create a cache with a given LRU attached.
partition_index_cache(lru& lru_, logalloc::region& r)
partition_index_cache(lru& lru_, logalloc::region& r, partition_index_cache_stats& stats)
: _cache(key_less_comparator())
, _region(r)
, _lru(lru_)
, _stats(stats)
{ }

~partition_index_cache() {
Expand Down Expand Up @@ -198,18 +192,18 @@ public:
entry& cp = *i;
auto ptr = share(cp);
if (cp.ready()) {
++_shard_stats.hits;
++_stats.hits;
return make_ready_future<entry_ptr>(std::move(ptr));
} else {
++_shard_stats.blocks;
++_stats.blocks;
return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
return std::move(ptr);
});
}
}

++_shard_stats.misses;
++_shard_stats.blocks;
++_stats.misses;
++_stats.blocks;

entry_ptr ptr = _as(_region, [&] {
return with_allocator(_region.allocator(), [&] {
Expand All @@ -233,8 +227,8 @@ public:
partition_index_page&& page = f.get0();
e.promise()->set_value();
e.set_page(std::move(page));
_shard_stats.used_bytes += e.size_in_allocator();
++_shard_stats.populations;
_stats.used_bytes += e.size_in_allocator();
++_stats.populations;
return ptr;
} catch (...) {
e.promise()->set_exception(std::current_exception());
Expand All @@ -248,12 +242,10 @@ public:
}

void on_evicted(entry& p) {
_shard_stats.used_bytes -= p.size_in_allocator();
++_shard_stats.evictions;
_stats.used_bytes -= p.size_in_allocator();
++_stats.evictions;
}

static const stats& shard_stats() { return _shard_stats; }

// Evicts all unreferenced entries.
future<> evict_gently() {
auto i = _cache.begin();
Expand Down
20 changes: 20 additions & 0 deletions sstables/partition_index_cache_stats.hh
@@ -0,0 +1,20 @@
/*
* Copyright (C) 2023 ScyllaDB
*/

/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

#pragma once

#include <cstdint>

struct partition_index_cache_stats {
uint64_t hits = 0; // Number of times entry was found ready
uint64_t misses = 0; // Number of times entry was not found
uint64_t blocks = 0; // Number of times entry was not ready (>= misses)
uint64_t evictions = 0; // Number of times entry was evicted
uint64_t populations = 0; // Number of times entry was inserted
uint64_t used_bytes = 0; // Number of bytes entries occupy in memory
};

0 comments on commit 9a3d572

Please sign in to comment.