Skip to content

Commit

Permalink
Merge '[Backport 5.2] : Reload reclaimed bloom filters when memory is…
Browse files Browse the repository at this point in the history
… available ' from Lakshmi Narayanan Sreethar

PR #17771 introduced a threshold for the total memory used by all bloom filters across SSTables. When the total usage surpasses the threshold, the largest bloom filter will be removed from memory, bringing the total usage back under the threshold. This PR adds support for reloading such reclaimed bloom filters back into memory when memory becomes available (i.e., within the 10% of available memory earmarked for the reclaimable components).

The SSTables manager now maintains a list of all SSTables whose bloom filter was removed from memory and attempts to reload them when an SSTable, whose bloom filter is still in memory, gets deleted. The manager reloads from the smallest to the largest bloom filter to maximize the number of filters being reloaded into memory.

Backported from #18186 to 5.2.

Closes #18666

* github.com:scylladb/scylladb:
  sstable_datafile_test: add testcase to test reclaim during reload
  sstable_datafile_test: add test to verify auto reload of reclaimed components
  sstables_manager: reload previously reclaimed components when memory is available
  sstables_manager: start a fiber to reload components
  sstable_directory_test: fix generation in sstable_directory_test_table_scan_incomplete_sstables
  sstable_datafile_test: add test to verify reclaimed components reload
  sstables: support reloading reclaimed components
  sstables_manager: add new intrusive set to track the reclaimed sstables
  sstable: add link and comparator class to support new instrusive set
  sstable: renamed intrusive list link type
  sstable: track memory reclaimed from components per sstable
  sstable: rename local variable in sstable::total_reclaimable_memory_size
  • Loading branch information
denesb committed May 30, 2024
2 parents 45814c7 + d4c523e commit e89eb41
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 16 deletions.
2 changes: 1 addition & 1 deletion scylla-gdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4118,7 +4118,7 @@ def find_sstables():
system_sstables_manager = std_unique_ptr(db["_system_sstables_manager"]).get()
for manager in (user_sstables_manager, system_sstables_manager):
for sst_list_name in ("_active", "_undergoing_close"):
for sst in intrusive_list(manager[sst_list_name], link="_manager_link"):
for sst in intrusive_list(manager[sst_list_name], link="_manager_list_link"):
yield sst.address
except gdb.error:
# Scylla Enterprise 2020.1 compatibility
Expand Down
26 changes: 23 additions & 3 deletions sstables/sstables.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/coroutine/as_future.hh>

#include "utils/error_injection.hh"
#include "dht/sharder.hh"
#include "types.hh"
#include "writer.hh"
Expand Down Expand Up @@ -1510,20 +1511,39 @@ size_t sstable::total_reclaimable_memory_size() const {
}

size_t sstable::reclaim_memory_from_components() {
size_t total_memory_reclaimed = 0;
size_t memory_reclaimed_this_iteration = 0;

if (_components->filter) {
auto filter_memory_size = _components->filter->memory_size();
if (filter_memory_size > 0) {
// Discard it from memory by replacing it with an always present variant.
// No need to remove it from _recognized_components as the filter is still in disk.
_components->filter = std::make_unique<utils::filter::always_present_filter>();
total_memory_reclaimed += filter_memory_size;
memory_reclaimed_this_iteration += filter_memory_size;
}
}

_total_reclaimable_memory.reset();
return total_memory_reclaimed;
_total_memory_reclaimed += memory_reclaimed_this_iteration;
return memory_reclaimed_this_iteration;
}

size_t sstable::total_memory_reclaimed() const {
return _total_memory_reclaimed;
}

future<> sstable::reload_reclaimed_components(const io_priority_class& pc) {
if (_total_memory_reclaimed == 0) {
// nothing to reload
co_return;
}

co_await utils::get_local_injector().inject("reload_reclaimed_components/pause", std::chrono::seconds{3});

co_await read_filter(pc);
_total_reclaimable_memory.reset();
_total_memory_reclaimed -= _components->filter->memory_size();
sstlog.info("Reloaded bloom filter of {}", get_filename());
}

// This interface is only used during tests, snapshot loading and early initialization.
Expand Down
24 changes: 21 additions & 3 deletions sstables/sstables.hh
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ class sstable : public enable_lw_shared_from_this<sstable> {
public:
using version_types = sstable_version_types;
using format_types = sstable_format_types;
using manager_link_type = bi::list_member_hook<bi::link_mode<bi::auto_unlink>>;
using manager_list_link_type = bi::list_member_hook<bi::link_mode<bi::auto_unlink>>;
using manager_set_link_type = bi::set_member_hook<bi::link_mode<bi::auto_unlink>>;
public:
sstable(schema_ptr schema,
sstring dir,
Expand Down Expand Up @@ -576,7 +577,11 @@ private:
sstables_manager& _manager;

sstables_stats _stats;
manager_link_type _manager_link;
// link used by the _active list of sstables manager
manager_list_link_type _manager_list_link;
// link used by the _reclaimed set of sstables manager
manager_set_link_type _manager_set_link;


// The _large_data_stats map stores e.g. largest partitions, rows, cells sizes,
// and max number of rows in a partition.
Expand All @@ -590,6 +595,8 @@ private:
// It is initialized to 0 to prevent the sstables manager from reclaiming memory
// from the components before the SSTable has been fully loaded.
mutable std::optional<size_t> _total_reclaimable_memory{0};
// Total memory reclaimed so far from this sstable
size_t _total_memory_reclaimed{0};
public:
const bool has_component(component_type f) const;
sstables_manager& manager() { return _manager; }
Expand Down Expand Up @@ -668,11 +675,15 @@ private:

future<> create_data() noexcept;

// Note that only bloom filters are reclaimable by the following methods.
// Return the total reclaimable memory in this SSTable
size_t total_reclaimable_memory_size() const;
// Reclaim memory from the components back to the system.
// Note that only bloom filters are reclaimable.
size_t reclaim_memory_from_components();
// Return memory reclaimed so far from this sstable
size_t total_memory_reclaimed() const;
// Reload components from which memory was previously reclaimed
future<> reload_reclaimed_components(const io_priority_class& pc);

public:
// Finds first position_in_partition in a given partition.
Expand Down Expand Up @@ -917,6 +928,13 @@ public:
// Drops all evictable in-memory caches of on-disk content.
future<> drop_caches();

struct lesser_reclaimed_memory {
// comparator class to be used by the _reclaimed set in sstables manager
bool operator()(const sstable& sst1, const sstable& sst2) const {
return sst1.total_memory_reclaimed() < sst2.total_memory_reclaimed();
}
};

// Allow the test cases from sstable_test.cc to test private methods. We use
// a placeholder to avoid cluttering this class too much. The sstable_test class
// will then re-export as public every method it needs.
Expand Down
57 changes: 57 additions & 0 deletions sstables/sstables_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ sstables_manager::sstables_manager(
std::numeric_limits<size_t>::max())
, _dir_semaphore(dir_sem)
{
_components_reloader_status = components_reloader_fiber();
}

sstables_manager::~sstables_manager() {
Expand Down Expand Up @@ -87,16 +88,68 @@ void sstables_manager::increment_total_reclaimable_memory_and_maybe_reclaim(ssta
auto memory_reclaimed = sst_with_max_memory->reclaim_memory_from_components();
_total_memory_reclaimed += memory_reclaimed;
_total_reclaimable_memory -= memory_reclaimed;
_reclaimed.insert(*sst_with_max_memory);
smlogger.info("Reclaimed {} bytes of memory from SSTable components. Total memory reclaimed so far is {} bytes", memory_reclaimed, _total_memory_reclaimed);
}

size_t sstables_manager::get_memory_available_for_reclaimable_components() {
size_t memory_reclaim_threshold = _available_memory * _db_config.components_memory_reclaim_threshold();
return memory_reclaim_threshold - _total_reclaimable_memory;
}

future<> sstables_manager::components_reloader_fiber() {
sstlog.trace("components_reloader_fiber start");
while (true) {
co_await _sstable_deleted_event.when();

if (_closing) {
co_return;
}

// Reload bloom filters from the smallest to largest so as to maximize
// the number of bloom filters being reloaded.
auto memory_available = get_memory_available_for_reclaimable_components();
while (!_reclaimed.empty() && memory_available > 0) {
auto sstable_to_reload = _reclaimed.begin();
const size_t reclaimed_memory = sstable_to_reload->total_memory_reclaimed();
if (reclaimed_memory > memory_available) {
// cannot reload anymore sstables
break;
}

// Increment the total memory before reloading to prevent any parallel
// fibers from loading new bloom filters into memory.
_total_reclaimable_memory += reclaimed_memory;
_reclaimed.erase(sstable_to_reload);
// Use a lw_shared_ptr to prevent the sstable from getting deleted when
// the components are being reloaded.
auto sstable_ptr = sstable_to_reload->shared_from_this();
try {
co_await sstable_ptr->reload_reclaimed_components(default_priority_class());
} catch (...) {
// reload failed due to some reason
sstlog.warn("Failed to reload reclaimed SSTable components : {}", std::current_exception());
// revert back changes made before the reload
_total_reclaimable_memory -= reclaimed_memory;
_reclaimed.insert(*sstable_to_reload);
break;
}

_total_memory_reclaimed -= reclaimed_memory;
memory_available = get_memory_available_for_reclaimable_components();
}
}
}

void sstables_manager::add(sstable* sst) {
_active.push_back(*sst);
}

void sstables_manager::deactivate(sstable* sst) {
// Remove SSTable from the reclaimable memory tracking
_total_reclaimable_memory -= sst->total_reclaimable_memory_size();
_total_memory_reclaimed -= sst->total_memory_reclaimed();
_reclaimed.erase(*sst);

// At this point, sst has a reference count of zero, since we got here from
// lw_shared_ptr_deleter<sstables::sstable>::dispose().
Expand All @@ -113,6 +166,7 @@ void sstables_manager::deactivate(sstable* sst) {
void sstables_manager::remove(sstable* sst) {
_undergoing_close.erase(_undergoing_close.iterator_to(*sst));
delete sst;
_sstable_deleted_event.signal();
maybe_done();
}

Expand All @@ -127,6 +181,9 @@ future<> sstables_manager::close() {
maybe_done();
co_await _done.get_future();
co_await _sstable_metadata_concurrency_sem.stop();
// stop the components reload fiber
_sstable_deleted_event.signal();
co_await std::move(_components_reloader_status);
}

sstable_directory::components_lister sstables_manager::get_components_lister(std::filesystem::path dir) {
Expand Down
14 changes: 13 additions & 1 deletion sstables/sstables_manager.hh
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@ static constexpr size_t default_sstable_buffer_size = 128 * 1024;

class sstables_manager {
using list_type = boost::intrusive::list<sstable,
boost::intrusive::member_hook<sstable, sstable::manager_link_type, &sstable::_manager_link>,
boost::intrusive::member_hook<sstable, sstable::manager_list_link_type, &sstable::_manager_list_link>,
boost::intrusive::constant_time_size<false>>;
using set_type = boost::intrusive::set<sstable,
boost::intrusive::member_hook<sstable, sstable::manager_set_link_type, &sstable::_manager_set_link>,
boost::intrusive::constant_time_size<false>,
boost::intrusive::compare<sstable::lesser_reclaimed_memory>>;
private:
size_t _available_memory;
db::large_data_handler& _large_data_handler;
Expand All @@ -70,6 +74,11 @@ private:
size_t _total_reclaimable_memory{0};
// Total memory reclaimed so far across all sstables
size_t _total_memory_reclaimed{0};
// Set of sstables from which memory has been reclaimed
set_type _reclaimed;
// Condition variable that gets notified when an sstable is deleted
seastar::condition_variable _sstable_deleted_event;
future<> _components_reloader_status = make_ready_future<>();

bool _closing = false;
promise<> _done;
Expand Down Expand Up @@ -131,6 +140,9 @@ private:
// memory and if the total memory usage exceeds the pre-defined threshold,
// reclaim it from the SSTable that has the most reclaimable memory.
void increment_total_reclaimable_memory_and_maybe_reclaim(sstable* sst);
// Fiber to reload reclaimed components back into memory when memory becomes available.
future<> components_reloader_fiber();
size_t get_memory_available_for_reclaimable_components();
private:
db::large_data_handler& get_large_data_handler() const {
return _large_data_handler;
Expand Down
Loading

0 comments on commit e89eb41

Please sign in to comment.