Merge 'logalloc, reader_concurrency_semaphore: cooperate on OOM kills…

…' from Botond Dénes Consider the following code snippet: ```c++ future<> foo() { semaphore.consume(1024); } future<> bar() { return _allocating_section([&] { foo(); }); } ``` If the consumed memory triggers the OOM kill limit, the semaphore will throw `std::bad_alloc`. The allocating section will catch this, bump std reserves and retry the lambda. Bumping the reserves will not do anything to prevent the next call to `consume()` from triggering the kill limit. So this cycle will repeat until std reserves are so large that ensuring the reserve fails. At this point LSA gives up and re-throws the `std::bad_alloc`. Beyond the useless time spent on code that is doomed to fail, this also results in expensive LSA compaction and eviction of the cache (while trying to ensure reserves). Prevent this situation by throwing a distinct exception type which is derived from `std::bad_alloc`. Allocating section will not retry on seeing this exception. A test reproducing the bug is also added. Fixes: #15278 Closes #15581 * github.com:scylladb/scylladb: test/boost/row_cache_test: add test_cache_reader_semaphore_oom_kill utils/logalloc: handle utils::memory_limit_reached in with_reclaiming_disabled() reader_concurrency_semaphore: use utils::memory_limit_reached exception utils: add memory_limit_reached exception
scylladb · Oct 5, 2023 · e600f35 · e600f35
2 parents 967faa9 + 08c0456
commit e600f35
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 1 deletion.
diff --git a/reader_concurrency_semaphore.cc b/reader_concurrency_semaphore.cc
@@ -21,6 +21,7 @@
 #include "utils/exceptions.hh"
 #include "schema/schema.hh"
 #include "utils/human_readable.hh"
+#include "utils/memory_limit_reached.hh"
 
 logger rcslog("reader_concurrency_semaphore");
 
@@ -956,7 +957,7 @@ void reader_concurrency_semaphore::consume(reader_permit::impl& permit, resource
             ++_stats.total_reads_killed_due_to_kill_limit;
         }
         maybe_dump_reader_permit_diagnostics(*this, "kill limit triggered");
-        throw std::bad_alloc();
+        throw utils::memory_limit_reached(format("kill limit triggered on semaphore {} by permit {}", _name, permit.description()));
     }
     _resources -= r;
 }

diff --git a/test/boost/row_cache_test.cc b/test/boost/row_cache_test.cc
@@ -4697,3 +4697,43 @@ SEASTAR_TEST_CASE(test_compact_range_tombstones_on_read) {
         BOOST_REQUIRE(tracker_stats.rows_compacted_away == 2);
     });
 }
+
+// Reproduces #15278
+// Check that the semaphore's OOM kill doesn't send LSA allocating sections
+// into a tailspin, retrying the failing code, with increase reserves, which
+// of course doesn't necessarily help release pressure on the semaphore.
+SEASTAR_THREAD_TEST_CASE(test_cache_reader_semaphore_oom_kill) {
+    simple_schema s;
+    reader_concurrency_semaphore semaphore(100, 1, get_name(), std::numeric_limits<size_t>::max(), utils::updateable_value<uint32_t>(1),
+            utils::updateable_value<uint32_t>(1));
+    auto stop_semaphore = deferred_stop(semaphore);
+
+    cache_tracker tracker;
+    auto cache_mt = make_lw_shared<replica::memtable>(s.schema());
+    row_cache cache(s.schema(), snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker);
+
+    auto pk = s.make_pkey(0);
+
+    mutation m(s.schema(), pk);
+    s.add_row(m, s.make_ckey(0), sstring(1024, '0'));
+    cache.populate(m);
+
+    auto pr = dht::partition_range::make_singular(pk);
+    tombstone_gc_state gc_state(nullptr);
+
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_killed_due_to_kill_limit, 0);
+    auto kill_limit_before = 0;
+
+    // Check different amounts of memory consumed before the read, so the OOM kill is triggered in different places.
+    for (unsigned memory = 1; memory <= 512; memory *= 2) {
+        semaphore.set_resources({1, memory});
+        auto permit = semaphore.obtain_permit(s.schema().get(), "read", 0, db::no_timeout, {}).get();
+        auto create_reader_and_read_all = [&] {
+            auto rd = cache.make_reader(s.schema(), permit, pr, &gc_state);
+            auto close_rd = deferred_close(rd);
+            while (rd().get());
+        };
+        BOOST_REQUIRE_THROW(create_reader_and_read_all(), utils::memory_limit_reached);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().total_reads_killed_due_to_kill_limit, ++kill_limit_before);
+    }
+}
diff --git a/utils/logalloc.hh b/utils/logalloc.hh
@@ -19,6 +19,7 @@
 #include "seastarx.hh"
 #include "db/timeout_clock.hh"
 #include "utils/entangled.hh"
+#include "utils/memory_limit_reached.hh"
 
 namespace logalloc {
 
@@ -499,6 +500,11 @@ public:
                 logalloc::reclaim_lock _(r);
                 memory::disable_abort_on_alloc_failure_temporarily dfg;
                 return fn();
+            } catch (const utils::memory_limit_reached&) {
+                // Do not retry, bumping reserves won't help.
+                // The read reached a memory limit in the semaphore and is being
+                // terminated.
+                throw;
             } catch (const std::bad_alloc&) {
                 on_alloc_failure(r);
             }

diff --git a/utils/memory_limit_reached.hh b/utils/memory_limit_reached.hh
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2023-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <string>
+
+namespace utils {
+
+// An exception thrown when a certain process or task is being terminated because
+// it has reached the memory limit alloted for said task or task group.
+// This is distinct from a regular bad-alloc in that possibly there is still
+// memory available but not for the task being terminated.
+// Allows code like LSA to tell real alloc failure from artificial one and act
+// accordingly (not retry the task).
+class memory_limit_reached : public std::bad_alloc {
+    std::string _msg;
+public:
+    memory_limit_reached(std::string_view msg) : _msg(msg) { }
+    const char* what() const noexcept override { return _msg.c_str(); }
+};
+
+} // namespace utils