diff --git a/cache_streamed_mutation.hh b/cache_streamed_mutation.hh new file mode 100644 index 000000000000..dfb579ca3d9e --- /dev/null +++ b/cache_streamed_mutation.hh @@ -0,0 +1,482 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include +#include "row_cache.hh" +#include "mutation_reader.hh" +#include "streamed_mutation.hh" +#include "partition_version.hh" +#include "utils/logalloc.hh" +#include "query-request.hh" +#include "partition_snapshot_reader.hh" +#include "partition_snapshot_row_cursor.hh" +#include "read_context.hh" + +namespace cache { + +class lsa_manager { + row_cache& _cache; +public: + lsa_manager(row_cache& cache) : _cache(cache) { } + template + decltype(auto) run_in_read_section(const Func& func) { + return _cache._read_section(_cache._tracker.region(), [&func] () { + return with_linearized_managed_bytes([&func] () { + return func(); + }); + }); + } + template + decltype(auto) run_in_update_section(const Func& func) { + return _cache._update_section(_cache._tracker.region(), [&func] () { + return with_linearized_managed_bytes([&func] () { + return func(); + }); + }); + } + template + void run_in_update_section_with_allocator(Func&& func) { + return _cache._update_section(_cache._tracker.region(), [this, &func] () { + return with_linearized_managed_bytes([this, &func] () { + return with_allocator(_cache._tracker.region().allocator(), [this, &func] () mutable { + return func(); + }); + }); + }); + } + logalloc::region& region() { return _cache._tracker.region(); } + logalloc::allocating_section& read_section() { return _cache._read_section; } +}; + +class cache_streamed_mutation final : public streamed_mutation::impl { + lw_shared_ptr _snp; + position_in_partition::tri_compare _position_cmp; + + query::clustering_key_filter_ranges _ck_ranges; + query::clustering_row_ranges::const_iterator _ck_ranges_curr; + query::clustering_row_ranges::const_iterator _ck_ranges_end; + + lsa_manager _lsa_manager; + + stdx::optional _last_row_key; + + // We need to be prepared that we may get overlapping and out of order + // range tombstones. We must emit fragments with strictly monotonic positions, + // so we can't just trim such tombstones to the position of the last fragment. + // To solve that, range tombstones are accumulated first in a range_tombstone_stream + // and emitted once we have a fragment with a larger position. + range_tombstone_stream _tombstones; + + // Holds the lower bound of a position range which hasn't been processed yet. + // Only fragments with positions < _lower_bound have been emitted. + position_in_partition _lower_bound; + position_in_partition_view _upper_bound; + + bool _static_row_done = false; + bool _reading_underlying = false; + lw_shared_ptr _read_context; + partition_snapshot_row_cursor _next_row; + bool _next_row_in_range = false; + + future<> do_fill_buffer(); + future<> copy_from_cache_to_buffer(); + future<> process_static_row(); + void move_to_end(); + future<> move_to_next_range(); + future<> move_to_current_range(); + future<> move_to_next_entry(); + // Emits all delayed range tombstones with positions smaller than upper_bound. + void drain_tombstones(position_in_partition_view upper_bound); + // Emits all delayed range tombstones. + void drain_tombstones(); + void add_to_buffer(const partition_snapshot_row_cursor&); + void add_to_buffer(clustering_row&&); + void add_to_buffer(range_tombstone&&); + void add_to_buffer(mutation_fragment&&); + future<> read_from_underlying(); + future<> start_reading_from_underlying(); + bool after_current_range(position_in_partition_view position); + bool can_populate() const; + void maybe_update_continuity(); + void maybe_add_to_cache(const mutation_fragment& mf); + void maybe_add_to_cache(const clustering_row& cr); + void maybe_add_to_cache(const range_tombstone& rt); + void maybe_add_to_cache(const static_row& sr); + void maybe_set_static_row_continuous(); +public: + cache_streamed_mutation(schema_ptr s, + dht::decorated_key dk, + query::clustering_key_filter_ranges&& crr, + lw_shared_ptr ctx, + lw_shared_ptr snp, + row_cache& cache) + : streamed_mutation::impl(std::move(s), dk, snp->partition_tombstone()) + , _snp(std::move(snp)) + , _position_cmp(*_schema) + , _ck_ranges(std::move(crr)) + , _ck_ranges_curr(_ck_ranges.begin()) + , _ck_ranges_end(_ck_ranges.end()) + , _lsa_manager(cache) + , _tombstones(*_schema) + , _lower_bound(position_in_partition::before_all_clustered_rows()) + , _upper_bound(position_in_partition_view::before_all_clustered_rows()) + , _read_context(std::move(ctx)) + , _next_row(*_schema, cache._tracker.region(), *_snp) + { } + cache_streamed_mutation(const cache_streamed_mutation&) = delete; + cache_streamed_mutation(cache_streamed_mutation&&) = delete; + virtual future<> fill_buffer() override; + virtual ~cache_streamed_mutation() { + maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section()); + } +}; + +inline +future<> cache_streamed_mutation::process_static_row() { + if (_snp->version()->partition().static_row_continuous()) { + row sr = _snp->static_row(); + if (!sr.empty()) { + push_mutation_fragment(mutation_fragment(static_row(std::move(sr)))); + } + return make_ready_future<>(); + } else { + return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) { + if (sr) { + assert(sr->is_static_row()); + maybe_add_to_cache(sr->as_static_row()); + push_mutation_fragment(std::move(*sr)); + } + maybe_set_static_row_continuous(); + }); + } +} + +inline +future<> cache_streamed_mutation::fill_buffer() { + if (!_static_row_done) { + _static_row_done = true; + return process_static_row().then([this] { + return _lsa_manager.run_in_read_section([this] { + return move_to_current_range(); + }).then([this] { + return fill_buffer(); + }); + }); + } + return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] { + return do_fill_buffer(); + }); +} + +inline +future<> cache_streamed_mutation::do_fill_buffer() { + if (_reading_underlying) { + return read_from_underlying(); + } + return _lsa_manager.run_in_read_section([this] { + auto same_pos = _next_row.maybe_refresh(); + // FIXME: If continuity changed anywhere between _lower_bound and _next_row.position() + // we need to redo the lookup with _lower_bound. There is no eviction yet, so not yet a problem. + assert(same_pos); + while (!is_buffer_full() && !_end_of_stream && !_reading_underlying) { + future<> f = copy_from_cache_to_buffer(); + if (!f.available() || need_preempt()) { + return f; + } + } + return make_ready_future<>(); + }); +} + +inline +future<> cache_streamed_mutation::read_from_underlying() { + return do_until([this] { return !_reading_underlying || is_buffer_full(); }, [this] { + return _read_context->get_next_fragment().then([this] (auto&& mfopt) { + if (!mfopt) { + _reading_underlying = false; + return _lsa_manager.run_in_update_section([this] { + auto same_pos = _next_row.maybe_refresh(); + assert(same_pos); // FIXME: handle eviction + if (_next_row_in_range) { + this->maybe_update_continuity(); + this->add_to_buffer(_next_row); + return this->move_to_next_entry(); + } else { + if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) { + this->maybe_update_continuity(); + } else { + // FIXME: Insert dummy entry at _upper_bound. + } + return this->move_to_next_range(); + } + }); + } else { + this->maybe_add_to_cache(*mfopt); + this->add_to_buffer(std::move(*mfopt)); + return make_ready_future<>(); + } + }); + }); +} + +inline +void cache_streamed_mutation::maybe_update_continuity() { + if (can_populate() && _next_row.is_in_latest_version()) { + if (_last_row_key) { + if (_next_row.previous_row_in_latest_version_has_key(*_last_row_key)) { + _next_row.set_continuous(true); + } + } else if (!_ck_ranges_curr->start()) { + _next_row.set_continuous(true); + } + } +} + +inline +void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) { + if (mf.is_range_tombstone()) { + maybe_add_to_cache(mf.as_range_tombstone()); + } else { + assert(mf.is_clustering_row()); + const clustering_row& cr = mf.as_clustering_row(); + maybe_add_to_cache(cr); + } +} + +inline +void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) { + if (!can_populate()) { + return; + } + _lsa_manager.run_in_update_section_with_allocator([this, &cr] { + mutation_partition& mp = _snp->version()->partition(); + rows_entry::compare less(*_schema); + + // FIXME: If _next_row is up to date, but latest version doesn't have iterator in + // current row (could be far away, so we'd do this often), then this will do + // the lookup in mp. This is not necessary, because _next_row has iterators for + // next rows in each version, even if they're not part of the current row. + // They're currently buried in the heap, but you could keep a vector of + // iterators per each version in addition to the heap. + auto new_entry = alloc_strategy_unique_ptr( + current_allocator().construct(cr.key(), cr.tomb(), cr.marker(), cr.cells())); + new_entry->set_continuous(false); + auto it = _next_row.has_up_to_date_row_from_latest_version() + ? _next_row.get_iterator_in_latest_version() : mp.clustered_rows().lower_bound(cr.key(), less); + auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less); + if (insert_result.second) { + new_entry.release(); + } + it = insert_result.first; + + rows_entry& e = *it; + if (_last_row_key) { + if (it == mp.clustered_rows().begin()) { + // FIXME: check whether entry for _last_row_key is in older versions and if so set + // continuity to true. + } else { + auto prev_it = it; + --prev_it; + clustering_key_prefix::tri_compare tri_comp(*_schema); + if (tri_comp(*_last_row_key, prev_it->key()) == 0) { + e.set_continuous(true); + } + } + } else if (!_ck_ranges_curr->start()) { + e.set_continuous(true); + } else { + // FIXME: Insert dummy entry at _ck_ranges_curr->start() + } + }); +} + +inline +bool cache_streamed_mutation::after_current_range(position_in_partition_view p) { + return _position_cmp(p, _upper_bound) >= 0; +} + +inline +future<> cache_streamed_mutation::start_reading_from_underlying() { + _reading_underlying = true; + auto end = _next_row_in_range ? position_in_partition(_next_row.position()) + : position_in_partition(_upper_bound); + return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}); +} + +inline +future<> cache_streamed_mutation::copy_from_cache_to_buffer() { + position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key()); + for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) { + add_to_buffer(std::move(rts)); + if (is_buffer_full()) { + return make_ready_future<>(); + } + } + if (_next_row_in_range) { + add_to_buffer(_next_row); + return move_to_next_entry(); + } else { + return move_to_next_range(); + } +} + +inline +void cache_streamed_mutation::move_to_end() { + drain_tombstones(); + _end_of_stream = true; +} + +inline +future<> cache_streamed_mutation::move_to_next_range() { + ++_ck_ranges_curr; + if (_ck_ranges_curr == _ck_ranges_end) { + move_to_end(); + return make_ready_future<>(); + } else { + return move_to_current_range(); + } +} + +inline +future<> cache_streamed_mutation::move_to_current_range() { + _last_row_key = std::experimental::nullopt; + _lower_bound = position_in_partition::for_range_start(*_ck_ranges_curr); + _upper_bound = position_in_partition_view::for_range_end(*_ck_ranges_curr); + auto complete_until_next = _next_row.advance_to(_lower_bound) || _next_row.continuous(); + _next_row_in_range = !after_current_range(_next_row.position()); + if (!complete_until_next) { + return start_reading_from_underlying(); + } + return make_ready_future<>(); +} + +// _next_row must be inside the range. +inline +future<> cache_streamed_mutation::move_to_next_entry() { + if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) { + return move_to_next_range(); + } else { + if (!_next_row.next()) { + move_to_end(); + return make_ready_future<>(); + } + _next_row_in_range = !after_current_range(_next_row.position()); + if (!_next_row.continuous()) { + return start_reading_from_underlying(); + } + return make_ready_future<>(); + } +} + +inline +void cache_streamed_mutation::drain_tombstones(position_in_partition_view pos) { + while (auto mfo = _tombstones.get_next(pos)) { + push_mutation_fragment(std::move(*mfo)); + } +} + +inline +void cache_streamed_mutation::drain_tombstones() { + while (auto mfo = _tombstones.get_next()) { + push_mutation_fragment(std::move(*mfo)); + } +} + +inline +void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) { + if (mf.is_clustering_row()) { + add_to_buffer(std::move(std::move(mf).as_clustering_row())); + } else { + assert(mf.is_range_tombstone()); + add_to_buffer(std::move(mf).as_range_tombstone()); + } +} + +inline +void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) { + if (!row.dummy()) { + add_to_buffer(row.row()); + } +} + +inline +void cache_streamed_mutation::add_to_buffer(clustering_row&& row) { + drain_tombstones(row.position()); + _last_row_key = row.key(); + _lower_bound = position_in_partition::after_key(row.key()); + push_mutation_fragment(std::move(row)); +} + +inline +void cache_streamed_mutation::add_to_buffer(range_tombstone&& rt) { + // This guarantees that rt starts after any emitted clustering_row + if (!rt.trim_front(*_schema, _lower_bound)) { + return; + } + _lower_bound = position_in_partition(rt.position()); + _tombstones.apply(std::move(rt)); + drain_tombstones(_lower_bound); +} + +inline +void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) { + if (can_populate()) { + _lsa_manager.run_in_update_section_with_allocator([&] { + _snp->version()->partition().apply_row_tombstone(*_schema, rt); + }); + } +} + +inline +void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) { + if (can_populate()) { + _lsa_manager.run_in_update_section_with_allocator([&] { + _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells()); + }); + } +} + +inline +void cache_streamed_mutation::maybe_set_static_row_continuous() { + if (can_populate()) { + _snp->version()->partition().set_static_row_continuous(true); + } +} + +inline +bool cache_streamed_mutation::can_populate() const { + return _snp->at_latest_version() && _read_context->cache().phase_of(_read_context->key()) == _read_context->phase(); +} + +} // namespace cache + +inline streamed_mutation make_cache_streamed_mutation(schema_ptr s, + dht::decorated_key dk, + query::clustering_key_filter_ranges crr, + row_cache& cache, + lw_shared_ptr ctx, + lw_shared_ptr snp) +{ + return make_streamed_mutation( + std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache); +} diff --git a/clustering_bounds_comparator.hh b/clustering_bounds_comparator.hh index f55d6a2e520b..61445e96f61d 100644 --- a/clustering_bounds_comparator.hh +++ b/clustering_bounds_comparator.hh @@ -54,8 +54,8 @@ static inline bound_kind flip_bound_kind(bound_kind bk) } class bound_view { - const static thread_local clustering_key empty_prefix; public: + const static thread_local clustering_key empty_prefix; const clustering_key_prefix& prefix; bound_kind kind; bound_view(const clustering_key_prefix& prefix, bound_kind kind) diff --git a/configure.py b/configure.py index 859ae4ce4007..d5edf1e93326 100755 --- a/configure.py +++ b/configure.py @@ -184,6 +184,8 @@ def endswith(self, end): 'tests/perf/perf_cql_parser', 'tests/perf/perf_simple_query', 'tests/perf/perf_fast_forward', + 'tests/cache_streamed_mutation_test', + 'tests/row_cache_stress_test', 'tests/memory_footprint', 'tests/perf/perf_sstable', 'tests/cql_query_test', @@ -625,6 +627,7 @@ def endswith(self, end): 'tests/message', 'tests/perf/perf_simple_query', 'tests/perf/perf_fast_forward', + 'tests/row_cache_stress_test', 'tests/memory_footprint', 'tests/gossip', 'tests/perf/perf_sstable', diff --git a/converting_mutation_partition_applier.hh b/converting_mutation_partition_applier.hh index d06228ccda36..58799887770d 100644 --- a/converting_mutation_partition_applier.hh +++ b/converting_mutation_partition_applier.hh @@ -22,6 +22,7 @@ #pragma once #include "mutation_partition_view.hh" +#include "mutation_partition.hh" #include "schema.hh" // Mutation partition visitor which applies visited data into @@ -37,12 +38,12 @@ private: static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) { return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type); } - void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) { + static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) { if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) { dst.apply(new_def, atomic_cell_or_collection(cell)); } } - void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) { + static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) { if (!is_compatible(new_def, old_type, kind)) { return; } @@ -94,8 +95,8 @@ public: _p.apply_row_tombstone(_p_schema, rt); } - virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) override { - deletable_row& r = _p.clustered_row(_p_schema, key); + virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override { + deletable_row& r = _p.clustered_row(_p_schema, key, dummy, continuous); r.apply(rm); r.apply(deleted_at); _current_row = &r; @@ -116,4 +117,14 @@ public: accept_cell(_current_row->cells(), column_kind::regular_column, *def, col.type(), collection); } } + + // Appends the cell to dst upgrading it to the new schema. + // Cells must have monotonic names. + static void append_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, const atomic_cell_or_collection& cell) { + if (new_def.is_atomic()) { + accept_cell(dst, kind, new_def, old_type, cell.as_atomic_cell()); + } else { + accept_cell(dst, kind, new_def, old_type, cell.as_collection_mutation()); + } + } }; diff --git a/cql3/statements/batch_statement.cc b/cql3/statements/batch_statement.cc index fac623ddb06a..248e8b7d07d3 100644 --- a/cql3/statements/batch_statement.cc +++ b/cql3/statements/batch_statement.cc @@ -233,7 +233,7 @@ void batch_statement::verify_batch_size(const std::vector& mutations) size += v.data.size(); } void accept_row_tombstone(const range_tombstone&) override {} - void accept_row(clustering_key_view, const row_tombstone&, const row_marker&) override {} + void accept_row(position_in_partition_view, const row_tombstone&, const row_marker&, is_dummy, is_continuous) override {} void accept_row_cell(column_id, atomic_cell_view v) override { size += v.value().size(); } diff --git a/database.cc b/database.cc index 98bda7f76e63..8d7b1011ea72 100644 --- a/database.cc +++ b/database.cc @@ -144,7 +144,7 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl , _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memory_only_memtable_list()) , _compaction_strategy(make_compaction_strategy(_schema->compaction_strategy(), _schema->compaction_strategy_options())) , _sstables(make_lw_shared(_compaction_strategy.make_sstable_set(_schema))) - , _cache(_schema, sstables_as_mutation_source(), global_cache_tracker(), _config.max_cached_partition_size_in_bytes) + , _cache(_schema, sstables_as_snapshot_source(), global_cache_tracker()) , _commitlog(cl) , _compaction_manager(compaction_manager) , _flush_queue(std::make_unique()) @@ -183,7 +183,24 @@ column_family::sstables_as_mutation_source() { tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd, mutation_reader::forwarding fwd_mr) { - return make_sstable_reader(std::move(s), r, slice, pc, std::move(trace_state), fwd, fwd_mr); + return make_sstable_reader(std::move(s), _sstables, r, slice, pc, std::move(trace_state), fwd, fwd_mr); + }); +} + +snapshot_source +column_family::sstables_as_snapshot_source() { + return snapshot_source([this] () { + // FIXME: Will keep sstables on disk until next memtable flush. Make compaction force cache refresh. + auto sst_set = _sstables; + return mutation_source([this, sst_set = std::move(sst_set)] (schema_ptr s, + const dht::partition_range& r, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr trace_state, + streamed_mutation::forwarding fwd, + mutation_reader::forwarding fwd_mr) { + return make_sstable_reader(std::move(s), sst_set, r, slice, pc, std::move(trace_state), fwd, fwd_mr); + }); }); } @@ -529,6 +546,7 @@ class single_key_sstable_reader final : public mutation_reader::impl { mutation_reader column_family::make_sstable_reader(schema_ptr s, + lw_shared_ptr sstables, const dht::partition_range& pr, const query::partition_slice& slice, const io_priority_class& pc, @@ -555,11 +573,11 @@ column_family::make_sstable_reader(schema_ptr s, if (dht::shard_of(pos.token()) != engine().cpu_id()) { return make_empty_reader(); // range doesn't belong to this shard } - return restrict_reader(make_mutation_reader(const_cast(this), std::move(s), _sstables, + return restrict_reader(make_mutation_reader(const_cast(this), std::move(s), std::move(sstables), _stats.estimated_sstable_per_read, pr, slice, pc, std::move(trace_state), fwd)); } else { // range_sstable_reader is not movable so we need to wrap it - return restrict_reader(make_mutation_reader(std::move(s), _sstables, pr, slice, pc, std::move(trace_state), fwd, fwd_mr)); + return restrict_reader(make_mutation_reader(std::move(s), std::move(sstables), pr, slice, pc, std::move(trace_state), fwd, fwd_mr)); } } @@ -643,7 +661,7 @@ column_family::make_reader(schema_ptr s, if (_config.enable_cache) { readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr)); } else { - readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr)); + readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr)); } return make_combined_reader(std::move(readers)); @@ -662,7 +680,7 @@ column_family::make_streaming_reader(schema_ptr s, readers.emplace_back(mt->make_reader(s, range, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no)); } - readers.emplace_back(make_sstable_reader(s, range, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no)); + readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no)); return make_combined_reader(std::move(readers)); } @@ -680,7 +698,7 @@ column_family::make_streaming_reader(schema_ptr s, for (auto&& mt : *_memtables) { readers.emplace_back(mt->make_reader(s, range, slice, pc, trace_state, fwd, fwd_mr)); } - readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr)); + readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr)); return make_combined_reader(std::move(readers)); }); @@ -866,11 +884,6 @@ column_family::seal_active_streaming_memtable_immediate() { // If we ever need to, we'll keep them separate statistics, but we don't want to polute the // main stats about memtables with streaming memtables. // - // Second, we will not bother touching the cache after this flush. The current streaming code - // will invalidate the ranges it touches, so we won't do it twice. Even when that changes, the - // cache management code in here will have to differ from the main memtable's one. Please see - // the comment at flush_streaming_mutations() for details. - // // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the // memtable list, since this memtable was not available for reading up until this point. return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] { @@ -878,7 +891,12 @@ column_family::seal_active_streaming_memtable_immediate() { }).then([this, old, newtab] () { add_sstable(newtab, {engine().cpu_id()}); trigger_compaction(); - return old->clear_gently(); + // Cache synchronization must be started atomically with add_sstable() + if (_config.enable_cache) { + return _cache.update_invalidating(*old); + } else { + return old->clear_gently(); + } }).handle_exception([old] (auto ep) { dblog.error("failed to write streamed sstable: {}", ep); return make_exception_future<>(ep); @@ -1791,7 +1809,7 @@ future<> distributed_loader::load_new_sstables(distributed& db, sstrin cf.trigger_compaction(); // Drop entire cache for this column family because it may be populated // with stale data. - return cf.get_row_cache().clear(); + return cf.get_row_cache().invalidate(); }); }).then([&db, ks, cf] () mutable { return smp::submit_to(0, [&db, ks = std::move(ks), cf = std::move(cf)] () mutable { @@ -1824,6 +1842,7 @@ future distributed_loader::probe_file(distributed(); }); @@ -2566,7 +2585,6 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config; cfg.cf_stats = _config.cf_stats; cfg.enable_incremental_backups = _config.enable_incremental_backups; - cfg.max_cached_partition_size_in_bytes = db_config.max_cached_partition_size_in_kb() * 1024; return cfg; } @@ -3797,28 +3815,26 @@ future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::part // be to change seal_active_streaming_memtable_delayed to take a range parameter. However, we // need this code to go away as soon as we can (see FIXME above). So the double gate is a better // temporary counter measure. - return with_gate(_streaming_flush_gate, [this, plan_id, ranges = std::move(ranges)] { - return flush_streaming_big_mutations(plan_id).then([this] { - return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed); - }).finally([this] { - return _streaming_flush_phaser.advance_and_await(); - }).finally([this, ranges = std::move(ranges)] { - if (!_config.enable_cache) { - return make_ready_future<>(); - } - return do_with(std::move(ranges), [this] (auto& ranges) { - return parallel_for_each(ranges, [this](auto&& range) { - return _cache.invalidate(range); - }); + return with_gate(_streaming_flush_gate, [this, plan_id, ranges = std::move(ranges)] () mutable { + return flush_streaming_big_mutations(plan_id).then([this, ranges = std::move(ranges)] (auto sstables) mutable { + return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed).then([this] { + return _streaming_flush_phaser.advance_and_await(); + }).then([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable { + for (auto&& sst : sstables) { + // seal_active_streaming_memtable_big() ensures sst is unshared. + this->add_sstable(sst, {engine().cpu_id()}); + } + this->trigger_compaction(); + return _cache.invalidate(std::move(ranges)); }); }); }); } -future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) { +future> column_family::flush_streaming_big_mutations(utils::UUID plan_id) { auto it = _streaming_memtables_big.find(plan_id); if (it == _streaming_memtables_big.end()) { - return make_ready_future<>(); + return make_ready_future>(std::vector()); } auto entry = it->second; _streaming_memtables_big.erase(it); @@ -3830,11 +3846,7 @@ future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) { return sst->open_data(); }); }).then([this, entry] { - for (auto&& sst : entry->sstables) { - // seal_active_streaming_memtable_big() ensures sst is unshared. - add_sstable(sst, {engine().cpu_id()}); - } - trigger_compaction(); + return std::move(entry->sstables); }); }); } @@ -3862,7 +3874,7 @@ future<> column_family::clear() { _streaming_memtables->clear(); _streaming_memtables->add_memtable(); _streaming_memtables_big.clear(); - return _cache.clear(); + return _cache.invalidate(); } // NOTE: does not need to be futurized, but might eventually, depending on @@ -3888,7 +3900,7 @@ future column_family::discard_sstables(db_clock::time_point _sstables = std::move(pruned); dblog.debug("cleaning out row cache"); - return _cache.clear().then([rp, remove = std::move(remove)] () mutable { + return _cache.invalidate().then([rp, remove = std::move(remove)] () mutable { return parallel_for_each(remove, [](sstables::shared_sstable s) { return sstables::delete_atomically({s}); }).then([rp] { diff --git a/database.hh b/database.hh index 4692733c59e9..7e20335c7487 100644 --- a/database.hh +++ b/database.hh @@ -429,7 +429,6 @@ public: restricted_mutation_reader_config read_concurrency_config; restricted_mutation_reader_config streaming_read_concurrency_config; ::cf_stats* cf_stats = nullptr; - uint64_t max_cached_partition_size_in_bytes; }; struct no_commitlog {}; struct stats { @@ -505,7 +504,7 @@ private: }; std::unordered_map> _streaming_memtables_big; - future<> flush_streaming_big_mutations(utils::UUID plan_id); + future> flush_streaming_big_mutations(utils::UUID plan_id); void apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m); future<> seal_active_streaming_memtable_big(streaming_memtable_big& smb); @@ -575,7 +574,9 @@ private: private: void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, std::vector&& shards_for_the_sstable); // Adds new sstable to the set of sstables - // Doesn't update the cache. + // Doesn't update the cache. The cache must be synchronized in order for reads to see + // the writes contained in this sstable. + // Cache must be synchronized atomically with this, otherwise write atomicity may not be respected. // Doesn't trigger compaction. void add_sstable(lw_shared_ptr sstable, std::vector&& shards_for_the_sstable); // returns an empty pointer if sstable doesn't belong to current shard. @@ -619,11 +620,12 @@ private: void remove_ancestors_needed_rewrite(std::unordered_set ancestors); private: mutation_source_opt _virtual_reader; - // Creates a mutation reader which covers sstables. + // Creates a mutation reader which covers given sstables. // Caller needs to ensure that column_family remains live (FIXME: relax this). // The 'range' parameter must be live as long as the reader is used. // Mutations returned by the reader will all have given schema. mutation_reader make_sstable_reader(schema_ptr schema, + lw_shared_ptr sstables, const dht::partition_range& range, const query::partition_slice& slice, const io_priority_class& pc, @@ -632,6 +634,7 @@ private: mutation_reader::forwarding fwd_mr) const; mutation_source sstables_as_mutation_source(); + snapshot_source sstables_as_snapshot_source(); partition_presence_checker make_partition_presence_checker(lw_shared_ptr); std::chrono::steady_clock::time_point _sstable_writes_disabled_at; void do_trigger_compaction(); diff --git a/db/config.hh b/db/config.hh index d8a35bbb23b2..5bc2d9693e5a 100644 --- a/db/config.hh +++ b/db/config.hh @@ -373,9 +373,6 @@ public: val(reduce_cache_sizes_at, double, .85, Invalid, \ "When Java heap usage (after a full concurrent mark sweep (CMS) garbage collection) exceeds this percentage, Cassandra reduces the cache capacity to the fraction of the current size as specified by reduce_cache_capacity_to. To disable, set the value to 1.0." \ ) \ - val(max_cached_partition_size_in_kb, uint64_t, 10240uLL, Used, \ - "Partitions with size greater than this value won't be cached." \ - ) \ /* Disks settings */ \ val(stream_throughput_outbound_megabits_per_sec, uint32_t, 400, Unused, \ "Throttles all outbound streaming file transfers on a node to the specified throughput. Cassandra does mostly sequential I/O when streaming data during bootstrap or repair, which can lead to saturating the network connection and degrading client (RPC) performance." \ diff --git a/dht/i_partitioner.cc b/dht/i_partitioner.cc index 0e035e06e9ef..0a1d183edfe6 100644 --- a/dht/i_partitioner.cc +++ b/dht/i_partitioner.cc @@ -442,7 +442,7 @@ bool ring_position::less_compare(const schema& s, const ring_position& other) co } int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const { - auto token_cmp = tri_compare(lh._token, rh._token); + auto token_cmp = tri_compare(*lh._token, *rh._token); if (token_cmp) { return token_cmp; } @@ -464,7 +464,7 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi int ring_position_comparator::operator()(ring_position_view lh, sstables::key_view rh) const { auto rh_token = global_partitioner().get_token(rh); - auto token_cmp = tri_compare(lh._token, rh_token); + auto token_cmp = tri_compare(*lh._token, rh_token); if (token_cmp) { return token_cmp; } diff --git a/dht/i_partitioner.hh b/dht/i_partitioner.hh index a568b90f6ae3..5474cb6d8e56 100644 --- a/dht/i_partitioner.hh +++ b/dht/i_partitioner.hh @@ -374,6 +374,14 @@ private: token_bound _token_bound; // valid when !_key std::experimental::optional _key; public: + static ring_position min() { + return { minimum_token(), token_bound::start }; + } + + static ring_position max() { + return { maximum_token(), token_bound::end }; + } + static ring_position starting_at(dht::token token) { return { std::move(token), token_bound::start }; } @@ -463,7 +471,7 @@ class ring_position_view { // For example {_token=t1, _key=nullptr, _weight=1} is ordered after {_token=t1, _key=k1, _weight=0}, // but {_token=t1, _key=nullptr, _weight=-1} is ordered before it. // - const dht::token& _token; + const dht::token* _token; // always not nullptr const partition_key* _key; // Can be nullptr int8_t _weight; public: @@ -479,11 +487,11 @@ public: } bool is_min() const { - return _token.is_minimum(); + return _token->is_minimum(); } bool is_max() const { - return _token.is_maximum(); + return _token->is_maximum(); } static ring_position_view for_range_start(const partition_range& r) { @@ -503,11 +511,14 @@ public: } ring_position_view(const dht::ring_position& pos, after_key after = after_key::no) - : _token(pos.token()) + : _token(&pos.token()) , _key(pos.has_key() ? &*pos.key() : nullptr) , _weight(pos.has_key() ? bool(after) : pos.relation_to_keys()) { } + ring_position_view(const ring_position_view& pos) = default; + ring_position_view& operator=(const ring_position_view& other) = default; + ring_position_view(after_key_tag, const ring_position_view& v) : _token(v._token) , _key(v._key) @@ -515,13 +526,13 @@ public: { } ring_position_view(const dht::decorated_key& key, after_key after_key = after_key::no) - : _token(key.token()) + : _token(&key.token()) , _key(&key.key()) , _weight(bool(after_key)) { } ring_position_view(const dht::token& token, partition_key* key, int8_t weight) - : _token(token) + : _token(&token) , _key(key) , _weight(weight) { } diff --git a/hashing_partition_visitor.hh b/hashing_partition_visitor.hh index 53e284dee8b8..83c5bb785a72 100644 --- a/hashing_partition_visitor.hh +++ b/hashing_partition_visitor.hh @@ -63,8 +63,11 @@ public: rt.feed_hash(_h, _s); } - virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) { - key.feed_hash(_h, _s); + virtual void accept_row(position_in_partition_view pos, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override { + if (dummy) { + return; + } + pos.key().feed_hash(_h, _s); feed_hash(_h, deleted_at); feed_hash(_h, rm); } diff --git a/intrusive_set_external_comparator.hh b/intrusive_set_external_comparator.hh index 6e2c51311f90..b1a0f2fb2889 100644 --- a/intrusive_set_external_comparator.hh +++ b/intrusive_set_external_comparator.hh @@ -208,6 +208,10 @@ public: } template iterator insert(const_iterator hint, Elem& value, ElemCompare cmp) { + return insert_check(hint, value, std::move(cmp)).first; + } + template + std::pair insert_check(const_iterator hint, Elem& value, ElemCompare cmp) { algo::insert_commit_data commit_data; std::pair ret = algo::insert_unique_check(_header.this_ptr(), @@ -215,8 +219,8 @@ public: key_of_value()(value), key_node_comp(cmp), commit_data); - return ret.second ? insert_unique_commit(value, commit_data) - : iterator(ret.first, priv_value_traits_ptr()); + return ret.second ? std::make_pair(insert_unique_commit(value, commit_data), true) + : std::make_pair(iterator(ret.first, priv_value_traits_ptr()), false); } }; diff --git a/mutation.cc b/mutation.cc index 6dbb47f51016..e801c2a2c5e2 100644 --- a/mutation.cc +++ b/mutation.cc @@ -206,37 +206,20 @@ mutation& mutation::operator+=(mutation&& other) { return *this; } -enum class limit_mutation_size { yes, no }; +mutation mutation::sliced(const query::clustering_row_ranges& ranges) const { + auto m = mutation(schema(), decorated_key(), mutation_partition(partition(), *schema(), ranges)); + m.partition().row_tombstones().trim(*schema(), ranges); + return m; +} -template class mutation_rebuilder { mutation _m; streamed_mutation& _sm; size_t _remaining_limit; - template bool check_remaining_limit(const T& e) { - if (with_limit == limit_mutation_size::no) { - return true; - } - size_t size = e.memory_usage(); - if (_remaining_limit <= size) { - _remaining_limit = 0; - } else { - _remaining_limit -= size; - } - return _remaining_limit > 0; - } public: mutation_rebuilder(streamed_mutation& sm) : _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(0) { - static_assert(with_limit == limit_mutation_size::no, - "This constructor should be used only for mutation_rebuildeer with no limit"); - } - mutation_rebuilder(streamed_mutation& sm, size_t limit) - : _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(limit) { - static_assert(with_limit == limit_mutation_size::yes, - "This constructor should be used only for mutation_rebuildeer with limit"); - check_remaining_limit(_m.key()); } stop_iteration consume(tombstone t) { @@ -245,25 +228,16 @@ class mutation_rebuilder { } stop_iteration consume(range_tombstone&& rt) { - if (!check_remaining_limit(rt)) { - return stop_iteration::yes; - } _m.partition().apply_row_tombstone(*_m.schema(), std::move(rt)); return stop_iteration::no; } stop_iteration consume(static_row&& sr) { - if (!check_remaining_limit(sr)) { - return stop_iteration::yes; - } _m.partition().static_row().apply(*_m.schema(), column_kind::static_column, std::move(sr.cells())); return stop_iteration::no; } stop_iteration consume(clustering_row&& cr) { - if (!check_remaining_limit(cr)) { - return stop_iteration::yes; - } auto& dr = _m.partition().clustered_row(*_m.schema(), std::move(cr.key())); dr.apply(cr.tomb()); dr.apply(cr.marker()); @@ -272,29 +246,21 @@ class mutation_rebuilder { } mutation_opt consume_end_of_stream() { - return with_limit == limit_mutation_size::yes && _remaining_limit == 0 ? mutation_opt() - : mutation_opt(std::move(_m)); + return mutation_opt(std::move(_m)); } }; -future -mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit) { - return do_with(std::move(sm), [limit] (auto& sm) { - return consume(sm, mutation_rebuilder(sm, limit)); - }); -} - future mutation_from_streamed_mutation(streamed_mutation_opt sm) { if (!sm) { return make_ready_future(); } return do_with(std::move(*sm), [] (auto& sm) { - return consume(sm, mutation_rebuilder(sm)); + return consume(sm, mutation_rebuilder(sm)); }); } future mutation_from_streamed_mutation(streamed_mutation& sm) { - return consume(sm, mutation_rebuilder(sm)).then([] (mutation_opt&& mo) { + return consume(sm, mutation_rebuilder(sm)).then([] (mutation_opt&& mo) { return std::move(*mo); }); } diff --git a/mutation.hh b/mutation.hh index 10c29c02db93..f3f634553239 100644 --- a/mutation.hh +++ b/mutation.hh @@ -133,6 +133,10 @@ public: mutation operator+(const mutation& other) const; mutation& operator+=(const mutation& other); mutation& operator+=(mutation&& other); + + // Returns a subset of this mutation holding only information relevant for given clustering ranges. + // Range tombstones will be trimmed to the boundaries of the clustering ranges. + mutation sliced(const query::clustering_row_ranges&) const; private: friend std::ostream& operator<<(std::ostream& os, const mutation& m); }; @@ -185,4 +189,3 @@ boost::iterator_range::const_iterator> slice( future mutation_from_streamed_mutation(streamed_mutation_opt sm); future mutation_from_streamed_mutation(streamed_mutation& sm); -future mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit); diff --git a/mutation_partition.cc b/mutation_partition.cc index 1d1665ddff2b..2a1a54f5b5c8 100644 --- a/mutation_partition.cc +++ b/mutation_partition.cc @@ -175,7 +175,7 @@ void revert_intrusive_set_range(const schema& s, mutation_partition::rows_type& assert(i != dst.end()); rows_entry& dst_e = *i; - if (e.empty()) { + if (e.erased()) { dst.erase(i); start = src.erase_and_dispose(start, deleter); start = src.insert_before(start, dst_e); @@ -203,18 +203,10 @@ auto apply_reversibly_intrusive_set(const schema& s, mutation_partition::rows_ty while (src_i != src.end()) { rows_entry& src_e = *src_i; - // neutral entries will be given special meaning for the purpose of revert, so - // get rid of empty rows from the input as if they were not there. This doesn't change - // the value of src. - if (src_e.empty()) { - src_i = src.erase_and_dispose(src_i, current_deleter()); - continue; - } - auto i = dst.lower_bound(src_e, cmp); if (i == dst.end() || cmp(src_e, *i)) { - // Construct neutral entry which will represent missing dst entry for revert. - rows_entry* empty_e = current_allocator().construct(src_e.key()); + // Construct erased entry which will represent missing dst entry for revert. + rows_entry* empty_e = current_allocator().construct(rows_entry::erased_tag{}, src_e); [&] () noexcept { src_i = src.erase(src_i); src_i = src.insert_before(src_i, *empty_e); @@ -235,6 +227,7 @@ auto apply_reversibly_intrusive_set(const schema& s, mutation_partition::rows_ty mutation_partition::mutation_partition(const mutation_partition& x) : _tombstone(x._tombstone) , _static_row(x._static_row) + , _static_row_continuous(x._static_row_continuous) , _rows() , _row_tombstones(x._row_tombstones) { auto cloner = [] (const auto& x) { @@ -247,6 +240,7 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema query::clustering_key_filter_ranges ck_ranges) : _tombstone(x._tombstone) , _static_row(x._static_row) + , _static_row_continuous(x._static_row_continuous) , _rows() , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only()) { try { @@ -271,6 +265,7 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch query::clustering_key_filter_ranges ck_ranges) : _tombstone(x._tombstone) , _static_row(std::move(x._static_row)) + , _static_row_continuous(x._static_row_continuous) , _rows(std::move(x._rows)) , _row_tombstones(std::move(x._row_tombstones)) { @@ -319,6 +314,13 @@ mutation_partition::operator=(mutation_partition&& x) noexcept { return *this; } +void mutation_partition::ensure_last_dummy(const schema& s) { + if (_rows.empty() || !_rows.rbegin()->position().is_after_all_clustered_rows(s)) { + _rows.insert_before(_rows.end(), + *current_allocator().construct(s, position_in_partition_view::after_all_clustered_rows(), is_dummy::yes, is_continuous::yes)); + } +} + void mutation_partition::apply(const schema& s, const mutation_partition& p, const schema& p_schema) { if (s.version() != p_schema.version()) { @@ -507,7 +509,7 @@ mutation_partition::clustered_row(const schema& s, const clustering_key& key) { } deletable_row& -mutation_partition::clustered_row(const schema& s, const clustering_key_view& key) { +mutation_partition::clustered_row(const schema& s, clustering_key_view key) { auto i = _rows.find(key, rows_entry::compare(s)); if (i == _rows.end()) { auto e = current_allocator().construct(key); @@ -517,6 +519,17 @@ mutation_partition::clustered_row(const schema& s, const clustering_key_view& ke return i->row(); } +deletable_row& +mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) { + auto i = _rows.find(pos, rows_entry::compare(s)); + if (i == _rows.end()) { + auto e = current_allocator().construct(s, pos, dummy, continuous); + _rows.insert(i, *e, rows_entry::compare(s)); + return e->row(); + } + return i->row(); +} + mutation_partition::rows_type::const_iterator mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const { auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema)); @@ -759,6 +772,9 @@ mutation_partition::query_compacted(query::result::partition_writer& pw, const s auto is_reversed = slice.options.contains(query::partition_slice::option::reversed); auto send_ck = slice.options.contains(query::partition_slice::option::send_clustering_key); for_each_row(s, query::clustering_range::make_open_ended_both_sides(), is_reversed, [&] (const rows_entry& e) { + if (e.dummy()) { + return stop_iteration::no; + } auto& row = e.row(); auto row_tombstone = tombstone_for_row(s, e); @@ -843,13 +859,13 @@ operator<<(std::ostream& os, const deletable_row& dr) { std::ostream& operator<<(std::ostream& os, const rows_entry& re) { - return fprint(os, "{rows_entry: %s %s}", re._key, re._row); + return fprint(os, "{rows_entry: cont=%d dummy=%d %s %s}", re.continuous(), re.dummy(), re._key, re._row); } std::ostream& operator<<(std::ostream& os, const mutation_partition& mp) { - return fprint(os, "{mutation_partition: %s (%s) static %s clustered %s}", - mp._tombstone, ::join(", ", mp._row_tombstones), mp._static_row, + return fprint(os, "{mutation_partition: %s (%s) static cont=%d %s clustered %s}", + mp._tombstone, ::join(", ", mp._row_tombstones), mp._static_row_continuous, mp._static_row, ::join(", ", mp._rows)); } @@ -905,14 +921,30 @@ void deletable_row::revert(const schema& s, deletable_row& src) { _marker.revert(src._marker); } +void deletable_row::apply(const schema& s, deletable_row&& src) { + _cells.apply(s, column_kind::regular_column, std::move(src._cells)); + _marker.apply(src._marker); + _deleted_at.apply(src._deleted_at, _marker); +} + bool rows_entry::equal(const schema& s, const rows_entry& other) const { return equal(s, other, s); } +position_in_partition_view rows_entry::position() const { + if (_flags._last) { + return position_in_partition_view::after_all_clustered_rows(); + } else { + return position_in_partition_view( + position_in_partition_view::clustering_row_tag_t(), _key); + } +} + bool rows_entry::equal(const schema& s, const rows_entry& other, const schema& other_schema) const { - return key().equal(s, other.key()) // Only representation-compatible changes are allowed + position_in_partition::equal_compare eq(s); + return eq(position(), other.position()) && row().equal(column_kind::regular_column, s, other.row(), other_schema); } @@ -925,7 +957,7 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti return false; } - if (!std::equal(_rows.begin(), _rows.end(), p._rows.begin(), p._rows.end(), + if (!boost::equal(non_dummy_rows(), p.non_dummy_rows(), [&] (const rows_entry& e1, const rows_entry& e2) { return e1.equal(this_schema, e2, p_schema); } @@ -943,6 +975,16 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti return _static_row.equal(column_kind::static_column, this_schema, p._static_row, p_schema); } +bool mutation_partition::equal_continuity(const schema& s, const mutation_partition& p) const { + return _static_row_continuous == p._static_row_continuous + && boost::equal(_rows, p._rows, [&] (const rows_entry& e1, const rows_entry& e2) { + position_in_partition::equal_compare eq(s); + return eq(e1.position(), e2.position()) + && e1.continuous() == e2.continuous() + && e1.dummy() == e2.dummy(); + }); +} + void apply_reversibly(const column_definition& def, atomic_cell_or_collection& dst, atomic_cell_or_collection& src) { // Must be run via with_linearized_managed_bytes() context, but assume it is @@ -1216,8 +1258,10 @@ uint32_t mutation_partition::do_compact(const schema& s, uint32_t row_count = 0; auto row_callback = [&] (rows_entry& e) { + if (e.dummy()) { + return stop_iteration::no; + } deletable_row& row = e.row(); - row_tombstone tomb = tombstone_for_row(s, e); bool is_live = row.cells().compact_and_expire(s, column_kind::regular_column, tomb, query_time, can_gc, gc_before); @@ -1315,7 +1359,7 @@ size_t mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_time) const { size_t count = 0; - for (const rows_entry& e : _rows) { + for (const rows_entry& e : non_dummy_rows()) { tombstone base_tombstone = range_tombstone_for_row(s, e.key()); if (e.row().is_live(s, base_tombstone, query_time)) { ++count; @@ -1333,6 +1377,7 @@ rows_entry::rows_entry(rows_entry&& o) noexcept : _link(std::move(o._link)) , _key(std::move(o._key)) , _row(std::move(o._row)) + , _flags(std::move(o._flags)) { } row::row(const row& o) @@ -1641,7 +1686,10 @@ mutation_partition mutation_partition::difference(schema_ptr s, const mutation_p auto it_r = other._rows.begin(); rows_entry::compare cmp_r(*s); for (auto&& r : _rows) { - while (it_r != other._rows.end() && cmp_r(*it_r, r)) { + if (r.dummy()) { + continue; + } + while (it_r != other._rows.end() && (it_r->dummy() || cmp_r(*it_r, r))) { ++it_r; } if (it_r == other._rows.end() || !it_r->key().equal(*s, r.key())) { @@ -1671,7 +1719,7 @@ void mutation_partition::accept(const schema& s, mutation_partition_visitor& v) } for (const rows_entry& e : _rows) { const deletable_row& dr = e.row(); - v.accept_row(e.key(), dr.deleted_at(), dr.marker()); + v.accept_row(e.position(), dr.deleted_at(), dr.marker(), e.dummy(), e.continuous()); dr.cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) { const column_definition& def = s.regular_column_at(id); if (def.is_atomic()) { @@ -2069,6 +2117,41 @@ class counter_write_query_result_builder { } }; +mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t) + : _tombstone(t) + , _static_row_continuous(false) + , _rows() + , _row_tombstones(s) +{ + _rows.insert_before(_rows.end(), + *current_allocator().construct(s, position_in_partition_view::after_all_clustered_rows(), is_dummy::yes, is_continuous::no)); +} + +bool mutation_partition::is_fully_continuous() const { + if (!_static_row_continuous) { + return false; + } + for (auto&& row : _rows) { + if (!row.continuous()) { + return false; + } + } + return true; +} + +void mutation_partition::make_fully_continuous() { + _static_row_continuous = true; + auto i = _rows.begin(); + while (i != _rows.end()) { + if (i->dummy()) { + i = _rows.erase_and_dispose(i, alloc_strategy_deleter()); + } else { + i->set_continuous(true); + ++i; + } + } +} + future counter_write_query(schema_ptr s, const mutation_source& source, const dht::decorated_key& dk, const query::partition_slice& slice, diff --git a/mutation_partition.hh b/mutation_partition.hh index b8ff57d5b9ff..ff4b14a9b118 100644 --- a/mutation_partition.hh +++ b/mutation_partition.hh @@ -33,6 +33,7 @@ #include "schema.hh" #include "tombstone.hh" #include "keys.hh" +#include "position_in_partition.hh" #include "atomic_cell_or_collection.hh" #include "query-result.hh" #include "mutation_partition_view.hh" @@ -598,6 +599,9 @@ class deletable_row final { public: deletable_row() {} explicit deletable_row(clustering_row&&); + deletable_row(row_tombstone tomb, const row_marker& marker, const row& cells) + : _deleted_at(tomb), _marker(marker), _cells(cells) + {} void apply(tombstone deleted_at) { _deleted_at.apply(deleted_at); @@ -624,6 +628,10 @@ public: void apply_reversibly(const schema& s, deletable_row& src); // See reversibly_mergeable.hh void revert(const schema& s, deletable_row& src); + + // Weak exception guarantees. After exception, both src and this will commute to the same value as + // they would should the exception not happen. + void apply(const schema& s, deletable_row&& src); public: row_tombstone deleted_at() const { return _deleted_at; } api::timestamp_type created_at() const { return _marker.timestamp(); } @@ -642,28 +650,59 @@ class rows_entry { intrusive_set_external_comparator_member_hook _link; clustering_key _key; deletable_row _row; + struct flags { + bool _continuous : 1; // See doc of is_continuous. + bool _dummy : 1; + bool _last : 1; + bool _erased : 1; // Used only temporarily during apply_reversibly(). Refs #2012. + flags() : _continuous(true), _dummy(false), _last(false), _erased(false) { } + } _flags{}; friend class mutation_partition; public: + struct erased_tag {}; + rows_entry(erased_tag, const rows_entry& e) + : _key(e._key) + { + _flags._erased = true; + _flags._last = e._flags._last; + } explicit rows_entry(clustering_key&& key) : _key(std::move(key)) { } explicit rows_entry(const clustering_key& key) : _key(key) { } + rows_entry(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) + : _key(pos.key()) + { + if (!pos.is_clustering_row()) { + assert(bool(dummy)); + assert(pos.is_after_all_clustered_rows(s)); // FIXME: Support insertion at any position + _flags._last = true; + } + _flags._dummy = bool(dummy); + _flags._continuous = bool(continuous); + } rows_entry(const clustering_key& key, deletable_row&& row) : _key(key), _row(std::move(row)) { } rows_entry(const clustering_key& key, const deletable_row& row) : _key(key), _row(row) { } + rows_entry(const clustering_key& key, row_tombstone tomb, const row_marker& marker, const row& row) + : _key(key), _row(tomb, marker, row) + { } rows_entry(rows_entry&& o) noexcept; rows_entry(const rows_entry& e) : _key(e._key) , _row(e._row) + , _flags(e._flags) { } + // Valid only if !dummy() clustering_key& key() { return _key; } + // Valid only if !dummy() const clustering_key& key() const { return _key; } @@ -673,6 +712,11 @@ public: const deletable_row& row() const { return _row; } + position_in_partition_view position() const; + is_continuous continuous() const { return is_continuous(_flags._continuous); } + void set_continuous(bool value) { _flags._continuous = value; } + void set_continuous(is_continuous value) { set_continuous(bool(value)); } + is_dummy dummy() const { return is_dummy(_flags._dummy); } void apply(row_tombstone t) { _row.apply(t); } @@ -687,23 +731,54 @@ public: bool empty() const { return _row.empty(); } + bool erased() const { + return _flags._erased; + } + struct tri_compare { + position_in_partition::tri_compare _c; + explicit tri_compare(const schema& s) : _c(s) {} + int operator()(const rows_entry& e1, const rows_entry& e2) const { + return _c(e1.position(), e2.position()); + } + int operator()(const clustering_key& key, const rows_entry& e) const { + return _c(position_in_partition_view::for_key(key), e.position()); + } + int operator()(const rows_entry& e, const clustering_key& key) const { + return _c(e.position(), position_in_partition_view::for_key(key)); + } + int operator()(const rows_entry& e, position_in_partition_view p) const { + return _c(e.position(), p); + } + int operator()(position_in_partition_view p, const rows_entry& e) const { + return _c(p, e.position()); + } + int operator()(position_in_partition_view p1, position_in_partition_view p2) const { + return _c(p1, p2); + } + }; struct compare { - clustering_key::less_compare _c; - compare(const schema& s) : _c(s) {} + tri_compare _c; + explicit compare(const schema& s) : _c(s) {} bool operator()(const rows_entry& e1, const rows_entry& e2) const { - return _c(e1._key, e2._key); + return _c(e1, e2) < 0; } bool operator()(const clustering_key& key, const rows_entry& e) const { - return _c(key, e._key); + return _c(key, e) < 0; } bool operator()(const rows_entry& e, const clustering_key& key) const { - return _c(e._key, key); + return _c(e, key) < 0; } bool operator()(const clustering_key_view& key, const rows_entry& e) const { - return _c(key, e._key); + return _c(key, e) < 0; } bool operator()(const rows_entry& e, const clustering_key_view& key) const { - return _c(e._key, key); + return _c(e, key) < 0; + } + bool operator()(const rows_entry& e, position_in_partition_view p) const { + return _c(e.position(), p) < 0; + } + bool operator()(position_in_partition_view p, const rows_entry& e) const { + return _c(p, e.position()) < 0; } }; template @@ -712,10 +787,16 @@ public: delegating_compare(Comparator&& c) : _c(std::move(c)) {} template bool operator()(const Comparable& v, const rows_entry& e) const { + if (e._flags._last) { + return true; + } return _c(v, e._key); } template bool operator()(const rows_entry& e, const Comparable& v) const { + if (e._flags._last) { + return false; + } return _c(e._key, v); } }; @@ -728,6 +809,47 @@ public: bool equal(const schema& s, const rows_entry& other, const schema& other_schema) const; }; +// Represents a set of writes made to a single partition. +// +// The object is schema-dependent. Each instance is governed by some +// specific schema version. Accessors require a reference to the schema object +// of that version. +// +// There is an operation of addition defined on mutation_partition objects +// (also called "apply"), which gives as a result an object representing the +// sum of writes contained in the addends. For instances governed by the same +// schema, addition is commutative and associative. +// +// In addition to representing writes, the object supports specifying a set of +// partition elements called "continuity". This set can be used to represent +// lack of information about certain parts of the partition. It can be +// specified which ranges of clustering keys belong to that set. We say that a +// key range is continuous if all keys in that range belong to the continuity +// set, and discontinuous otherwise. By default everything is continuous. +// The static row may be also continuous or not. +// Partition tombstone is always continuous. +// +// Continuity is ignored by instance equality. It's also transient, not +// preserved by serialization. +// +// Continuity is represented internally using flags on row entries. The key +// range between two consecutive entries (both ends exclusive) is continuous +// if and only if rows_entry::continuous() is true for the later entry. The +// range starting after the last entry is assumed to be continuous. The range +// corresponding to the key of the entry is continuous if and only if +// rows_entry::dummy() is false. +// +// Adding two fully-continuous instances gives a fully-continuous instance. +// Continuity doesn't affect how the write part is added. +// +// Addition of continuity is not commutative in general, but is associative. +// Continuity flags on objects representing the same thing (e.g. rows_entry +// with the same key) are merged such that the information stored in the left- +// hand operand wins. Flags on objects which are present only in one of the +// operands are transferred as-is. Such merging rules are useful for layering +// information in MVCC, where newer versions specify continuity with respect +// to the combined set of rows in all prior versions, not just in their +// versions. class mutation_partition final { public: using rows_type = intrusive_set_external_comparator; @@ -736,6 +858,7 @@ public: private: tombstone _tombstone; row _static_row; + bool _static_row_continuous = true; rows_type _rows; // Contains only strict prefixes so that we don't have to lookup full keys // in both _row_tombstones and _rows. @@ -745,6 +868,12 @@ private: friend class converting_mutation_partition_applier; public: struct copy_comparators_only {}; + struct incomplete_tag {}; + // Constructs an empty instance which is fully discontinuous except for the partition tombstone. + mutation_partition(incomplete_tag, const schema& s, tombstone); + static mutation_partition make_incomplete(const schema& s, tombstone t = {}) { + return mutation_partition(incomplete_tag(), s, t); + } mutation_partition(schema_ptr s) : _rows() , _row_tombstones(*s) @@ -762,6 +891,7 @@ public: mutation_partition& operator=(mutation_partition&& x) noexcept; bool equal(const schema&, const mutation_partition&) const; bool equal(const schema& this_schema, const mutation_partition& p, const schema& p_schema) const; + bool equal_continuity(const schema&, const mutation_partition&) const; // Consistent with equal() template void feed_hash(Hasher& h, const schema& s) const { @@ -770,6 +900,13 @@ public: } friend std::ostream& operator<<(std::ostream& os, const mutation_partition& mp); public: + // Makes sure there is a dummy entry after all clustered rows. Doesn't affect continuity. + // Doesn't invalidate iterators. + void ensure_last_dummy(const schema&); + bool static_row_continuous() const { return _static_row_continuous; } + void set_static_row_continuous(bool value) { _static_row_continuous = value; } + bool is_fully_continuous() const; + void make_fully_continuous(); void apply(tombstone t) { _tombstone.apply(t); } void apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t); void apply_delete(const schema& schema, range_tombstone rt); @@ -866,7 +1003,8 @@ public: public: deletable_row& clustered_row(const schema& s, const clustering_key& key); deletable_row& clustered_row(const schema& s, clustering_key&& key); - deletable_row& clustered_row(const schema& s, const clustering_key_view& key); + deletable_row& clustered_row(const schema& s, clustering_key_view key); + deletable_row& clustered_row(const schema& s, position_in_partition_view pos, is_dummy, is_continuous); public: tombstone partition_tombstone() const { return _tombstone; } row& static_row() { return _static_row; } @@ -879,6 +1017,7 @@ public: const row* find_row(const schema& s, const clustering_key& key) const; tombstone range_tombstone_for_row(const schema& schema, const clustering_key& key) const; row_tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const; + // Can be called only for non-dummy entries row_tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const; boost::iterator_range range(const schema& schema, const query::clustering_range& r) const; rows_type::const_iterator lower_bound(const schema& schema, const query::clustering_range& r) const; @@ -886,6 +1025,11 @@ public: rows_type::iterator lower_bound(const schema& schema, const query::clustering_range& r); rows_type::iterator upper_bound(const schema& schema, const query::clustering_range& r); boost::iterator_range range(const schema& schema, const query::clustering_range& r); + // Returns an iterator range of rows_entry, with only non-dummy entries. + auto non_dummy_rows() const { + return boost::make_iterator_range(_rows.begin(), _rows.end()) + | boost::adaptors::filtered([] (const rows_entry& e) { return bool(!e.dummy()); }); + } // Writes this partition using supplied query result writer. // The partition should be first compacted with compact_for_query(), otherwise // results may include data which is deleted/expired. diff --git a/mutation_partition_applier.hh b/mutation_partition_applier.hh index 23a2600e4f79..2414150add31 100644 --- a/mutation_partition_applier.hh +++ b/mutation_partition_applier.hh @@ -50,8 +50,8 @@ public: _p.apply_row_tombstone(_schema, rt); } - virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) override { - deletable_row& r = _p.clustered_row(_schema, key); + virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override { + deletable_row& r = _p.clustered_row(_schema, key, dummy, continuous); r.apply(rm); r.apply(deleted_at); _current_row = &r; diff --git a/mutation_partition_serializer.cc b/mutation_partition_serializer.cc index 9413f61c577b..7788e0fbed6f 100644 --- a/mutation_partition_serializer.cc +++ b/mutation_partition_serializer.cc @@ -196,7 +196,7 @@ void mutation_partition_serializer::write_serialized(Writer&& writer, const sche auto row_tombstones = write_row_cells(std::move(srow_writer), mp.static_row(), s, column_kind::static_column).end_static_row().start_range_tombstones(); write_tombstones(s, row_tombstones, mp.row_tombstones()); auto clustering_rows = std::move(row_tombstones).end_range_tombstones().start_rows(); - for (auto&& cr : mp.clustered_rows()) { + for (auto&& cr : mp.non_dummy_rows()) { write_row(clustering_rows.add(), s, cr.key(), cr.row().cells(), cr.row().marker(), cr.row().deleted_at()); } std::move(clustering_rows).end_rows().end_mutation_partition(); diff --git a/mutation_partition_view.cc b/mutation_partition_view.cc index f96645014c83..d2d555c602ce 100644 --- a/mutation_partition_view.cc +++ b/mutation_partition_view.cc @@ -210,7 +210,7 @@ mutation_partition_view::accept(const column_mapping& cm, mutation_partition_vis for (auto&& cr : mpv.rows()) { auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at())); - visitor.accept_row(cr.key(), t, read_row_marker(cr.marker())); + visitor.accept_row(position_in_partition_view::for_key(cr.key()), t, read_row_marker(cr.marker())); struct cell_visitor { mutation_partition_visitor& _visitor; diff --git a/mutation_partition_visitor.hh b/mutation_partition_visitor.hh index 3e3f014bcdf4..9a7c5ac50c65 100644 --- a/mutation_partition_visitor.hh +++ b/mutation_partition_visitor.hh @@ -29,6 +29,19 @@ class row_marker; class row_tombstone; +// When used on an entry, marks the range between this entry and the previous +// one as continuous or discontinuous, excluding the keys of both entries. +// This information doesn't apply to continuity of the entries themselves, +// that is specified by is_dummy flag. +// See class doc of mutation_partition. +using is_continuous = bool_class; + +// Dummy entry is an entry which is incomplete. +// Typically used for marking bounds of continuity range. +// See class doc of mutation_partition. +class dummy_tag {}; +using is_dummy = bool_class; + // Guarantees: // // - any tombstones which affect cell's liveness are visited before that cell @@ -56,7 +69,8 @@ public: virtual void accept_row_tombstone(const range_tombstone&) = 0; - virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) = 0; + virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, + is_dummy = is_dummy::no, is_continuous = is_continuous::yes) = 0; virtual void accept_row_cell(column_id id, atomic_cell_view) = 0; diff --git a/mutation_reader.cc b/mutation_reader.cc index 4f28c64891f6..8eb1cc861f15 100644 --- a/mutation_reader.cc +++ b/mutation_reader.cc @@ -153,8 +153,8 @@ class reader_returning final : public mutation_reader::impl { } }; -mutation_reader make_reader_returning(mutation m) { - return make_mutation_reader(streamed_mutation_from_mutation(std::move(m))); +mutation_reader make_reader_returning(mutation m, streamed_mutation::forwarding fwd) { + return make_mutation_reader(streamed_mutation_from_mutation(std::move(m), std::move(fwd))); } mutation_reader make_reader_returning(streamed_mutation m) { @@ -324,3 +324,36 @@ make_multi_range_reader(schema_ptr s, mutation_source source, const dht::partiti return make_mutation_reader(std::move(s), std::move(source), ranges, slice, pc, std::move(trace_state), fwd, fwd_mr); } + +snapshot_source make_empty_snapshot_source() { + return snapshot_source([] { + return make_empty_mutation_source(); + }); +} + +mutation_source make_empty_mutation_source() { + return mutation_source([](schema_ptr s, + const dht::partition_range& pr, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr tr, + streamed_mutation::forwarding fwd) { + return make_empty_reader(); + }); +} + +mutation_source make_combined_mutation_source(std::vector addends) { + return mutation_source([addends = std::move(addends)] (schema_ptr s, + const dht::partition_range& pr, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr tr, + streamed_mutation::forwarding fwd) { + std::vector rd; + rd.reserve(addends.size()); + for (auto&& ms : addends) { + rd.emplace_back(ms(s, pr, slice, pc, tr, fwd)); + } + return make_combined_reader(std::move(rd)); + }); +} diff --git a/mutation_reader.hh b/mutation_reader.hh index c7aa94d6db5c..cee317a067b7 100644 --- a/mutation_reader.hh +++ b/mutation_reader.hh @@ -159,7 +159,7 @@ public: mutation_reader make_combined_reader(std::vector); mutation_reader make_combined_reader(mutation_reader&& a, mutation_reader&& b); // reads from the input readers, in order -mutation_reader make_reader_returning(mutation); +mutation_reader make_reader_returning(mutation, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no); mutation_reader make_reader_returning(streamed_mutation); mutation_reader make_reader_returning_many(std::vector, const query::partition_slice& slice = query::full_slice, @@ -279,34 +279,36 @@ class mutation_source { // We could have our own version of std::function<> that is nothrow // move constructible and save some indirection and allocation. // Probably not worth the effort though. - std::unique_ptr _fn; + lw_shared_ptr _fn; private: mutation_source() = default; explicit operator bool() const { return bool(_fn); } friend class optimized_optional; public: - mutation_source(func_type fn) : _fn(std::make_unique(std::move(fn))) {} + mutation_source(func_type fn) : _fn(make_lw_shared(std::move(fn))) {} + // For sources which don't care about the mutation_reader::forwarding flag (always fast forwardable) + mutation_source(std::function fn) + : _fn(make_lw_shared([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr tr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) { + return fn(s, range, slice, pc, std::move(tr), fwd); + })) {} mutation_source(std::function fn) - : _fn(std::make_unique([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding) { + : _fn(make_lw_shared([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) { + assert(!fwd); return fn(s, range, slice, pc); })) {} mutation_source(std::function fn) - : _fn(std::make_unique([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding) { + : _fn(make_lw_shared([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) { + assert(!fwd); return fn(s, range, slice); })) {} mutation_source(std::function fn) - : _fn(std::make_unique([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding) { + : _fn(make_lw_shared([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) { + assert(!fwd); return fn(s, range); })) {} - mutation_source(const mutation_source& other) - : _fn(std::make_unique(*other._fn)) { } - - mutation_source& operator=(const mutation_source& other) { - _fn = std::make_unique(*other._fn); - return *this; - } - + mutation_source(const mutation_source& other) = default; + mutation_source& operator=(const mutation_source& other) = default; mutation_source(mutation_source&&) = default; mutation_source& operator=(mutation_source&&) = default; @@ -326,6 +328,32 @@ public: } }; +// Returns a mutation_source which is the sum of given mutation_sources. +// +// Adding two mutation sources gives a mutation source which contains +// the sum of writes contained in the addends. +mutation_source make_combined_mutation_source(std::vector); + +// Represent mutation_source which can be snapshotted. +class snapshot_source { +private: + std::function _func; +public: + snapshot_source(std::function func) + : _func(std::move(func)) + { } + + // Creates a new snapshot. + // The returned mutation_source represents all earlier writes and only those. + // Note though that the mutations in the snapshot may get compacted over time. + mutation_source operator()() { + return _func(); + } +}; + +mutation_source make_empty_mutation_source(); +snapshot_source make_empty_snapshot_source(); + template<> struct move_constructor_disengages { enum { value = true }; diff --git a/partition_builder.hh b/partition_builder.hh index f26a9d2f3604..e04ce2ff99a4 100644 --- a/partition_builder.hh +++ b/partition_builder.hh @@ -56,8 +56,8 @@ public: _partition.apply_row_tombstone(_schema, rt); } - virtual void accept_row(clustering_key_view key, const row_tombstone& deleted_at, const row_marker& rm) override { - deletable_row& r = _partition.clustered_row(_schema, key); + virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override { + deletable_row& r = _partition.clustered_row(_schema, key, dummy, continuous); r.apply(rm); r.apply(deleted_at); _current_row = &r; diff --git a/partition_snapshot_reader.hh b/partition_snapshot_reader.hh index 3173d20c0281..a1445382bb18 100644 --- a/partition_snapshot_reader.hh +++ b/partition_snapshot_reader.hh @@ -30,6 +30,26 @@ struct partition_snapshot_reader_dummy_accounter { }; extern partition_snapshot_reader_dummy_accounter no_accounter; +inline void maybe_merge_versions(lw_shared_ptr& snp, + logalloc::region& lsa_region, + logalloc::allocating_section& read_section) { + if (!snp.owned()) { + return; + } + // If no one else is using this particular snapshot try to merge partition + // versions. + with_allocator(lsa_region.allocator(), [&snp, &lsa_region, &read_section] { + return with_linearized_managed_bytes([&snp, &lsa_region, &read_section] { + try { + read_section(lsa_region, [&snp] { + snp->merge_partition_versions(); + }); + } catch (...) { } + snp = {}; + }); + }); +} + template class partition_snapshot_reader : public streamed_mutation::impl, public MemoryAccounter { struct rows_position { @@ -45,21 +65,6 @@ class partition_snapshot_reader : public streamed_mutation::impl, public MemoryA return _cmp(*b._position, *a._position); } }; - class rows_entry_compare { - position_in_partition::less_compare _cmp; - public: - explicit rows_entry_compare(const schema& s) : _cmp(s) { } - bool operator()(const rows_entry& a, const position_in_partition& b) const { - position_in_partition_view a_view(position_in_partition_view::clustering_row_tag_t(), - a.key()); - return _cmp(a_view, b); - } - bool operator()(const position_in_partition& a, const rows_entry& b) const { - position_in_partition_view b_view(position_in_partition_view::clustering_row_tag_t(), - b.key()); - return _cmp(a, b_view); - } - }; private: // Keeps shared pointer to the container we read mutation from to make sure // that its lifetime is appropriately extended. @@ -70,8 +75,8 @@ private: query::clustering_row_ranges::const_iterator _ck_range_end; bool _in_ck_range = false; - rows_entry_compare _cmp; - clustering_key_prefix::equality _eq; + rows_entry::compare _cmp; + position_in_partition::equal_compare _eq; heap_compare _heap_cmp; lw_shared_ptr _snapshot; @@ -94,8 +99,14 @@ private: void refresh_iterators() { _clustering_rows.clear(); - if (!_in_ck_range && _current_ck_range == _ck_range_end) { - return; + if (!_in_ck_range) { + if (_current_ck_range == _ck_range_end) { + _end_of_stream = true; + return; + } + for (auto&& v : _snapshot->versions()) { + _range_tombstones.apply(v.partition().row_tombstones(), *_current_ck_range); + } } for (auto&& v : _snapshot->versions()) { @@ -117,14 +128,27 @@ private: boost::range::make_heap(_clustering_rows, _heap_cmp); } - void pop_clustering_row() { + // Valid if has_more_rows() + const rows_entry& pop_clustering_row() { + boost::range::pop_heap(_clustering_rows, _heap_cmp); auto& current = _clustering_rows.back(); + const rows_entry& e = *current._position; current._position = std::next(current._position); if (current._position == current._end) { _clustering_rows.pop_back(); } else { boost::range::push_heap(_clustering_rows, _heap_cmp); } + return e; + } + + // Valid if has_more_rows() + const rows_entry& peek_row() const { + return *_clustering_rows.front()._position; + } + + bool has_more_rows() const { + return !_clustering_rows.empty(); } mutation_fragment_opt read_static_row() { @@ -143,20 +167,18 @@ private: } mutation_fragment_opt read_next() { - if (!_clustering_rows.empty()) { - auto mf = _range_tombstones.get_next(*_clustering_rows.front()._position); + while (has_more_rows()) { + auto mf = _range_tombstones.get_next(peek_row()); if (mf) { return mf; } - - boost::range::pop_heap(_clustering_rows, _heap_cmp); - clustering_row result = *_clustering_rows.back()._position; - pop_clustering_row(); - while (!_clustering_rows.empty() && _eq(_clustering_rows.front()._position->key(), result.key())) { - boost::range::pop_heap(_clustering_rows, _heap_cmp); - auto& current = _clustering_rows.back(); - result.apply(*_schema, *current._position); - pop_clustering_row(); + const rows_entry& e = pop_clustering_row(); + if (e.dummy()) { + continue; + } + clustering_row result = e; + while (has_more_rows() && _eq(peek_row().position(), result.position())) { + result.apply(*_schema, pop_clustering_row()); } _last_entry = position_in_partition(result.position()); return mutation_fragment(std::move(result)); @@ -184,18 +206,13 @@ private: } while (!is_end_of_stream() && !is_buffer_full()) { - if (_in_ck_range && _clustering_rows.empty()) { - _in_ck_range = false; - _current_ck_range = std::next(_current_ck_range); - refresh_iterators(); - continue; - } - auto mfopt = read_next(); if (mfopt) { emplace_mutation_fragment(std::move(*mfopt)); } else { - _end_of_stream = true; + _in_ck_range = false; + _current_ck_range = std::next(_current_ck_range); + refresh_iterators(); } } } @@ -226,31 +243,11 @@ public: , _range_tombstones(*s) , _lsa_region(region) , _read_section(read_section) { - for (auto&& v : _snapshot->versions()) { - auto&& rt_list = v.partition().row_tombstones(); - for (auto&& range : _ck_ranges.ranges()) { - _range_tombstones.apply(rt_list, range); - } - } do_fill_buffer(); } ~partition_snapshot_reader() { - if (!_snapshot.owned()) { - return; - } - // If no one else is using this particular snapshot try to merge partition - // versions. - with_allocator(_lsa_region.allocator(), [this] { - return with_linearized_managed_bytes([this] { - try { - _read_section(_lsa_region, [this] { - _snapshot->merge_partition_versions(); - }); - } catch (...) { } - _snapshot = {}; - }); - }); + maybe_merge_versions(_snapshot, _lsa_region, _read_section); } virtual future<> fill_buffer() override { diff --git a/partition_snapshot_row_cursor.hh b/partition_snapshot_row_cursor.hh new file mode 100644 index 000000000000..e91fe7294ca7 --- /dev/null +++ b/partition_snapshot_row_cursor.hh @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "partition_version.hh" + +// Allows iterating over rows of mutation_partition represented by given partition_snapshot. +// +// The cursor initially has a position before all rows and is not pointing at any row. +// To position the cursor, use advance_to(). +// +// All methods should be called with the region of the snapshot locked. The cursor is invalidated +// when that lock section is left, or if the snapshot is modified. +// +// When the cursor is invalidated, it still maintains its previous position. It can be brought +// back to validity by calling maybe_refresh(), or advance_to(). +// +class partition_snapshot_row_cursor final { + struct position_in_version { + mutation_partition::rows_type::iterator it; + mutation_partition::rows_type::iterator end; + int version_no; + + struct less_compare { + rows_entry::tri_compare _cmp; + public: + explicit less_compare(const schema& s) : _cmp(s) { } + bool operator()(const position_in_version& a, const position_in_version& b) { + auto res = _cmp(*a.it, *b.it); + return res > 0 || (res == 0 && a.version_no > b.version_no); + } + }; + }; + + const schema& _schema; + logalloc::region& _region; + partition_snapshot& _snp; + std::vector _heap; + std::vector _current_row; + position_in_partition _position; + uint64_t _last_reclaim_count = 0; + size_t _last_versions_count = 0; + + // Removes the next row from _heap and puts it into _current_row + void recreate_current_row() { + position_in_version::less_compare heap_less(_schema); + position_in_partition::equal_compare eq(_schema); + do { + boost::range::pop_heap(_heap, heap_less); + _current_row.push_back(_heap.back()); + _heap.pop_back(); + } while (!_heap.empty() && eq(_current_row[0].it->position(), _heap[0].it->position())); + _position = position_in_partition(_current_row[0].it->position()); + } +public: + partition_snapshot_row_cursor(const schema& s, logalloc::region& region, partition_snapshot& snp) + : _schema(s) + , _region(region) + , _snp(snp) + , _position(position_in_partition::static_row_tag_t{}) + { } + bool has_up_to_date_row_from_latest_version() const { + return up_to_date() && _current_row[0].version_no == 0; + } + mutation_partition::rows_type::iterator get_iterator_in_latest_version() const { + return _current_row[0].it; + } + bool up_to_date() const { + return _region.reclaim_counter() == _last_reclaim_count && _last_versions_count == _snp.version_count(); + } + + // Brings back the cursor to validity. + // Can be only called when cursor is pointing at a row. + // + // Semantically equivalent to: + // + // advance_to(position()); + // + // but avoids work if not necessary. + bool maybe_refresh() { + if (!up_to_date()) { + return advance_to(_position); + } + return true; + } + + // Moves the cursor to the first entry with position >= pos. + // + // The caller must ensure that such entry exists. + // + // Returns true iff there can't be any clustering row entries + // between lower_bound (inclusive) and the entry to which the cursor + // was advanced. + // + // May be called when cursor is not valid. + // The cursor is valid after the call. + // Must be called under reclaim lock. + bool advance_to(position_in_partition_view lower_bound) { + rows_entry::compare less(_schema); + position_in_version::less_compare heap_less(_schema); + _heap.clear(); + _current_row.clear(); + int version_no = 0; + for (auto&& v : _snp.versions()) { + auto& rows = v.partition().clustered_rows(); + auto pos = rows.lower_bound(lower_bound, less); + auto end = rows.end(); + if (pos != end) { + _heap.push_back({pos, end, version_no}); + } + ++version_no; + } + boost::range::make_heap(_heap, heap_less); + _last_reclaim_count = _region.reclaim_counter(); + _last_versions_count = _snp.version_count(); + bool found = no_clustering_row_between(_schema, lower_bound, _heap[0].it->position()); + recreate_current_row(); + return found; + } + + // Advances the cursor to the next row. + // If there is no next row, returns false and the cursor is no longer pointing at a row. + // Can be only called on a valid cursor pointing at a row. + bool next() { + position_in_version::less_compare heap_less(_schema); + assert(up_to_date()); + for (auto&& curr : _current_row) { + ++curr.it; + if (curr.it != curr.end) { + _heap.push_back(curr); + boost::range::push_heap(_heap, heap_less); + } + } + _current_row.clear(); + if (_heap.empty()) { + return false; + } + recreate_current_row(); + return true; + } + + // Can be called only when cursor is valid and pointing at a row. + bool continuous() const { return bool(_current_row[0].it->continuous()); } + + // Can be called only when cursor is valid and pointing at a row. + bool dummy() const { return bool(_current_row[0].it->dummy()); } + + // Can be called only when cursor is valid and pointing at a row, and !dummy(). + const clustering_key& key() const { return _current_row[0].it->key(); } + + // Can be called only when cursor is valid and pointing at a row. + clustering_row row() const { + clustering_row result(key()); + for (auto&& v : _current_row) { + result.apply(_schema, *v.it); + } + return result; + } + + // Can be called when cursor is pointing at a row, even when invalid. + const position_in_partition& position() const { + return _position; + } + + bool is_in_latest_version() const; + bool previous_row_in_latest_version_has_key(const clustering_key_prefix& key) const; + void set_continuous(bool val); +}; + +inline +bool partition_snapshot_row_cursor::is_in_latest_version() const { + return _current_row[0].version_no == 0; +} + +inline +bool partition_snapshot_row_cursor::previous_row_in_latest_version_has_key(const clustering_key_prefix& key) const { + if (_current_row[0].it == _snp.version()->partition().clustered_rows().begin()) { + return false; + } + auto prev_it = _current_row[0].it; + --prev_it; + clustering_key_prefix::tri_compare tri_comp(_schema); + return tri_comp(prev_it->key(), key) == 0; +} + +inline +void partition_snapshot_row_cursor::set_continuous(bool val) { + _current_row[0].it->set_continuous(val); +} diff --git a/partition_version.cc b/partition_version.cc index 6cbb0eb1744d..fa52861831c0 100644 --- a/partition_version.cc +++ b/partition_version.cc @@ -20,6 +20,7 @@ */ #include +#include #include "partition_version.hh" @@ -62,6 +63,72 @@ partition_version::~partition_version() } } +namespace { + +GCC6_CONCEPT( + +// A functor which transforms objects from Domain into objects from CoDomain +template +concept bool Mapper() { + return requires(U obj, const Domain& src) { + { obj(src) } -> const CoDomain& + }; +} + +// A functor which merges two objects from Domain into one. The result is stored in the first argument. +template +concept bool Reducer() { + return requires(U obj, Domain& dst, const Domain& src) { + { obj(dst, src) } -> void; + }; +} + +) + +// Calculates the value of particular part of mutation_partition represented by +// the version chain starting from v. +// |map| extracts the part from each version. +// |reduce| Combines parts from the two versions. +template +GCC6_CONCEPT( +requires Mapper() && Reducer() +) +inline Result squashed(const partition_version_ref& v, Map&& map, Reduce&& reduce) { + Result r = map(v->partition()); + auto it = v->next(); + while (it) { + reduce(r, map(it->partition())); + it = it->next(); + } + return r; +} + +} + +row partition_snapshot::static_row() const { + return ::squashed(version(), + [] (const mutation_partition& mp) -> const row& { return mp.static_row(); }, + [this] (row& a, const row& b) { a.apply(*_schema, column_kind::static_column, b); }); +} + +tombstone partition_snapshot::partition_tombstone() const { + return ::squashed(version(), + [] (const mutation_partition& mp) { return mp.partition_tombstone(); }, + [] (tombstone& a, tombstone b) { a.apply(b); }); +} + +mutation_partition partition_snapshot::squashed() const { + return ::squashed(version(), + [] (const mutation_partition& mp) -> const mutation_partition& { return mp; }, + [this] (mutation_partition& a, const mutation_partition& b) { a.apply(*_schema, b, *_schema); }); +} + +tombstone partition_entry::partition_tombstone() const { + return ::squashed(_version, + [] (const mutation_partition& mp) { return mp.partition_tombstone(); }, + [] (tombstone& a, tombstone b) { a.apply(b); }); +} + partition_snapshot::~partition_snapshot() { if (_version && _version.is_unique_owner()) { auto v = &*_version; @@ -139,20 +206,6 @@ void partition_entry::set_version(partition_version* new_version) _version = partition_version_ref(*new_version); } -void partition_entry::apply(const schema& s, partition_version* pv, const schema& pv_schema) -{ - if (!_snapshot) { - _version->partition().apply(s, std::move(pv->partition()), pv_schema); - current_allocator().destroy(pv); - } else { - if (s.version() != pv_schema.version()) { - pv->partition().upgrade(pv_schema, s); - } - pv->insert_before(*_version); - set_version(pv); - } -} - void partition_entry::apply(const schema& s, const mutation_partition& mp, const schema& mp_schema) { if (!_snapshot) { @@ -169,22 +222,6 @@ void partition_entry::apply(const schema& s, const mutation_partition& mp, const } } -void partition_entry::apply(const schema& s, mutation_partition&& mp, const schema& mp_schema) -{ - if (!_snapshot) { - _version->partition().apply(s, std::move(mp), mp_schema); - } else { - if (s.version() != mp_schema.version()) { - apply(s, mp, mp_schema); - } else { - auto new_version = current_allocator().construct(std::move(mp)); - new_version->insert_before(*_version); - - set_version(new_version); - } - } -} - void partition_entry::apply(const schema& s, mutation_partition_view mpv, const schema& mp_schema) { if (!_snapshot) { @@ -199,75 +236,286 @@ void partition_entry::apply(const schema& s, mutation_partition_view mpv, const } } -void partition_entry::apply(const schema& s, partition_entry&& pe, const schema& mp_schema) -{ - auto begin = &*pe._version; - auto snapshot = pe._snapshot; - if (pe._snapshot) { - pe._snapshot->_version = std::move(pe._version); - pe._snapshot->_entry = nullptr; - pe._snapshot = nullptr; - } - pe._version = { }; - - auto current = begin; - if (!current->next() && !current->is_referenced()) { - try { - apply(s, current, mp_schema); - } catch (...) { - pe._version = partition_version_ref(*current); - throw; +// Iterates over all rows in mutation represented by partition_entry. +// It abstracts away the fact that rows may be spread across multiple versions. +class partition_entry::rows_iterator final { + struct version { + mutation_partition::rows_type::iterator current_row; + mutation_partition::rows_type* rows; + bool can_move; + struct compare { + const rows_entry::tri_compare& _cmp; + public: + explicit compare(const rows_entry::tri_compare& cmp) : _cmp(cmp) { } + bool operator()(const version& a, const version& b) const { + return _cmp(*a.current_row, *b.current_row) > 0; + } + }; + }; + const schema& _schema; + rows_entry::tri_compare _rows_cmp; + rows_entry::compare _rows_less_cmp; + version::compare _version_cmp; + std::vector _heap; + std::vector _current_row; +public: + rows_iterator(partition_version* version, const schema& schema) + : _schema(schema) + , _rows_cmp(schema) + , _rows_less_cmp(schema) + , _version_cmp(_rows_cmp) + { + bool can_move = true; + while (version) { + can_move &= !version->is_referenced(); + auto& rows = version->partition().clustered_rows(); + if (!rows.empty()) { + _heap.push_back({rows.begin(), &rows, can_move}); + } + version = version->next(); } - return; + boost::range::make_heap(_heap, _version_cmp); + move_to_next_row(); + } + bool done() const { + return _current_row.empty(); + } + // Return clustering key of the current row in source. + // Valid only when !is_dummy(). + const clustering_key& key() const { + return _current_row[0].current_row->key(); } + bool is_dummy() const { + return bool(_current_row[0].current_row->dummy()); + } + template + void consume_row(RowConsumer&& consumer) { + assert(!_current_row.empty()); + // versions in _current_row are not ordered but it is not a problem + // due to the fact that all rows are continuous. + for (version& v : _current_row) { + if (!v.can_move) { + consumer(deletable_row(v.current_row->row())); + } else { + consumer(std::move(v.current_row->row())); + } + } + } + void remove_current_row_when_possible() { + assert(!_current_row.empty()); + auto deleter = current_deleter(); + for (version& v : _current_row) { + if (v.can_move) { + v.rows->erase_and_dispose(v.current_row, deleter); + } + } + } + void move_to_next_row() { + _current_row.clear(); + while (!_heap.empty() && + (_current_row.empty() || _rows_cmp(*_current_row[0].current_row, *_heap[0].current_row) == 0)) { + boost::range::pop_heap(_heap, _version_cmp); + auto& curr = _heap.back(); + _current_row.push_back({curr.current_row, curr.rows, curr.can_move}); + ++curr.current_row; + if (curr.current_row == curr.rows->end()) { + _heap.pop_back(); + } else { + boost::range::push_heap(_heap, _version_cmp); + } + } + } +}; - try { - while (current && !current->is_referenced()) { - auto next = current->next(); - apply(s, std::move(current->partition()), mp_schema); - // Leave current->partition() valid (albeit empty) in case we throw later. - current->partition() = mutation_partition(mp_schema.shared_from_this()); - current = next; +namespace { + +// When applying partition_entry to an incomplete partition_entry this class is used to represent +// the target incomplete partition_entry. It encapsulates the logic needed for handling multiple versions. +class apply_incomplete_target final { + struct version { + mutation_partition::rows_type::iterator current_row; + mutation_partition::rows_type* rows; + size_t version_no; + + struct compare { + const rows_entry::tri_compare& _cmp; + public: + explicit compare(const rows_entry::tri_compare& cmp) : _cmp(cmp) { } + bool operator()(const version& a, const version& b) const { + auto res = _cmp(*a.current_row, *b.current_row); + return res > 0 || (res == 0 && a.version_no > b.version_no); + } + }; + }; + const schema& _schema; + partition_entry& _pe; + rows_entry::tri_compare _rows_cmp; + rows_entry::compare _rows_less_cmp; + version::compare _version_cmp; + std::vector _heap; + mutation_partition::rows_type::iterator _next_in_latest_version; +public: + apply_incomplete_target(partition_entry& pe, const schema& schema) + : _schema(schema) + , _pe(pe) + , _rows_cmp(schema) + , _rows_less_cmp(schema) + , _version_cmp(_rows_cmp) + { + size_t version_no = 0; + _next_in_latest_version = pe.version()->partition().clustered_rows().begin(); + for (auto&& v : pe.version()->elements_from_this()) { + if (!v.partition().clustered_rows().empty()) { + _heap.push_back({v.partition().clustered_rows().begin(), &v.partition().clustered_rows(), version_no}); + } + ++version_no; } - while (current) { - auto next = current->next(); - apply(s, current->partition(), mp_schema); - current = next; + boost::range::make_heap(_heap, _version_cmp); + } + // Applies the row from source. + // Must be called for rows with monotonic keys. + // Weak exception guarantees. The target and source partitions are left + // in a state such that the two still commute to the same value on retry. + void apply(partition_entry::rows_iterator& src) { + auto&& key = src.key(); + while (!_heap.empty() && _rows_less_cmp(*_heap[0].current_row, key)) { + boost::range::pop_heap(_heap, _version_cmp); + auto& curr = _heap.back(); + curr.current_row = curr.rows->lower_bound(key, _rows_less_cmp); + if (curr.version_no == 0) { + _next_in_latest_version = curr.current_row; + } + if (curr.current_row == curr.rows->end()) { + _heap.pop_back(); + } else { + boost::range::push_heap(_heap, _version_cmp); + } } - } catch (...) { + + if (!_heap.empty()) { + rows_entry& next_row = *_heap[0].current_row; + if (_rows_cmp(key, next_row) == 0) { + if (next_row.dummy()) { + return; + } + } else if (!next_row.continuous()) { + return; + } + } + + mutation_partition::rows_type& rows = _pe.version()->partition().clustered_rows(); + if (_next_in_latest_version != rows.end() && _rows_cmp(key, *_next_in_latest_version) == 0) { + src.consume_row([&] (deletable_row&& row) { + _next_in_latest_version->row().apply(_schema, std::move(row)); + }); + } else { + auto e = current_allocator().construct(key); + e->set_continuous(_heap.empty() ? is_continuous::yes : _heap[0].current_row->continuous()); + rows.insert_before(_next_in_latest_version, *e); + src.consume_row([&] (deletable_row&& row) { + e->row().apply(_schema, std::move(row)); + }); + } + } +}; + +} // namespace + +template +void partition_entry::with_detached_versions(Func&& func) { + partition_version* current = &*_version; + auto snapshot = _snapshot; + if (snapshot) { + snapshot->_version = std::move(_version); + snapshot->_entry = nullptr; + _snapshot = nullptr; + } + _version = { }; + + auto revert = defer([&] { if (snapshot) { - pe._snapshot = snapshot; - snapshot->_entry = &pe; - pe._version = std::move(snapshot->_version); + _snapshot = snapshot; + snapshot->_entry = this; + _version = std::move(snapshot->_version); } else { - pe._version = partition_version_ref(*begin); + _version = partition_version_ref(*current); } - throw; + }); + + func(current); +} + +void partition_entry::apply_to_incomplete(const schema& s, partition_entry&& pe, const schema& pe_schema) +{ + if (s.version() != pe_schema.version()) { + partition_entry entry(pe.squashed(pe_schema.shared_from_this(), s.shared_from_this())); + entry.with_detached_versions([&] (partition_version* v) { + apply_to_incomplete(s, v); + }); + } else { + pe.with_detached_versions([&](partition_version* v) { + apply_to_incomplete(s, v); + }); } +} - current = begin; - while (current && !current->is_referenced()) { - auto next = current->next(); - current_allocator().destroy(current); - current = next; +void partition_entry::apply_to_incomplete(const schema& s, partition_version* version) { + partition_version& dst = open_version(s); + + bool can_move = true; + auto current = version; + bool static_row_continuous = dst.partition().static_row_continuous(); + while (current) { + can_move &= !current->is_referenced(); + dst.partition().apply(current->partition().partition_tombstone()); + if (static_row_continuous) { + row& static_row = dst.partition().static_row(); + if (can_move) { + static_row.apply(s, column_kind::static_column, std::move(current->partition().static_row())); + } else { + static_row.apply(s, column_kind::static_column, current->partition().static_row()); + } + } + range_tombstone_list& tombstones = dst.partition().row_tombstones(); + if (can_move) { + tombstones.apply_reversibly(s, current->partition().row_tombstones()).cancel(); + } else { + tombstones.apply(s, current->partition().row_tombstones()); + } + current = current->next(); } - if (current) { - current->back_reference().mark_as_unique_owner(); + + partition_entry::rows_iterator source(version, s); + apply_incomplete_target target(*this, s); + + while (!source.done()) { + if (!source.is_dummy()) { + target.apply(source); + } + source.remove_current_row_when_possible(); + source.move_to_next_row(); } } mutation_partition partition_entry::squashed(schema_ptr from, schema_ptr to) { mutation_partition mp(to); + mp.set_static_row_continuous(_version->partition().static_row_continuous()); for (auto&& v : _version->all_elements()) { mp.apply(*to, v.partition(), *from); } return mp; } +mutation_partition partition_entry::squashed(const schema& s) +{ + return squashed(s.shared_from_this(), s.shared_from_this()); +} + void partition_entry::upgrade(schema_ptr from, schema_ptr to) { auto new_version = current_allocator().construct(mutation_partition(to)); + new_version->partition().set_static_row_continuous(_version->partition().static_row_continuous()); try { for (auto&& v : _version->all_elements()) { new_version->partition().apply(*to, v.partition(), *from); @@ -282,13 +530,45 @@ void partition_entry::upgrade(schema_ptr from, schema_ptr to) remove_or_mark_as_unique_owner(old_version); } -lw_shared_ptr partition_entry::read(schema_ptr entry_schema) +lw_shared_ptr partition_entry::read(schema_ptr entry_schema, partition_snapshot::phase_type phase) { + open_version(*entry_schema, phase); if (_snapshot) { return _snapshot->shared_from_this(); } else { - auto snp = make_lw_shared(entry_schema, this); + auto snp = make_lw_shared(entry_schema, this, phase); _snapshot = snp.get(); return snp; } } + +std::vector +partition_snapshot::range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end) +{ + range_tombstone_list list(s); + for (auto&& v : versions()) { + for (auto&& rt : v.partition().row_tombstones().slice(s, start, end)) { + list.apply(s, rt); + } + } + return boost::copy_range>(list); +} + +std::ostream& operator<<(std::ostream& out, partition_entry& e) { + out << "{"; + bool first = true; + if (e._version) { + for (const partition_version& v : e.versions()) { + if (!first) { + out << ", "; + } + if (v.is_referenced()) { + out << "(*) "; + } + out << v.partition(); + first = false; + } + } + out << "}"; + return out; +} diff --git a/partition_version.hh b/partition_version.hh index e78a14a7ab28..a1b100809d6f 100644 --- a/partition_version.hh +++ b/partition_version.hh @@ -117,6 +117,8 @@ class partition_version : public anchorless_list_base_hook { friend class partition_version_ref; public: + explicit partition_version(schema_ptr s) noexcept + : _partition(std::move(s)) { } explicit partition_version(mutation_partition mp) noexcept : _partition(std::move(mp)) { } partition_version(partition_version&& pv) noexcept; @@ -126,10 +128,12 @@ public: mutation_partition& partition() { return _partition; } const mutation_partition& partition() const { return _partition; } - bool is_referenced() { return _backref; } + bool is_referenced() const { return _backref; } partition_version_ref& back_reference() { return *_backref; } }; +using partition_version_range = anchorless_list_base_hook::range; + class partition_version_ref { partition_version* _version = nullptr; bool _unique_owner = false; @@ -160,7 +164,7 @@ public: return *this; } - explicit operator bool() { return _version; } + explicit operator bool() const { return _version; } partition_version& operator*() { assert(_version); @@ -170,6 +174,10 @@ public: assert(_version); return _version; } + const partition_version* operator->() const { + assert(_version); + return _version; + } bool is_unique_owner() const { return _unique_owner; } void mark_as_unique_owner() { _unique_owner = true; } @@ -178,15 +186,24 @@ public: class partition_entry; class partition_snapshot : public enable_lw_shared_from_this { +public: + // Only snapshots created with the same value of phase can point to the same version. + using phase_type = uint64_t; + static constexpr phase_type default_phase = 0; + static constexpr phase_type max_phase = std::numeric_limits::max(); +private: schema_ptr _schema; // Either _version or _entry is non-null. partition_version_ref _version; partition_entry* _entry; + phase_type _phase; friend class partition_entry; public: - explicit partition_snapshot(schema_ptr s, partition_entry* entry) - : _schema(std::move(s)), _entry(entry) { } + explicit partition_snapshot(schema_ptr s, + partition_entry* entry, + phase_type phase = default_phase) + : _schema(std::move(s)), _entry(entry), _phase(phase) { } partition_snapshot(const partition_snapshot&) = delete; partition_snapshot(partition_snapshot&&) = delete; partition_snapshot& operator=(const partition_snapshot&) = delete; @@ -201,23 +218,48 @@ public: partition_version_ref& version(); - auto versions() { + const partition_version_ref& version() const; + + partition_version_range versions() { return version()->elements_from_this(); } unsigned version_count(); + + bool at_latest_version() const { + return _entry != nullptr; + } + + tombstone partition_tombstone() const; + row static_row() const; + mutation_partition squashed() const; + // Returns range tombstones overlapping with [start, end) + std::vector range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end); }; +// Represents mutation_partition with snapshotting support a la MVCC. +// +// Internally the state is represented by an ordered list of mutation_partition +// objects called versions. The logical mutation_partition state represented +// by that chain is equal to reducing the chain using mutation_partition::apply() +// from left (latest version) to right. class partition_entry { partition_snapshot* _snapshot = nullptr; partition_version_ref _version; friend class partition_snapshot; + friend class cache_entry; private: + // Detaches all versions temporarily around execution of the function. + // The function receives partition_version* pointing to the latest version. + template + void with_detached_versions(Func&&); + void set_version(partition_version*); - void apply(const schema& s, partition_version* pv, const schema& pv_schema); + void apply_to_incomplete(const schema& s, partition_version* other); public: + class rows_iterator; partition_entry() = default; explicit partition_entry(mutation_partition mp); ~partition_entry(); @@ -238,28 +280,68 @@ public: return *this; } + partition_version_ref& version() { + return _version; + } + + partition_version_range versions() { + return _version->elements_from_this(); + } + // Strong exception guarantees. + // Assumes this instance and mp are fully continuous. void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema); - // Same exception guarantees as: - // mutation_partition::apply(const schema&, mutation_partition&&, const schema&) - void apply(const schema& s, mutation_partition&& mp, const schema& mp_schema); - // Strong exception guarantees. + // Assumes this instance and mpv are fully continuous. void apply(const schema& s, mutation_partition_view mpv, const schema& mp_schema); + // Adds mutation_partition represented by "other" to the one represented + // by this entry. + // + // The argument must be fully-continuous. + // + // The rules of addition differ from that used by regular + // mutation_partition addition with regards to continuity. The continuity + // of the result is the same as in this instance. Information from "other" + // which is incomplete in this instance is dropped. In other words, this + // performs set intersection on continuity information, drops information + // which falls outside of the continuity range, and applies regular merging + // rules for the rest. + // // Weak exception guarantees. // If an exception is thrown this and pe will be left in some valid states // such that if the operation is retried (possibly many times) and eventually // succeeds the result will be as if the first attempt didn't fail. - void apply(const schema& s, partition_entry&& pe, const schema& pe_schema); + void apply_to_incomplete(const schema& s, partition_entry&& pe, const schema& pe_schema); + + // Ensures that the latest version can be populated with data from given phase + // by inserting a new version if necessary. + // Doesn't affect value or continuity of the partition. + // Returns a reference to the new latest version. + partition_version& open_version(const schema& s, partition_snapshot::phase_type phase = partition_snapshot::max_phase) { + if (_snapshot && _snapshot->_phase != phase) { + auto new_version = current_allocator().construct(mutation_partition(s.shared_from_this())); + new_version->partition().set_static_row_continuous(_version->partition().static_row_continuous()); + new_version->insert_before(*_version); + set_version(new_version); + return *new_version; + } + return *_version; + } mutation_partition squashed(schema_ptr from, schema_ptr to); + mutation_partition squashed(const schema&); + tombstone partition_tombstone() const; // needs to be called with reclaiming disabled void upgrade(schema_ptr from, schema_ptr to); - lw_shared_ptr read(schema_ptr entry_schema); + // Snapshots with different values of phase will point to different partition_version objects. + lw_shared_ptr read(schema_ptr entry_schema, + partition_snapshot::phase_type phase = partition_snapshot::default_phase); + + friend std::ostream& operator<<(std::ostream& out, partition_entry& e); }; inline partition_version_ref& partition_snapshot::version() @@ -270,3 +352,12 @@ inline partition_version_ref& partition_snapshot::version() return _entry->_version; } } + +inline const partition_version_ref& partition_snapshot::version() const +{ + if (_version) { + return _version; + } else { + return _entry->_version; + } +} diff --git a/position_in_partition.hh b/position_in_partition.hh new file mode 100644 index 000000000000..765a1a73a69e --- /dev/null +++ b/position_in_partition.hh @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "types.hh" +#include "keys.hh" +#include "clustering_bounds_comparator.hh" +#include "query-request.hh" + +inline +lexicographical_relation relation_for_lower_bound(composite_view v) { + switch (v.last_eoc()) { + case composite::eoc::start: + case composite::eoc::none: + return lexicographical_relation::before_all_prefixed; + case composite::eoc::end: + return lexicographical_relation::after_all_prefixed; + default: + assert(0); + } +} + +inline +lexicographical_relation relation_for_upper_bound(composite_view v) { + switch (v.last_eoc()) { + case composite::eoc::start: + return lexicographical_relation::before_all_prefixed; + case composite::eoc::none: + return lexicographical_relation::before_all_strictly_prefixed; + case composite::eoc::end: + return lexicographical_relation::after_all_prefixed; + default: + assert(0); + } +} + +class position_in_partition_view { + friend class position_in_partition; + + int _bound_weight = 0; + const clustering_key_prefix* _ck; // nullptr for static row +private: + position_in_partition_view(int bound_weight, const clustering_key_prefix* ck) + : _bound_weight(bound_weight) + , _ck(ck) + { } + // Returns placement of this position_in_partition relative to *_ck, + // or lexicographical_relation::at_prefix if !_ck. + lexicographical_relation relation() const { + // FIXME: Currently position_range cannot represent a range end bound which + // includes just the prefix key or a range start which excludes just a prefix key. + // In both cases we should return lexicographical_relation::before_all_strictly_prefixed here. + // Refs #1446. + if (_bound_weight <= 0) { + return lexicographical_relation::before_all_prefixed; + } else { + return lexicographical_relation::after_all_prefixed; + } + } +public: + struct static_row_tag_t { }; + struct clustering_row_tag_t { }; + struct range_tag_t { }; + using range_tombstone_tag_t = range_tag_t; + + position_in_partition_view(static_row_tag_t) : _ck(nullptr) { } + position_in_partition_view(clustering_row_tag_t, const clustering_key_prefix& ck) + : _ck(&ck) { } + position_in_partition_view(const clustering_key_prefix& ck) + : _ck(&ck) { } + position_in_partition_view(range_tag_t, bound_view bv) + : _bound_weight(weight(bv.kind)), _ck(&bv.prefix) { } + + static position_in_partition_view for_range_start(const query::clustering_range& r) { + return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)}; + } + + static position_in_partition_view for_range_end(const query::clustering_range& r) { + return {position_in_partition_view::range_tag_t(), bound_view::from_range_end(r)}; + } + + static position_in_partition_view before_all_clustered_rows() { + return {range_tag_t(), bound_view::bottom()}; + } + + static position_in_partition_view after_all_clustered_rows() { + return {position_in_partition_view::range_tag_t(), bound_view::top()}; + } + + static position_in_partition_view for_static_row() { + return {static_row_tag_t()}; + } + + static position_in_partition_view for_key(const clustering_key& ck) { + return {clustering_row_tag_t(), ck}; + } + + static position_in_partition_view after_key(const clustering_key& ck) { + return {1, &ck}; + } + + bool is_static_row() const { return !_ck; } + bool is_clustering_row() const { return _ck && !_bound_weight; } + + // Returns true if all fragments that can be seen for given schema have + // positions >= than this. + bool is_before_all_fragments(const schema& s) const { + return !_ck || (!s.has_static_columns() && _bound_weight < 0 && _ck->is_empty(s)); + } + + bool is_after_all_clustered_rows(const schema& s) const { + return _ck && _ck->is_empty(s) && _bound_weight > 0; + } + + // Valid when >= before_all_clustered_rows() + const clustering_key_prefix& key() const { + return *_ck; + } + + // Can be called only when !is_static_row && !is_clustering_row(). + bound_view as_start_bound_view() const { + assert(_bound_weight != 0); + return bound_view(*_ck, _bound_weight < 0 ? bound_kind::incl_start : bound_kind::excl_start); + } + + friend std::ostream& operator<<(std::ostream&, position_in_partition_view); + friend bool no_clustering_row_between(const schema&, position_in_partition_view, position_in_partition_view); +}; + +class position_in_partition { + int _bound_weight = 0; + stdx::optional _ck; +public: + struct static_row_tag_t { }; + struct after_static_row_tag_t { }; + struct clustering_row_tag_t { }; + struct after_clustering_row_tag_t { }; + struct range_tag_t { }; + using range_tombstone_tag_t = range_tag_t; + + explicit position_in_partition(static_row_tag_t) { } + position_in_partition(clustering_row_tag_t, clustering_key_prefix ck) + : _ck(std::move(ck)) { } + position_in_partition(after_clustering_row_tag_t, clustering_key_prefix ck) + // FIXME: Use lexicographical_relation::before_strictly_prefixed here. Refs #1446 + : _bound_weight(1), _ck(std::move(ck)) { } + position_in_partition(range_tag_t, bound_view bv) + : _bound_weight(weight(bv.kind)), _ck(bv.prefix) { } + position_in_partition(after_static_row_tag_t) : + position_in_partition(range_tag_t(), bound_view::bottom()) { } + explicit position_in_partition(position_in_partition_view view) + : _bound_weight(view._bound_weight) + { + if (view._ck) { + _ck = *view._ck; + } + } + + static position_in_partition before_all_clustered_rows() { + return {position_in_partition::range_tag_t(), bound_view::bottom()}; + } + + static position_in_partition after_all_clustered_rows() { + return {position_in_partition::range_tag_t(), bound_view::top()}; + } + + static position_in_partition after_key(clustering_key ck) { + return {after_clustering_row_tag_t(), std::move(ck)}; + } + + static position_in_partition for_key(clustering_key ck) { + return {clustering_row_tag_t(), std::move(ck)}; + } + + static position_in_partition for_range_start(const query::clustering_range&); + static position_in_partition for_range_end(const query::clustering_range&); + + bool is_static_row() const { return !_ck; } + bool is_clustering_row() const { return _ck && !_bound_weight; } + + bool is_after_all_clustered_rows(const schema& s) const { + return _ck && _ck->is_empty(s) && _bound_weight > 0; + } + + template + void feed_hash(Hasher& hasher, const schema& s) const { + ::feed_hash(hasher, _bound_weight); + if (_ck) { + ::feed_hash(hasher, true); + _ck->feed_hash(hasher, s); + } else { + ::feed_hash(hasher, false); + } + } + + clustering_key_prefix& key() { + return *_ck; + } + const clustering_key_prefix& key() const { + return *_ck; + } + operator position_in_partition_view() const { + return { _bound_weight, _ck ? &*_ck : nullptr }; + } + + // Defines total order on the union of position_and_partition and composite objects. + // + // The ordering is compatible with position_range (r). The following is satisfied for + // all cells with name c included by the range: + // + // r.start() <= c < r.end() + // + // The ordering on composites given by this is compatible with but weaker than the cell name order. + // + // The ordering on position_in_partition given by this is compatible but weaker than the ordering + // given by position_in_partition::tri_compare. + // + class composite_tri_compare { + const schema& _s; + public: + composite_tri_compare(const schema& s) : _s(s) {} + + int operator()(position_in_partition_view a, position_in_partition_view b) const { + if (a.is_static_row() || b.is_static_row()) { + return b.is_static_row() - a.is_static_row(); + } + auto&& types = _s.clustering_key_type()->types(); + auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); }; + return lexicographical_tri_compare(types.begin(), types.end(), + a._ck->begin(_s), a._ck->end(_s), + b._ck->begin(_s), b._ck->end(_s), + cmp, a.relation(), b.relation()); + } + + int operator()(position_in_partition_view a, composite_view b) const { + if (b.empty()) { + return 1; // a cannot be empty. + } + if (a.is_static_row() || b.is_static()) { + return b.is_static() - a.is_static_row(); + } + auto&& types = _s.clustering_key_type()->types(); + auto b_values = b.values(); + auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); }; + return lexicographical_tri_compare(types.begin(), types.end(), + a._ck->begin(_s), a._ck->end(_s), + b_values.begin(), b_values.end(), + cmp, a.relation(), relation_for_lower_bound(b)); + } + + int operator()(composite_view a, position_in_partition_view b) const { + return -(*this)(b, a); + } + + int operator()(composite_view a, composite_view b) const { + if (a.is_static() != b.is_static()) { + return a.is_static() ? -1 : 1; + } + auto&& types = _s.clustering_key_type()->types(); + auto a_values = a.values(); + auto b_values = b.values(); + auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); }; + return lexicographical_tri_compare(types.begin(), types.end(), + a_values.begin(), a_values.end(), + b_values.begin(), b_values.end(), + cmp, + relation_for_lower_bound(a), + relation_for_lower_bound(b)); + } + }; + + // Less comparator giving the same order as composite_tri_compare. + class composite_less_compare { + composite_tri_compare _cmp; + public: + composite_less_compare(const schema& s) : _cmp(s) {} + + template + bool operator()(const T& a, const U& b) const { + return _cmp(a, b) < 0; + } + }; + + class tri_compare { + bound_view::tri_compare _cmp; + private: + template + int compare(const T& a, const U& b) const { + bool a_rt_weight = bool(a._ck); + bool b_rt_weight = bool(b._ck); + if (!a_rt_weight || !b_rt_weight) { + return a_rt_weight - b_rt_weight; + } + return _cmp(*a._ck, a._bound_weight, *b._ck, b._bound_weight); + } + public: + tri_compare(const schema& s) : _cmp(s) { } + int operator()(const position_in_partition& a, const position_in_partition& b) const { + return compare(a, b); + } + int operator()(const position_in_partition_view& a, const position_in_partition_view& b) const { + return compare(a, b); + } + int operator()(const position_in_partition& a, const position_in_partition_view& b) const { + return compare(a, b); + } + int operator()(const position_in_partition_view& a, const position_in_partition& b) const { + return compare(a, b); + } + }; + class less_compare { + tri_compare _cmp; + public: + less_compare(const schema& s) : _cmp(s) { } + bool operator()(const position_in_partition& a, const position_in_partition& b) const { + return _cmp(a, b) < 0; + } + bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const { + return _cmp(a, b) < 0; + } + bool operator()(const position_in_partition& a, const position_in_partition_view& b) const { + return _cmp(a, b) < 0; + } + bool operator()(const position_in_partition_view& a, const position_in_partition& b) const { + return _cmp(a, b) < 0; + } + }; + class equal_compare { + clustering_key_prefix::equality _equal; + template + bool compare(const T& a, const U& b) const { + bool a_rt_weight = bool(a._ck); + bool b_rt_weight = bool(b._ck); + return a_rt_weight == b_rt_weight + && (!a_rt_weight || (_equal(*a._ck, *b._ck) + && a._bound_weight == b._bound_weight)); + } + public: + equal_compare(const schema& s) : _equal(s) { } + bool operator()(const position_in_partition& a, const position_in_partition& b) const { + return compare(a, b); + } + bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const { + return compare(a, b); + } + bool operator()(const position_in_partition_view& a, const position_in_partition& b) const { + return compare(a, b); + } + bool operator()(const position_in_partition& a, const position_in_partition_view& b) const { + return compare(a, b); + } + }; + friend std::ostream& operator<<(std::ostream&, const position_in_partition&); +}; + +inline +position_in_partition position_in_partition::for_range_start(const query::clustering_range& r) { + return {position_in_partition::range_tag_t(), bound_view::from_range_start(r)}; +} + +inline +position_in_partition position_in_partition::for_range_end(const query::clustering_range& r) { + return {position_in_partition::range_tag_t(), bound_view::from_range_end(r)}; +} + +// Returns true if and only if there can't be any clustering_row with position > a and < b. +// It is assumed that a <= b. +inline +bool no_clustering_row_between(const schema& s, position_in_partition_view a, position_in_partition_view b) { + clustering_key_prefix::equality eq(s); + if (a._ck && b._ck) { + return eq(*a._ck, *b._ck) && (a._bound_weight >= 0 || b._bound_weight <= 0); + } else { + return !a._ck && !b._ck; + } +} + +// Includes all position_in_partition objects "p" for which: start <= p < end +// And only those. +class position_range { +private: + position_in_partition _start; + position_in_partition _end; +public: + static position_range from_range(const query::clustering_range&); + + static position_range for_static_row() { + return { + position_in_partition(position_in_partition::static_row_tag_t()), + position_in_partition(position_in_partition::after_static_row_tag_t()) + }; + } + + static position_range full() { + return { + position_in_partition(position_in_partition::static_row_tag_t()), + position_in_partition::after_all_clustered_rows() + }; + } + + static position_range all_clustered_rows() { + return { + position_in_partition::before_all_clustered_rows(), + position_in_partition::after_all_clustered_rows() + }; + } + + position_range(position_range&&) = default; + position_range& operator=(position_range&&) = default; + position_range(const position_range&) = default; + position_range& operator=(const position_range&) = default; + + // Constructs position_range which covers the same rows as given clustering_range. + // position_range includes a fragment if it includes position of that fragment. + position_range(const query::clustering_range&); + position_range(query::clustering_range&&); + + position_range(position_in_partition start, position_in_partition end) + : _start(std::move(start)) + , _end(std::move(end)) + { } + + const position_in_partition& start() const& { return _start; } + position_in_partition&& start() && { return std::move(_start); } + const position_in_partition& end() const& { return _end; } + position_in_partition&& end() && { return std::move(_end); } + bool contains(const schema& s, position_in_partition_view pos) const; + bool overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const; + + friend std::ostream& operator<<(std::ostream&, const position_range&); +}; + +inline +bool position_range::contains(const schema& s, position_in_partition_view pos) const { + position_in_partition::less_compare less(s); + return !less(pos, _start) && less(pos, _end); +} + +inline +bool position_range::overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const { + position_in_partition::less_compare less(s); + return !less(end, _start) && less(start, _end); +} diff --git a/range.hh b/range.hh index 85a79b4779ad..4ee730a08528 100644 --- a/range.hh +++ b/range.hh @@ -548,6 +548,12 @@ public: return nonwrapping_range(range_bound(split_point, false), end()); } } + // Creates a new sub-range which is the intersection of this range and a range starting with "start". + // If there is no overlap, returns stdx::nullopt. + template + stdx::optional trim_front(stdx::optional&& start, Comparator&& cmp) const { + return intersection(nonwrapping_range(std::move(start), {}), cmp); + } // Transforms this range into a new range of a different value type // Supplied transformer should transform value of type T (the old type) into value of type U (the new type). template::type> diff --git a/range_tombstone.hh b/range_tombstone.hh index 79fe5267621e..9ab33382899e 100644 --- a/range_tombstone.hh +++ b/range_tombstone.hh @@ -29,11 +29,10 @@ #include "tombstone.hh" #include "clustering_bounds_comparator.hh" #include "stdx.hh" +#include "position_in_partition.hh" namespace bi = boost::intrusive; -class position_in_partition_view; - /** * Represents a ranged deletion operation. Can be empty. */ @@ -149,6 +148,27 @@ public: // is larger than the end bound of this. stdx::optional apply(const schema& s, range_tombstone&& src); + // Intersects the range of this tombstone with [pos, +inf) and replaces + // the range of the tombstone if there is an overlap. + // Returns true if there is an overlap. When returns false, the tombstone + // is not modified. + // + // pos must satisfy: + // 1) before_all_clustered_rows() <= pos + // 2) !pos.is_clustering_row() - because range_tombstone bounds can't represent such positions + bool trim_front(const schema& s, position_in_partition_view pos) { + position_in_partition::less_compare less(s); + if (!less(pos, end_position())) { + return false; + } + if (less(position(), pos)) { + bound_view new_start = pos.as_start_bound_view(); + start = new_start.prefix; + start_kind = new_start.kind; + } + return true; + } + size_t external_memory_usage() const { return start.external_memory_usage() + end.external_memory_usage(); } diff --git a/range_tombstone_list.cc b/range_tombstone_list.cc index 5a950ae062e0..78eb1157b5ee 100644 --- a/range_tombstone_list.cc +++ b/range_tombstone_list.cc @@ -307,11 +307,46 @@ range_tombstone_list::slice(const schema& s, const query::clustering_range& r) c _tombstones.upper_bound(bv_range.second, order_by_start{s})); } +boost::iterator_range +range_tombstone_list::slice(const schema& s, position_in_partition_view start, position_in_partition_view end) const { + struct order_by_end { + position_in_partition::less_compare less; + order_by_end(const schema& s) : less(s) {} + bool operator()(position_in_partition_view v, const range_tombstone& rt) const { return less(v, rt.end_position()); } + bool operator()(const range_tombstone& rt, position_in_partition_view v) const { return less(rt.end_position(), v); } + }; + struct order_by_start { + position_in_partition::less_compare less; + order_by_start(const schema& s) : less(s) {} + bool operator()(position_in_partition_view v, const range_tombstone& rt) const { return less(v, rt.position()); } + bool operator()(const range_tombstone& rt, position_in_partition_view v) const { return less(rt.position(), v); } + }; + return boost::make_iterator_range( + _tombstones.upper_bound(start, order_by_end{s}), // end_position() is exclusive, hence upper_bound() + _tombstones.lower_bound(end, order_by_start{s})); +} + range_tombstone_list::iterator range_tombstone_list::erase(const_iterator a, const_iterator b) { return _tombstones.erase_and_dispose(a, b, current_deleter()); } +void range_tombstone_list::trim(const schema& s, const query::clustering_row_ranges& ranges) { + range_tombstone_list list(s); + bound_view::compare less(s); + for (auto&& range : ranges) { + auto start = bound_view::from_range_start(range); + auto end = bound_view::from_range_end(range); + for (const range_tombstone& rt : slice(s, range)) { + list.apply(s, range_tombstone( + std::max(rt.start_bound(), start, less), + std::min(rt.end_bound(), end, less), + rt.tomb)); + } + } + *this = std::move(list); +} + range_tombstone_list::range_tombstones_type::iterator range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range_tombstone& new_rt) { _ops.emplace_back(insert_undo_op(new_rt)); @@ -368,3 +403,13 @@ void range_tombstone_list::update_undo_op::undo(const schema& s, range_tombstone assert (it != rt_list.end()); *it = std::move(_old_rt); } + +std::ostream& operator<<(std::ostream& out, const range_tombstone_list& list) { + return out << "{" << ::join(", ", list) << "}"; +} + +bool range_tombstone_list::equal(const schema& s, const range_tombstone_list& other) const { + return boost::equal(_tombstones, other._tombstones, [&s] (auto&& rt1, auto&& rt2) { + return rt1.equal(s, rt2); + }); +} diff --git a/range_tombstone_list.hh b/range_tombstone_list.hh index 8d27fabdbbd0..646e30eb8dd5 100644 --- a/range_tombstone_list.hh +++ b/range_tombstone_list.hh @@ -23,6 +23,8 @@ #include "range_tombstone.hh" #include "query-request.hh" +#include "position_in_partition.hh" +#include class range_tombstone_list final { using range_tombstones_type = range_tombstone::container_type; @@ -139,7 +141,12 @@ public: tombstone search_tombstone_covering(const schema& s, const clustering_key_prefix& key) const; // Returns range of tombstones which overlap with given range boost::iterator_range slice(const schema& s, const query::clustering_range&) const; + // Returns range tombstones which overlap with [start, end) + boost::iterator_range slice(const schema& s, position_in_partition_view start, position_in_partition_view end) const; iterator erase(const_iterator, const_iterator); + // Ensures that every range tombstone is strictly contained within given clustering ranges. + // Preserves all information which may be relevant for rows from that ranges. + void trim(const schema& s, const query::clustering_row_ranges&); range_tombstone_list difference(const schema& s, const range_tombstone_list& rt_list) const; // Erases the range tombstones for which filter returns true. template @@ -161,6 +168,9 @@ public: void apply(const schema& s, const range_tombstone_list& rt_list); // See reversibly_mergeable.hh reverter apply_reversibly(const schema& s, range_tombstone_list& rt_list); + + friend std::ostream& operator<<(std::ostream& out, const range_tombstone_list&); + bool equal(const schema&, const range_tombstone_list&) const; private: void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev); diff --git a/read_context.hh b/read_context.hh new file mode 100644 index 000000000000..91e9979b7493 --- /dev/null +++ b/read_context.hh @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "schema.hh" +#include "query-request.hh" +#include "streamed_mutation.hh" +#include "partition_version.hh" +#include "tracing/tracing.hh" +#include "row_cache.hh" + +namespace cache { + +/* + * Represent a reader to the underlying source. + * This reader automatically makes sure that it's up to date with all cache updates + */ +class autoupdating_underlying_reader final { + row_cache& _cache; + read_context& _read_context; + stdx::optional _reader; + utils::phased_barrier::phase_type _reader_creation_phase; + dht::partition_range _range = { }; + stdx::optional _last_key; + stdx::optional _new_last_key; +public: + autoupdating_underlying_reader(row_cache& cache, read_context& context) + : _cache(cache) + , _read_context(context) + { } + // Reads next partition without changing mutation source snapshot. + future read_next_same_phase() { + _last_key = std::move(_new_last_key); + return (*_reader)().then([this] (auto&& smopt) { + if (smopt) { + _new_last_key = smopt->decorated_key(); + } + return std::move(smopt); + }); + } + future operator()() { + _last_key = std::move(_new_last_key); + auto start = population_range_start(); + auto phase = _cache.phase_of(start); + if (!_reader || _reader_creation_phase != phase) { + if (_last_key) { + auto cmp = dht::ring_position_comparator(*_cache._schema); + auto&& new_range = _range.split_after(*_last_key, cmp); + if (!new_range) { + return make_ready_future(streamed_mutation_opt()); + } + _range = std::move(*new_range); + _last_key = {}; + } + auto& snap = _cache.snapshot_for_phase(phase); + _reader = _cache.create_underlying_reader(_read_context, snap, _range); + _reader_creation_phase = phase; + } + return (*_reader)().then([this] (auto&& smopt) { + if (smopt) { + _new_last_key = smopt->decorated_key(); + } + return std::move(smopt); + }); + } + future<> fast_forward_to(dht::partition_range&& range) { + auto snapshot_and_phase = _cache.snapshot_of(dht::ring_position_view::for_range_start(_range)); + return fast_forward_to(std::move(range), snapshot_and_phase.snapshot, snapshot_and_phase.phase); + } + future<> fast_forward_to(dht::partition_range&& range, mutation_source& snapshot, row_cache::phase_type phase) { + _range = std::move(range); + _last_key = { }; + _new_last_key = { }; + if (_reader && _reader_creation_phase == phase) { + return _reader->fast_forward_to(_range); + } + _reader = _cache.create_underlying_reader(_read_context, snapshot, _range); + _reader_creation_phase = phase; + return make_ready_future<>(); + } + utils::phased_barrier::phase_type creation_phase() const { + assert(_reader); + return _reader_creation_phase; + } + const dht::partition_range& range() const { + return _range; + } + dht::ring_position_view population_range_start() const { + return _last_key ? dht::ring_position_view::for_after_key(*_last_key) + : dht::ring_position_view::for_range_start(_range); + } +}; + +class read_context final : public enable_lw_shared_from_this { + row_cache& _cache; + schema_ptr _schema; + const dht::partition_range& _range; + const query::partition_slice& _slice; + const io_priority_class& _pc; + tracing::trace_state_ptr _trace_state; + streamed_mutation::forwarding _fwd; + mutation_reader::forwarding _fwd_mr; + bool _range_query; + autoupdating_underlying_reader _underlying; + + // When reader enters a partition, it must be set up for reading that + // partition from the underlying mutation source (_sm) in one of two ways: + // + // 1) either _underlying is already in that partition, then _sm is set to the + // stream obtained from it. + // + // 2) _underlying is before the partition, then _underlying_snapshot and _key + // are set so that _sm can be created on demand. + // + streamed_mutation_opt _sm; + mutation_source_opt _underlying_snapshot; + dht::partition_range _sm_range; + stdx::optional _key; + row_cache::phase_type _phase; +public: + read_context(row_cache& cache, + schema_ptr schema, + const dht::partition_range& range, + const query::partition_slice& slice, + const io_priority_class& pc, + tracing::trace_state_ptr trace_state, + streamed_mutation::forwarding fwd, + mutation_reader::forwarding fwd_mr) + : _cache(cache) + , _schema(std::move(schema)) + , _range(range) + , _slice(slice) + , _pc(pc) + , _trace_state(std::move(trace_state)) + , _fwd(fwd) + , _fwd_mr(fwd_mr) + , _range_query(!range.is_singular() || !range.start()->value().has_key()) + , _underlying(_cache, *this) + { } + read_context(const read_context&) = delete; + row_cache& cache() { return _cache; } + const schema_ptr& schema() const { return _schema; } + const dht::partition_range& range() const { return _range; } + const query::partition_slice& slice() const { return _slice; } + const io_priority_class& pc() const { return _pc; } + tracing::trace_state_ptr trace_state() const { return _trace_state; } + streamed_mutation::forwarding fwd() const { return _fwd; } + mutation_reader::forwarding fwd_mr() const { return _fwd_mr; } + bool is_range_query() const { return _range_query; } + autoupdating_underlying_reader& underlying() { return _underlying; } + row_cache::phase_type phase() const { return _phase; } + const dht::decorated_key& key() const { return _sm->decorated_key(); } +private: + future<> create_sm(); + future<> ensure_sm_created() { + if (_sm) { + return make_ready_future<>(); + } + return create_sm(); + } +public: + // Prepares the underlying streamed_mutation to represent dk in given snapshot. + // Partitions must be entered with strictly monotonic keys. + // The key must be after the current range of the underlying() reader. + // The phase argument must match the snapshot's phase. + void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) { + _phase = phase; + _sm = {}; + _underlying_snapshot = snapshot; + _key = dk; + } + // Prepares the underlying streamed_mutation to be sm. + // The phase argument must match the phase of the snapshot used to obtain sm. + void enter_partition(streamed_mutation&& sm, row_cache::phase_type phase) { + _phase = phase; + _sm = std::move(sm); + _underlying_snapshot = {}; + } + // Fast forwards the underlying streamed_mutation to given range. + future<> fast_forward_to(position_range range) { + return ensure_sm_created().then([this, range = std::move(range)] () mutable { + return _sm->fast_forward_to(std::move(range)); + }); + } + // Gets the next fragment from the underlying streamed_mutation + future get_next_fragment() { + return ensure_sm_created().then([this] { + return (*_sm)(); + }); + } +}; + +} diff --git a/row_cache.cc b/row_cache.cc index ff9d82cdb373..ebfee8efcf7f 100644 --- a/row_cache.cc +++ b/row_cache.cc @@ -32,29 +32,21 @@ #include #include #include "stdx.hh" +#include "cache_streamed_mutation.hh" +#include "read_context.hh" +#include "schema_upgrader.hh" using namespace std::chrono_literals; +using namespace cache; static logging::logger clogger("cache"); thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduling_group(1ms, 0.2); -enum class is_wide_partition { yes, no }; - -future -try_to_read(uint64_t max_cached_partition_size_in_bytes, streamed_mutation_opt&& sm) { - if (!sm) { - return make_ready_future(is_wide_partition::no, mutation_opt()); - } - return mutation_from_streamed_mutation_with_limit(std::move(*sm), max_cached_partition_size_in_bytes).then( - [] (mutation_opt&& omo) mutable { - if (omo) { - return make_ready_future(is_wide_partition::no, std::move(omo)); - } else { - return make_ready_future(is_wide_partition::yes, mutation_opt()); - } - }); +mutation_reader +row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) { + return src(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes); } cache_tracker& global_cache_tracker() { @@ -77,19 +69,10 @@ cache_tracker::cache_tracker() { clear_continuity(*std::next(it)); lru.pop_back_and_dispose(current_deleter()); }; - if (!_wide_partition_lru.empty() && (_normal_eviction_count == 0 || _lru.empty())) { - evict_last(_wide_partition_lru); - _normal_eviction_count = _normal_large_eviction_ratio; - ++_stats.wide_partition_evictions; - } else { - if (_lru.empty()) { - return memory::reclaiming_result::reclaimed_nothing; - } - evict_last(_lru); - if (_normal_eviction_count > 0) { - --_normal_eviction_count; - } + if (_lru.empty()) { + return memory::reclaiming_result::reclaimed_nothing; } + evict_last(_lru); --_stats.partitions; ++_stats.evictions; ++_stats.modification_count; @@ -117,14 +100,12 @@ cache_tracker::setup_metrics() { sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }), sm::make_derive("total_operations_hits", sm::description("total number of operation hits"), _stats.hits), sm::make_derive("total_operations_misses", sm::description("total number of operation misses"), _stats.misses), - sm::make_derive("total_operations_uncached_wide_partitions", sm::description("total number of operation of uncached wide partitions"), _stats.uncached_wide_partitions), sm::make_derive("total_operations_insertions", sm::description("total number of operation insert"), _stats.insertions), sm::make_derive("total_operations_concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key), sm::make_derive("total_operations_merges", sm::description("total number of operation merged"), _stats.merges), sm::make_derive("total_operations_evictions", sm::description("total number of operation eviction"), _stats.evictions), - sm::make_derive("total_operations_wide_partition_evictions", sm::description("total number of operation wide partition eviction"), _stats.wide_partition_evictions), - sm::make_derive("total_operations_wide_partition_mispopulations", sm::description("total number of operation wide partition mispopulations"), _stats.wide_partition_mispopulations), sm::make_derive("total_operations_removals", sm::description("total number of operation removals"), _stats.removals), + sm::make_derive("total_operations_mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations), sm::make_gauge("objects_partitions", sm::description("total number of partition objects"), _stats.partitions) }); } @@ -145,7 +126,6 @@ void cache_tracker::clear() { } }; clear(_lru); - clear(_wide_partition_lru); }); _stats.removals += _stats.partitions; _stats.partitions = 0; @@ -157,26 +137,14 @@ void cache_tracker::touch(cache_entry& e) { lru.erase(lru.iterator_to(e)); lru.push_front(e); }; - move_to_front(e.wide_partition() ? _wide_partition_lru : _lru, e); + move_to_front(_lru, e); } void cache_tracker::insert(cache_entry& entry) { ++_stats.insertions; ++_stats.partitions; ++_stats.modification_count; - if (entry.wide_partition()) { - _wide_partition_lru.push_front(entry); - } else { - _lru.push_front(entry); - } -} - -void cache_tracker::mark_wide(cache_entry& entry) { - if (entry._lru_link.is_linked()) { - entry._lru_link.unlink(); - } - entry.set_wide_partition(); - _wide_partition_lru.push_front(entry); + _lru.push_front(entry); } void cache_tracker::on_erase() { @@ -197,16 +165,12 @@ void cache_tracker::on_miss() { ++_stats.misses; } -void cache_tracker::on_miss_already_populated() { - ++_stats.concurrent_misses_same_key; -} - -void cache_tracker::on_uncached_wide_partition() { - ++_stats.uncached_wide_partitions; +void cache_tracker::on_mispopulate() { + ++_stats.mispopulations; } -void cache_tracker::on_wide_partition_mispopulation() { - ++_stats.wide_partition_mispopulations; +void cache_tracker::on_miss_already_populated() { + ++_stats.concurrent_misses_same_key; } allocation_strategy& cache_tracker::allocator() { @@ -221,66 +185,161 @@ const logalloc::region& cache_tracker::region() const { return _region; } +// Stable cursor over partition entries from given range. +// +// Must be accessed with reclaim lock held on the cache region. +// The position of the cursor is always valid, but cache entry reference +// is not always valid. It remains valid as long as the iterators +// into _cache._partitions remain valid. Cache entry reference can be +// brought back to validity by calling refresh(). +// +class partition_range_cursor final { + std::reference_wrapper _cache; + row_cache::partitions_type::iterator _it; + row_cache::partitions_type::iterator _end; + dht::ring_position_view _start_pos; + dht::ring_position_view _end_pos; + stdx::optional _last; + uint64_t _last_reclaim_count; + size_t _last_modification_count; +private: + void set_position(cache_entry& e) { + // FIXME: make ring_position_view convertible to ring_position, so we can use e.position() + if (e.is_dummy_entry()) { + _last = {}; + _start_pos = dht::ring_position_view::max(); + } else { + _last = e.key(); + _start_pos = dht::ring_position_view(*_last); + } + } +public: + // Creates a cursor positioned at the lower bound of the range. + // The cache entry reference is not valid. + // The range reference must remain live as long as this instance is used. + partition_range_cursor(row_cache& cache, const dht::partition_range& range) + : _cache(cache) + , _start_pos(dht::ring_position_view::for_range_start(range)) + , _end_pos(dht::ring_position_view::for_range_end(range)) + , _last_reclaim_count(std::numeric_limits::max()) + , _last_modification_count(std::numeric_limits::max()) + { } + + // Ensures that cache entry reference is valid. + // The cursor will point at the first entry with position >= the current position. + // Returns true if and only if the position of the cursor changed. + // Strong exception guarantees. + bool refresh() { + auto reclaim_count = _cache.get().get_cache_tracker().region().reclaim_counter(); + auto modification_count = _cache.get().get_cache_tracker().modification_count(); + + if (reclaim_count == _last_reclaim_count && modification_count == _last_modification_count) { + return true; + } + + auto cmp = cache_entry::compare(_cache.get()._schema); + if (cmp(_end_pos, _start_pos)) { // next() may have moved _start_pos past the _end_pos. + _end_pos = _start_pos; + } + _end = _cache.get()._partitions.lower_bound(_end_pos, cmp); + _it = _cache.get()._partitions.lower_bound(_start_pos, cmp); + auto same = !cmp(_start_pos, _it->position()); + set_position(*_it); + _last_reclaim_count = reclaim_count; + _last_modification_count = modification_count; + return same; + } + + // Positions the cursor at the next entry. + // May advance past the requested range. Use in_range() after the call to determine that. + // Call only when in_range() and cache entry reference is valid. + // Strong exception guarantees. + void next() { + auto next = std::next(_it); + set_position(*next); + _it = std::move(next); + } + + // Valid only after refresh() and before _cache._partitions iterators are invalidated. + // Points inside the requested range if in_range(). + cache_entry& entry() { + return *_it; + } + + // Call only when cache entry reference is valid. + bool in_range() { + return _it != _end; + } + + // Returns current position of the cursor. + // Result valid as long as this instance is valid and not advanced. + dht::ring_position_view position() const { + return _start_pos; + } +}; + +future<> read_context::create_sm() { + if (_range_query) { + // FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range + // here in case the same reader will need to be fast forwarded later. + _sm_range = dht::partition_range({dht::ring_position(*_key)}, {dht::ring_position(*_key)}); + } else { + _sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)}); + } + return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase).then([this] { + return _underlying.read_next_same_phase().then([this] (auto&& smo) { + if (!smo) { + _sm = make_empty_streamed_mutation(_cache.schema(), *_key, streamed_mutation::forwarding::yes); + } else { + _sm = std::move(*smo); + } + }); + }); +} + +static streamed_mutation read_directly_from_underlying(streamed_mutation&& sm, read_context& reader) { + if (reader.schema()->version() != sm.schema()->version()) { + sm = transform(std::move(sm), schema_upgrader(reader.schema())); + } + if (reader.fwd() == streamed_mutation::forwarding::no) { + sm = streamed_mutation_from_forwarding_streamed_mutation(std::move(sm)); + } + return std::move(sm); +} + // Reader which populates the cache using data from the delegate. class single_partition_populating_reader final : public mutation_reader::impl { - schema_ptr _schema; row_cache& _cache; - mutation_source& _underlying; mutation_reader _delegate; - const io_priority_class _pc; - const query::partition_slice& _slice; - dht::partition_range _large_partition_range; - mutation_reader _large_partition_reader; - tracing::trace_state_ptr _trace_state; - streamed_mutation::forwarding _fwd; + lw_shared_ptr _read_context; public: - single_partition_populating_reader(schema_ptr s, - row_cache& cache, - mutation_source& underlying, - mutation_reader delegate, - const io_priority_class pc, - const query::partition_slice& slice, - tracing::trace_state_ptr trace_state, - streamed_mutation::forwarding fwd) - : _schema(std::move(s)) - , _cache(cache) - , _underlying(underlying) - , _delegate(std::move(delegate)) - , _pc(pc) - , _slice(slice) - , _trace_state(std::move(trace_state)) - , _fwd(fwd) + single_partition_populating_reader(row_cache& cache, + lw_shared_ptr context) + : _cache(cache) + , _read_context(std::move(context)) { } virtual future operator()() override { - auto op = _cache._populate_phaser.start(); - return _delegate().then([this, op = std::move(op)] (auto sm) mutable { + if (!_read_context) { + return make_ready_future(streamed_mutation_opt()); + } + auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value()); + auto phase = src_and_phase.phase; + _delegate = _cache.create_underlying_reader(*_read_context, src_and_phase.snapshot, _read_context->range()); + return _delegate().then([this, phase] (auto sm) mutable -> streamed_mutation_opt { + auto ctx = std::move(_read_context); if (!sm) { - return make_ready_future(streamed_mutation_opt()); + return std::move(sm); } - dht::decorated_key dk = sm->decorated_key(); - return try_to_read(_cache._max_cached_partition_size_in_bytes, std::move(sm)).then( - [this, op = std::move(op), dk = std::move(dk)] - (is_wide_partition wide_partition, mutation_opt&& mo) { - if (wide_partition == is_wide_partition::no) { - if (mo) { - _cache.populate(*mo); - mo->upgrade(_schema); - auto ck_ranges = query::clustering_key_filter_ranges::get_ranges(*_schema, _slice, mo->key()); - auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), std::move(ck_ranges)); - mo->partition() = std::move(filtered_partition); - return make_ready_future(streamed_mutation_from_mutation(std::move(*mo), _fwd)); - } - return make_ready_future(streamed_mutation_opt()); - } else { - _cache.on_uncached_wide_partition(); - _cache._tracker.on_wide_partition_mispopulation(); - _cache.mark_partition_as_wide(dk); - _large_partition_range = dht::partition_range::make_singular(std::move(dk)); - _large_partition_reader = _underlying(_schema, _large_partition_range, _slice, _pc, _trace_state, _fwd); - return _large_partition_reader(); - } + if (phase == _cache.phase_of(ctx->range().start()->value())) { + return _cache._read_section(_cache._tracker.region(), [&] { + cache_entry& e = _cache.find_or_create(sm->decorated_key(), sm->partition_tombstone(), phase); + return e.read(_cache, *ctx, std::move(*sm), phase); }); + } else { + _cache._tracker.on_mispopulate(); + return read_directly_from_underlying(std::move(*sm), *ctx); + } }); } }; @@ -299,176 +358,31 @@ void row_cache::on_miss() { _tracker.on_miss(); } -void row_cache::on_uncached_wide_partition() { - _tracker.on_uncached_wide_partition(); -} - -class just_cache_scanning_reader final { - schema_ptr _schema; - row_cache& _cache; - row_cache::partitions_type::iterator _it; - row_cache::partitions_type::iterator _end; - const dht::partition_range* _range; - stdx::optional _last; - uint64_t _last_reclaim_count; - size_t _last_modification_count; - const query::partition_slice& _slice; - const io_priority_class _pc; - streamed_mutation::forwarding _fwd; -private: - void update_iterators() { - auto cmp = cache_entry::compare(_cache._schema); - auto update_end = [&] { - if (_range->end()) { - if (_range->end()->is_inclusive()) { - _end = _cache._partitions.upper_bound(_range->end()->value(), cmp); - } else { - _end = _cache._partitions.lower_bound(_range->end()->value(), cmp); - } - } else { - _end = _cache.partitions_end(); - } - }; - - auto reclaim_count = _cache.get_cache_tracker().region().reclaim_counter(); - auto modification_count = _cache.get_cache_tracker().modification_count(); - if (!_last) { - if (_range->start()) { - if (_range->start()->is_inclusive()) { - _it = _cache._partitions.lower_bound(_range->start()->value(), cmp); - } else { - _it = _cache._partitions.upper_bound(_range->start()->value(), cmp); - } - } else { - _it = _cache._partitions.begin(); - } - update_end(); - } else if (reclaim_count != _last_reclaim_count || modification_count != _last_modification_count) { - _it = _cache._partitions.upper_bound(*_last, cmp); - update_end(); - } - _last_reclaim_count = reclaim_count; - _last_modification_count = modification_count; - } -public: - struct cache_data { - streamed_mutation_opt mut; - bool continuous; - }; - just_cache_scanning_reader(schema_ptr s, - row_cache& cache, - const dht::partition_range& range, - const query::partition_slice& slice, - const io_priority_class& pc, - streamed_mutation::forwarding fwd) - : _schema(std::move(s)), _cache(cache), _range(&range), _slice(slice), _pc(pc), _fwd(fwd) - { } - future operator()() { - return _cache._read_section(_cache._tracker.region(), [this] { - return with_linearized_managed_bytes([&] { - update_iterators(); - if (_it == _end) { - return make_ready_future(cache_data { {}, _it->continuous() }); - } - cache_entry& ce = *_it; - ++_it; - _last = ce.key(); - _cache.upgrade_entry(ce); - _cache._tracker.touch(ce); - _cache.on_hit(); - cache_data cd { { }, ce.continuous() }; - if (ce.wide_partition()) { - return ce.read_wide(_cache, _schema, _slice, _pc, _fwd).then([this, cd = std::move(cd)] (auto smopt) mutable { - if (smopt) { - cd.mut = std::move(*smopt); - } else { - cd.mut = streamed_mutation_from_mutation(mutation(*_last, _schema), _fwd); - } - return std::move(cd); - }); - } - cd.mut = ce.read(_cache, _schema, _slice, _fwd); - return make_ready_future(std::move(cd)); - }); - }); - } - future<> fast_forward_to(const dht::partition_range& pr) { - _last = {}; - _range = ≺ - return make_ready_future<>(); - } -}; - class range_populating_reader { row_cache& _cache; - schema_ptr _schema; - dht::partition_range _range; - const query::partition_slice& _slice; - utils::phased_barrier::phase_type _populate_phase; - const io_priority_class& _pc; - tracing::trace_state_ptr _trace_state; - mutation_reader _reader; - bool _reader_created = false; - row_cache::previous_entry_pointer _last_key; - dht::partition_range _large_partition_range; - mutation_reader _large_partition_reader; - streamed_mutation::forwarding _fwd; + autoupdating_underlying_reader& _reader; + stdx::optional _last_key; + read_context& _read_context; private: - void update_reader() { - // TODO: allow updating sstables without fully recreating the reader - if (_populate_phase != _cache._populate_phaser.phase()) { - _populate_phase = _cache._populate_phaser.phase(); - if (_last_key._key) { - auto cmp = dht::ring_position_comparator(*_schema); - auto&& new_range = _range.split_after(*_last_key._key, cmp); - if (new_range) { - _range = std::move(new_range).value(); - } else { - _reader = make_empty_reader(); - _reader_created = false; - return; - } - } - _reader = _cache._underlying(_cache._schema, _range, query::full_slice, _pc, _trace_state); - } - } - - future handle_large_partition(dht::decorated_key&& dk) { - _cache.on_uncached_wide_partition(); - _cache._tracker.on_wide_partition_mispopulation(); - _cache.mark_partition_as_wide(dk, &_last_key); - _last_key.reset(dk, _populate_phase); - - _large_partition_range = dht::partition_range::make_singular(dk); - _large_partition_reader = _cache._underlying(_schema, _large_partition_range, _slice, _pc, _trace_state, _fwd); - return _large_partition_reader().then([this, dk = std::move(dk)] (auto smopt) mutable -> streamed_mutation_opt { - _large_partition_reader = {}; - if (!smopt) { - // We cannot emit disengaged optional since this is a part of range - // read and it would incorrectly interpreted as end of stream. - // Produce empty mutation instead. - return streamed_mutation_from_mutation(mutation(std::move(dk), _schema)); - } - return smopt; - }); + bool can_set_continuity() const { + return _last_key && _reader.creation_phase() == _cache.phase_of(_reader.population_range_start()); } - void handle_end_of_stream() { - if (_last_key._populate_phase != _populate_phase) { + if (!can_set_continuity()) { return; } - if (!_range.end() || !_range.end()->is_inclusive()) { + if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) { cache_entry::compare cmp(_cache._schema); - auto it = _range.end() ? _cache._partitions.find(_range.end()->value(), cmp) - : std::prev(_cache._partitions.end()); + auto it = _reader.range().end() ? _cache._partitions.find(_reader.range().end()->value(), cmp) + : std::prev(_cache._partitions.end()); if (it != _cache._partitions.end()) { if (it == _cache._partitions.begin()) { - if (!_last_key._key) { + if (!_last_key->_key) { it->set_continuous(true); } } else { auto prev = std::prev(it); - if (prev->key().equal(*_cache._schema, *_last_key._key)) { + if (prev->key().equal(*_cache._schema, *_last_key->_key)) { it->set_continuous(true); } } @@ -476,167 +390,142 @@ class range_populating_reader { } } public: - range_populating_reader( - row_cache& cache, - schema_ptr schema, - const query::partition_slice& slice, - const io_priority_class& pc, - tracing::trace_state_ptr trace_state, - streamed_mutation::forwarding fwd) + range_populating_reader(row_cache& cache, read_context& ctx) : _cache(cache) - , _schema(std::move(schema)) - , _slice(slice) - , _pc(pc) - , _trace_state(std::move(trace_state)) - , _fwd(fwd) + , _reader(ctx.underlying()) + , _read_context(ctx) {} future operator()() { - update_reader(); - return _reader().then([this, op = _cache._populate_phaser.start()] (streamed_mutation_opt smopt) mutable { - dht::decorated_key dk = smopt ? smopt->decorated_key() : dht::decorated_key{ {}, partition_key::make_empty() }; - return try_to_read(_cache._max_cached_partition_size_in_bytes, std::move(smopt)).then( - [this, op = std::move(op), dk = std::move(dk)] (is_wide_partition is_wide, mutation_opt&& mo) mutable { - if (is_wide == is_wide_partition::yes) { - _cache.on_miss(); - return handle_large_partition(std::move(dk)); - } - - if (!mo) { + return _reader().then([this] (streamed_mutation_opt smopt) mutable -> streamed_mutation_opt { + { + if (!smopt) { handle_end_of_stream(); - return make_ready_future(); + return std::move(smopt); } - _cache.on_miss(); - _cache.populate(*mo, &_last_key); - _last_key.reset(mo->decorated_key(), _populate_phase); - - mo->upgrade(_schema); - auto ck_ranges = query::clustering_key_filter_ranges::get_ranges(*_schema, _slice, mo->key()); - auto filtered_partition = mutation_partition(std::move(mo->partition()), *mo->schema(), std::move(ck_ranges)); - mo->partition() = std::move(filtered_partition); - return make_ready_future(streamed_mutation_from_mutation(std::move(*mo), _fwd)); - }); + if (_reader.creation_phase() == _cache.phase_of(smopt->decorated_key())) { + return _cache._read_section(_cache._tracker.region(), [&] { + cache_entry& e = _cache.find_or_create(smopt->decorated_key(), smopt->partition_tombstone(), _reader.creation_phase(), + can_set_continuity() ? &*_last_key : nullptr); + _last_key = smopt->decorated_key(); + return e.read(_cache, _read_context, std::move(*smopt), _reader.creation_phase()); + }); + } else { + _cache._tracker.on_mispopulate(); + _last_key = smopt->decorated_key(); + return read_directly_from_underlying(std::move(*smopt), _read_context); + } + } }); } - future<> fast_forward_to(const dht::partition_range& pr) { - _range = pr; - - auto phase = _cache._populate_phaser.phase(); - if (!_range.start()) { - _last_key.reset({ }, phase); - } else if (!_range.start()->is_inclusive() && _range.start()->value().has_key()) { - _last_key.reset(_range.start()->value().as_decorated_key(), phase); + future<> fast_forward_to(dht::partition_range&& pr) { + if (!pr.start()) { + _last_key = row_cache::previous_entry_pointer(); + } else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) { + _last_key = pr.start()->value().as_decorated_key(); } else { // Inclusive start bound, cannot set continuity flag. - _last_key.reset(stdx::nullopt, phase - 1); + _last_key = {}; } - if (!_reader_created || phase != _populate_phase) { - _populate_phase = _cache._populate_phaser.phase(); - _reader = _cache._underlying(_cache._schema, _range, query::full_slice, _pc, _trace_state); - _reader_created = true; - return make_ready_future(); - } - return _reader.fast_forward_to(_range); + return _reader.fast_forward_to(std::move(pr)); } }; class scanning_and_populating_reader final : public mutation_reader::impl { const dht::partition_range* _pr; - schema_ptr _schema; - dht::partition_range _secondary_range; - - just_cache_scanning_reader _primary_reader; + row_cache& _cache; + lw_shared_ptr _read_context; + partition_range_cursor _primary; range_populating_reader _secondary_reader; - streamed_mutation::forwarding _fwd; - mutation_reader::forwarding _fwd_mr; - streamed_mutation_opt _next_primary; bool _secondary_in_progress = false; - bool _first_element = true; - stdx::optional _last_key; + bool _advance_primary = false; + stdx::optional _lower_bound; + dht::partition_range _secondary_range; private: - void update_last_key(const streamed_mutation_opt& smopt) { - if (smopt) { - _last_key = smopt->decorated_key(); - } + streamed_mutation read_from_entry(cache_entry& ce) { + _cache.upgrade_entry(ce); + _cache._tracker.touch(ce); + _cache.on_hit(); + return ce.read(_cache, *_read_context); } - bool is_inclusive_start_bound(const dht::decorated_key& dk) { - if (!_first_element) { - return false; - } - return _pr->start() && _pr->start()->is_inclusive() && _pr->start()->value().equal(*_schema, dk); - } + streamed_mutation_opt do_read_from_primary() { + return _cache._read_section(_cache._tracker.region(), [this] { + return with_linearized_managed_bytes([&] () -> streamed_mutation_opt { + auto not_moved = _primary.refresh(); - future read_from_primary() { - return _primary_reader().then([this] (just_cache_scanning_reader::cache_data cd) { - auto& smopt = cd.mut; - if (cd.continuous || (smopt && is_inclusive_start_bound(smopt->decorated_key()))) { - _first_element = false; - update_last_key(smopt); - return make_ready_future(std::move(smopt)); - } else { - _next_primary = std::move(smopt); + if (_advance_primary && not_moved) { + _primary.next(); + not_moved = false; + } + _advance_primary = false; - if (!_next_primary) { - if (!_last_key) { - _secondary_range = *_pr; - } else { - dht::ring_position_comparator cmp(*_schema); - auto&& new_range = _pr->split_after(*_last_key, cmp); - if (!new_range) { - return make_ready_future(); - } - _secondary_range = std::move(*new_range); + if (not_moved || _primary.entry().continuous()) { + if (!_primary.in_range()) { + return stdx::nullopt; } + cache_entry& e = _primary.entry(); + auto sm = read_from_entry(e); + _lower_bound = {e.key(), false}; + // Delay the call to next() so that we don't see stale continuity on next invocation. + _advance_primary = true; + return streamed_mutation_opt(std::move(sm)); } else { - if (_last_key) { - _secondary_range = dht::partition_range::make({ *_last_key, false }, { _next_primary->decorated_key(), false }); + if (_primary.in_range()) { + cache_entry& e = _primary.entry(); + _secondary_range = dht::partition_range(_lower_bound ? std::move(_lower_bound) : _pr->start(), + dht::partition_range::bound{e.key(), false}); + _lower_bound = {e.key(), true}; + _secondary_in_progress = true; + return stdx::nullopt; } else { - if (!_pr->start()) { - _secondary_range = dht::partition_range::make_ending_with({ _next_primary->decorated_key(), false }); - } else { - _secondary_range = dht::partition_range::make(*_pr->start(), { _next_primary->decorated_key(), false }); + dht::ring_position_comparator cmp(*_read_context->schema()); + auto range = _pr->trim_front(std::move(_lower_bound), cmp); + if (!range) { + return stdx::nullopt; } + _lower_bound = {dht::ring_position::max()}; + _secondary_range = std::move(*range); + _secondary_in_progress = true; + return stdx::nullopt; } } + }); + }); + } - _secondary_in_progress = true; - return _secondary_reader.fast_forward_to(_secondary_range).then([this] { - return read_from_secondary(); - }); - } + future read_from_primary() { + auto smo = do_read_from_primary(); + if (!_secondary_in_progress) { + return make_ready_future(std::move(smo)); + } + return _secondary_reader.fast_forward_to(std::move(_secondary_range)).then([this] { + return read_from_secondary(); }); } future read_from_secondary() { return _secondary_reader().then([this] (streamed_mutation_opt smopt) { if (smopt) { - return smopt; + return make_ready_future(std::move(smopt)); } else { _secondary_in_progress = false; - update_last_key(_next_primary); - return std::move(_next_primary); + return read_from_primary(); } }); } public: - scanning_and_populating_reader(schema_ptr s, - row_cache& cache, - const dht::partition_range& range, - const query::partition_slice& slice, - const io_priority_class& pc, - tracing::trace_state_ptr trace_state, - streamed_mutation::forwarding fwd, - mutation_reader::forwarding fwd_mr) + scanning_and_populating_reader(row_cache& cache, + const dht::partition_range& range, + lw_shared_ptr context) : _pr(&range) - , _schema(s) - , _primary_reader(s, cache, range, slice, pc, fwd) - , _secondary_reader(cache, s, slice, pc, trace_state, fwd) - , _fwd(fwd) - , _fwd_mr(fwd_mr) + , _cache(cache) + , _read_context(std::move(context)) + , _primary(cache, range) + , _secondary_reader(cache, *_read_context) { } future operator()() { @@ -649,21 +538,17 @@ class scanning_and_populating_reader final : public mutation_reader::impl { future<> fast_forward_to(const dht::partition_range& pr) { _secondary_in_progress = false; - _first_element = true; + _advance_primary = false; _pr = ≺ - return _primary_reader.fast_forward_to(pr); + _primary = partition_range_cursor{_cache, pr}; + _lower_bound = {}; + return make_ready_future<>(); } }; mutation_reader -row_cache::make_scanning_reader(schema_ptr s, - const dht::partition_range& range, - const io_priority_class& pc, - const query::partition_slice& slice, - tracing::trace_state_ptr trace_state, - streamed_mutation::forwarding fwd, - mutation_reader::forwarding fwd_mr) { - return make_mutation_reader(std::move(s), *this, range, slice, pc, std::move(trace_state), fwd, fwd_mr); +row_cache::make_scanning_reader(const dht::partition_range& range, lw_shared_ptr context) { + return make_mutation_reader(*this, range, std::move(context)); } mutation_reader @@ -673,43 +558,29 @@ row_cache::make_reader(schema_ptr s, const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd, - mutation_reader::forwarding fwd_mr) { - if (range.is_singular()) { - const query::ring_position& pos = range.start()->value(); - - if (!pos.has_key()) { - return make_scanning_reader(std::move(s), range, pc, slice, std::move(trace_state), fwd, fwd_mr); - } + mutation_reader::forwarding fwd_mr) +{ + auto ctx = make_lw_shared(*this, std::move(s), range, slice, pc, trace_state, fwd, fwd_mr); + if (!ctx->is_range_query()) { return _read_section(_tracker.region(), [&] { return with_linearized_managed_bytes([&] { - const dht::decorated_key& dk = pos.as_decorated_key(); - auto i = _partitions.find(dk, cache_entry::compare(_schema)); + auto i = _partitions.find(ctx->range().start()->value(), cache_entry::compare(_schema)); if (i != _partitions.end()) { cache_entry& e = *i; _tracker.touch(e); upgrade_entry(e); - mutation_reader reader; - if (e.wide_partition()) { - reader = _underlying(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr); - _tracker.on_uncached_wide_partition(); - on_miss(); - } else { - reader = make_reader_returning(e.read(*this, s, slice, fwd)); - on_hit(); - } - return reader; + on_hit(); + return make_reader_returning(e.read(*this, *ctx)); } else { - auto reader = make_mutation_reader(s, *this, _underlying, - _underlying(_schema, range, query::full_slice, pc, trace_state), pc, slice, trace_state, fwd); on_miss(); - return reader; + return make_mutation_reader(*this, std::move(ctx)); } }); }); } - return make_scanning_reader(std::move(s), range, pc, slice, std::move(trace_state), fwd, fwd_mr); + return make_scanning_reader(range, std::move(ctx)); } row_cache::~row_cache() { @@ -738,12 +609,11 @@ template // { create(it) } -> row_cache::partitions_type::iterator; // { visit(it) } -> void; // } -void row_cache::do_find_or_create_entry(const dht::decorated_key& key, +cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key, const previous_entry_pointer* previous, CreateEntry&& create_entry, VisitEntry&& visit_entry) { - with_allocator(_tracker.allocator(), [&] { - _populate_section(_tracker.region(), [&] { - with_linearized_managed_bytes([&] { + return with_allocator(_tracker.allocator(), [&] () -> cache_entry& { + return with_linearized_managed_bytes([&] () -> cache_entry& { auto i = _partitions.lower_bound(key, cache_entry::compare(_schema)); if (i == _partitions.end() || !i->key().equal(*_schema, key)) { i = create_entry(i); @@ -751,8 +621,8 @@ void row_cache::do_find_or_create_entry(const dht::decorated_key& key, visit_entry(i); } - if (!previous || previous->_populate_phase != _populate_phaser.phase()) { - return; + if (!previous) { + return *i; } if ((!previous->_key && i == _partitions.begin()) @@ -760,23 +630,27 @@ void row_cache::do_find_or_create_entry(const dht::decorated_key& key, && std::prev(i)->key().equal(*_schema, *previous->_key))) { i->set_continuous(true); } + + return *i; }); - }); }); } -void row_cache::mark_partition_as_wide(const dht::decorated_key& key, const previous_entry_pointer* previous) { - do_find_or_create_entry(key, previous, [&] (auto i) { - cache_entry* entry = current_allocator().construct( - _schema, key, cache_entry::wide_partition_tag{}); +cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous) { + return do_find_or_create_entry(key, previous, [&] (auto i) { // create + auto entry = current_allocator().construct(cache_entry::incomplete_tag{}, _schema, key, t); _tracker.insert(*entry); return _partitions.insert(i, *entry); - }, [&] (auto i) { - _tracker.mark_wide(*i); + }, [&] (auto i) { // visit + cache_entry& e = *i; + e.partition().open_version(*e.schema(), phase).partition().apply(t); + _tracker.touch(e); + upgrade_entry(e); }); } void row_cache::populate(const mutation& m, const previous_entry_pointer* previous) { + _populate_section(_tracker.region(), [&] { do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i) { cache_entry* entry = current_allocator().construct( m.schema(), m.decorated_key(), m.partition()); @@ -784,24 +658,46 @@ void row_cache::populate(const mutation& m, const previous_entry_pointer* previo _tracker.insert(*entry); return _partitions.insert(i, *entry); }, [&] (auto i) { - _tracker.touch(*i); - // We cache whole partitions right now, so if cache already has this partition, - // it must be complete, so do nothing. - _tracker.on_miss_already_populated(); // #1534 + throw std::runtime_error(sprint("cache already contains entry for {}", m.key())); }); + }); +} + +mutation_source& row_cache::snapshot_for_phase(phase_type phase) { + if (phase == _underlying_phase) { + return _underlying; + } else { + if (phase + 1 < _underlying_phase) { + throw std::runtime_error(sprint("attempted to read from retired phase {} (current={})", phase, _underlying_phase)); + } + return *_prev_snapshot; + } } -future<> row_cache::clear() { - return invalidate(query::full_partition_range); +row_cache::snapshot_and_phase row_cache::snapshot_of(dht::ring_position_view pos) { + dht::ring_position_less_comparator less(*_schema); + if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) { + return {_underlying, _underlying_phase}; + } + return {*_prev_snapshot, _underlying_phase - 1}; } -future<> row_cache::update(memtable& m, partition_presence_checker presence_checker) { +row_cache::phase_type row_cache::phase_of(dht::ring_position_view pos) { + dht::ring_position_less_comparator less(*_schema); + if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) { + return _underlying_phase; + } + return _underlying_phase - 1; +} + +template +future<> row_cache::do_update(memtable& m, Updater updater) { m.on_detach_from_region_group(); _tracker.region().merge(m); // Now all data in memtable belongs to cache auto attr = seastar::thread_attributes(); attr.scheduling_group = &_update_thread_scheduling_group; STAP_PROBE(scylla, row_cache_update_start); - auto t = seastar::thread(attr, [this, &m, presence_checker = std::move(presence_checker)] { + auto t = seastar::thread(attr, [this, &m, updater = std::move(updater)] () mutable { auto cleanup = defer([&] { with_allocator(_tracker.allocator(), [&m, this] () { logalloc::reclaim_lock _(_tracker.region()); @@ -825,9 +721,16 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec } }); }); - _populate_phaser.advance_and_await().get(); + auto permit = get_units(_update_sem, 1).get0(); + ++_underlying_phase; + _prev_snapshot = std::exchange(_underlying, _snapshot_source()); + _prev_snapshot_pos = dht::ring_position::min(); + auto cleanup_prev_snapshot = defer([this] { + _prev_snapshot_pos = {}; + _prev_snapshot = {}; + }); while (!m.partitions.empty()) { - with_allocator(_tracker.allocator(), [this, &m, &presence_checker] () { + with_allocator(_tracker.allocator(), [this, &m, &updater] () { unsigned quota = 30; auto cmp = cache_entry::compare(_schema); { @@ -845,26 +748,7 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec memtable_entry& mem_e = *i; // FIXME: Optimize knowing we lookup in-order. auto cache_i = _partitions.lower_bound(mem_e.key(), cmp); - // If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete. - // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to - // search it. - if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { - if (!cache_i->wide_partition()) { - cache_entry& entry = *cache_i; - upgrade_entry(entry); - entry.partition().apply(*_schema, std::move(mem_e.partition()), *mem_e.schema()); - _tracker.touch(entry); - _tracker.on_merge(); - } - } else if (presence_checker(mem_e.key()) == - partition_presence_checker_result::definitely_doesnt_exist) { - cache_entry* entry = current_allocator().construct( - mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition())); - _tracker.insert(*entry); - _partitions.insert(cache_i, *entry); - } else { - _tracker.clear_continuity(*cache_i); - } + updater(cache_i, mem_e); i = m.partitions.erase(i); current_allocator().destroy(&mem_e); --quota; @@ -872,6 +756,13 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec }); STAP_PROBE(scylla, row_cache_update_partition_end); } while (!m.partitions.empty() && quota && !need_preempt()); + with_allocator(standard_allocator(), [&] { + if (m.partitions.empty()) { + _prev_snapshot_pos = {}; + } else { + _prev_snapshot_pos = m.partitions.begin()->key(); + } + }); STAP_PROBE1(scylla, row_cache_update_one_batch_end, quota_before - quota); }); if (quota == 0 && seastar::thread::should_yield()) { @@ -888,6 +779,42 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec }); } +future<> row_cache::update(memtable& m, partition_presence_checker is_present) { + return do_update(m, [this, is_present = std::move(is_present)] (row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e) mutable { + // If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete. + // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to + // search it. + if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { + cache_entry& entry = *cache_i; + upgrade_entry(entry); + entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema()); + _tracker.touch(entry); + _tracker.on_merge(); + } else if (is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) { + cache_entry* entry = current_allocator().construct( + mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition())); + _tracker.insert(*entry); + _partitions.insert(cache_i, *entry); + } else { + _tracker.clear_continuity(*cache_i); + } + }); +} + +future<> row_cache::update_invalidating(memtable& m) { + return do_update(m, [this] (row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e) { + if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { + // FIXME: Invalidate only affected row ranges. + // This invalidates all row ranges and the static row, leaving only the partition tombstone continuous, + // which has to always be continuous. + cache_entry& e = *cache_i; + e.partition() = partition_entry(mutation_partition::make_incomplete(*e.schema(), mem_e.partition().partition_tombstone())); + } else { + _tracker.clear_continuity(*cache_i); + } + }); +} + void row_cache::touch(const dht::decorated_key& dk) { _read_section(_tracker.region(), [&] { with_linearized_managed_bytes([&] { @@ -914,45 +841,37 @@ void row_cache::invalidate_locked(const dht::decorated_key& dk) { } future<> row_cache::invalidate(const dht::decorated_key& dk) { -return _populate_phaser.advance_and_await().then([this, &dk] { - _read_section(_tracker.region(), [&] { - with_allocator(_tracker.allocator(), [this, &dk] { + return invalidate(dht::partition_range::make_singular(dk)); +} + +future<> row_cache::invalidate(const dht::partition_range& range) { + return invalidate(dht::partition_range_vector({range})); +} + +future<> row_cache::invalidate(dht::partition_range_vector&& ranges) { + return get_units(_update_sem, 1).then([this, ranges = std::move(ranges)] (auto permit) mutable { + _underlying = _snapshot_source(); + ++_underlying_phase; + auto on_failure = defer([this] { this->clear_now(); }); with_linearized_managed_bytes([&] { - invalidate_locked(dk); + for (auto&& range : ranges) { + this->invalidate_unwrapped(range); + } }); - }); + on_failure.cancel(); }); -}); } -future<> row_cache::invalidate(const dht::partition_range& range) { - return _populate_phaser.advance_and_await().then([this, &range] { - with_linearized_managed_bytes([&] { - invalidate_unwrapped(range); - }); - }); +void row_cache::evict(const dht::partition_range& range) { + invalidate_unwrapped(range); } void row_cache::invalidate_unwrapped(const dht::partition_range& range) { logalloc::reclaim_lock _(_tracker.region()); auto cmp = cache_entry::compare(_schema); - auto begin = _partitions.begin(); - if (range.start()) { - if (range.start()->is_inclusive()) { - begin = _partitions.lower_bound(range.start()->value(), cmp); - } else { - begin = _partitions.upper_bound(range.start()->value(), cmp); - } - } - auto end = partitions_end(); - if (range.end()) { - if (range.end()->is_inclusive()) { - end = _partitions.upper_bound(range.end()->value(), cmp); - } else { - end = _partitions.lower_bound(range.end()->value(), cmp); - } - } + auto begin = _partitions.lower_bound(dht::ring_position_view::for_range_start(range), cmp); + auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp); with_allocator(_tracker.allocator(), [this, begin, end] { auto it = _partitions.erase_and_dispose(begin, end, [this, deleter = current_deleter()] (auto&& p) mutable { _tracker.on_erase(); @@ -963,13 +882,12 @@ void row_cache::invalidate_unwrapped(const dht::partition_range& range) { }); } -row_cache::row_cache(schema_ptr s, mutation_source fallback_factory, - cache_tracker& tracker, uint64_t max_cached_partition_size_in_bytes) +row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker) : _tracker(tracker) , _schema(std::move(s)) , _partitions(cache_entry::compare(_schema)) - , _underlying(std::move(fallback_factory)) - , _max_cached_partition_size_in_bytes(max_cached_partition_size_in_bytes) + , _underlying(src()) + , _snapshot_source(std::move(src)) { with_allocator(_tracker.allocator(), [this] { cache_entry* entry = current_allocator().construct(cache_entry::dummy_entry_tag()); @@ -1002,41 +920,30 @@ void row_cache::set_schema(schema_ptr new_schema) noexcept { _schema = std::move(new_schema); } -future cache_entry::read_wide(row_cache& rc, - schema_ptr s, const query::partition_slice& slice, const io_priority_class& pc, streamed_mutation::forwarding fwd) -{ - struct range_and_underlyig_reader { - dht::partition_range _range; - mutation_reader _reader; - range_and_underlyig_reader(row_cache& rc, schema_ptr s, dht::partition_range pr, - const query::partition_slice& slice, const io_priority_class& pc, streamed_mutation::forwarding fwd) - : _range(std::move(pr)) - , _reader(rc._underlying(s, _range, slice, pc, nullptr, fwd)) - { } - range_and_underlyig_reader(range_and_underlyig_reader&&) = delete; - }; - rc._tracker.on_uncached_wide_partition(); - auto pr = dht::partition_range::make_singular(_key); - auto rd_ptr = std::make_unique(rc, s, std::move(pr), slice, pc, fwd); - auto& r_a_ur = *rd_ptr; - return r_a_ur._reader().finally([rd_ptr = std::move(rd_ptr)] {}); +streamed_mutation cache_entry::read(row_cache& rc, read_context& reader) { + auto source_and_phase = rc.snapshot_of(_key); + reader.enter_partition(_key, source_and_phase.snapshot, source_and_phase.phase); + return do_read(rc, reader); } -streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s, streamed_mutation::forwarding fwd) { - return read(rc, s, query::full_slice, fwd); +streamed_mutation cache_entry::read(row_cache& rc, read_context& reader, + streamed_mutation&& sm, row_cache::phase_type phase) { + reader.enter_partition(std::move(sm), phase); + return do_read(rc, reader); } -streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s, const query::partition_slice& slice, streamed_mutation::forwarding fwd) { - assert(!wide_partition()); - if (_schema->version() != s->version()) { - auto ck_ranges = query::clustering_key_filter_ranges::get_ranges(*s, slice, _key.key()); - auto mp = mutation_partition(_pe.squashed(_schema, s), *s, std::move(ck_ranges)); - auto m = mutation(s, _key, std::move(mp)); - return streamed_mutation_from_mutation(std::move(m), fwd); +// Assumes reader is in the corresponding partition +streamed_mutation cache_entry::do_read(row_cache& rc, read_context& reader) { + auto snp = _pe.read(_schema, reader.phase()); + auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key()); + auto sm = make_cache_streamed_mutation(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp)); + if (reader.schema()->version() != _schema->version()) { + sm = transform(std::move(sm), schema_upgrader(reader.schema())); } - auto ckr = query::clustering_key_filter_ranges::get_ranges(*s, slice, _key.key()); - auto snp = _pe.read(_schema); - return make_partition_snapshot_reader(_schema, _key, std::move(ckr), snp, rc._tracker.region(), rc._read_section, { }, fwd); + if (reader.fwd() == streamed_mutation::forwarding::yes) { + sm = make_forwardable(std::move(sm)); + } + return std::move(sm); } const schema_ptr& row_cache::schema() const { @@ -1045,10 +952,6 @@ const schema_ptr& row_cache::schema() const { void row_cache::upgrade_entry(cache_entry& e) { if (e._schema != _schema) { - if (e.wide_partition()) { - e._schema = _schema; - return; - } auto& r = _tracker.region(); assert(!r.reclaiming_enabled()); with_allocator(r.allocator(), [this, &e] { @@ -1059,3 +962,18 @@ void row_cache::upgrade_entry(cache_entry& e) { }); } } + +std::ostream& operator<<(std::ostream& out, row_cache& rc) { + rc._read_section(rc._tracker.region(), [&] { + out << "{row_cache: " << ::join(", ", rc._partitions.begin(), rc._partitions.end()) << "}"; + }); + return out; +} + +std::ostream& operator<<(std::ostream& out, cache_entry& e) { + return out << "{cache_entry: " << e.position() + << ", cont=" << e.continuous() + << ", dummy=" << e.is_dummy_entry() + << ", " << e.partition() + << "}"; +} diff --git a/row_cache.hh b/row_cache.hh index 12b66ce5dc3e..0d189a99ba33 100644 --- a/row_cache.hh +++ b/row_cache.hh @@ -40,6 +40,16 @@ namespace bi = boost::intrusive; class row_cache; +class memtable_entry; + +namespace cache { + +class autoupdating_underlying_reader; +class cache_streamed_mutation; +class read_context; +class lsa_manager; + +} // Intrusive set entry which holds partition data. // @@ -60,119 +70,113 @@ class cache_entry { // True when we know that there is nothing between this entry and the next one in cache struct { bool _continuous : 1; - bool _wide_partition : 1; bool _dummy_entry : 1; } _flags{}; lru_link_type _lru_link; cache_link_type _cache_link; friend class size_calculator; + + streamed_mutation do_read(row_cache&, cache::read_context& reader); public: friend class row_cache; friend class cache_tracker; struct dummy_entry_tag{}; + struct incomplete_tag{}; + cache_entry(dummy_entry_tag) : _key{dht::token(), partition_key::make_empty()} { _flags._dummy_entry = true; } - struct wide_partition_tag{}; - - cache_entry(schema_ptr s, const dht::decorated_key& key, wide_partition_tag) - : _schema(std::move(s)) - , _key(key) - { - _flags._wide_partition = true; - } + // Creates an entry which is fully discontinuous, except for the partition tombstone. + cache_entry(incomplete_tag, schema_ptr s, const dht::decorated_key& key, tombstone t) + : cache_entry(s, key, mutation_partition::make_incomplete(*s, t)) + { } cache_entry(schema_ptr s, const dht::decorated_key& key, const mutation_partition& p) : _schema(std::move(s)) , _key(key) , _pe(p) - { } + { + _pe.version()->partition().ensure_last_dummy(*_schema); + } cache_entry(schema_ptr s, dht::decorated_key&& key, mutation_partition&& p) noexcept : _schema(std::move(s)) , _key(std::move(key)) , _pe(std::move(p)) - { } + { + _pe.version()->partition().ensure_last_dummy(*_schema); + } + // It is assumed that pe is fully continuous cache_entry(schema_ptr s, dht::decorated_key&& key, partition_entry&& pe) noexcept : _schema(std::move(s)) , _key(std::move(key)) , _pe(std::move(pe)) - { } + { + // If we can assume that _pe is fully continuous, we don't need to check all versions + // to determine what the continuity is. + // This doesn't change value and doesn't invalidate iterators, so can be called even with a snapshot. + _pe.version()->partition().ensure_last_dummy(*_schema); + } cache_entry(cache_entry&&) noexcept; bool is_evictable() { return _lru_link.is_linked(); } const dht::decorated_key& key() const { return _key; } + dht::ring_position_view position() const { + if (is_dummy_entry()) { + return dht::ring_position_view::max(); + } + return _key; + } const partition_entry& partition() const { return _pe; } partition_entry& partition() { return _pe; } const schema_ptr& schema() const { return _schema; } schema_ptr& schema() { return _schema; } - // Requires: !wide_partition() - streamed_mutation read(row_cache&, const schema_ptr&, streamed_mutation::forwarding); - // Requires: !wide_partition() - streamed_mutation read(row_cache&, const schema_ptr&, const query::partition_slice&, streamed_mutation::forwarding); - // May return disengaged optional if the partition is empty. - future read_wide(row_cache&, schema_ptr, const query::partition_slice&, const io_priority_class&, streamed_mutation::forwarding); + streamed_mutation read(row_cache&, cache::read_context& reader); + streamed_mutation read(row_cache&, cache::read_context& reader, streamed_mutation&& underlying, utils::phased_barrier::phase_type); bool continuous() const { return _flags._continuous; } void set_continuous(bool value) { _flags._continuous = value; } - bool wide_partition() const { return _flags._wide_partition; } - void set_wide_partition() { - _flags._wide_partition = true; - _pe = {}; - } bool is_dummy_entry() const { return _flags._dummy_entry; } struct compare { - dht::decorated_key::less_comparator _c; + dht::ring_position_less_comparator _c; compare(schema_ptr s) - : _c(std::move(s)) + : _c(*s) {} bool operator()(const dht::decorated_key& k1, const cache_entry& k2) const { - if (k2.is_dummy_entry()) { - return true; - } - return _c(k1, k2._key); + return _c(k1, k2.position()); } - bool operator()(const dht::ring_position& k1, const cache_entry& k2) const { - if (k2.is_dummy_entry()) { - return true; - } - return _c(k1, k2._key); + bool operator()(dht::ring_position_view k1, const cache_entry& k2) const { + return _c(k1, k2.position()); } bool operator()(const cache_entry& k1, const cache_entry& k2) const { - if (k1.is_dummy_entry()) { - return false; - } - if (k2.is_dummy_entry()) { - return true; - } - return _c(k1._key, k2._key); + return _c(k1.position(), k2.position()); } bool operator()(const cache_entry& k1, const dht::decorated_key& k2) const { - if (k1.is_dummy_entry()) { - return false; - } - return _c(k1._key, k2); + return _c(k1.position(), k2); } - bool operator()(const cache_entry& k1, const dht::ring_position& k2) const { - if (k1.is_dummy_entry()) { - return false; - } - return _c(k1._key, k2); + bool operator()(const cache_entry& k1, dht::ring_position_view k2) const { + return _c(k1.position(), k2); + } + + bool operator()(dht::ring_position_view k1, dht::ring_position_view k2) const { + return _c(k1, k2); } }; + + friend std::ostream& operator<<(std::ostream&, cache_entry&); }; // Tracks accesses and performs eviction of cache entries. @@ -190,23 +194,20 @@ public: struct stats { uint64_t hits; uint64_t misses; - uint64_t uncached_wide_partitions; - uint64_t wide_partition_mispopulations; uint64_t insertions; uint64_t concurrent_misses_same_key; uint64_t merges; uint64_t evictions; - uint64_t wide_partition_evictions; uint64_t removals; uint64_t partitions; uint64_t modification_count; + uint64_t mispopulations; }; private: stats _stats{}; seastar::metrics::metric_groups _metrics; logalloc::region _region; lru_type _lru; - lru_type _wide_partition_lru; private: void setup_metrics(); public: @@ -215,21 +216,18 @@ public: void clear(); void touch(cache_entry&); void insert(cache_entry&); - void mark_wide(cache_entry&); void clear_continuity(cache_entry& ce); void on_erase(); void on_merge(); void on_hit(); void on_miss(); void on_miss_already_populated(); - void on_uncached_wide_partition(); - void on_wide_partition_mispopulation(); + void on_mispopulate(); allocation_strategy& allocator(); logalloc::region& region(); const logalloc::region& region() const; uint64_t modification_count() const { return _stats.modification_count; } uint64_t partitions() const { return _stats.partitions; } - uint64_t uncached_wide_partitions() const { return _stats.uncached_wide_partitions; } const stats& get_stats() const { return _stats; } }; @@ -240,21 +238,27 @@ cache_tracker& global_cache_tracker(); // A data source which wraps another data source such that data obtained from the underlying data source // is cached in-memory in order to serve queries faster. // -// To query the underlying data source through cache, use make_reader(). -// // Cache populates itself automatically during misses. // -// Cache needs to be maintained externally so that it remains consistent with the underlying data source. -// Any incremental change to the underlying data source should result in update() being called on cache. +// Cache represents a snapshot of the underlying mutation source. When the +// underlying mutation source changes, cache needs to be explicitly synchronized +// to the latest snapshot. This is done by calling update() or invalidate(). // class row_cache final { public: + using phase_type = utils::phased_barrier::phase_type; using partitions_type = bi::set, bi::constant_time_size, // we need this to have bi::auto_unlink on hooks bi::compare>; + friend class cache::autoupdating_underlying_reader; friend class single_partition_populating_reader; friend class cache_entry; + friend class cache::cache_streamed_mutation; + friend class cache::lsa_manager; + friend class cache::read_context; + friend class partition_range_cursor; + friend class cache_tester; public: struct stats { utils::timed_rate_moving_average hits; @@ -265,32 +269,52 @@ private: stats _stats{}; schema_ptr _schema; partitions_type _partitions; // Cached partitions are complete. + + // The snapshots used by cache are versioned. The version number of a snapshot is + // called the "population phase", or simply "phase". Between updates, cache + // represents the same snapshot. + // + // Update doesn't happen atomically. Before it completes, some entries reflect + // the old snapshot, while others reflect the new snapshot. After update + // completes, all entries must reflect the new snapshot. There is a race between the + // update process and populating reads. Since after the update all entries must + // reflect the new snapshot, reads using the old snapshot cannot be allowed to + // insert data which will no longer be reached by the update process. The whole + // range can be therefore divided into two sub-ranges, one which was already + // processed by the update and one which hasn't. Each key can be assigned a + // population phase which determines to which range it belongs, as well as which + // snapshot it reflects. The methods snapshot_of() and phase_of() can + // be used to determine this. + // + // In general, reads are allowed to populate given range only if the phase + // of the snapshot they use matches the phase of all keys in that range + // when the population is committed. This guarantees that the range will + // be reached by the update process or already has been in its entirety. + // In case of phase conflict, current solution is to give up on + // population. Since the update process is a scan, it's sufficient to + // check when committing the population if the start and end of the range + // have the same phases and that it's the same phase as that of the start + // of the range at the time when reading began. + mutation_source _underlying; - uint64_t _max_cached_partition_size_in_bytes; - - // Synchronizes populating reads with updates of underlying data source to ensure that cache - // remains consistent across flushes with the underlying data source. - // Readers obtained from the underlying data source in earlier than - // current phases must not be used to populate the cache, unless they hold - // phaser::operation created in the reader's phase of origin. Readers - // should hold to a phase only briefly because this inhibits progress of - // updates. Phase changes occur in update()/clear(), which can be assumed to - // be asynchronous wrt invoking of the underlying data source. - utils::phased_barrier _populate_phaser; + phase_type _underlying_phase = 0; + mutation_source_opt _prev_snapshot; + + // Positions >= than this are using _prev_snapshot, the rest is using _underlying. + stdx::optional _prev_snapshot_pos; + + snapshot_source _snapshot_source; + + // There can be at most one update in progress. + seastar::semaphore _update_sem = {1}; logalloc::allocating_section _update_section; logalloc::allocating_section _populate_section; logalloc::allocating_section _read_section; - mutation_reader make_scanning_reader(schema_ptr, - const dht::partition_range&, - const io_priority_class& pc, - const query::partition_slice& slice, - tracing::trace_state_ptr trace_state, - streamed_mutation::forwarding, - mutation_reader::forwarding); + mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&); + mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr); void on_hit(); void on_miss(); - void on_uncached_wide_partition(); void upgrade_entry(cache_entry&); void invalidate_locked(const dht::decorated_key&); void invalidate_unwrapped(const dht::partition_range&); @@ -298,13 +322,10 @@ private: static thread_local seastar::thread_scheduling_group _update_thread_scheduling_group; struct previous_entry_pointer { - utils::phased_barrier::phase_type _populate_phase; stdx::optional _key; - void reset(stdx::optional key, utils::phased_barrier::phase_type populate_phase) { - _populate_phase = populate_phase; - _key = std::move(key); - } + previous_entry_pointer() = default; // Represents dht::ring_position_view::min() + previous_entry_pointer(dht::decorated_key key) : _key(std::move(key)) {}; // TODO: Currently inserting an entry to the cache increases // modification counter. That doesn't seem to be necessary and if we @@ -317,15 +338,55 @@ private: // { create(it) } -> partitions_type::iterator; // { visit(it) } -> void; // } - void do_find_or_create_entry(const dht::decorated_key& key, const previous_entry_pointer* previous, + // + // Must be run under reclaim lock + cache_entry& do_find_or_create_entry(const dht::decorated_key& key, const previous_entry_pointer* previous, CreateEntry&& create_entry, VisitEntry&& visit_entry); + // Ensures that partition entry for given key exists in cache and returns a reference to it. + // Prepares the entry for reading. "phase" must match the current phase of the entry. + // + // Since currently every entry has to have a complete tombstone, it has to be provided here. + // The entry which is returned will have the tombstone applied to it. + // + // Must be run under reclaim lock + cache_entry& find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous = nullptr); + partitions_type::iterator partitions_end() { return std::prev(_partitions.end()); } + + // Only active phases are accepted. + // Reference valid only until next deferring point. + mutation_source& snapshot_for_phase(phase_type); + + // Returns population phase for given position in the ring. + // snapshot_for_phase() can be called to obtain mutation_source for given phase, but + // only until the next deferring point. + // Should be only called outside update(). + phase_type phase_of(dht::ring_position_view); + + struct snapshot_and_phase { + mutation_source& snapshot; + phase_type phase; + }; + + // Optimized version of: + // + // { snapshot_for_phase(phase_of(pos)), phase_of(pos) }; + // + snapshot_and_phase snapshot_of(dht::ring_position_view pos); + + // Merges the memtable into cache with configurable logic for handling memtable entries. + // The Updater gets invoked for every entry in the memtable with a lower bound iterator + // into _partitions (cache_i), and the memtable entry. + // It is invoked inside allocating section and in the context of cache's allocator. + // All memtable entries will be removed. + template + future<> do_update(memtable& m, Updater func); public: ~row_cache(); - row_cache(schema_ptr, mutation_source underlying, cache_tracker&, uint64_t _max_cached_partition_size_in_bytes = 10 * 1024 * 1024); + row_cache(schema_ptr, snapshot_source, cache_tracker&); row_cache(row_cache&&) = default; row_cache(const row_cache&) = delete; row_cache& operator=(row_cache&&) = default; @@ -344,43 +405,49 @@ public: const stats& stats() const { return _stats; } public: - // Populate cache from given mutation. The mutation must contain all - // information there is for its partition in the underlying data sources. + // Populate cache from given mutation, which must be fully continuous. + // Intended to be used only in tests. + // Can only be called prior to any reads. void populate(const mutation& m, const previous_entry_pointer* previous = nullptr); - // Caches an information that a partition with a given key is wide. - void mark_partition_as_wide(const dht::decorated_key& key, const previous_entry_pointer* previous = nullptr); - - // Clears the cache. - // Guarantees that cache will not be populated using readers created - // before this method was invoked. - future<> clear(); - // Synchronizes cache with the underlying data source from a memtable which // has just been flushed to the underlying data source. // The memtable can be queried during the process, but must not be written. // After the update is complete, memtable is empty. future<> update(memtable&, partition_presence_checker underlying_negative); + // Like update(), synchronizes cache with an incremental change to the underlying + // mutation source, but instead of inserting and merging data, invalidates affected ranges. + // Can be thought of as a more fine-grained version of invalidate(), which invalidates + // as few elements as possible. + future<> update_invalidating(memtable&); + // Moves given partition to the front of LRU if present in cache. void touch(const dht::decorated_key&); - // Removes given partition from cache. + // Synchronizes cache with the underlying mutation source + // by invalidating ranges which were modified. This will force + // them to be re-read from the underlying mutation source + // during next read overlapping with the invalidated ranges. // - // Guarantees that cache will not be populated with given key - // using readers created before this method was invoked. + // The ranges passed to invalidate() must include all + // data which changed since last synchronization. Failure + // to do so may result in reads seeing partial writes, + // which would violate write atomicity. // - // The key must be kept alive until method resolves. - future<> invalidate(const dht::decorated_key& key); - - // Removes given range of partitions from cache. - // The range can be a wrap around. - // - // Guarantees that cache will not be populated with partitions from that range - // using readers created before this method was invoked. + // Guarantees that readers created after invalidate() + // completes will see all writes from the underlying + // mutation source made prior to the call to invalidate(). + future<> invalidate(const dht::decorated_key&); + future<> invalidate(const dht::partition_range& = query::full_partition_range); + future<> invalidate(dht::partition_range_vector&&); + + // Evicts entries from given range in cache. // - // The range must be kept alive until method resolves. - future<> invalidate(const dht::partition_range&); + // Note that this does not synchronize with the underlying source, + // it is assumed that the underlying source didn't change. + // If it did, use invalidate() instead. + void evict(const dht::partition_range& = query::full_partition_range); auto num_entries() const { return _partitions.size(); @@ -392,6 +459,8 @@ public: void set_schema(schema_ptr) noexcept; const schema_ptr& schema() const; + friend std::ostream& operator<<(std::ostream&, row_cache&); + friend class just_cache_scanning_reader; friend class scanning_and_populating_reader; friend class range_populating_reader; diff --git a/schema_upgrader.hh b/schema_upgrader.hh new file mode 100644 index 000000000000..2683cec7dd41 --- /dev/null +++ b/schema_upgrader.hh @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "streamed_mutation.hh" +#include "converting_mutation_partition_applier.hh" + +// A StreamedMutationTransformer which transforms the stream to a different schema +class schema_upgrader { + schema_ptr _prev; + schema_ptr _new; +private: + row transform(row&& r, column_kind kind) { + row new_row; + r.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) { + const column_definition& col = _prev->column_at(kind, id); + const column_definition* new_col = _new->get_column_definition(col.name()); + if (new_col) { + converting_mutation_partition_applier::append_cell(new_row, kind, *new_col, col.type, std::move(cell)); + } + }); + return new_row; + } +public: + schema_upgrader(schema_ptr s) + : _new(std::move(s)) + { } + schema_ptr operator()(schema_ptr old) { + _prev = std::move(old); + return _new; + } + mutation_fragment consume(static_row&& row) { + return mutation_fragment(static_row(transform(std::move(row.cells()), column_kind::static_column))); + } + mutation_fragment consume(clustering_row&& row) { + return mutation_fragment(clustering_row(row.key(), row.tomb(), row.marker(), + transform(std::move(row.cells()), column_kind::regular_column))); + } + mutation_fragment consume(range_tombstone&& rt) { + return std::move(rt); + } + mutation_fragment operator()(mutation_fragment&& mf) { + return std::move(mf).consume(*this); + } +}; + +GCC6_CONCEPT( +static_assert(StreamedMutationTranformer()); +) diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index 9a4b7dcf2d0d..dc1ce5d7b410 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -2042,9 +2042,10 @@ class data_read_resolver : public abstract_read_resolver { virtual void accept_static_cell(column_id, atomic_cell_view) override { } virtual void accept_static_cell(column_id, collection_mutation_view) override { } virtual void accept_row_tombstone(const range_tombstone&) override { } - virtual void accept_row(clustering_key_view key, const row_tombstone&, const row_marker&) override { + virtual void accept_row(position_in_partition_view pos, const row_tombstone&, const row_marker&, is_dummy dummy, is_continuous) override { + assert(!dummy); if (!_is_reversed || !_last_ck) { - _last_ck = clustering_key(key); + _last_ck = pos.key(); } } virtual void accept_row_cell(column_id id, atomic_cell_view) override { } diff --git a/streamed_mutation.cc b/streamed_mutation.cc index 5a1bf99d41cc..31a1686a58ac 100644 --- a/streamed_mutation.cc +++ b/streamed_mutation.cc @@ -154,6 +154,10 @@ std::ostream& operator<<(std::ostream& os, const mutation_fragment& mf) { return os; } +streamed_mutation make_empty_streamed_mutation(schema_ptr s, dht::decorated_key key, streamed_mutation::forwarding fwd) { + return streamed_mutation_from_mutation(mutation(std::move(key), std::move(s)), fwd); +} + streamed_mutation streamed_mutation_from_mutation(mutation m, streamed_mutation::forwarding fwd) { class reader final : public streamed_mutation::impl { @@ -165,10 +169,16 @@ streamed_mutation streamed_mutation_from_mutation(mutation m, streamed_mutation: private: void prepare_next_clustering_row() { auto& crs = _mutation.partition().clustered_rows(); - auto re = crs.unlink_leftmost_without_rebalance(); - if (re) { + while (true) { + auto re = crs.unlink_leftmost_without_rebalance(); + if (!re) { + break; + } auto re_deleter = defer([re] { current_deleter()(re); }); - _cr = mutation_fragment(std::move(*re)); + if (!re->dummy()) { + _cr = mutation_fragment(std::move(*re)); + break; + } } } void prepare_next_range_tombstone() { @@ -262,6 +272,44 @@ streamed_mutation streamed_mutation_from_mutation(mutation m, streamed_mutation: return std::move(sm); } +streamed_mutation streamed_mutation_from_forwarding_streamed_mutation(streamed_mutation&& sm) +{ + class reader final : public streamed_mutation::impl { + streamed_mutation _sm; + bool _static_row_done = false; + public: + explicit reader(streamed_mutation&& sm) + : streamed_mutation::impl(sm.schema(), sm.decorated_key(), sm.partition_tombstone()) + , _sm(std::move(sm)) + { } + + virtual future<> fill_buffer() override { + if (!_static_row_done) { + _static_row_done = true; + return _sm().then([this] (auto&& mf) { + if (mf) { + this->push_mutation_fragment(std::move(*mf)); + } + return _sm.fast_forward_to(query::clustering_range{}).then([this] { + return this->fill_buffer(); + }); + }); + } + return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] { + return _sm().then([this] (auto&& mf) { + if (mf) { + this->push_mutation_fragment(std::move(*mf)); + } else { + _end_of_stream = true; + } + }); + }); + } + }; + + return make_streamed_mutation(std::move(sm)); +} + streamed_mutation make_forwardable(streamed_mutation m) { class reader : public streamed_mutation::impl { streamed_mutation _sm; @@ -472,8 +520,7 @@ mutation_fragment_opt range_tombstone_stream::do_get_next() mutation_fragment_opt range_tombstone_stream::get_next(const rows_entry& re) { if (!_list.empty()) { - position_in_partition_view view(position_in_partition_view::clustering_row_tag_t(), re.key()); - return !_cmp(view, _list.begin()->position()) ? do_get_next() : mutation_fragment_opt(); + return !_cmp(re.position(), _list.begin()->position()) ? do_get_next() : mutation_fragment_opt(); } return { }; } @@ -632,3 +679,7 @@ bool mutation_fragment::relevant_for_range_assuming_after(const schema& s, posit // Range tombstones overlapping with the new range are let in return is_range_tombstone() && cmp(pos, as_range_tombstone().end_position()); } + +std::ostream& operator<<(std::ostream& out, const range_tombstone_stream& rtl) { + return out << rtl._list; +} diff --git a/streamed_mutation.hh b/streamed_mutation.hh index 66e05ecabadf..a562add89cd6 100644 --- a/streamed_mutation.hh +++ b/streamed_mutation.hh @@ -23,6 +23,7 @@ #include "mutation_partition.hh" #include "utils/optimized_optional.hh" +#include "position_in_partition.hh" #include @@ -38,8 +39,6 @@ // mutation_fragment objects. It reflects the order in which content of // partition appears in the sstables. -class position_in_partition_view; - class clustering_row { clustering_key_prefix _ck; row_tombstone _t; @@ -111,6 +110,13 @@ public: return sizeof(clustering_row) + external_memory_usage(); } + bool equal(const schema& s, const clustering_row& other) const { + return _ck.equal(s, other._ck) + && _t == other._t + && _marker == other._marker + && _cells.equal(column_kind::static_column, s, other._cells, s); + } + friend std::ostream& operator<<(std::ostream& os, const clustering_row& row); }; @@ -148,6 +154,10 @@ public: return sizeof(static_row) + external_memory_usage(); } + bool equal(const schema& s, const static_row& other) const { + return _cells.equal(column_kind::static_column, s, other._cells, s); + } + friend std::ostream& operator<<(std::ostream& is, const static_row& row); }; @@ -185,9 +195,29 @@ public: mutation_fragment(clustering_row&& r); mutation_fragment(range_tombstone&& r); - mutation_fragment(const mutation_fragment&) = delete; + mutation_fragment(const mutation_fragment& o) + : _kind(o._kind), _data(std::make_unique()) { + switch(_kind) { + case kind::static_row: + new (&_data->_static_row) static_row(o._data->_static_row); + break; + case kind::clustering_row: + new (&_data->_clustering_row) clustering_row(o._data->_clustering_row); + break; + case kind::range_tombstone: + new (&_data->_range_tombstone) range_tombstone(o._data->_range_tombstone); + break; + } + } mutation_fragment(mutation_fragment&& other) = default; - mutation_fragment& operator=(const mutation_fragment&) = delete; + mutation_fragment& operator=(const mutation_fragment& other) { + if (this != &other) { + mutation_fragment copy(other); + this->~mutation_fragment(); + new (this) mutation_fragment(std::move(copy)); + } + return *this; + } mutation_fragment& operator=(mutation_fragment&& other) noexcept { if (this != &other) { this->~mutation_fragment(); @@ -297,395 +327,24 @@ public: return *_data->_size_in_bytes; } - friend std::ostream& operator<<(std::ostream&, const mutation_fragment& mf); -}; - -std::ostream& operator<<(std::ostream&, mutation_fragment::kind); - -std::ostream& operator<<(std::ostream&, const mutation_fragment& mf); - -class position_in_partition; - -inline -lexicographical_relation relation_for_lower_bound(composite_view v) { - switch (v.last_eoc()) { - case composite::eoc::start: - case composite::eoc::none: - return lexicographical_relation::before_all_prefixed; - case composite::eoc::end: - return lexicographical_relation::after_all_prefixed; - default: - assert(0); - } -} - -inline -lexicographical_relation relation_for_upper_bound(composite_view v) { - switch (v.last_eoc()) { - case composite::eoc::start: - return lexicographical_relation::before_all_prefixed; - case composite::eoc::none: - return lexicographical_relation::before_all_strictly_prefixed; - case composite::eoc::end: - return lexicographical_relation::after_all_prefixed; - default: - assert(0); - } -} - -class position_in_partition_view { - friend class position_in_partition; - - int _bound_weight = 0; - const clustering_key_prefix* _ck; // nullptr for static row -private: - position_in_partition_view(int bound_weight, const clustering_key_prefix* ck) - : _bound_weight(bound_weight) - , _ck(ck) - { } - // Returns placement of this position_in_partition relative to *_ck, - // or lexicographical_relation::at_prefix if !_ck. - lexicographical_relation relation() const { - // FIXME: Currently position_range cannot represent a range end bound which - // includes just the prefix key or a range start which excludes just a prefix key. - // In both cases we should return lexicographical_relation::before_all_strictly_prefixed here. - // Refs #1446. - if (_bound_weight <= 0) { - return lexicographical_relation::before_all_prefixed; - } else { - return lexicographical_relation::after_all_prefixed; - } - } -public: - struct static_row_tag_t { }; - struct clustering_row_tag_t { }; - struct range_tag_t { }; - using range_tombstone_tag_t = range_tag_t; - - position_in_partition_view(static_row_tag_t) : _ck(nullptr) { } - position_in_partition_view(clustering_row_tag_t, const clustering_key_prefix& ck) - : _ck(&ck) { } - position_in_partition_view(range_tag_t, bound_view bv) - : _bound_weight(weight(bv.kind)), _ck(&bv.prefix) { } - - static position_in_partition_view for_range_start(const query::clustering_range&); - static position_in_partition_view for_range_end(const query::clustering_range&); - - static position_in_partition_view before_all_clustered_rows() { - return {range_tag_t(), bound_view::bottom()}; - } - - static position_in_partition_view for_static_row() { - return {static_row_tag_t()}; - } - - bool is_static_row() const { return !_ck; } - - // Returns true if all fragments that can be seen for given schema have - // positions >= than this. - bool is_before_all_fragments(const schema& s) const { - return !_ck || (!s.has_static_columns() && _bound_weight < 0 && _ck->is_empty(s)); - } - - friend std::ostream& operator<<(std::ostream&, position_in_partition_view); -}; - -inline -position_in_partition_view position_in_partition_view::for_range_start(const query::clustering_range& r) { - return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)}; -} - -inline -position_in_partition_view position_in_partition_view::for_range_end(const query::clustering_range& r) { - return {position_in_partition_view::range_tag_t(), bound_view::from_range_end(r)}; -} - -class position_in_partition { - int _bound_weight = 0; - stdx::optional _ck; -public: - struct static_row_tag_t { }; - struct after_static_row_tag_t { }; - struct clustering_row_tag_t { }; - struct after_clustering_row_tag_t { }; - struct range_tag_t { }; - using range_tombstone_tag_t = range_tag_t; - - explicit position_in_partition(static_row_tag_t) { } - position_in_partition(clustering_row_tag_t, clustering_key_prefix ck) - : _ck(std::move(ck)) { } - position_in_partition(after_clustering_row_tag_t, clustering_key_prefix ck) - // FIXME: Use lexicographical_relation::before_strictly_prefixed here. Refs #1446 - : _bound_weight(1), _ck(std::move(ck)) { } - position_in_partition(range_tag_t, bound_view bv) - : _bound_weight(weight(bv.kind)), _ck(bv.prefix) { } - position_in_partition(after_static_row_tag_t) : - position_in_partition(range_tag_t(), bound_view::bottom()) { } - explicit position_in_partition(position_in_partition_view view) - : _bound_weight(view._bound_weight) - { - if (view._ck) { - _ck = *view._ck; - } - } - - static position_in_partition before_all_clustered_rows() { - return {position_in_partition::range_tag_t(), bound_view::bottom()}; - } - - static position_in_partition after_all_clustered_rows() { - return {position_in_partition::range_tag_t(), bound_view::top()}; - } - - static position_in_partition after_key(clustering_key ck) { - return {after_clustering_row_tag_t(), std::move(ck)}; - } - - static position_in_partition for_key(clustering_key ck) { - return {clustering_row_tag_t(), std::move(ck)}; - } - - bool is_static_row() const { return !_ck; } - bool is_clustering_row() const { return _ck && !_bound_weight; } - - template - void feed_hash(Hasher& hasher, const schema& s) const { - ::feed_hash(hasher, _bound_weight); - if (_ck) { - ::feed_hash(hasher, true); - _ck->feed_hash(hasher, s); - } else { - ::feed_hash(hasher, false); - } - } - - clustering_key_prefix& key() { - return *_ck; - } - const clustering_key_prefix& key() const { - return *_ck; - } - operator position_in_partition_view() const { - return { _bound_weight, _ck ? &*_ck : nullptr }; - } - - // Defines total order on the union of position_and_partition and composite objects. - // - // The ordering is compatible with position_range (r). The following is satisfied for - // all cells with name c included by the range: - // - // r.start() <= c < r.end() - // - // The ordering on composites given by this is compatible with but weaker than the cell name order. - // - // The ordering on position_in_partition given by this is compatible but weaker than the ordering - // given by position_in_partition::tri_compare. - // - class composite_tri_compare { - const schema& _s; - public: - composite_tri_compare(const schema& s) : _s(s) {} - - int operator()(position_in_partition_view a, position_in_partition_view b) const { - if (a.is_static_row() || b.is_static_row()) { - return b.is_static_row() - a.is_static_row(); - } - auto&& types = _s.clustering_key_type()->types(); - auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); }; - return lexicographical_tri_compare(types.begin(), types.end(), - a._ck->begin(_s), a._ck->end(_s), - b._ck->begin(_s), b._ck->end(_s), - cmp, a.relation(), b.relation()); - } - - int operator()(position_in_partition_view a, composite_view b) const { - if (b.empty()) { - return 1; // a cannot be empty. - } - if (a.is_static_row() || b.is_static()) { - return b.is_static() - a.is_static_row(); - } - auto&& types = _s.clustering_key_type()->types(); - auto b_values = b.values(); - auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); }; - return lexicographical_tri_compare(types.begin(), types.end(), - a._ck->begin(_s), a._ck->end(_s), - b_values.begin(), b_values.end(), - cmp, a.relation(), relation_for_lower_bound(b)); - } - - int operator()(composite_view a, position_in_partition_view b) const { - return -(*this)(b, a); - } - - int operator()(composite_view a, composite_view b) const { - if (a.is_static() != b.is_static()) { - return a.is_static() ? -1 : 1; - } - auto&& types = _s.clustering_key_type()->types(); - auto a_values = a.values(); - auto b_values = b.values(); - auto cmp = [&] (const data_type& t, bytes_view c1, bytes_view c2) { return t->compare(c1, c2); }; - return lexicographical_tri_compare(types.begin(), types.end(), - a_values.begin(), a_values.end(), - b_values.begin(), b_values.end(), - cmp, - relation_for_lower_bound(a), - relation_for_lower_bound(b)); - } - }; - - // Less comparator giving the same order as composite_tri_compare. - class composite_less_compare { - composite_tri_compare _cmp; - public: - composite_less_compare(const schema& s) : _cmp(s) {} - - template - bool operator()(const T& a, const U& b) const { - return _cmp(a, b) < 0; - } - }; - - class tri_compare { - bound_view::tri_compare _cmp; - private: - template - int compare(const T& a, const U& b) const { - bool a_rt_weight = bool(a._ck); - bool b_rt_weight = bool(b._ck); - if (!a_rt_weight || !b_rt_weight) { - return a_rt_weight - b_rt_weight; - } - return _cmp(*a._ck, a._bound_weight, *b._ck, b._bound_weight); - } - public: - tri_compare(const schema& s) : _cmp(s) { } - int operator()(const position_in_partition& a, const position_in_partition& b) const { - return compare(a, b); - } - int operator()(const position_in_partition_view& a, const position_in_partition_view& b) const { - return compare(a, b); - } - int operator()(const position_in_partition& a, const position_in_partition_view& b) const { - return compare(a, b); + bool equal(const schema& s, const mutation_fragment& other) const { + if (other._kind != _kind) { + return false; } - int operator()(const position_in_partition_view& a, const position_in_partition& b) const { - return compare(a, b); - } - }; - class less_compare { - tri_compare _cmp; - public: - less_compare(const schema& s) : _cmp(s) { } - bool operator()(const position_in_partition& a, const position_in_partition& b) const { - return _cmp(a, b) < 0; - } - bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const { - return _cmp(a, b) < 0; - } - bool operator()(const position_in_partition& a, const position_in_partition_view& b) const { - return _cmp(a, b) < 0; - } - bool operator()(const position_in_partition_view& a, const position_in_partition& b) const { - return _cmp(a, b) < 0; - } - }; - class equal_compare { - clustering_key_prefix::equality _equal; - template - bool compare(const T& a, const U& b) const { - bool a_rt_weight = bool(a._ck); - bool b_rt_weight = bool(b._ck); - return a_rt_weight == b_rt_weight - && (!a_rt_weight || (_equal(*a._ck, *b._ck) - && a._bound_weight == b._bound_weight)); - } - public: - equal_compare(const schema& s) : _equal(s) { } - bool operator()(const position_in_partition& a, const position_in_partition& b) const { - return compare(a, b); - } - bool operator()(const position_in_partition_view& a, const position_in_partition_view& b) const { - return compare(a, b); - } - bool operator()(const position_in_partition_view& a, const position_in_partition& b) const { - return compare(a, b); - } - bool operator()(const position_in_partition& a, const position_in_partition_view& b) const { - return compare(a, b); + switch(_kind) { + case kind::static_row: + return as_static_row().equal(s, other.as_static_row()); + case kind::clustering_row: + return as_clustering_row().equal(s, other.as_clustering_row()); + case kind::range_tombstone: + return as_range_tombstone().equal(s, other.as_range_tombstone()); } - }; - friend std::ostream& operator<<(std::ostream&, const position_in_partition&); -}; - -// Includes all position_in_partition objects "p" for which: start <= p < end -// And only those. -class position_range { -private: - position_in_partition _start; - position_in_partition _end; -public: - static position_range from_range(const query::clustering_range&); - - static position_range for_static_row() { - return { - position_in_partition(position_in_partition::static_row_tag_t()), - position_in_partition(position_in_partition::after_static_row_tag_t()) - }; - } - - static position_range full() { - return { - position_in_partition(position_in_partition::static_row_tag_t()), - position_in_partition::after_all_clustered_rows() - }; - } - - static position_range all_clustered_rows() { - return { - position_in_partition::before_all_clustered_rows(), - position_in_partition::after_all_clustered_rows() - }; + abort(); } - position_range(position_range&&) = default; - position_range& operator=(position_range&&) = default; - position_range(const position_range&) = default; - position_range& operator=(const position_range&) = default; - - // Constructs position_range which covers the same rows as given clustering_range. - // position_range includes a fragment if it includes position of that fragment. - position_range(const query::clustering_range&); - position_range(query::clustering_range&&); - - position_range(position_in_partition start, position_in_partition end) - : _start(std::move(start)) - , _end(std::move(end)) - { } - - const position_in_partition& start() const& { return _start; } - position_in_partition&& start() && { return std::move(_start); } - const position_in_partition& end() const& { return _end; } - position_in_partition&& end() && { return std::move(_end); } - bool contains(const schema& s, position_in_partition_view pos) const; - bool overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const; - - friend std::ostream& operator<<(std::ostream&, const position_range&); + friend std::ostream& operator<<(std::ostream&, const mutation_fragment& mf); }; -inline -bool position_range::contains(const schema& s, position_in_partition_view pos) const { - position_in_partition::less_compare less(s); - return !less(pos, _start) && less(pos, _end); -} - -inline -bool position_range::overlaps(const schema& s, position_in_partition_view start, position_in_partition_view end) const { - position_in_partition::less_compare less(s); - return !less(end, _start) && less(start, _end); -} - inline position_in_partition_view static_row::position() const { return position_in_partition_view(position_in_partition_view::static_row_tag_t()); @@ -696,6 +355,10 @@ inline position_in_partition_view clustering_row::position() const return position_in_partition_view(position_in_partition_view::clustering_row_tag_t(), _ck); } +std::ostream& operator<<(std::ostream&, mutation_fragment::kind); + +std::ostream& operator<<(std::ostream&, const mutation_fragment& mf); + template<> struct move_constructor_disengages { enum { value = true }; @@ -889,11 +552,14 @@ class mutation; streamed_mutation streamed_mutation_from_mutation(mutation, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no); streamed_mutation streamed_mutation_returning(schema_ptr, dht::decorated_key, std::vector, tombstone t = {}); +streamed_mutation streamed_mutation_from_forwarding_streamed_mutation(streamed_mutation&&); //Requires all streamed_mutations to have the same schema. streamed_mutation merge_mutations(std::vector); streamed_mutation reverse_streamed_mutation(streamed_mutation); +streamed_mutation make_empty_streamed_mutation(schema_ptr, dht::decorated_key, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no); + // range_tombstone_stream is a helper object that simplifies producing a stream // of range tombstones and merging it with a stream of clustering rows. // Tombstones are added using apply() and retrieved using get_next(). @@ -935,6 +601,7 @@ public: } void apply(const range_tombstone_list&, const query::clustering_range&); void reset(); + friend std::ostream& operator<<(std::ostream& out, const range_tombstone_stream&); }; // mutation_hasher is an equivalent of hashing_partition_visitor for @@ -1049,3 +716,50 @@ public: consume_range_tombstones_until_end(); } }; + + +GCC6_CONCEPT( + // F gets a stream element as an argument and returns the new value which replaces that element + // in the transformed stream. + template + concept bool StreamedMutationTranformer() { + return requires(F f, mutation_fragment mf, schema_ptr s) { + { f(std::move(mf)) } -> mutation_fragment + { f(s) } -> schema_ptr + }; + } +) + +// Creates a stream which is like sm but with transformation applied to the elements. +template +GCC6_CONCEPT( + requires StreamedMutationTranformer() +) +streamed_mutation transform(streamed_mutation sm, T t) { + class reader : public streamed_mutation::impl { + streamed_mutation _sm; + T _t; + public: + explicit reader(streamed_mutation sm, T&& t) + : impl(t(sm.schema()), sm.decorated_key(), sm.partition_tombstone()) + , _sm(std::move(sm)) + , _t(std::move(t)) + { } + + virtual future<> fill_buffer() override { + return _sm.fill_buffer().then([this] { + while (!_sm.is_buffer_empty()) { + push_mutation_fragment(_t(_sm.pop_mutation_fragment())); + } + _end_of_stream = _sm.is_end_of_stream(); + }); + } + + virtual future<> fast_forward_to(position_range pr) override { + _end_of_stream = false; + forward_buffer_to(pr.start()); + return _sm.fast_forward_to(std::move(pr)); + } + }; + return make_streamed_mutation(std::move(sm), std::move(t)); +} diff --git a/test.py b/test.py index e783ed00c2cc..c5abb3d66a15 100755 --- a/test.py +++ b/test.py @@ -53,6 +53,7 @@ 'canonical_mutation_test', 'gossiping_property_file_snitch_test', 'row_cache_test', + 'cache_streamed_mutation_test', 'network_topology_strategy_test', 'query_processor_test', 'batchlog_manager_test', @@ -150,6 +151,7 @@ def boost_test_wants_double_dash(path): '-c1 -m1G'.split())) test_to_run.append(('build/release/tests/sstable_test', 'boost', ['-c1'])) test_to_run.append(('build/release/tests/view_schema_test', 'boost', ['-c1'])) + test_to_run.append(('build/release/tests/row_cache_stress_test', 'other', '-c1 -m1G --seconds 10'.split())) if 'debug' in modes_to_run: test_to_run.append(('build/debug/tests/sstable_test', 'boost', ['-c1'])) test_to_run.append(('build/debug/tests/view_schema_test', 'boost', ['-c1'])) diff --git a/tests/cache_streamed_mutation_test.cc b/tests/cache_streamed_mutation_test.cc new file mode 100644 index 000000000000..2f5f1289d496 --- /dev/null +++ b/tests/cache_streamed_mutation_test.cc @@ -0,0 +1,1272 @@ + +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + + +#include + +#include "tests/test-utils.hh" +#include "core/thread.hh" +#include "schema_builder.hh" +#include "keys.hh" +#include "mutation_partition.hh" +#include "partition_version.hh" +#include "mutation.hh" +#include "memtable.hh" +#include "cache_streamed_mutation.hh" +#include "row_cache.hh" + +#include "disk-error-handler.hh" +#include "memtable_snapshot_source.hh" +#include "mutation_assertions.hh" + +thread_local disk_error_signal_type commit_error; +thread_local disk_error_signal_type general_disk_error; + +/* + * =================== + * ====== Utils ====== + * =================== + */ + +static schema_ptr make_schema() { + return schema_builder("ks", "cf") + .with_column("pk", int32_type, column_kind::partition_key) + .with_column("ck", int32_type, column_kind::clustering_key) + .with_column("v", int32_type) + .build(); +} + +static const thread_local schema_ptr SCHEMA = make_schema(); + +static partition_key make_pk(int value) { + return partition_key::from_exploded(*SCHEMA, { int32_type->decompose(value) }); +} + +static const thread_local partition_key PK = make_pk(0); +static const thread_local dht::decorated_key DK = + dht::global_partitioner().decorate_key(*SCHEMA, PK); + +static clustering_key make_ck(int value) { + return clustering_key_prefix::from_single_value(*SCHEMA, int32_type->decompose(value)); +} + +static void add_row(mutation& m, int ck, int value) { + m.set_clustered_cell(make_ck(ck), "v", data_value(value), 1); +} + +static void add_tombstone(mutation& m, range_tombstone rt) { + m.partition().apply_row_tombstone(*SCHEMA, rt); +} + +static void set_row_continuous(mutation_partition& mp, int ck, is_continuous value) { + auto it = mp.clustered_rows().find(make_ck(ck), rows_entry::compare(*SCHEMA)); + assert(it != mp.clustered_rows().end()); + it->set_continuous(value); +} + +static query::partition_slice make_slice(std::vector ranges) { + return query::partition_slice(std::move(ranges), { }, { }, { }); +} + +struct expected_fragment { + boost::variant f; + + expected_fragment(int row_key) : f(row_key) { } + expected_fragment(range_tombstone rt) : f(rt) { } + + void check(streamed_mutation_assertions& sm, const query::clustering_row_ranges& ranges) { + if (f.which() == 0) { + sm.produces_row_with_key(make_ck(boost::get(f))); + } else { + sm.produces_range_tombstone(boost::get(f), ranges); + } + } +}; + +static +mutation make_incomplete_mutation() { + return mutation(SCHEMA, DK, mutation_partition::make_incomplete(*SCHEMA)); +} + +static void assert_single_version(lw_shared_ptr snp) { + BOOST_REQUIRE(snp->at_latest_version()); + BOOST_REQUIRE_EQUAL(snp->version_count(), 1); +} + +struct expected_row { + int ck; + is_continuous continuous; + is_dummy dummy; + + struct dummy_tag_t { }; + + expected_row(int k, is_continuous cont) + : ck(k), continuous(cont), dummy(false) { } + expected_row(dummy_tag_t, is_continuous cont) + : continuous(cont), dummy(true) { } + + void check(const rows_entry& r) const { + clustering_key::equality ck_eq(*SCHEMA); + BOOST_REQUIRE_EQUAL(r.continuous(), continuous); + BOOST_REQUIRE_EQUAL(r.dummy(), dummy); + if (!r.dummy()) { + BOOST_REQUIRE(ck_eq(r.key(), make_ck(ck))); + } + } + + friend std::ostream& operator<<(std::ostream& out, const expected_row& e) { + return out << "{ck=" << e.ck << ", cont=" << bool(e.continuous) << ", dummy=" << bool(e.dummy) << "}"; + } +}; + +static void assert_cached_rows(lw_shared_ptr snp, std::deque expected) { + auto&& rows = snp->version()->partition().clustered_rows(); + for (auto&& r : rows) { + BOOST_REQUIRE(!expected.empty()); + expected.front().check(r); + expected.pop_front(); + } + if (!expected.empty()) { + BOOST_FAIL(sprint("Expected %s next, but no more rows", expected.front())); + } +} + +struct expected_tombstone { + int start; + bool start_inclusive; + int end; + bool end_inclusive; + + expected_tombstone(int s, bool s_i, int e, bool e_i) + : start(s) + , start_inclusive(s_i) + , end(e) + , end_inclusive(e_i) + { } + void check(const range_tombstone& rt) const { + clustering_key::equality ck_eq(*SCHEMA); + BOOST_REQUIRE(ck_eq(rt.start, make_ck(start))); + BOOST_REQUIRE_EQUAL(rt.start_kind, start_inclusive ? bound_kind::incl_start : bound_kind::excl_start); + BOOST_REQUIRE(ck_eq(rt.end, make_ck(end))); + BOOST_REQUIRE_EQUAL(rt.end_kind, end_inclusive ? bound_kind::incl_end : bound_kind::excl_end); + } +}; + +static void assert_cached_tombstones(lw_shared_ptr snp, std::deque expected) { + const range_tombstone_list& rts = snp->version()->partition().row_tombstones(); + for (auto&& rt : rts) { + BOOST_REQUIRE(!expected.empty()); + if (!expected.front().equal(*SCHEMA, rt)) { + BOOST_FAIL(sprint("Expected %s, but found %s", expected.front(), rt)); + } + expected.pop_front(); + } + BOOST_REQUIRE(expected.empty()); +} + +class cache_tester { +public: + static lw_shared_ptr snapshot_for_key(row_cache& rc, const dht::decorated_key& dk) { + return rc._read_section(rc._tracker.region(), [&] { + return with_linearized_managed_bytes([&] { + cache_entry& e = rc.find_or_create(dk, {}, rc.phase_of(dk)); + return e.partition().read(e.schema()); + }); + }); + } +}; + +static void check_produces_only(streamed_mutation sm, std::deque expected, const query::clustering_row_ranges& ranges) { + auto sa = assert_that_stream(std::move(sm)); + for (auto&& e : expected) { + e.check(sa, ranges); + } + sa.produces_end_of_stream(); +} + +void test_slice_single_version(mutation& underlying, + mutation& cache_mutation, + const query::partition_slice& slice, + std::deque expected_sm_fragments, + std::deque expected_cache_rows, + std::deque expected_cache_tombstones) { + // Set up underlying + memtable_snapshot_source source_mt(SCHEMA); + source_mt.apply(underlying); + cache_tracker tracker; + row_cache cache(SCHEMA, snapshot_source([&] { return source_mt(); }), tracker); + + cache.populate(cache_mutation); + + try { + auto range = dht::partition_range::make_singular(DK); + auto reader = cache.make_reader(SCHEMA, range, slice); + auto smo = reader().get0(); + BOOST_REQUIRE(bool(smo)); + + check_produces_only(std::move(*smo), expected_sm_fragments, slice.row_ranges(*SCHEMA, DK.key())); + + auto snp = cache_tester::snapshot_for_key(cache, DK); + assert_single_version(snp); + assert_cached_rows(snp, expected_cache_rows); + assert_cached_tombstones(snp, expected_cache_tombstones); + } catch (...) { + std::cerr << "cache: " << cache << "\n"; + throw; + } +} + +/* + * ======================================================== + * ====== Tests for single row with a single version ====== + * ======================================================== + */ +void test_single_row(int ck, + bool cached, + is_continuous continuous, + const query::partition_slice& slice, + std::deque expected_sm_rows, + std::deque expected_cache_rows) { + const int value = 12; + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck, value); + + auto m = make_incomplete_mutation(); + if (cached) { + add_row(m, ck, value); + set_row_continuous(m.partition(), ck, continuous); + } + + std::deque expected_sm_fragments; + for (int r : expected_sm_rows) { + expected_sm_fragments.push_back(expected_fragment(r)); + } + test_slice_single_version(underlying, m, slice, expected_sm_fragments, expected_cache_rows, { }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_full_range) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, query::full_slice, { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_single_row_range) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make_singular(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_range_from_start_to_row) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_range_from_row_to_end) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_exclusive_range_on_the_left) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with({make_ck(1), false}) }), + { }, { expected_row(expected_row::dummy_tag_t{}, is_continuous::no) }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_exclusive_range_on_the_right) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with({make_ck(1), false}) }), + { }, { expected_row(expected_row::dummy_tag_t{}, is_continuous::no) }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_small_range) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(0), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_small_range_on_left) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(0), make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_not_cached_small_range_on_right) { + return seastar::async([] { + test_single_row(1, false, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(1), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_full_range) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, query::full_slice, { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_single_row_range) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make_singular(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_range_from_start_to_row) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_range_from_row_to_end) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_exclusive_range_on_the_left) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with({make_ck(1), false}) }), { }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_exclusive_range_on_the_right) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with({make_ck(1), false}) }), { }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_small_range) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(0), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_small_range_on_left) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(0), make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_continuous_small_range_on_right) { + return seastar::async([] { + test_single_row(1, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(1), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_full_range) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, query::full_slice, { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_single_row_range) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make_singular(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_range_from_start_to_row) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make_ending_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_range_from_row_to_end) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make_starting_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_exclusive_range_on_the_left) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make_ending_with({make_ck(1), false}) }), { }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_exclusive_range_on_the_right) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make_starting_with({make_ck(1), false}) }), { }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_small_range) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_small_range_on_left) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_cached_as_noncontinuous_small_range_on_right) { + return seastar::async([] { + test_single_row(1, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(1), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +/* + * ====================================================== + * ====== Tests for two rows with a single version ====== + * ====================================================== + */ + +void test_two_rows(int ck1, + bool cached1, + is_continuous continuous1, + int ck2, + bool cached2, + is_continuous continuous2, + const query::partition_slice& slice, + std::deque expected_sm_rows, + std::deque expected_cache_rows) { + const int value1 = 12; + const int value2 = 34; + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck1, value1); + add_row(underlying, ck2, value2); + + auto cache = make_incomplete_mutation(); + if (cached1) { + add_row(cache, ck1, value1); + set_row_continuous(cache.partition(), ck1, continuous1); + } + if (cached2) { + add_row(cache, ck2, value2); + set_row_continuous(cache.partition(), ck2, continuous2); + } + std::deque expected_sm_fragments; + for (int r : expected_sm_rows) { + expected_sm_fragments.push_back(expected_fragment(r)); + } + test_slice_single_version(underlying, cache, slice, expected_sm_fragments, expected_cache_rows, { }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_full_range) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, query::full_slice, { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_single_row_range1) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_singular(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_single_row_range2) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_singular(make_ck(3)) }), { 3 }, { + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_range_from_start_to_row1) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_ending_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_range_from_start_to_row2) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_ending_with(make_ck(3)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_range_from_row1_to_end) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_starting_with(make_ck(1)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_range_from_row2_to_end) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_starting_with(make_ck(3)) }), { 3 }, { + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_exclusive_range_on_the_left) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_ending_with({make_ck(3), false}) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_exclusive_range_on_the_right) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make_starting_with({make_ck(1), false}) }), { 3 }, { + expected_row(3, is_continuous::no), // TODO: this should be possible to mark continuous here + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_exclusive_range_between_rows1) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make({make_ck(1), false}, {make_ck(3), false}) }), + { }, { expected_row(expected_row::dummy_tag_t{}, is_continuous::no) }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_exclusive_range_between_rows2) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make({make_ck(1), false}, make_ck(3)) }), { 3 }, { + expected_row(3, is_continuous::no), // TODO: this should be possible to mark continuous here + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_exclusive_range_between_rows3) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(1), {make_ck(3), false}) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_small_range) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_small_range_row1) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_not_cached_small_range_row2) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_full_range) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, query::full_slice, { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_single_row_range1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_singular(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_single_row_range2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_singular(make_ck(3)) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_range_from_start_to_row1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_range_from_start_to_row2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with(make_ck(3)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_range_from_row1_to_end) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with(make_ck(1)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_range_from_row2_to_end) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with(make_ck(3)) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_exclusive_range_on_the_left) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_ending_with({make_ck(3), false}) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_exclusive_range_on_the_right) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make_starting_with({make_ck(1), false}) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_exclusive_range_between_rows1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make({make_ck(1), false}, {make_ck(3), false}) }), { }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_exclusive_range_between_rows2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make({make_ck(1), false}, make_ck(3)) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_exclusive_range_between_rows3) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(1), {make_ck(3), false}) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_small_range) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_small_range_row1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(0), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_continuous_small_range_row2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, true, is_continuous::yes, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_full_range) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, query::full_slice, { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_single_row_range1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_singular(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_single_row_range2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_singular(make_ck(3)) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_range_from_start_to_row1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_ending_with(make_ck(1)) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_range_from_start_to_row2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_ending_with(make_ck(3)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_range_from_row1_to_end) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_starting_with(make_ck(1)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_range_from_row2_to_end) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_starting_with(make_ck(3)) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_exclusive_range_on_the_left) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_ending_with({make_ck(3), false}) }), { 1 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_exclusive_range_on_the_right) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make_starting_with({make_ck(1), false}) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::yes) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_exclusive_range_between_rows1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make({make_ck(1), false}, {make_ck(3), false}) }), { }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_exclusive_range_between_rows2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make({make_ck(1), false}, make_ck(3)) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_exclusive_range_between_rows3) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(1), {make_ck(3), false}) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_small_range) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_small_range_row1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(2)) }), { 1 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_cached_non_continuous_small_range_row2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_first_not_cached_second_cached_non_continuous1) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_first_not_cached_second_cached_non_continuous2) { + return seastar::async([] { + test_two_rows(1, false, is_continuous::no, 3, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_first_cached_non_continuous_second_not_cached1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_first_cached_non_continuous_second_not_cached2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::no, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_first_cached_continuous_second_not_cached1) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_two_rows_first_cached_continuous_second_not_cached2) { + return seastar::async([] { + test_two_rows(1, true, is_continuous::yes, 3, false, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +/* + * ======================================================== + * ====== Tests for three rows with a single version ====== + * ======================================================== + */ + +void test_three_rows(int ck1, + bool cached1, + is_continuous continuous1, + int ck2, + bool cached2, + is_continuous continuous2, + int ck3, + bool cached3, + is_continuous continuous3, + const query::partition_slice& slice, + std::deque expected_sm_rows, + std::deque expected_cache_rows) { + const int value1 = 12; + const int value2 = 34; + const int value3 = 56; + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck1, value1); + add_row(underlying, ck2, value2); + add_row(underlying, ck3, value3); + + auto cache = make_incomplete_mutation(); + if (cached1) { + add_row(cache, ck1, value1); + set_row_continuous(cache.partition(), ck1, continuous1); + } + if (cached2) { + add_row(cache, ck2, value2); + set_row_continuous(cache.partition(), ck2, continuous2); + } + if (cached3) { + add_row(cache, ck3, value3); + set_row_continuous(cache.partition(), ck3, continuous3); + } + std::deque expected_sm_fragments; + for (int r : expected_sm_rows) { + expected_sm_fragments.push_back(expected_fragment(r)); + } + test_slice_single_version(underlying, cache, slice, expected_sm_fragments, expected_cache_rows, { }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_continuous1) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::yes, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(6)) }), { 1, 3, 5 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(5, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_continuous2) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::yes, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(6)) }), { 3, 5 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::no), + expected_row(5, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_continuous3) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::yes, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::yes), + expected_row(5, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_continuous4) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::yes, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(1, is_continuous::yes), + expected_row(3, is_continuous::no), + expected_row(5, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_noncontinuous1) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::no, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(6)) }), { 1, 3, 5 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(5, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_noncontinuous2) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::no, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(6)) }), { 3, 5 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(5, is_continuous::yes), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_nonecontinuous3) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::no, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(0), make_ck(4)) }), { 1, 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::yes), + expected_row(5, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +SEASTAR_TEST_CASE(test_three_rows_first_nonecontinuous4) { + return seastar::async([] { + test_three_rows(1, true, is_continuous::no, 3, false, is_continuous::no, 5, true, is_continuous::no, make_slice({ query::clustering_range::make(make_ck(2), make_ck(4)) }), { 3 }, { + expected_row(1, is_continuous::no), + expected_row(3, is_continuous::no), + expected_row(5, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }); + }); +} + +/* + * ================================================================================================ + * ====== Tests for single rows and range tombstone with single version and single row range ====== + * ================================================================================================ + */ + +static tombstone new_tombstone() { + return tombstone(api::new_timestamp(), gc_clock::now()); +} + +SEASTAR_TEST_CASE(test_single_row_and_tombstone_not_cached_single_row_range1) { + return seastar::async([] { + const int ck1 = 1; + const int value1 = 12; + range_tombstone rt(make_ck(0), bound_kind::incl_start, make_ck(2), bound_kind::incl_end, new_tombstone()); + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck1, value1); + add_tombstone(underlying, rt); + + auto cache = make_incomplete_mutation(); + auto slice = make_slice({ query::clustering_range::make_singular(make_ck(ck1)) }); + + test_slice_single_version(underlying, cache, slice, { + expected_fragment(rt), + expected_fragment(1) + }, { + expected_row(1, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }, { rt }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_and_tombstone_not_cached_single_row_range2) { + return seastar::async([] { + const int ck1 = 1; + const int value1 = 12; + range_tombstone rt(make_ck(0), bound_kind::incl_start, make_ck(2), bound_kind::incl_end, new_tombstone()); + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck1, value1); + add_tombstone(underlying, rt); + + auto cache = make_incomplete_mutation(); + auto slice = make_slice({ query::clustering_range::make(make_ck(0), {make_ck(1), false}) }); + + test_slice_single_version(underlying, cache, slice, { + expected_fragment(rt), + }, { + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }, { rt }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_and_tombstone_not_cached_single_row_range3) { + return seastar::async([] { + const int ck1 = 4; + const int value1 = 12; + range_tombstone rt(make_ck(0), bound_kind::incl_start, make_ck(2), bound_kind::incl_end, new_tombstone()); + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck1, value1); + add_tombstone(underlying, rt); + + auto cache = make_incomplete_mutation(); + auto slice = make_slice({ query::clustering_range::make(make_ck(0), make_ck(5)) }); + + test_slice_single_version(underlying, cache, slice, { + expected_fragment(rt), + expected_fragment(4) + }, { + expected_row(4, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }, { rt }); + }); +} + +SEASTAR_TEST_CASE(test_single_row_and_tombstone_not_cached_single_row_range4) { + return seastar::async([] { + const int ck1 = 4; + const int value1 = 12; + range_tombstone rt(make_ck(0), bound_kind::incl_start, make_ck(2), bound_kind::incl_end, new_tombstone()); + + mutation underlying(PK, SCHEMA); + add_row(underlying, ck1, value1); + add_tombstone(underlying, rt); + + auto cache = make_incomplete_mutation(); + auto slice = make_slice({ query::clustering_range::make(make_ck(3), make_ck(5)) }); + + test_slice_single_version(underlying, cache, slice, { + expected_fragment(4) + }, { + expected_row(4, is_continuous::no), + expected_row(expected_row::dummy_tag_t{}, is_continuous::no) + }, {}); + }); +} diff --git a/tests/memory_footprint.cc b/tests/memory_footprint.cc index 81680d267748..6889251702f4 100644 --- a/tests/memory_footprint.cc +++ b/tests/memory_footprint.cc @@ -175,7 +175,7 @@ static sizes calculate_sizes(const mutation& m) { auto s = m.schema(); auto mt = make_lw_shared(s); cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, make_empty_snapshot_source(), tracker); auto cache_initial_occupancy = tracker.region().occupancy().used_space(); diff --git a/tests/memtable_snapshot_source.hh b/tests/memtable_snapshot_source.hh new file mode 100644 index 000000000000..96477014345f --- /dev/null +++ b/tests/memtable_snapshot_source.hh @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "mutation_reader.hh" +#include "memtable.hh" +#include "utils/phased_barrier.hh" +#include +#include +#include + +// in-memory snapshottable mutation source. +// Must be destroyed in a seastar thread. +class memtable_snapshot_source { + schema_ptr _s; + circular_buffer> _memtables; + utils::phased_barrier _apply; + bool _closed = false; + seastar::condition_variable _should_compact; + future<> _compactor; +private: + bool should_compact() const { + return !_closed && _memtables.size() >= 3; + } + lw_shared_ptr new_memtable() { + return make_lw_shared(_s); + } + lw_shared_ptr pending() { + if (_memtables.empty()) { + _memtables.push_back(new_memtable()); + on_new_memtable(); + } + return _memtables.back(); + } + void on_new_memtable() { + if (should_compact()) { + _should_compact.signal(); + } + } + void compact() { + if (_memtables.empty()) { + return; + } + auto count = _memtables.size(); + auto op = _apply.start(); + auto new_mt = make_lw_shared(_memtables.back()->schema()); + std::vector readers; + for (auto&& mt : _memtables) { + readers.push_back(mt->make_reader(new_mt->schema())); + } + auto&& rd = make_combined_reader(std::move(readers)); + consume(rd, [&] (mutation&& m) { + new_mt->apply(std::move(m)); + return stop_iteration::no; + }).get(); + _memtables.erase(_memtables.begin(), _memtables.begin() + count); + _memtables.push_back(new_mt); + } +public: + memtable_snapshot_source(schema_ptr s) + : _s(s) + , _compactor(seastar::async([this] { + while (!_closed) { + _should_compact.wait().get(); + while (should_compact()) { + compact(); + } + } + })) + { } + memtable_snapshot_source(memtable_snapshot_source&&) = delete; // 'this' captured. + ~memtable_snapshot_source() { + _closed = true; + _should_compact.broadcast(); + _compactor.get(); + } + // Must run in a seastar thread + void clear() { + _memtables.erase(_memtables.begin(), _memtables.end()); + _apply.advance_and_await().get(); + _memtables.erase(_memtables.begin(), _memtables.end()); + } + // Must run in a seastar thread + void apply(const mutation& mt) { + pending()->apply(mt); + } + // Must run in a seastar thread + void apply(memtable& mt) { + auto op = _apply.start(); + auto new_mt = new_memtable(); + new_mt->apply(mt).get(); + _memtables.push_back(new_mt); + } + // Must run in a seastar thread + // mt must not change from now on. + void apply(lw_shared_ptr mt) { + auto op = _apply.start(); + _memtables.push_back(std::move(mt)); + on_new_memtable(); + } + mutation_source operator()() { + std::vector src; + for (auto&& mt : _memtables) { + src.push_back(mt->as_data_source()); + } + _memtables.push_back(new_memtable()); // so that src won't change any more. + on_new_memtable(); + return make_combined_mutation_source(std::move(src)); + } +}; diff --git a/tests/mutation_assertions.hh b/tests/mutation_assertions.hh index 48cbbb4f0f77..bcb0d8c4d385 100644 --- a/tests/mutation_assertions.hh +++ b/tests/mutation_assertions.hh @@ -30,7 +30,12 @@ public: : _m(std::move(m)) { } - mutation_assertion& is_equal_to(const mutation& other) { + // If ck_ranges is passed, verifies only that information relevant for ck_ranges matches. + mutation_assertion& is_equal_to(const mutation& other, const query::clustering_row_ranges& ck_ranges = {}) { + if (!ck_ranges.empty()) { + mutation_assertion(_m.sliced(ck_ranges)).is_equal_to(other.sliced(ck_ranges)); + return *this; + } if (_m != other) { BOOST_FAIL(sprint("Mutations differ, expected %s\n ...but got: %s", other, _m)); } @@ -54,6 +59,13 @@ public: return *this; } + mutation_assertion& has_same_continuity(const mutation& other) { + if (!_m.partition().equal_continuity(*_m.schema(), other.partition())) { + BOOST_FAIL(sprint("Continuity doesn't match: %s\n ...and: %s", other, _m)); + } + return *this; + } + // Verifies that mutation data remains unchanged when upgraded to the new schema void is_upgrade_equivalent(schema_ptr new_schema) { mutation m2 = _m; @@ -148,6 +160,25 @@ public: return *this; } + streamed_mutation_assertions& produces(mutation_fragment mf) { + auto mfopt = _sm().get0(); + if (!mfopt) { + BOOST_FAIL(sprint("Expected mutation fragment %s, got end of stream", mf)); + } + if (!mfopt->equal(*_sm.schema(), mf)) { + BOOST_FAIL(sprint("Expected %s, but got %s", mf, *mfopt)); + } + return *this; + } + + streamed_mutation_assertions& produces_only(const std::deque& fragments) { + for (auto&& f : fragments) { + produces(f); + } + produces_end_of_stream(); + return *this; + } + streamed_mutation_assertions& produces_row_with_key(const clustering_key& ck) { BOOST_TEST_MESSAGE(sprint("Expect %s", ck)); auto mfo = _sm().get0(); @@ -164,7 +195,8 @@ public: return *this; } - streamed_mutation_assertions& produces_range_tombstone(const range_tombstone& rt) { + // If ck_ranges is passed, verifies only that information relevant for ck_ranges matches. + streamed_mutation_assertions& produces_range_tombstone(const range_tombstone& rt, const query::clustering_row_ranges& ck_ranges = {}) { BOOST_TEST_MESSAGE(sprint("Expect %s", rt)); auto mfo = _sm().get0(); if (!mfo) { @@ -174,7 +206,18 @@ public: BOOST_FAIL(sprint("Expected range tombstone %s, but got %s", rt, *mfo)); } auto& actual = mfo->as_range_tombstone(); - if (!actual.equal(*_sm.schema(), rt)) { + const schema& s = *_sm.schema(); + if (!ck_ranges.empty()) { + range_tombstone_list actual_list(s); + range_tombstone_list expected_list(s); + actual_list.apply(s, actual); + expected_list.apply(s, rt); + actual_list.trim(s, ck_ranges); + expected_list.trim(s, ck_ranges); + if (!actual_list.equal(s, expected_list)) { + BOOST_FAIL(sprint("Expected %s, but got %s", expected_list, actual_list)); + } + } else if (!actual.equal(s, rt)) { BOOST_FAIL(sprint("Expected range tombstone %s, but got %s", rt, actual)); } return *this; diff --git a/tests/mutation_reader_assertions.hh b/tests/mutation_reader_assertions.hh index 20f81f8c48dd..4c24a89833d9 100644 --- a/tests/mutation_reader_assertions.hh +++ b/tests/mutation_reader_assertions.hh @@ -29,6 +29,11 @@ class reader_assertions { mutation_reader _reader; dht::partition_range _pr; +private: + mutation_opt read_next() { + auto smo = _reader().get0(); + return mutation_from_streamed_mutation(std::move(smo)).get0(); + } public: reader_assertions(mutation_reader reader) : _reader(std::move(reader)) @@ -36,35 +41,28 @@ public: reader_assertions& produces(const dht::decorated_key& dk) { BOOST_TEST_MESSAGE(sprint("Expecting key %s", dk)); - _reader().then([&] (auto sm) { - if (!sm) { - BOOST_FAIL(sprint("Expected: %s, got end of stream", dk)); - } - if (!sm->decorated_key().equal(*sm->schema(), dk)) { - BOOST_FAIL(sprint("Expected: %s, got: %s", dk, sm->decorated_key())); - } - }).get0(); + auto mo = read_next(); + if (!mo) { + BOOST_FAIL(sprint("Expected: %s, got end of stream", dk)); + } + if (!mo->decorated_key().equal(*mo->schema(), dk)) { + BOOST_FAIL(sprint("Expected: %s, got: %s", dk, mo->decorated_key())); + } return *this; } - reader_assertions& produces(mutation m) { + reader_assertions& produces(mutation m, const query::clustering_row_ranges& ck_ranges = {}) { BOOST_TEST_MESSAGE(sprint("Expecting %s", m)); - _reader().then([] (auto sm) { - return mutation_from_streamed_mutation(std::move(sm)); - }).then([this, m = std::move(m)] (mutation_opt&& mo) mutable { - BOOST_REQUIRE(bool(mo)); - assert_that(*mo).is_equal_to(m); - }).get0(); + auto mo = read_next(); + BOOST_REQUIRE(bool(mo)); + assert_that(*mo).is_equal_to(m, ck_ranges); return *this; } mutation_assertion next_mutation() { - return _reader().then([] (auto sm) { - return mutation_from_streamed_mutation(std::move(sm)); - }).then([] (mutation_opt&& mo) mutable { - BOOST_REQUIRE(bool(mo)); - return mutation_assertion(std::move(*mo)); - }).get0(); + auto mo = read_next(); + BOOST_REQUIRE(bool(mo)); + return mutation_assertion(std::move(*mo)); } template @@ -77,20 +75,16 @@ public: reader_assertions& produces_end_of_stream() { BOOST_TEST_MESSAGE("Expecting end of stream"); - _reader().then([] (auto sm) { - return mutation_from_streamed_mutation(std::move(sm)); - }).then([this] (mutation_opt&& mo) mutable { - if (bool(mo)) { - BOOST_FAIL(sprint("Expected end of stream, got %s", *mo)); - } - }).get0(); + auto mo = read_next(); + if (bool(mo)) { + BOOST_FAIL(sprint("Expected end of stream, got %s", *mo)); + } return *this; } reader_assertions& produces_eos_or_empty_mutation() { BOOST_TEST_MESSAGE("Expecting eos or empty mutation"); - auto sm = _reader().get0(); - mutation_opt mo = mutation_from_streamed_mutation(std::move(sm)).get0(); + auto mo = read_next(); if (mo) { if (!mo->partition().empty()) { BOOST_FAIL(sprint("Mutation is not empty: %s", *mo)); diff --git a/tests/mutation_source_test.cc b/tests/mutation_source_test.cc index a523fe140927..476e1fe717dd 100644 --- a/tests/mutation_source_test.cc +++ b/tests/mutation_source_test.cc @@ -90,7 +90,7 @@ static void test_streamed_mutation_forwarding_is_consistent_with_slicing(populat } mutation sliced_m = mutation_from_streamed_mutation(sliced_sm).get0(); - assert_that(sliced_m).is_equal_to(fwd_m); + assert_that(sliced_m).is_equal_to(fwd_m, slice_with_ranges.row_ranges(*m.schema(), m.key())); } } @@ -295,9 +295,9 @@ static void test_streamed_mutation_slicing_returns_only_relevant_tombstones(popu auto sm = assert_that_stream(std::move(*smo)); sm.produces_row_with_key(keys[2]); - sm.produces_range_tombstone(rt3); + sm.produces_range_tombstone(rt3, slice.row_ranges(*s, m.key())); sm.produces_row_with_key(keys[8]); - sm.produces_range_tombstone(rt4); + sm.produces_range_tombstone(rt4, slice.row_ranges(*s, m.key())); sm.produces_end_of_stream(); } @@ -314,9 +314,9 @@ static void test_streamed_mutation_slicing_returns_only_relevant_tombstones(popu streamed_mutation_opt smo = rd().get0(); BOOST_REQUIRE(bool(smo)); assert_that_stream(std::move(*smo)) - .produces_range_tombstone(rt3) + .produces_range_tombstone(rt3, slice.row_ranges(*s, m.key())) .produces_row_with_key(keys[8]) - .produces_range_tombstone(rt4) + .produces_range_tombstone(rt4, slice.row_ranges(*s, m.key())) .produces_end_of_stream(); } } @@ -676,7 +676,7 @@ static void test_clustering_slices(populate_fn populate) { .with_range(query::clustering_range::make_singular(make_ck(2))) .build(); assert_that(ds(s, pr, slice)) - .produces(row6 + row7 + del_1 + del_2) + .produces(row6 + row7 + del_1 + del_2, slice.row_ranges(*s, pk.key())) .produces_end_of_stream(); } @@ -761,7 +761,6 @@ static mutation_sets generate_mutation_sets() { auto m1 = mutation(partition_key::from_single_value(*s1, to_bytes("key1")), s1); auto m2 = mutation(partition_key::from_single_value(*s2, to_bytes("key1")), s2); - result.equal.emplace_back(mutations{m1, m2}); clustering_key ck1 = clustering_key::from_deeply_exploded(*s1, {data_value(bytes("ck1_0")), data_value(bytes("ck1_1"))}); @@ -841,6 +840,14 @@ static mutation_sets generate_mutation_sets() { result.equal.emplace_back(mutations{m1, m2}); } + { + m1.partition().ensure_last_dummy(*m1.schema()); + result.equal.emplace_back(mutations{m1, m2}); + + m2.partition().ensure_last_dummy(*m2.schema()); + result.equal.emplace_back(mutations{m1, m2}); + } + { auto ts = new_timestamp(); m1.set_clustered_cell(ck2, "regular_col_1_s1", data_value(bytes("x")), ts); @@ -934,6 +941,7 @@ class random_mutation_generator::impl { std::vector _blobs; std::uniform_int_distribution _ck_index_dist{0, n_blobs - 1}; std::uniform_int_distribution _bool_dist{0, 1}; + std::uniform_int_distribution _not_dummy_dist{0, 19}; template static gc_clock::time_point expiry_dist(Generator& gen) { @@ -1164,9 +1172,14 @@ class random_mutation_generator::impl { size_t row_count = row_count_dist(_gen); for (size_t i = 0; i < row_count; ++i) { auto ckey = make_random_key(); - deletable_row& row = m.partition().clustered_row(*_schema, ckey); - set_random_cells(row.cells(), column_kind::regular_column); - row.marker() = random_row_marker(); + is_continuous continuous = is_continuous(_bool_dist(_gen)); + if (_not_dummy_dist(_gen)) { + deletable_row& row = m.partition().clustered_row(*_schema, ckey, is_dummy::no, continuous); + set_random_cells(row.cells(), column_kind::regular_column); + row.marker() = random_row_marker(); + } else { + m.partition().clustered_row(*_schema, ckey, is_dummy::yes, continuous); + } } size_t range_tombstone_count = row_count_dist(_gen); @@ -1180,6 +1193,12 @@ class random_mutation_generator::impl { m.partition().apply_row_tombstone(*_schema, range_tombstone(std::move(start), std::move(end), random_tombstone())); } + + if (_bool_dist(_gen)) { + m.partition().ensure_last_dummy(*_schema); + m.partition().clustered_rows().rbegin()->set_continuous(is_continuous(_bool_dist(_gen))); + } + return m; } }; diff --git a/tests/mutation_test.cc b/tests/mutation_test.cc index e20014c13c6f..e80bae177e0c 100644 --- a/tests/mutation_test.cc +++ b/tests/mutation_test.cc @@ -48,6 +48,7 @@ #include "cell_locking.hh" #include "disk-error-handler.hh" +#include "simple_schema.hh" thread_local disk_error_signal_type commit_error; thread_local disk_error_signal_type general_disk_error; @@ -830,7 +831,8 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) { break; // we exhausted all allocation points } catch (const std::bad_alloc&) { BOOST_TEST_MESSAGE("Checking that apply was reverted"); - assert_that(m).is_equal_to(target); + assert_that(m).is_equal_to(target) + .has_same_continuity(target); } } } @@ -851,7 +853,8 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) { assert_that(m).is_equal_to(target); // they should still commute m.apply(copy_of_second); - assert_that(m).is_equal_to(expected_apply_result); + assert_that(m).is_equal_to(expected_apply_result) + .has_same_continuity(expected_apply_result); } } } @@ -1513,3 +1516,193 @@ SEASTAR_TEST_CASE(test_mutation_diff_with_random_generator) { }); }); } + +SEASTAR_TEST_CASE(test_continuity_merging) { + return seastar::async([] { + simple_schema table; + auto&& s = *table.schema(); + + auto new_mutation = [&] { + return mutation(table.make_pkey(0), table.schema()); + }; + + { + auto left = new_mutation(); + auto right = new_mutation(); + auto result = new_mutation(); + + left.partition().clustered_row(s, table.make_ckey(0), is_dummy::no, is_continuous::yes); + right.partition().clustered_row(s, table.make_ckey(0), is_dummy::no, is_continuous::no); + result.partition().clustered_row(s, table.make_ckey(0), is_dummy::no, is_continuous::yes); + + left.partition().clustered_row(s, table.make_ckey(1), is_dummy::yes, is_continuous::yes); + right.partition().clustered_row(s, table.make_ckey(2), is_dummy::yes, is_continuous::no); + result.partition().clustered_row(s, table.make_ckey(1), is_dummy::yes, is_continuous::yes); + result.partition().clustered_row(s, table.make_ckey(2), is_dummy::yes, is_continuous::no); + + left.partition().clustered_row(s, table.make_ckey(3), is_dummy::yes, is_continuous::yes); + right.partition().clustered_row(s, table.make_ckey(3), is_dummy::no, is_continuous::no); + result.partition().clustered_row(s, table.make_ckey(3), is_dummy::yes, is_continuous::yes); + + left.partition().clustered_row(s, table.make_ckey(4), is_dummy::no, is_continuous::no); + right.partition().clustered_row(s, table.make_ckey(4), is_dummy::no, is_continuous::yes); + result.partition().clustered_row(s, table.make_ckey(4), is_dummy::no, is_continuous::no); + + left.partition().clustered_row(s, table.make_ckey(5), is_dummy::no, is_continuous::no); + right.partition().clustered_row(s, table.make_ckey(5), is_dummy::yes, is_continuous::yes); + result.partition().clustered_row(s, table.make_ckey(5), is_dummy::no, is_continuous::no); + + left.partition().clustered_row(s, table.make_ckey(6), is_dummy::no, is_continuous::yes); + right.partition().clustered_row(s, table.make_ckey(6), is_dummy::yes, is_continuous::no); + result.partition().clustered_row(s, table.make_ckey(6), is_dummy::no, is_continuous::yes); + + left.partition().clustered_row(s, table.make_ckey(7), is_dummy::yes, is_continuous::yes); + right.partition().clustered_row(s, table.make_ckey(7), is_dummy::yes, is_continuous::no); + result.partition().clustered_row(s, table.make_ckey(7), is_dummy::yes, is_continuous::yes); + + left.partition().clustered_row(s, table.make_ckey(8), is_dummy::yes, is_continuous::no); + right.partition().clustered_row(s, table.make_ckey(8), is_dummy::yes, is_continuous::yes); + result.partition().clustered_row(s, table.make_ckey(8), is_dummy::yes, is_continuous::no); + + assert_that(left + right).has_same_continuity(result); + } + + // static row continuity + { + auto complete = mutation(table.make_pkey(0), table.schema()); + auto incomplete = mutation(table.make_pkey(0), table.schema()); + incomplete.partition().set_static_row_continuous(false); + + assert_that(complete + complete).has_same_continuity(complete); + assert_that(complete + incomplete).has_same_continuity(complete); + assert_that(incomplete + complete).has_same_continuity(incomplete); + assert_that(incomplete + incomplete).has_same_continuity(incomplete); + } + }); + +} + +SEASTAR_TEST_CASE(test_apply_to_incomplete) { + return seastar::async([] { + simple_schema table; + auto&& s = *table.schema(); + + auto new_mutation = [&] { + return mutation(table.make_pkey(0), table.schema()); + }; + + auto mutation_with_row = [&] (clustering_key ck) { + auto m = new_mutation(); + table.add_row(m, ck, "v"); + return m; + }; + + // FIXME: There is no assert_that() for mutation_partition + auto assert_equal = [&] (mutation_partition mp1, mutation_partition mp2) { + auto key = table.make_pkey(0); + assert_that(mutation(table.schema(), key, std::move(mp1))) + .is_equal_to(mutation(table.schema(), key, std::move(mp2))); + }; + + auto apply = [&] (partition_entry& e, const mutation& m) { + e.apply_to_incomplete(s, partition_entry(m.partition()), s); + }; + + auto ck1 = table.make_ckey(1); + auto ck2 = table.make_ckey(2); + + BOOST_TEST_MESSAGE("Check that insert falling into discontinuous range is dropped"); + { + auto e = partition_entry(mutation_partition::make_incomplete(s)); + auto m = new_mutation(); + table.add_row(m, ck1, "v"); + apply(e, m); + assert_equal(e.squashed(s), mutation_partition::make_incomplete(s)); + } + + BOOST_TEST_MESSAGE("Check that continuity from latest version wins"); + { + auto m1 = mutation_with_row(ck2); + auto e = partition_entry(m1.partition()); + + auto snap1 = e.read(table.schema()); + + auto m2 = mutation_with_row(ck2); + apply(e, m2); + + partition_version* latest = &*e.version(); + partition_version* prev = latest->next(); + + for (rows_entry& row : prev->partition().clustered_rows()) { + row.set_continuous(is_continuous::no); + } + + auto m3 = mutation_with_row(ck1); + apply(e, m3); + assert_equal(e.squashed(s), (m2 + m3).partition()); + + // Check that snapshot data is not stolen when its entry is applied + auto e2 = partition_entry(mutation_partition(table.schema())); + e2.apply_to_incomplete(s, std::move(e), s); + assert_equal(snap1->squashed(), m1.partition()); + assert_equal(e2.squashed(s), (m2 + m3).partition()); + } + }); + +} + +SEASTAR_TEST_CASE(test_schema_upgrade_preserves_continuity) { + return seastar::async([] { + simple_schema table; + + auto new_mutation = [&] { + return mutation(table.make_pkey(0), table.schema()); + }; + + auto mutation_with_row = [&] (clustering_key ck) { + auto m = new_mutation(); + table.add_row(m, ck, "v"); + return m; + }; + + // FIXME: There is no assert_that() for mutation_partition + auto assert_entry_equal = [&] (schema_ptr e_schema, partition_entry& e, mutation m) { + auto key = table.make_pkey(0); + assert_that(mutation(e_schema, key, e.squashed(*e_schema))) + .is_equal_to(m) + .has_same_continuity(m); + }; + + auto apply = [&] (schema_ptr e_schema, partition_entry& e, const mutation& m) { + e.apply_to_incomplete(*e_schema, partition_entry(m.partition()), *m.schema()); + }; + + auto m1 = mutation_with_row(table.make_ckey(1)); + m1.partition().clustered_rows().begin()->set_continuous(is_continuous::no); + m1.partition().set_static_row_continuous(false); + m1.partition().ensure_last_dummy(*m1.schema()); + + auto e = partition_entry(m1.partition()); + auto rd1 = e.read(table.schema()); + + auto m2 = mutation_with_row(table.make_ckey(3)); + m2.partition().ensure_last_dummy(*m2.schema()); + apply(table.schema(), e, m2); + + auto new_schema = schema_builder(table.schema()).with_column("__new_column", utf8_type).build(); + + e.upgrade(table.schema(), new_schema); + rd1 = {}; + + assert_entry_equal(new_schema, e, m1 + m2); + + auto m3 = mutation_with_row(table.make_ckey(2)); + apply(new_schema, e, m3); + + auto m4 = mutation_with_row(table.make_ckey(0)); + table.add_static_row(m4, "s_val"); + apply(new_schema, e, m4); + + assert_entry_equal(new_schema, e, m1 + m2 + m3); + }); +} diff --git a/tests/perf_row_cache_update.cc b/tests/perf_row_cache_update.cc index 96ee48ed587a..797c76b9e401 100644 --- a/tests/perf_row_cache_update.cc +++ b/tests/perf_row_cache_update.cc @@ -73,7 +73,7 @@ int main(int argc, char** argv) { .build(); cache_tracker tracker; - row_cache cache(s, mutation_source([] (schema_ptr, auto&&) { return make_empty_reader(); }), tracker); + row_cache cache(s, make_empty_snapshot_source(), tracker); size_t partitions = app.configuration()["partitions"].as(); size_t cell_size = app.configuration()["cell-size"].as(); diff --git a/tests/row_cache_alloc_stress.cc b/tests/row_cache_alloc_stress.cc index f7f7ecb36dec..7412f311419f 100644 --- a/tests/row_cache_alloc_stress.cc +++ b/tests/row_cache_alloc_stress.cc @@ -70,10 +70,8 @@ int main(int argc, char** argv) { .with_column("v", bytes_type, column_kind::regular_column) .build(); - auto mt0 = make_lw_shared(s); - cache_tracker tracker; - row_cache cache(s, mt0->as_data_source(), tracker); + row_cache cache(s, make_empty_snapshot_source(), tracker); auto mt = make_lw_shared(s); std::vector keys; @@ -134,16 +132,16 @@ int main(int argc, char** argv) { auto fill_cache_to_the_top = [&] { std::cout << "Filling up memory with evictable data\n"; while (true) { + auto evictions_before = tracker.get_stats().evictions; // Ensure that entries matching memtable partitions are evicted // last, we want to hit the merge path in row_cache::update() for (auto&& key : keys) { cache.touch(key); } - auto occupancy_before = tracker.region().occupancy().used_space(); auto m = make_small_mutation(); cache_stuffing.push_back(m.decorated_key()); cache.populate(m); - if (tracker.region().occupancy().used_space() <= occupancy_before) { + if (tracker.get_stats().evictions > evictions_before) { break; } } diff --git a/tests/row_cache_stress_test.cc b/tests/row_cache_stress_test.cc new file mode 100644 index 000000000000..a6f48939b52a --- /dev/null +++ b/tests/row_cache_stress_test.cc @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#include +#include "seastarx.hh" +#include "tests/simple_schema.hh" +#include "core/app-template.hh" +#include "memtable.hh" +#include "row_cache.hh" +#include "partition_slice_builder.hh" +#include "utils/int_range.hh" +#include "utils/div_ceil.hh" +#include "tests/memtable_snapshot_source.hh" +#include + +#include "disk-error-handler.hh" + +logging::logger test_log("test"); + +thread_local disk_error_signal_type commit_error; +thread_local disk_error_signal_type general_disk_error; + +static thread_local bool cancelled = false; + +using namespace std::chrono_literals; + +struct table { + simple_schema s; + std::vector p_keys; + std::vector p_writetime; // committed writes + std::vector c_keys; + uint64_t mutation_phase = 0; + uint64_t mutations = 0; + uint64_t reads_started = 0; + uint64_t scans_started = 0; + + lw_shared_ptr mt; + lw_shared_ptr prev_mt; + memtable_snapshot_source underlying; + row_cache cache; + + table(unsigned partitions, unsigned rows) + : mt(make_lw_shared(s.schema())) + , underlying(s.schema()) + , cache(s.schema(), snapshot_source([this] { return underlying(); }), global_cache_tracker()) + { + p_keys = s.make_pkeys(partitions); + p_writetime.resize(p_keys.size()); + c_keys = s.make_ckeys(rows); + } + + size_t index_of_key(const dht::decorated_key& dk) { + for (auto i : boost::irange(0, p_keys.size())) { + if (p_keys[i].equal(*s.schema(), dk)) { + return i; + } + } + throw std::runtime_error(sprint("key not found: %s", dk)); + } + + sstring value_tag(int key, uint64_t phase) { + return sprint("k_0x%x_p_0x%x", key, phase); + } + + mutation get_mutation(int key, api::timestamp_type t, const sstring& tag) { + mutation m(p_keys[key], s.schema()); + for (auto ck : c_keys) { + s.add_row(m, ck, tag, t); + } + return m; + } + + // Must not be called concurrently + void flush() { + test_log.trace("flushing"); + prev_mt = std::exchange(mt, make_lw_shared(s.schema())); + auto flushed = make_lw_shared(s.schema()); + flushed->apply(*prev_mt).get(); + prev_mt->mark_flushed(flushed->as_data_source()); + underlying.apply(flushed); + test_log.trace("updating cache"); + cache.update(*prev_mt, [] (const dht::decorated_key& dk) { + return partition_presence_checker_result::maybe_exists; + }).get(); + test_log.trace("flush done"); + prev_mt = {}; + } + + void mutate_next_phase() { + test_log.trace("mutating, phase={}", mutation_phase); + for (auto i : boost::irange(0, p_keys.size())) { + auto t = s.new_timestamp(); + auto tag = value_tag(i, mutation_phase); + auto m = get_mutation(i, t, tag); + mt->apply(std::move(m)); + p_writetime[i] = t; + test_log.trace("updated key {}, {} @{}", i, tag, t); + ++mutations; + later().get(); + } + test_log.trace("mutated whole ring"); + ++mutation_phase; + // FIXME: mutate concurrently with flush + flush(); + } + + struct reader { + dht::partition_range pr; + query::partition_slice slice; + mutation_reader rd; + }; + + std::unique_ptr make_reader(dht::partition_range pr, query::partition_slice slice) { + test_log.trace("making reader, pk={} ck={}", pr, slice); + auto r = std::make_unique(reader{std::move(pr), std::move(slice)}); + std::vector rd; + if (prev_mt) { + rd.push_back(prev_mt->make_reader(s.schema(), r->pr, r->slice)); + } + rd.push_back(mt->make_reader(s.schema(), r->pr, r->slice)); + rd.push_back(cache.make_reader(s.schema(), r->pr, r->slice)); + r->rd = make_combined_reader(std::move(rd)); + return r; + } + + std::unique_ptr make_single_key_reader(int pk, int_range ck_range) { + ++reads_started; + auto slice = partition_slice_builder(*s.schema()) + .with_range(ck_range.transform([this] (int key) { return c_keys[key]; })) + .build(); + auto pr = dht::partition_range::make_singular(p_keys[pk]); + return make_reader(std::move(pr), std::move(slice)); + } + + std::unique_ptr make_scanning_reader() { + ++scans_started; + return make_reader(query::full_partition_range, query::full_slice); + } +}; + +struct reader_id { + sstring name; + + friend std::ostream& operator<<(std::ostream& out, reader_id id) { + return out << id.name; + } +}; + +class validating_consumer { + table& _t; + reader_id _id; + stdx::optional _value; + size_t _row_count = 0; + size_t _key = 0; + std::vector _writetimes; +public: + validating_consumer(table& t, reader_id id) + : _t(t) + , _id(id) + , _writetimes(t.p_writetime) + { } + + void consume_new_partition(const dht::decorated_key& key) { + test_log.trace("reader {}: enters partition {}", _id, key); + _value = {}; + _key = _t.index_of_key(key); + } + + stop_iteration consume_end_of_partition() { return stop_iteration::no; } + stop_iteration consume(tombstone) { return stop_iteration::no; } + stop_iteration consume(const static_row&) { return stop_iteration::no; } + stop_iteration consume(const range_tombstone&) { return stop_iteration::no; } + + stop_iteration consume(const clustering_row& row) { + ++_row_count; + sstring value; + api::timestamp_type t; + std::tie(value, t) = _t.s.get_value(row); + test_log.trace("reader {}: {} @{}, {}", _id, value, t, row); + if (_value && value != _value) { + throw std::runtime_error(sprint("Saw values from two different writes in partition %d: %s and %s", _key, _value, value)); + } + auto lowest_timestamp = _writetimes[_key]; + if (t < lowest_timestamp) { + throw std::runtime_error(sprint("Expected to see the write @%d, but saw @%d (%s), c_key=%s", lowest_timestamp, t, value, row.key())); + } + _value = std::move(value); + return stop_iteration::no; + } + + size_t consume_end_of_stream() { + test_log.trace("reader {}: done, {} rows", _id, _row_count); + return _row_count; + } +}; + +template +class monotonic_counter { + std::function _getter; + T _prev; +public: + monotonic_counter(std::function getter) + : _getter(std::move(getter)) { + _prev = _getter(); + } + // Return change in value since the last call to change() or rate(). + auto change() { + auto now = _getter(); + return now - std::exchange(_prev, now); + } +}; + +int main(int argc, char** argv) { + namespace bpo = boost::program_options; + app_template app; + app.add_options() + ("trace", "Enables trace-level logging for the test actions") + ("concurrency", bpo::value()->default_value(10), "Number of concurrent single partition readers") + ("scan-concurrency", bpo::value()->default_value(2), "Number of concurrent ring scanners") + ("partitions", bpo::value()->default_value(10), "Number of partitions") + ("rows", bpo::value()->default_value(10000), "Number of rows in each partitions") + ("seconds", bpo::value()->default_value(600), "Duration [s] after which the test terminates with a success") + ; + + return app.run(argc, argv, [&app] { + if (app.configuration().count("trace")) { + test_log.set_level(seastar::log_level::trace); + } + + return seastar::async([&app] { + auto concurrency = app.configuration()["concurrency"].as(); + auto scan_concurrency = app.configuration()["scan-concurrency"].as(); + auto partitions = app.configuration()["partitions"].as(); + auto rows = app.configuration()["rows"].as(); + auto seconds = app.configuration()["seconds"].as(); + + table t(partitions, rows); + + engine().at_exit([] { + cancelled = true; + return make_ready_future(); + }); + + timer<> completion_timer; + completion_timer.set_callback([&] { + test_log.info("Test done."); + cancelled = true; + }); + completion_timer.arm(std::chrono::seconds(seconds)); + + auto fail = [&] (sstring msg) { + test_log.error("{}", msg); + cancelled = true; + completion_timer.cancel(); + }; + + // Stats printer + timer<> stats_printer; + monotonic_counter reads([&] { return t.reads_started; }); + monotonic_counter scans([&] { return t.scans_started; }); + monotonic_counter mutations([&] { return t.mutations; }); + monotonic_counter flushes([&] { return t.mutation_phase; }); + stats_printer.set_callback([&] { + auto MB = 1024 * 1024; + test_log.info("reads/s: {}, scans/s: {}, mutations/s: {}, flushes/s: {}, Cache: {}/{} [MB], LSA: {}/{} [MB], std free: {} [MB]", + reads.change(), scans.change(), mutations.change(), flushes.change(), + global_cache_tracker().region().occupancy().used_space() / MB, + global_cache_tracker().region().occupancy().total_space() / MB, + logalloc::shard_tracker().region_occupancy().used_space() / MB, + logalloc::shard_tracker().region_occupancy().total_space() / MB, + seastar::memory::stats().free_memory() / MB); + }); + stats_printer.arm_periodic(1s); + + auto single_partition_reader = [&] (int i, reader_id id) { + auto n_keys = t.c_keys.size(); + + // Assign ranges so that there is ~30% overlap between adjacent readers. + auto len = div_ceil(n_keys, concurrency); + len = std::min(n_keys, len + div_ceil(len, 3)); // so that read ranges overlap + auto start = (n_keys - len) * i / (std::max(concurrency - 1, 1u)); + int_range ck_range = make_int_range(start, start + len); + + int pk = t.p_keys.size() / 2; // FIXME: spread over 3 consecutive partitions + test_log.info("{} is using pk={} ck={}", id, pk, ck_range); + while (!cancelled) { + test_log.trace("{}: starting read", id); + auto rd = t.make_single_key_reader(pk, ck_range); + auto row_count = consume_flattened(std::move(rd->rd), validating_consumer(t, id)).get0(); + if (row_count != len) { + throw std::runtime_error(sprint("Expected %d fragments, got %d", len, row_count)); + } + } + }; + + auto scanning_reader = [&] (reader_id id) { + auto expected_row_count = t.p_keys.size() * t.c_keys.size(); + while (!cancelled) { + test_log.trace("{}: starting read", id); + auto rd = t.make_scanning_reader(); + auto row_count = consume_flattened(std::move(rd->rd), validating_consumer(t, id)).get0(); + if (row_count != expected_row_count) { + throw std::runtime_error(sprint("Expected %d fragments, got %d", expected_row_count, row_count)); + } + } + }; + + // populate the initial phase, readers expect constant fragment count. + t.mutate_next_phase(); + + auto readers = parallel_for_each(boost::irange(0u, concurrency), [&] (auto i) { + reader_id id{sprint("single-%d", i)}; + return seastar::async([&, i, id] { + single_partition_reader(i, id); + }).handle_exception([&, id] (auto e) { + fail(sprint("%s failed: %s", id, e)); + }); + }); + + auto scanning_readers = parallel_for_each(boost::irange(0u, scan_concurrency), [&] (auto i) { + reader_id id{sprint("scan-%d", i)}; + return seastar::async([&, id] { + scanning_reader(id); + }).handle_exception([&, id] (auto e) { + fail(sprint("%s failed: %s", id, e)); + }); + }); + + timer<> evictor; + evictor.set_callback([&] { + test_log.trace("evicting"); + t.cache.evict(); + }); + evictor.arm_periodic(3s); + + // Mutator + while (!cancelled) { + t.mutate_next_phase(); + } + + stats_printer.cancel(); + completion_timer.cancel(); + evictor.cancel(); + readers.get(); + scanning_readers.get(); + }); + }); +} diff --git a/tests/row_cache_test.cc b/tests/row_cache_test.cc index 57666ece9627..4c9b6092fb68 100644 --- a/tests/row_cache_test.cc +++ b/tests/row_cache_test.cc @@ -29,10 +29,12 @@ #include "tests/mutation_source_test.hh" #include "schema_builder.hh" +#include "simple_schema.hh" #include "row_cache.hh" #include "core/thread.hh" #include "memtable.hh" #include "partition_slice_builder.hh" +#include "tests/memtable_snapshot_source.hh" #include "disk-error-handler.hh" @@ -95,21 +97,37 @@ mutation make_new_mutation(schema_ptr s, int key) { return make_new_mutation(s, partition_key::from_single_value(*s, to_bytes(sprint("key%d", key)))); } +snapshot_source make_decorated_snapshot_source(snapshot_source src, std::function decorator) { + return snapshot_source([src = std::move(src), decorator = std::move(decorator)] () mutable { + return decorator(src()); + }); +} + +mutation_source make_source_with(mutation m) { + return mutation_source([m] (schema_ptr s, const dht::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, streamed_mutation::forwarding fwd) { + assert(m.schema() == s); + return make_reader_returning(m, std::move(fwd)); + }); +} + +// It is assumed that src won't change. +snapshot_source snapshot_source_from_snapshot(mutation_source src) { + return snapshot_source([src = std::move(src)] { + return src; + }); +} + SEASTAR_TEST_CASE(test_cache_delegates_to_underlying) { return seastar::async([] { auto s = make_schema(); auto m = make_new_mutation(s); cache_tracker tracker; - row_cache cache(s, mutation_source([m] (schema_ptr s, const dht::partition_range&) { - assert(m.schema() == s); - return make_reader_returning(m); - }), tracker); + row_cache cache(s, snapshot_source_from_snapshot(make_source_with(m)), tracker); assert_that(cache.make_reader(s, query::full_partition_range)) .produces(m) .produces_end_of_stream(); - assert(tracker.uncached_wide_partitions() == 0); }); } @@ -119,10 +137,7 @@ SEASTAR_TEST_CASE(test_cache_works_after_clearing) { auto m = make_new_mutation(s); cache_tracker tracker; - row_cache cache(s, mutation_source([m] (schema_ptr s, const dht::partition_range&) { - assert(m.schema() == s); - return make_reader_returning(m); - }), tracker); + row_cache cache(s, snapshot_source_from_snapshot(make_source_with(m)), tracker); assert_that(cache.make_reader(s, query::full_partition_range)) .produces(m) @@ -157,64 +172,14 @@ mutation_reader make_counting_reader(mutation_reader mr, int& counter) { return make_mutation_reader(std::move(mr), counter); } -SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_for_wide_partition_full_range) { - return seastar::async([] { - auto s = make_schema(); - auto m = make_new_mutation(s); - int secondary_calls_count = 0; - cache_tracker tracker; - row_cache cache(s, mutation_source([&secondary_calls_count, &m] (schema_ptr s, const dht::partition_range& range) { - return make_counting_reader(make_reader_returning(m), secondary_calls_count); - }), tracker, 0); - - assert_that(cache.make_reader(s, query::full_partition_range)) - .produces(m) - .produces_end_of_stream(); - // 2 from cache reader (m & eos) + 1 from large partition read - BOOST_REQUIRE_EQUAL(secondary_calls_count, 3); - BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 1); - assert_that(cache.make_reader(s, query::full_partition_range)) - .produces(m) - .produces_end_of_stream(); - // previous 3 + 1 from large partition read - BOOST_REQUIRE_EQUAL(secondary_calls_count, 4); - BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 2); - }); -} - -SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_for_wide_partition_single_partition) { - return seastar::async([] { - auto s = make_schema(); - auto m = make_new_mutation(s); - int secondary_calls_count = 0; - cache_tracker tracker; - row_cache cache(s, mutation_source([&secondary_calls_count, &m] (schema_ptr s, const dht::partition_range& range) { - return make_counting_reader(make_reader_returning(m), secondary_calls_count); - }), tracker, 0); - - auto singular_range = dht::partition_range::make_singular(query::ring_position(m.decorated_key())); - - assert_that(cache.make_reader(s, singular_range)) - .produces(m) - .produces_end_of_stream(); - BOOST_REQUIRE_EQUAL(secondary_calls_count, 3); - BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 1); - assert_that(cache.make_reader(s, singular_range)) - .produces(m) - .produces_end_of_stream(); - BOOST_REQUIRE_EQUAL(secondary_calls_count, 5); - BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 2); - }); -} - SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_empty_full_range) { return seastar::async([] { auto s = make_schema(); int secondary_calls_count = 0; cache_tracker tracker; - row_cache cache(s, mutation_source([&secondary_calls_count] (schema_ptr s, const dht::partition_range& range) { + row_cache cache(s, snapshot_source_from_snapshot(mutation_source([&secondary_calls_count] (schema_ptr s, const dht::partition_range& range, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, streamed_mutation::forwarding fwd) { return make_counting_reader(make_empty_reader(), secondary_calls_count); - }), tracker); + })), tracker); assert_that(cache.make_reader(s, query::full_partition_range)) .produces_end_of_stream(); @@ -227,26 +192,27 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_empty_full_range) void test_cache_delegates_to_underlying_only_once_with_single_partition(schema_ptr s, const mutation& m, - const dht::partition_range& range) { + const dht::partition_range& range, + int calls_to_secondary) { int secondary_calls_count = 0; cache_tracker tracker; - row_cache cache(s, mutation_source([m, &secondary_calls_count] (schema_ptr s, const dht::partition_range& range) { + row_cache cache(s, snapshot_source_from_snapshot(mutation_source([m, &secondary_calls_count] (schema_ptr s, const dht::partition_range& range, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, streamed_mutation::forwarding fwd) { assert(m.schema() == s); if (range.contains(dht::ring_position(m.decorated_key()), dht::ring_position_comparator(*s))) { - return make_counting_reader(make_reader_returning(m), secondary_calls_count); + return make_counting_reader(make_reader_returning(m, std::move(fwd)), secondary_calls_count); } else { return make_counting_reader(make_empty_reader(), secondary_calls_count); } - }), tracker); + })), tracker); assert_that(cache.make_reader(s, range)) .produces(m) .produces_end_of_stream(); - BOOST_REQUIRE_EQUAL(secondary_calls_count, 2); + BOOST_REQUIRE_EQUAL(secondary_calls_count, calls_to_secondary); assert_that(cache.make_reader(s, range)) .produces(m) .produces_end_of_stream(); - BOOST_REQUIRE_EQUAL(secondary_calls_count, 2); + BOOST_REQUIRE_EQUAL(secondary_calls_count, calls_to_secondary); } SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_single_key_range) { @@ -254,7 +220,7 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_single_key_range) auto s = make_schema(); auto m = make_new_mutation(s); test_cache_delegates_to_underlying_only_once_with_single_partition(s, m, - dht::partition_range::make_singular(query::ring_position(m.decorated_key()))); + dht::partition_range::make_singular(query::ring_position(m.decorated_key())), 1); }); } @@ -262,7 +228,7 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_full_range) { return seastar::async([] { auto s = make_schema(); auto m = make_new_mutation(s); - test_cache_delegates_to_underlying_only_once_with_single_partition(s, m, query::full_partition_range); + test_cache_delegates_to_underlying_only_once_with_single_partition(s, m, query::full_partition_range, 2); }); } @@ -272,7 +238,7 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_range_open) { auto m = make_new_mutation(s); dht::partition_range::bound end = {dht::ring_position(m.decorated_key()), true}; dht::partition_range range = dht::partition_range::make_ending_with(end); - test_cache_delegates_to_underlying_only_once_with_single_partition(s, m, range); + test_cache_delegates_to_underlying_only_once_with_single_partition(s, m, range, 2); }); } @@ -326,17 +292,19 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_multiple_mutation } auto make_cache = [&tracker, &mt](schema_ptr s, int& secondary_calls_count) -> lw_shared_ptr { - auto secondary = mutation_source([&mt, &secondary_calls_count] (schema_ptr s, const dht::partition_range& range) { - return make_counting_reader(mt->as_data_source()(s, range), secondary_calls_count); + auto secondary = mutation_source([&mt, &secondary_calls_count] (schema_ptr s, const dht::partition_range& range, + const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) { + return make_counting_reader(mt->as_data_source()(s, range, slice, pc, std::move(trace), std::move(fwd)), secondary_calls_count); }); - return make_lw_shared(s, secondary, tracker); + return make_lw_shared(s, snapshot_source_from_snapshot(secondary), tracker); }; auto make_ds = [&make_cache](schema_ptr s, int& secondary_calls_count) -> mutation_source { auto cache = make_cache(s, secondary_calls_count); - return mutation_source([cache] (schema_ptr s, const dht::partition_range& range) { - return cache->make_reader(s, range); + return mutation_source([cache] (schema_ptr s, const dht::partition_range& range, + const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) { + return cache->make_reader(s, range, slice, pc, std::move(trace), std::move(fwd)); }); }; @@ -459,8 +427,9 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_multiple_mutation }; auto cache = make_cache(s, secondary_calls_count); - auto ds = mutation_source([cache] (schema_ptr s, const dht::partition_range& range) { - return cache->make_reader(s, range); + auto ds = mutation_source([cache] (schema_ptr s, const dht::partition_range& range, + const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) { + return cache->make_reader(s, range, slice, pc, std::move(trace), std::move(fwd)); }); test(ds, query::full_partition_range, partitions.size() + 1); @@ -497,7 +466,7 @@ SEASTAR_TEST_CASE(test_query_of_incomplete_range_goes_to_underlying) { } cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); auto get_partition_range = [] (const mutation& m) { return dht::partition_range::make_singular(query::ring_position(m.decorated_key())); @@ -547,7 +516,7 @@ SEASTAR_TEST_CASE(test_single_key_queries_after_population_in_reverse_order) { } cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); auto get_partition_range = [] (const mutation& m) { return dht::partition_range::make_singular(query::ring_position(m.decorated_key())); @@ -584,7 +553,7 @@ SEASTAR_TEST_CASE(test_row_cache_conforms_to_mutation_source) { mt->apply(m); } - auto cache = make_lw_shared(s, mt->as_data_source(), tracker); + auto cache = make_lw_shared(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); return mutation_source([cache] (schema_ptr s, const dht::partition_range& range, const query::partition_slice& slice, @@ -598,13 +567,84 @@ SEASTAR_TEST_CASE(test_row_cache_conforms_to_mutation_source) { }); } +static +mutation make_fully_continuous(const mutation& m) { + mutation res = m; + res.partition().make_fully_continuous(); + return res; +} + +SEASTAR_TEST_CASE(test_reading_from_random_partial_partition) { + return seastar::async([] { + cache_tracker tracker; + random_mutation_generator gen(random_mutation_generator::generate_counters::no); + + // The test primes the cache with m1, which has random continuity, + // and then applies m2 on top of it. This should result in some of m2's + // write information to be dropped. The test then verifies that we still get the + // proper m1 + m2. + + auto m1 = gen(); + auto m2 = make_fully_continuous(gen()); + + memtable_snapshot_source underlying(gen.schema()); + underlying.apply(make_fully_continuous(m1)); + + row_cache cache(gen.schema(), snapshot_source([&] { return underlying(); }), tracker); + + cache.populate(m1); // m1 is supposed to have random continuity and populate() should preserve it + + auto rd1 = cache.make_reader(gen.schema()); + auto sm1 = rd1().get0(); + + // Merge m2 into cache + auto mt = make_lw_shared(gen.schema()); + mt->apply(m2); + underlying.apply(m2); + cache.update(*mt, make_default_partition_presence_checker()).get(); + + auto rd2 = cache.make_reader(gen.schema()); + auto sm2 = rd2().get0(); + + assert_that(std::move(sm1)).has_mutation().is_equal_to(m1); + assert_that(std::move(sm2)).has_mutation().is_equal_to(m1 + m2); + }); +} + +SEASTAR_TEST_CASE(test_random_partition_population) { + return seastar::async([] { + cache_tracker tracker; + random_mutation_generator gen(random_mutation_generator::generate_counters::no); + + auto m1 = make_fully_continuous(gen()); + auto m2 = make_fully_continuous(gen()); + + memtable_snapshot_source underlying(gen.schema()); + underlying.apply(m1); + + row_cache cache(gen.schema(), snapshot_source([&] { return underlying(); }), tracker); + + assert_that(cache.make_reader(gen.schema())) + .produces(m1) + .produces_end_of_stream(); + + underlying.apply(m2); + cache.invalidate().get(); + + auto pr = dht::partition_range::make_singular(m2.decorated_key()); + assert_that(cache.make_reader(gen.schema(), pr)) + .produces(m1 + m2) + .produces_end_of_stream(); + }); +} + SEASTAR_TEST_CASE(test_eviction) { return seastar::async([] { auto s = make_schema(); auto mt = make_lw_shared(s); cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); std::vector keys; for (int i = 0; i < 100000; i++) { @@ -647,13 +687,96 @@ void verify_has(row_cache& cache, const mutation& m) { assert_that(reader().get0()).has_mutation().is_equal_to(m); } +void test_sliced_read_row_presence(mutation_reader reader, schema_ptr s, std::deque expected) +{ + clustering_key::equality ck_eq(*s); + + auto smopt = reader().get0(); + BOOST_REQUIRE(smopt); + auto mfopt = (*smopt)().get0(); + while (mfopt) { + if (mfopt->is_clustering_row()) { + BOOST_REQUIRE(!expected.empty()); + auto expected_ck = expected.front(); + auto ck = clustering_key_prefix::from_single_value(*s, int32_type->decompose(expected_ck)); + expected.pop_front(); + auto& cr = mfopt->as_clustering_row(); + if (!ck_eq(cr.key(), ck)) { + BOOST_FAIL(sprint("Expected %s, but got %s", ck, cr.key())); + } + } + mfopt = (*smopt)().get0(); + } + BOOST_REQUIRE(expected.empty()); + BOOST_REQUIRE(!reader().get0()); +} + +SEASTAR_TEST_CASE(test_single_partition_update) { + return seastar::async([] { + auto s = schema_builder("ks", "cf") + .with_column("pk", int32_type, column_kind::partition_key) + .with_column("ck", int32_type, column_kind::clustering_key) + .with_column("v", int32_type) + .build(); + auto pk = partition_key::from_exploded(*s, { int32_type->decompose(100) }); + auto dk = dht::global_partitioner().decorate_key(*s, pk); + auto range = dht::partition_range::make_singular(dk); + auto make_ck = [&s] (int v) { + return clustering_key_prefix::from_single_value(*s, int32_type->decompose(v)); + }; + auto ck1 = make_ck(1); + auto ck2 = make_ck(2); + auto ck3 = make_ck(3); + auto ck4 = make_ck(4); + auto ck7 = make_ck(7); + memtable_snapshot_source cache_mt(s); + { + mutation m(pk, s); + m.set_clustered_cell(ck1, "v", data_value(101), 1); + m.set_clustered_cell(ck2, "v", data_value(101), 1); + m.set_clustered_cell(ck4, "v", data_value(101), 1); + m.set_clustered_cell(ck7, "v", data_value(101), 1); + cache_mt.apply(m); + } + + cache_tracker tracker; + row_cache cache(s, snapshot_source([&] { return cache_mt(); }), tracker); + + { + auto slice = partition_slice_builder(*s) + .with_range(query::clustering_range::make_ending_with(ck1)) + .with_range(query::clustering_range::make_starting_with(ck4)) + .build(); + auto reader = cache.make_reader(s, range, slice); + test_sliced_read_row_presence(std::move(reader), s, {1, 4, 7}); + } + + auto mt = make_lw_shared(s); + { + mutation m(pk, s); + m.set_clustered_cell(ck3, "v", data_value(101), 1); + mt->apply(m); + cache_mt.apply(m); + } + cache.update(*mt, [] (auto&& key) { + return partition_presence_checker_result::maybe_exists; + }).get(); + + { + auto reader = cache.make_reader(s, range); + test_sliced_read_row_presence(std::move(reader), s, {1, 2, 3, 4, 7}); + } + + }); +} + SEASTAR_TEST_CASE(test_update) { return seastar::async([] { auto s = make_schema(); auto cache_mt = make_lw_shared(s); cache_tracker tracker; - row_cache cache(s, cache_mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker); BOOST_TEST_MESSAGE("Check cache miss with populate"); @@ -738,7 +861,7 @@ SEASTAR_TEST_CASE(test_update_failure) { auto cache_mt = make_lw_shared(s); cache_tracker tracker; - row_cache cache(s, cache_mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker); int partition_count = 1000; @@ -847,7 +970,7 @@ class throttled_mutation_source { private: class impl : public enable_lw_shared_from_this { mutation_source _underlying; - ::throttle _throttle; + ::throttle& _throttle; private: class reader : public mutation_reader::impl { throttle& _throttle; @@ -869,33 +992,26 @@ class throttled_mutation_source { } }; public: - impl(mutation_source underlying) + impl(::throttle& t, mutation_source underlying) : _underlying(std::move(underlying)) + , _throttle(t) { } - mutation_reader make_reader(schema_ptr s, const dht::partition_range& pr) { - return make_mutation_reader(_throttle, _underlying(s, pr)); + mutation_reader make_reader(schema_ptr s, const dht::partition_range& pr, + const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) { + return make_mutation_reader(_throttle, _underlying(s, pr, slice, pc, std::move(trace), std::move(fwd))); } - - ::throttle& throttle() { return _throttle; } }; lw_shared_ptr _impl; public: - throttled_mutation_source(mutation_source underlying) - : _impl(make_lw_shared(std::move(underlying))) + throttled_mutation_source(throttle& t, mutation_source underlying) + : _impl(make_lw_shared(t, std::move(underlying))) { } - void block() { - _impl->throttle().block(); - } - - void unblock() { - _impl->throttle().unblock(); - } - operator mutation_source() const { - return mutation_source([this] (schema_ptr s, const dht::partition_range& pr) { - return _impl->make_reader(std::move(s), pr); + return mutation_source([impl = _impl] (schema_ptr s, const dht::partition_range& pr, + const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) { + return impl->make_reader(std::move(s), pr, slice, pc, std::move(trace), std::move(fwd)); }); } }; @@ -909,10 +1025,11 @@ static std::vector updated_ring(std::vector& mutations) { } static mutation_source make_mutation_source(std::vector>& memtables) { - return mutation_source([&memtables] (schema_ptr s, const dht::partition_range& pr) { + return mutation_source([&memtables] (schema_ptr s, const dht::partition_range& pr, + const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) { std::vector readers; for (auto&& mt : memtables) { - readers.emplace_back(mt->make_reader(s, pr)); + readers.emplace_back(mt->make_reader(s, pr, slice, pc, trace, fwd)); } return make_combined_reader(std::move(readers)); }); @@ -923,14 +1040,14 @@ SEASTAR_TEST_CASE(test_continuity_flag_and_invalidate_race) { auto s = make_schema(); lw_shared_ptr mt = make_lw_shared(s); - cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); - auto ring = make_ring(s, 4); for (auto&& m : ring) { mt->apply(m); } + cache_tracker tracker; + row_cache cache(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); + // Bring ring[2]and ring[3] to cache. auto range = dht::partition_range::make_starting_with({ ring[2].ring_position(), true }); assert_that(cache.make_reader(s, range)) @@ -958,7 +1075,7 @@ SEASTAR_TEST_CASE(test_continuity_flag_and_invalidate_race) { .produces(ring[2]); // Invalidate whole cache. - cache.clear().get(); + cache.invalidate().get(); rd.produces(ring[3]) .produces_end_of_stream(); @@ -976,17 +1093,21 @@ SEASTAR_TEST_CASE(test_continuity_flag_and_invalidate_race) { SEASTAR_TEST_CASE(test_cache_population_and_update_race) { return seastar::async([] { auto s = make_schema(); - std::vector> memtables; - throttled_mutation_source cache_source(make_mutation_source(memtables)); + memtable_snapshot_source memtables(s); + throttle thr; + auto cache_source = make_decorated_snapshot_source(snapshot_source([&] { return memtables(); }), [&] (mutation_source src) { + return throttled_mutation_source(thr, std::move(src)); + }); cache_tracker tracker; - row_cache cache(s, cache_source, tracker); auto mt1 = make_lw_shared(s); - memtables.push_back(mt1); auto ring = make_ring(s, 3); for (auto&& m : ring) { mt1->apply(m); } + memtables.apply(*mt1); + + row_cache cache(s, cache_source, tracker); auto mt2 = make_lw_shared(s); auto ring2 = updated_ring(ring); @@ -994,7 +1115,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) { mt2->apply(m); } - cache_source.block(); + thr.block(); auto m0_range = dht::partition_range::make_singular(ring[0].ring_position()); auto rd1 = cache.make_reader(s, m0_range); @@ -1004,9 +1125,8 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) { auto rd2_result = rd2(); sleep(10ms).get(); - auto mt2_flushed = make_lw_shared(s); - mt2_flushed->apply(*mt2).get(); - memtables.push_back(mt2_flushed); + + memtables.apply(*mt2); // This update should miss on all partitions auto update_future = cache.update(*mt2, make_default_partition_presence_checker()); @@ -1014,7 +1134,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) { auto rd3 = cache.make_reader(s); // rd2, which is in progress, should not prevent forward progress of update() - cache_source.unblock(); + thr.unblock(); update_future.get(); // Reads started before memtable flush should return previous value, otherwise this test @@ -1049,7 +1169,7 @@ SEASTAR_TEST_CASE(test_invalidate) { auto mt = make_lw_shared(s); cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); int partition_count = 1000; @@ -1104,17 +1224,21 @@ SEASTAR_TEST_CASE(test_invalidate) { SEASTAR_TEST_CASE(test_cache_population_and_clear_race) { return seastar::async([] { auto s = make_schema(); - std::vector> memtables; - throttled_mutation_source cache_source(make_mutation_source(memtables)); + memtable_snapshot_source memtables(s); + throttle thr; + auto cache_source = make_decorated_snapshot_source(snapshot_source([&] { return memtables(); }), [&] (mutation_source src) { + return throttled_mutation_source(thr, std::move(src)); + }); cache_tracker tracker; - row_cache cache(s, cache_source, tracker); auto mt1 = make_lw_shared(s); - memtables.push_back(mt1); auto ring = make_ring(s, 3); for (auto&& m : ring) { mt1->apply(m); } + memtables.apply(*mt1); + + row_cache cache(s, std::move(cache_source), tracker); auto mt2 = make_lw_shared(s); auto ring2 = updated_ring(ring); @@ -1122,7 +1246,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_clear_race) { mt2->apply(m); } - cache_source.block(); + thr.block(); auto rd1 = cache.make_reader(s); auto rd1_result = rd1(); @@ -1130,15 +1254,15 @@ SEASTAR_TEST_CASE(test_cache_population_and_clear_race) { sleep(10ms).get(); memtables.clear(); - memtables.push_back(mt2); + memtables.apply(*mt2); // This update should miss on all partitions - auto cache_cleared = cache.clear(); + auto cache_cleared = cache.invalidate(); auto rd2 = cache.make_reader(s); // rd1, which is in progress, should not prevent forward progress of clear() - cache_source.unblock(); + thr.unblock(); cache_cleared.get(); // Reads started before memtable flush should return previous value, otherwise this test @@ -1169,19 +1293,14 @@ SEASTAR_TEST_CASE(test_cache_population_and_clear_race) { SEASTAR_TEST_CASE(test_mvcc) { return seastar::async([] { - auto no_difference = [] (auto& m1, auto& m2) { - return m1.partition().difference(m1.schema(), m2.partition()).empty() - && m2.partition().difference(m1.schema(), m1.partition()).empty(); - }; - - auto test = [&no_difference] (const mutation& m1, const mutation& m2, bool with_active_memtable_reader) { + auto test = [&] (const mutation& m1, const mutation& m2, bool with_active_memtable_reader) { auto s = m1.schema(); - auto mt = make_lw_shared(s); + memtable_snapshot_source underlying(s); partition_key::equality eq(*s); cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source([&] { return underlying(); }), tracker); auto pk = m1.key(); cache.populate(m1); @@ -1197,8 +1316,7 @@ SEASTAR_TEST_CASE(test_mvcc) { auto mt1 = make_lw_shared(s); mt1->apply(m2); - auto m12 = m1; - m12.apply(m2); + auto m12 = m1 + m2; stdx::optional mt1_reader_opt; stdx::optional mt1_reader_sm_opt; @@ -1208,7 +1326,9 @@ SEASTAR_TEST_CASE(test_mvcc) { BOOST_REQUIRE(*mt1_reader_sm_opt); } + underlying.apply(*mt1); cache.update(*mt1, make_default_partition_presence_checker()).get(); + auto sm3 = cache.make_reader(s)().get0(); BOOST_REQUIRE(sm3); BOOST_REQUIRE(eq(sm3->key(), pk)); @@ -1221,51 +1341,45 @@ SEASTAR_TEST_CASE(test_mvcc) { BOOST_REQUIRE(sm5); BOOST_REQUIRE(eq(sm5->key(), pk)); - stdx::optional previous; - position_in_partition::less_compare cmp(*sm3->schema()); - auto mf = (*sm3)().get0(); - while (mf) { - if (previous) { - BOOST_REQUIRE(cmp(*previous, mf->position())); - } - previous = position_in_partition(mf->position()); - mf = (*sm3)().get0(); - } - sm3 = { }; + assert_that_stream(std::move(*sm3)).has_monotonic_positions(); if (with_active_memtable_reader) { assert(mt1_reader_sm_opt); auto mt1_reader_mutation = mutation_from_streamed_mutation(std::move(*mt1_reader_sm_opt)).get0(); BOOST_REQUIRE(mt1_reader_mutation); - BOOST_REQUIRE(no_difference(m2, *mt1_reader_mutation)); + assert_that(*mt1_reader_mutation).is_equal_to(m2); } auto m_4 = mutation_from_streamed_mutation(std::move(sm4)).get0(); - BOOST_REQUIRE(no_difference(m12, *m_4)); + assert_that(*m_4).is_equal_to(m12); auto m_1 = mutation_from_streamed_mutation(std::move(sm1)).get0(); - BOOST_REQUIRE(no_difference(m1, *m_1)); + assert_that(*m_1).is_equal_to(m1); - cache.clear().get0(); + cache.invalidate().get0(); auto m_2 = mutation_from_streamed_mutation(std::move(sm2)).get0(); - BOOST_REQUIRE(no_difference(m1, *m_2)); + assert_that(*m_2).is_equal_to(m1); auto m_5 = mutation_from_streamed_mutation(std::move(sm5)).get0(); - BOOST_REQUIRE(no_difference(m12, *m_5)); + assert_that(*m_5).is_equal_to(m12); }; - for_each_mutation_pair([&] (const mutation& m1, const mutation& m2_, are_equal) { - if (m1.schema() != m2_.schema()) { + for_each_mutation_pair([&] (const mutation& m1_, const mutation& m2_, are_equal) { + if (m1_.schema() != m2_.schema()) { return; } - if (m1.partition().empty() || m2_.partition().empty()) { + if (m1_.partition().empty() || m2_.partition().empty()) { return; } - auto s = m1.schema(); + auto s = m1_.schema(); + + auto m1 = m1_; + m1.partition().make_fully_continuous(); auto m2 = mutation(m1.decorated_key(), m1.schema()); m2.partition().apply(*s, m2_.partition(), *s); + m2.partition().make_fully_continuous(); test(m1, m2, false); test(m1, m2, true); @@ -1273,25 +1387,6 @@ SEASTAR_TEST_CASE(test_mvcc) { }); } -void test_sliced_read_row_presence(mutation_reader reader, schema_ptr s, const query::partition_slice& ps, std::deque expected) -{ - clustering_key::equality ck_eq(*s); - - auto smopt = reader().get0(); - BOOST_REQUIRE(smopt); - auto mfopt = (*smopt)().get0(); - while (mfopt) { - if (mfopt->is_clustering_row()) { - auto& cr = mfopt->as_clustering_row(); - BOOST_REQUIRE(ck_eq(cr.key(), clustering_key_prefix::from_single_value(*s, int32_type->decompose(expected.front())))); - expected.pop_front(); - } - mfopt = (*smopt)().get0(); - } - - BOOST_REQUIRE(!reader().get0()); -} - SEASTAR_TEST_CASE(test_slicing_mutation_reader) { return seastar::async([] { auto s = schema_builder("ks", "cf") @@ -1312,27 +1407,27 @@ SEASTAR_TEST_CASE(test_slicing_mutation_reader) { mt->apply(m); cache_tracker tracker; - row_cache cache(s, mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(mt->as_data_source()), tracker); auto run_tests = [&] (auto& ps, std::deque expected) { - cache.clear().get0(); + cache.invalidate().get0(); auto reader = cache.make_reader(s, query::full_partition_range, ps); - test_sliced_read_row_presence(std::move(reader), s, ps, expected); + test_sliced_read_row_presence(std::move(reader), s, expected); reader = cache.make_reader(s, query::full_partition_range, ps); - test_sliced_read_row_presence(std::move(reader), s, ps, expected); + test_sliced_read_row_presence(std::move(reader), s, expected); auto dk = dht::global_partitioner().decorate_key(*s, pk); auto singular_range = dht::partition_range::make_singular(dk); reader = cache.make_reader(s, singular_range, ps); - test_sliced_read_row_presence(std::move(reader), s, ps, expected); + test_sliced_read_row_presence(std::move(reader), s, expected); - cache.clear().get0(); + cache.invalidate().get0(); reader = cache.make_reader(s, singular_range, ps); - test_sliced_read_row_presence(std::move(reader), s, ps, expected); + test_sliced_read_row_presence(std::move(reader), s, expected); }; { @@ -1387,7 +1482,7 @@ SEASTAR_TEST_CASE(test_lru) { auto cache_mt = make_lw_shared(s); cache_tracker tracker; - row_cache cache(s, cache_mt->as_data_source(), tracker); + row_cache cache(s, snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker); int partition_count = 10; @@ -1438,3 +1533,223 @@ SEASTAR_TEST_CASE(test_lru) { .produces_end_of_stream(); }); } + +SEASTAR_TEST_CASE(test_update_invalidating) { + return seastar::async([] { + simple_schema s; + cache_tracker tracker; + memtable_snapshot_source underlying(s.schema()); + + auto mutation_for_key = [&] (dht::decorated_key key) { + mutation m(key, s.schema()); + s.add_row(m, s.make_ckey(0), "val"); + return m; + }; + + auto keys = s.make_pkeys(4); + + auto m1 = mutation_for_key(keys[1]); + underlying.apply(m1); + + auto m2 = mutation_for_key(keys[3]); + underlying.apply(m2); + + row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker); + + assert_that(cache.make_reader(s.schema())) + .produces(m1) + .produces(m2) + .produces_end_of_stream(); + + auto mt = make_lw_shared(s.schema()); + + auto m3 = mutation_for_key(m1.decorated_key()); + auto m4 = mutation_for_key(keys[2]); + auto m5 = mutation_for_key(keys[0]); + mt->apply(m3); + mt->apply(m4); + mt->apply(m5); + + underlying.apply(*mt); + cache.update_invalidating(*mt).get(); + + assert_that(cache.make_reader(s.schema())) + .produces(m5) + .produces(m1 + m3) + .produces(m4) + .produces(m2) + .produces_end_of_stream(); + }); +} + +SEASTAR_TEST_CASE(test_scan_with_partial_partitions) { + return seastar::async([] { + simple_schema s; + auto cache_mt = make_lw_shared(s.schema()); + + auto pkeys = s.make_pkeys(3); + + mutation m1(pkeys[0], s.schema()); + s.add_row(m1, s.make_ckey(0), "v1"); + s.add_row(m1, s.make_ckey(1), "v2"); + s.add_row(m1, s.make_ckey(2), "v3"); + s.add_row(m1, s.make_ckey(3), "v4"); + cache_mt->apply(m1); + + mutation m2(pkeys[1], s.schema()); + s.add_row(m2, s.make_ckey(0), "v5"); + s.add_row(m2, s.make_ckey(1), "v6"); + s.add_row(m2, s.make_ckey(2), "v7"); + cache_mt->apply(m2); + + mutation m3(pkeys[2], s.schema()); + s.add_row(m3, s.make_ckey(0), "v8"); + s.add_row(m3, s.make_ckey(1), "v9"); + s.add_row(m3, s.make_ckey(2), "v10"); + cache_mt->apply(m3); + + cache_tracker tracker; + row_cache cache(s.schema(), snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker); + + // partially populate all up to middle of m1 + { + auto slice = partition_slice_builder(*s.schema()) + .with_range(query::clustering_range::make_ending_with(s.make_ckey(1))) + .build(); + auto prange = dht::partition_range::make_ending_with(dht::ring_position(m1.decorated_key())); + assert_that(cache.make_reader(s.schema(), prange, slice)) + .produces(m1, slice.row_ranges(*s.schema(), m1.key())) + .produces_end_of_stream(); + } + + // partially populate m3 + { + auto slice = partition_slice_builder(*s.schema()) + .with_range(query::clustering_range::make_ending_with(s.make_ckey(1))) + .build(); + auto prange = dht::partition_range::make_singular(m3.decorated_key()); + assert_that(cache.make_reader(s.schema(), prange, slice)) + .produces(m3, slice.row_ranges(*s.schema(), m3.key())) + .produces_end_of_stream(); + } + + // full scan + assert_that(cache.make_reader(s.schema())) + .produces(m1) + .produces(m2) + .produces(m3) + .produces_end_of_stream(); + + // full scan after full scan + assert_that(cache.make_reader(s.schema())) + .produces(m1) + .produces(m2) + .produces(m3) + .produces_end_of_stream(); + }); +} + +SEASTAR_TEST_CASE(test_cache_populates_partition_tombstone) { + return seastar::async([] { + simple_schema s; + auto cache_mt = make_lw_shared(s.schema()); + + auto pkeys = s.make_pkeys(2); + + mutation m1(pkeys[0], s.schema()); + s.add_static_row(m1, "val"); + m1.partition().apply(tombstone(s.new_timestamp(), gc_clock::now())); + cache_mt->apply(m1); + + mutation m2(pkeys[1], s.schema()); + s.add_static_row(m2, "val"); + m2.partition().apply(tombstone(s.new_timestamp(), gc_clock::now())); + cache_mt->apply(m2); + + cache_tracker tracker; + row_cache cache(s.schema(), snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker); + + // singular range case + { + auto prange = dht::partition_range::make_singular(dht::ring_position(m1.decorated_key())); + assert_that(cache.make_reader(s.schema(), prange)) + .produces(m1) + .produces_end_of_stream(); + + assert_that(cache.make_reader(s.schema(), prange)) // over populated + .produces(m1) + .produces_end_of_stream(); + } + + // range scan case + { + assert_that(cache.make_reader(s.schema())) + .produces(m1) + .produces(m2) + .produces_end_of_stream(); + + assert_that(cache.make_reader(s.schema())) // over populated + .produces(m1) + .produces(m2) + .produces_end_of_stream(); + } + }); +} + +// Tests the case of cache reader having to reconcile a range tombstone +// from the underlying mutation source which overlaps with previously emitted +// tombstones. +SEASTAR_TEST_CASE(test_tombstone_merging_in_partial_partition) { + return seastar::async([] { + simple_schema s; + cache_tracker tracker; + memtable_snapshot_source underlying(s.schema()); + + auto pk = s.make_pkey(0); + auto pr = dht::partition_range::make_singular(pk); + + tombstone t0{s.new_timestamp(), gc_clock::now()}; + tombstone t1{s.new_timestamp(), gc_clock::now()}; + + mutation m1(pk, s.schema()); + m1.partition().apply_delete(*s.schema(), + s.make_range_tombstone(query::clustering_range::make(s.make_ckey(0), s.make_ckey(10)), t0)); + underlying.apply(m1); + + mutation m2(pk, s.schema()); + m2.partition().apply_delete(*s.schema(), + s.make_range_tombstone(query::clustering_range::make(s.make_ckey(3), s.make_ckey(6)), t1)); + m2.partition().apply_delete(*s.schema(), + s.make_range_tombstone(query::clustering_range::make(s.make_ckey(7), s.make_ckey(12)), t1)); + s.add_row(m2, s.make_ckey(4), "val"); + s.add_row(m2, s.make_ckey(8), "val"); + underlying.apply(m2); + + row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker); + + { + auto slice = partition_slice_builder(*s.schema()) + .with_range(query::clustering_range::make_singular(s.make_ckey(4))) + .build(); + + assert_that(cache.make_reader(s.schema(), pr, slice)) + .produces(m1 + m2, slice.row_ranges(*s.schema(), pk.key())) + .produces_end_of_stream(); + } + + { + auto slice = partition_slice_builder(*s.schema()) + .with_range(query::clustering_range::make_starting_with(s.make_ckey(4))) + .build(); + + assert_that(cache.make_reader(s.schema(), pr, slice)) + .produces(m1 + m2, slice.row_ranges(*s.schema(), pk.key())) + .produces_end_of_stream(); + + auto rd = cache.make_reader(s.schema(), pr, slice); + auto smo = rd().get0(); + BOOST_REQUIRE(smo); + assert_that_stream(std::move(*smo)).has_monotonic_positions(); + } + }); +} diff --git a/tests/simple_schema.hh b/tests/simple_schema.hh index ee3eaecaa828..efa298a1c304 100644 --- a/tests/simple_schema.hh +++ b/tests/simple_schema.hh @@ -28,15 +28,18 @@ #include "keys.hh" #include "streamed_mutation.hh" #include "mutation.hh" +#include "schema_builder.hh" +#include "streamed_mutation.hh" // Helper for working with the following table: // -// CREATE TABLE ks.cf (pk utf8, ck utf8, v utf8, s1 utf8 static, PRIMARY KEY (pk, ck)); +// CREATE TABLE ks.cf (pk text, ck text, v text, s1 text static, PRIMARY KEY (pk, ck)); // class simple_schema { schema_ptr _s; api::timestamp_type _timestamp = api::min_timestamp; -private: + const column_definition& _v_def; +public: api::timestamp_type new_timestamp() { return _timestamp++; } @@ -48,6 +51,7 @@ public: .with_column("s1", utf8_type, column_kind::static_column) .with_column("v", utf8_type) .build()) + , _v_def(*_s->get_column_definition(to_bytes("v"))) { } clustering_key make_ckey(sstring ck) { @@ -70,8 +74,23 @@ public: return dht::global_partitioner().decorate_key(*_s, key); } - void add_row(mutation& m, const clustering_key& key, sstring v) { - m.set_clustered_cell(key, to_bytes("v"), data_value(v), new_timestamp()); + void add_row(mutation& m, const clustering_key& key, const sstring& v, api::timestamp_type t = api::missing_timestamp) { + if (t == api::missing_timestamp) { + t = new_timestamp(); + } + m.set_clustered_cell(key, _v_def, atomic_cell::make_live(t, data_value(v).serialize())); + } + + std::pair get_value(const clustering_row& row) { + auto cell = row.cells().find_cell(_v_def.id); + if (!cell) { + throw std::runtime_error("cell not found"); + } + atomic_cell_view ac = cell->as_atomic_cell(); + if (!ac.is_live()) { + throw std::runtime_error("cell is dead"); + } + return std::make_pair(value_cast(utf8_type->deserialize(ac.value())), ac.timestamp()); } mutation_fragment make_row(const clustering_key& key, sstring v) { @@ -91,9 +110,12 @@ public: return rt; } - range_tombstone make_range_tombstone(const query::clustering_range& range) { + range_tombstone make_range_tombstone(const query::clustering_range& range, tombstone t = {}) { auto bv_range = bound_view::from_range(range); - range_tombstone rt(bv_range.first, bv_range.second, tombstone(new_timestamp(), gc_clock::now())); + if (!t) { + t = tombstone(new_timestamp(), gc_clock::now()); + } + range_tombstone rt(bv_range.first, bv_range.second, t); return rt; } @@ -114,4 +136,13 @@ public: std::sort(keys.begin(), keys.end(), dht::decorated_key::less_comparator(_s)); return keys; } + + // Returns n clustering keys in their natural order + std::vector make_ckeys(int n) { + std::vector keys; + for (int i = 0; i < n; ++i) { + keys.push_back(make_ckey(i)); + } + return keys; + } }; diff --git a/tests/streamed_mutation_test.cc b/tests/streamed_mutation_test.cc index fde5bafbccd4..2ff294f81727 100644 --- a/tests/streamed_mutation_test.cc +++ b/tests/streamed_mutation_test.cc @@ -29,8 +29,10 @@ #include "tests/test_services.hh" #include "schema_builder.hh" #include "total_order_check.hh" +#include "schema_upgrader.hh" #include "disk-error-handler.hh" +#include "mutation_assertions.hh" thread_local disk_error_signal_type commit_error; thread_local disk_error_signal_type general_disk_error; @@ -241,7 +243,7 @@ SEASTAR_TEST_CASE(test_fragmenting_and_freezing_streamed_mutations) { return make_ready_future<>(); }, 1).get0(); - auto expected_fragments = m.partition().clustered_rows().calculate_size() + auto expected_fragments = boost::size(m.partition().non_dummy_rows()) + m.partition().row_tombstones().size() + !m.partition().static_row().empty(); BOOST_REQUIRE_EQUAL(fms.size(), std::max(expected_fragments, size_t(1))); @@ -538,3 +540,21 @@ SEASTAR_TEST_CASE(test_ordering_of_position_in_partition_and_composite_view_in_a .check(); }); } + +SEASTAR_TEST_CASE(test_schema_upgrader_is_equivalent_with_mutation_upgrade) { + return seastar::async([] { + for_each_mutation_pair([](const mutation& m1, const mutation& m2, are_equal eq) { + if (m1.schema()->version() != m2.schema()->version()) { + // upgrade m1 to m2's schema + + auto from_upgrader = mutation_from_streamed_mutation( + transform(streamed_mutation_from_mutation(m1), schema_upgrader(m2.schema()))).get0(); + + auto regular = m1; + regular.upgrade(m2.schema()); + + assert_that(from_upgrader).has_mutation().is_equal_to(regular); + } + }); + }); +} diff --git a/utils/int_range.hh b/utils/int_range.hh new file mode 100644 index 000000000000..ecc405bf00a0 --- /dev/null +++ b/utils/int_range.hh @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2017 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include "range.hh" +#include + +using int_range = nonwrapping_range; + +inline +unsigned cardinality(const int_range& r) { + assert(r.start()); + assert(r.end()); + return r.end()->value() - r.start()->value() + r.start()->is_inclusive() + r.end()->is_inclusive() - 1; +} + +inline +unsigned cardinality(const stdx::optional& ropt) { + return ropt ? cardinality(*ropt) : 0; +} + +inline +stdx::optional intersection(const int_range& a, const int_range& b) { + auto int_tri_cmp = [] (int x, int y) { + return x < y ? -1 : (x > y ? 1 : 0); + }; + return a.intersection(b, int_tri_cmp); +} + +inline +int_range make_int_range(int start_inclusive, int end_exclusive) { + if (end_exclusive <= start_inclusive) { + throw std::runtime_error(sprint("invalid range: [%d, %d)", start_inclusive, end_exclusive)); + } + return int_range({start_inclusive}, {end_exclusive - 1}); +}