From ec034b92c04602083139caf0976f9e2cdbd2fc61 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 11 Jun 2023 09:47:52 +0300 Subject: [PATCH 1/6] mutation_test: test_cell_ordering: improve debuggability Currently, it is hard to tell which of the many sub-cases fail in this unit test, in case any of them fails. This change uses logging in debug and trace level to help with that by reproducing the error with --logger-log-level testlog=trace (The cases are deterministic so reproducing should not be a problem) Signed-off-by: Benny Halevy --- test/boost/mutation_test.cc | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/test/boost/mutation_test.cc b/test/boost/mutation_test.cc index f9b4fbd65ed1..311801a0f517 100644 --- a/test/boost/mutation_test.cc +++ b/test/boost/mutation_test.cc @@ -687,14 +687,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) { auto expiry_2 = now + ttl_2; auto assert_order = [] (atomic_cell_view first, atomic_cell_view second) { - if (compare_atomic_cell_for_merge(first, second) >= 0) { - testlog.trace("Expected {} < {}", first, second); - abort(); - } - if (compare_atomic_cell_for_merge(second, first) <= 0) { - testlog.trace("Expected {} < {}", second, first); - abort(); - } + testlog.trace("Expected {} < {}", first, second); + BOOST_REQUIRE(compare_atomic_cell_for_merge(first, second) < 0); + + testlog.trace("Expected {} > {}", second, first); + BOOST_REQUIRE(compare_atomic_cell_for_merge(second, first) > 0); }; auto assert_equal = [] (atomic_cell_view c1, atomic_cell_view c2) { @@ -703,18 +700,22 @@ SEASTAR_TEST_CASE(test_cell_ordering) { BOOST_REQUIRE(compare_atomic_cell_for_merge(c2, c1) == 0); }; + testlog.debug("Live cells with same value are equal"); assert_equal( atomic_cell::make_live(*bytes_type, 0, bytes("value")), atomic_cell::make_live(*bytes_type, 0, bytes("value"))); + testlog.debug("Non-expiring live cells are ordered before expiring cells"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes("value")), atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1)); + testlog.debug("Dead cells with same expiry are equal"); assert_equal( atomic_cell::make_dead(1, expiry_1), atomic_cell::make_dead(1, expiry_1)); + testlog.debug("Non-expiring live cells are ordered before expiring cells, with empty value"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes()), atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_2)); @@ -722,49 +723,52 @@ SEASTAR_TEST_CASE(test_cell_ordering) { // Origin doesn't compare ttl (is it wise?) // But we do. See https://github.com/scylladb/scylla/issues/10156 // and https://github.com/scylladb/scylla/issues/10173 + testlog.debug("Expiring cells with higher ttl are ordered before expiring cells with smaller ttl and same expiry time"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2), atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1)); + testlog.debug("Cells are ordered by value if all else is equal"); assert_order( atomic_cell::make_live(*bytes_type, 0, bytes("value1")), atomic_cell::make_live(*bytes_type, 0, bytes("value2"))); + testlog.debug("Cells are ordered by value in lexicographical order if all else is equal"); assert_order( atomic_cell::make_live(*bytes_type, 0, bytes("value12")), atomic_cell::make_live(*bytes_type, 0, bytes("value2"))); - // Live cells are ordered first by timestamp... + testlog.debug("Live cells are ordered first by timestamp..."); assert_order( atomic_cell::make_live(*bytes_type, 0, bytes("value2")), atomic_cell::make_live(*bytes_type, 1, bytes("value1"))); - // ..then by value + testlog.debug("...then by value"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_2, ttl_2), atomic_cell::make_live(*bytes_type, 1, bytes("value2"), expiry_1, ttl_1)); - // ..then by expiry + testlog.debug("...then by expiry"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_1), atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_1)); - // Dead wins + testlog.debug("Dead wins"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes("value")), atomic_cell::make_dead(1, expiry_1)); - // Dead wins with expiring cell + testlog.debug("Dead wins with expiring cell"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_2, ttl_2), atomic_cell::make_dead(1, expiry_1)); - // Deleted cells are ordered first by timestamp + testlog.debug("Deleted cells are ordered first by timestamp..."); assert_order( atomic_cell::make_dead(1, expiry_2), atomic_cell::make_dead(2, expiry_1)); - // ...then by expiry + testlog.debug("...then by expiry"); assert_order( atomic_cell::make_dead(1, expiry_1), atomic_cell::make_dead(1, expiry_2)); From 761d62cd82e7063bfdbe4fa32284fd510a31e607 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Thu, 8 Jun 2023 10:47:53 +0300 Subject: [PATCH 2/6] compare_atomic_cell_for_merge: compare value last for live cells Currently, when two cells have the same write timestamp and both are alive or expiring, we compare their value first, before checking if either of them is expiring and if both are expiring, comparing their expiration time and ttl value to determine which of them will expire later or was written later. This was changed in CASSANDRA-14592 for consistency with the preference for dead cells over live cells, as expiring cells will become tombstones at a future time and then they'd win over live cells with the same timestamp, hence they should win also before expiration. In addition, comparing the cell value before expiration can lead to unintuitive corner cases where rewriting a cell using the same timestamp but different TTL may cause scylla to return the cell with null value if it expired in the meanwhile. Also, when multiple columns are written using two upserts using the same write timestamp but with different expiration, selecting cells by their value may return a mixed result where each cell is selected individually from either upsert, by picking the cells with the largest values for each column, while using the expiration time to break tie will lead to a more consistent results where a set of cell from only one of the upserts will be selected. Fixes scylladb/scylladb#14182 Signed-off-by: Benny Halevy --- mutation/atomic_cell.cc | 7 ++----- test/boost/mutation_test.cc | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/mutation/atomic_cell.cc b/mutation/atomic_cell.cc index 185182cf0029..1f3febad165c 100644 --- a/mutation/atomic_cell.cc +++ b/mutation/atomic_cell.cc @@ -79,10 +79,6 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) { return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater; } if (left.is_live()) { - auto c = compare_unsigned(left.value(), right.value()) <=> 0; - if (c != 0) { - return c; - } if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) { // prefer expiring cells. return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less; @@ -90,12 +86,13 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) { if (left.is_live_and_has_ttl()) { if (left.expiry() != right.expiry()) { return left.expiry() <=> right.expiry(); - } else { + } else if (right.ttl() != left.ttl()) { // prefer the cell that was written later, // so it survives longer after it expires, until purged. return right.ttl() <=> left.ttl(); } } + return compare_unsigned(left.value(), right.value()); } else { // Both are deleted diff --git a/test/boost/mutation_test.cc b/test/boost/mutation_test.cc index 311801a0f517..3837dbc6420b 100644 --- a/test/boost/mutation_test.cc +++ b/test/boost/mutation_test.cc @@ -710,6 +710,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) { atomic_cell::make_live(*bytes_type, 1, bytes("value")), atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1)); + testlog.debug("Non-expiring live cells are ordered before expiring cells, regardless of their value"); + assert_order( + atomic_cell::make_live(*bytes_type, 1, bytes("value2")), + atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_1, ttl_1)); + testlog.debug("Dead cells with same expiry are equal"); assert_equal( atomic_cell::make_dead(1, expiry_1), @@ -743,16 +748,21 @@ SEASTAR_TEST_CASE(test_cell_ordering) { atomic_cell::make_live(*bytes_type, 0, bytes("value2")), atomic_cell::make_live(*bytes_type, 1, bytes("value1"))); - testlog.debug("...then by value"); - assert_order( - atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_2, ttl_2), - atomic_cell::make_live(*bytes_type, 1, bytes("value2"), expiry_1, ttl_1)); - testlog.debug("...then by expiry"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_1), atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_1)); + testlog.debug("...then by ttl (in reverse)"); + assert_order( + atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_2), + atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_1)); + + testlog.debug("...then by value"); + assert_order( + atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_1, ttl_1), + atomic_cell::make_live(*bytes_type, 1, bytes("value2"), expiry_1, ttl_1)); + testlog.debug("Dead wins"); assert_order( atomic_cell::make_live(*bytes_type, 1, bytes("value")), From 6717e45ff0362c794ed405c1ab623a533bf2c383 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 11 Jun 2023 10:52:12 +0300 Subject: [PATCH 3/6] atomic_cell: compare_atomic_cell_for_merge: update and add documentation Signed-off-by: Benny Halevy --- mutation/atomic_cell.cc | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mutation/atomic_cell.cc b/mutation/atomic_cell.cc index 1f3febad165c..312e34046703 100644 --- a/mutation/atomic_cell.cc +++ b/mutation/atomic_cell.cc @@ -66,32 +66,43 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other) set_view(_data); } -// Based on: -// - org.apache.cassandra.db.AbstractCell#reconcile() -// - org.apache.cassandra.db.BufferExpiringCell#reconcile() -// - org.apache.cassandra.db.BufferDeletedCell#reconcile() +// Based on Cassandra's resolveRegular function: +// - https://github.com/apache/cassandra/blob/e4f31b73c21b04966269c5ac2d3bd2562e5f6c63/src/java/org/apache/cassandra/db/rows/Cells.java#L79-L119 std::strong_ordering compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) { + // Largest write timestamp wins. if (left.timestamp() != right.timestamp()) { return left.timestamp() <=> right.timestamp(); } + // Tombstones always win reconciliation with live cells of the same timestamp if (left.is_live() != right.is_live()) { return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater; } if (left.is_live()) { + // Prefer expiring cells (which will become tombstones at some future date) over live cells. + // See https://issues.apache.org/jira/browse/CASSANDRA-14592 if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) { - // prefer expiring cells. return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less; } + // If both are expiring, choose the cell with the latest expiry or derived write time. if (left.is_live_and_has_ttl()) { + // Prefer cell with latest expiry if (left.expiry() != right.expiry()) { return left.expiry() <=> right.expiry(); } else if (right.ttl() != left.ttl()) { - // prefer the cell that was written later, - // so it survives longer after it expires, until purged. + // The cell write time is derived by (expiry - ttl). + // Prefer the cell that was written later, + // so it survives longer after it expires, until purged, + // as it become purgeable gc_grace_seconds after it was written. + // + // Note that this is an extension to Cassandra's algorithm + // which stops at the expiration time, and if equal, + // move forward to compare the cell values. return right.ttl() <=> left.ttl(); } } + // The cell with the largest value wins, if all other attributes of the cells are identical. + // This is quite arbitrary, but still required to break the tie in a deterministic way. return compare_unsigned(left.value(), right.value()); } else { // Both are deleted From 0aa13f70eb4a3965d8ffa5fa85696229494648fe Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 13 Jun 2023 10:00:10 +0300 Subject: [PATCH 4/6] mutation_partition: compare_row_marker_for_merge: consider ttl in case expiry is the same As in compare_atomic_cell_for_merge, we want to consider the row marker ttl for ordering, in case both are expiring and have the same expiration time. This was missed in a57c087c897f60c331d8c9c3abcb125aef9250a2 and a085ef74ff0910012880e7be7973352e655dc91d. With that in mind, add documentation to compare_row_marker_for_merge and a mutual note to both functions about their equivalence. Signed-off-by: Benny Halevy --- mutation/atomic_cell.cc | 4 ++++ mutation/mutation_partition.cc | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/mutation/atomic_cell.cc b/mutation/atomic_cell.cc index 312e34046703..2e3cd31d1910 100644 --- a/mutation/atomic_cell.cc +++ b/mutation/atomic_cell.cc @@ -68,6 +68,10 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other) // Based on Cassandra's resolveRegular function: // - https://github.com/apache/cassandra/blob/e4f31b73c21b04966269c5ac2d3bd2562e5f6c63/src/java/org/apache/cassandra/db/rows/Cells.java#L79-L119 +// +// Note: the ordering algorithm for cell is the same as for rows, +// except that the cell value is used to break a tie in case all other attributes are equal. +// See compare_row_marker_for_merge. std::strong_ordering compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) { // Largest write timestamp wins. diff --git a/mutation/mutation_partition.cc b/mutation/mutation_partition.cc index 2aa19c5cc9d7..04abab0c80ee 100644 --- a/mutation/mutation_partition.cc +++ b/mutation/mutation_partition.cc @@ -1108,20 +1108,34 @@ operator<<(std::ostream& os, const mutation_partition::printer& p) { constexpr gc_clock::duration row_marker::no_ttl; constexpr gc_clock::duration row_marker::dead; +// Note: the ordering algorithm for rows is the same as for cells, +// except that there is no cell value to break a tie in case all other attributes are equal. +// See compare_atomic_cell_for_merge. int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept { + // Largest write timestamp wins. if (left.timestamp() != right.timestamp()) { return left.timestamp() > right.timestamp() ? 1 : -1; } + // Tombstones always win reconciliation with live rows of the same timestamp if (left.is_live() != right.is_live()) { return left.is_live() ? -1 : 1; } if (left.is_live()) { + // Prefer expiring rows (which will become tombstones at some future date) over live rows. + // See https://issues.apache.org/jira/browse/CASSANDRA-14592 if (left.is_expiring() != right.is_expiring()) { // prefer expiring cells. return left.is_expiring() ? 1 : -1; } - if (left.is_expiring() && left.expiry() != right.expiry()) { - return left.expiry() < right.expiry() ? -1 : 1; + // If both are expiring, choose the cell with the latest expiry or derived write time. + if (left.is_expiring()) { + if (left.expiry() != right.expiry()) { + return left.expiry() < right.expiry() ? -1 : 1; + } else if (left.ttl() != right.ttl()) { + // The cell write time is derived by (expiry - ttl). + // Prefer row that was written later (and has a smaller ttl). + return left.ttl() < right.ttl() ? 1 : -1; + } } } else { // Both are either deleted or missing From 31a3152a59e98b2ad1ac52d80166ca1cdd1df959 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Thu, 8 Jun 2023 12:04:09 +0300 Subject: [PATCH 5/6] cql-pytest: test_using_timestamp: add tests for rewrites using same timestamp Add reproducers for #14182: test_rewrite_different_values_using_same_timestamp verifies expiration-based cell reconciliation. test_rewrite_different_values_using_same_timestamp_and_expiration is a scylla_only test, verifying that when two cells with same timestamp and same expiration are compared, the one with the lesser ttl prevails. test_rewrite_using_same_timestamp_select_after_expiration reproduces the specific issue hit in #14182 where a cell is selected after it expires since it has a lexicographically larger value than the other cell with later expiration. test_rewrite_multiple_cells_using_same_timestamp verifies atomicity of inserts of multiple columns, with a TTL. Signed-off-by: Benny Halevy --- test/cql-pytest/test_using_timestamp.py | 160 +++++++++++++++++++++++- 1 file changed, 159 insertions(+), 1 deletion(-) diff --git a/test/cql-pytest/test_using_timestamp.py b/test/cql-pytest/test_using_timestamp.py index 8eb8307a7205..a302a8eaa3a5 100644 --- a/test/cql-pytest/test_using_timestamp.py +++ b/test/cql-pytest/test_using_timestamp.py @@ -18,10 +18,20 @@ @pytest.fixture(scope="session") def table1(cql, test_keyspace): table = test_keyspace + "." + unique_name() - cql.execute(f"CREATE TABLE {table} (k int PRIMARY KEY, v int)") + cql.execute(f"CREATE TABLE {table} (k int PRIMARY KEY, v int, w int)") yield table cql.execute("DROP TABLE " + table) +# sync with wall-clock on exact second so that expiration won't cross the whole-second boundary +# 100 milliseconds should be enough to execute 2 inserts at the same second in debug mode +# sleep until the next whole second mark if there is not enough time left on the clock +def ensure_sync_with_tick(millis = 100): + t = time.time() + while t - int(t) >= 1 - millis / 1000: + time.sleep(1 - (t - int(t))) + t = time.time() + return t + # In Cassandra, timestamps can be any *signed* 64-bit integer, not including # the most negative 64-bit integer (-2^63) which for deletion times is # reserved for marking *not deleted* cells. @@ -90,3 +100,151 @@ def test_key_writetime(cql, table1): cql.execute(f'SELECT writetime(k) FROM {table1}') with pytest.raises(InvalidRequest, match='PRIMARY KEY part k|TTL is not legal on partition key component k'): cql.execute(f'SELECT ttl(k) FROM {table1}') + +def test_rewrite_different_values_using_same_timestamp(cql, table1): + """ + Rewriting cells more than once with the same timestamp + requires tie-breaking to decide which of the cells prevails. + When the two inserts are non-expiring or when they have the same expiration time, + cells are selected based on the higher value. + Otherwise, expiring cells are preferred over non-expiring ones, + and if both are expiring, the one with the later expiration time wins. + """ + table = table1 + ts = 1000 + values = [[1, 2], [2, 1]] + for i in range(len(values)): + v1, v2 = values[i] + + def assert_value(k, expected): + select = f"SELECT k, v FROM {table} WHERE k = {k}" + res = list(cql.execute(select)) + assert len(res) == 1 + assert res[0].v == expected + + # With no TTL, highest value wins + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts}") + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts}") + assert_value(k, max(v1, v2)) + + # Expiring cells are preferred over non-expiring + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts}") + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts} and TTL 1") + assert_value(k, v2) + + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts} and TTL 1") + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts}") + assert_value(k, v1) + + # When both are expiring, the one with the later expiration time wins + ensure_sync_with_tick() + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts} and TTL 1") + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts} and TTL 2") + assert_value(k, v2) + + ensure_sync_with_tick() + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts} and TTL 2") + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts} and TTL 1") + assert_value(k, v1) + +def test_rewrite_different_values_using_same_timestamp_and_expiration(scylla_only, cql, table1): + """ + Rewriting cells more than once with the same timestamp + requires tie-breaking to decide which of the cells prevails. + When the two inserts are expiring and have the same expiration time, + scylla selects the cells with the lower ttl. + """ + table = table1 + ts = 1000 + values = [[1, 2], [2, 1]] + for i in range(len(values)): + v1, v2 = values[i] + + def assert_value(k, expected): + select = f"SELECT k, v FROM {table} WHERE k = {k}" + res = list(cql.execute(select)) + assert len(res) == 1 + assert res[0].v == expected + + # When both have the same expiration, the one with the lower TTL wins (as it has higher derived write time = expiration - ttl) + ensure_sync_with_tick() + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts} and TTL 3") + time.sleep(1) + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts} and TTL 2") + assert_value(k, v2) + +def test_rewrite_using_same_timestamp_select_after_expiration(cql, table1): + """ + Reproducer for https://github.com/scylladb/scylladb/issues/14182 + + Rewrite a cell using the same timestamp and ttl. + Due to #14182, after the first insert expires, + the first write would have been selected when it has a lexicographically larger + value, and that results in a null value in the select query result. + With the fix, we expect to get the cell with the higher expiration time. + """ + table = table1 + ts = 1000 + values = [[2, 1], [1, 2]] + for i in range(len(values)): + v1, v2 = values[i] + + def assert_value(k, expected): + select = f"SELECT k, v FROM {table} WHERE k = {k}" + res = list(cql.execute(select)) + assert len(res) == 1 + assert res[0].v == expected + + ensure_sync_with_tick() + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v1}) USING TIMESTAMP {ts} AND TTL 1") + cql.execute(f"INSERT INTO {table} (k, v) VALUES ({k}, {v2}) USING TIMESTAMP {ts} AND TTL 2") + + # wait until first insert expires, and expect 2nd value. + # Null value was returned due to #14182 when v1 > v2 + time.sleep(1) + assert_value(k, v2) + +def test_rewrite_multiple_cells_using_same_timestamp(cql, table1): + """ + Reproducer for https://github.com/scylladb/scylladb/issues/14182: + + Inserts multiple cells in two insert queries that use the same timestamp and different expiration. + Due to #14182, the select query result contained a mixture + of the inserts that is based on the value in each cell, + rather than on the (different) expiration times on the + two inserts. + """ + table = table1 + ts = 1000 + ttl1 = 10 + ttl2 = 20 + values = [{'v':1, 'w':2}, {'v':2, 'w':1}] + + def assert_values(k, expected): + select = f"SELECT * FROM {table} WHERE k = {k}" + res = list(cql.execute(select)) + assert len(res) == 1 + assert res[0].k == k and res[0].v == expected['v'] and res[0].w == expected['w'] + + # rewrite values once with and once without TTL + # if reconciliation is done by value, the result will be a mix of the two writes + # while if reconciliation is based first on the expiration time, the second write should prevail. + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v, w) VALUES ({k}, {values[0]['v']}, {values[0]['w']}) USING TIMESTAMP {ts} AND TTL {ttl1}") + cql.execute(f"INSERT INTO {table} (k, v, w) VALUES ({k}, {values[1]['v']}, {values[1]['w']}) USING TIMESTAMP {ts}") + assert_values(k, values[0]) + + # rewrite values using the same write time and different ttls, so they get different expiration times + # if reconciliation is done by value, the result will be a mix of the two writes + # while if reconciliation is based first on the expiration time, the second write should prevail. + k = unique_key_int() + cql.execute(f"INSERT INTO {table} (k, v, w) VALUES ({k}, {values[0]['v']}, {values[0]['w']}) USING TIMESTAMP {ts} AND TTL {ttl1}") + cql.execute(f"INSERT INTO {table} (k, v, w) VALUES ({k}, {values[1]['v']}, {values[1]['w']}) USING TIMESTAMP {ts} AND TTL {ttl2}") + assert_values(k, values[1]) From 26ff8f7bf78a75f7e513e701e5f945ed6b311b9d Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 11 Jun 2023 11:53:46 +0300 Subject: [PATCH 6/6] docs: dml: add update ordering section and add docs/dev/timestamp-conflict-resolution.md to document the details of the conflict resolution algorithm. Refs scylladb/scylladb#14063 Signed-off-by: Benny Halevy --- docs/cql/dml.rst | 61 ++++++++++++++++++++--- docs/dev/timestamp-conflict-resolution.md | 37 ++++++++++++++ 2 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 docs/dev/timestamp-conflict-resolution.md diff --git a/docs/cql/dml.rst b/docs/cql/dml.rst index 7cbf3a27b532..132ee7ad27fd 100644 --- a/docs/cql/dml.rst +++ b/docs/cql/dml.rst @@ -592,7 +592,7 @@ of eventual consistency on an event of a timestamp collision: ``INSERT`` statements happening concurrently at different cluster nodes proceed without coordination. Eventually cell values -supplied by a statement with the highest timestamp will prevail. +supplied by a statement with the highest timestamp will prevail (see :ref:`update ordering `). Unless a timestamp is provided by the client, Scylla will automatically generate a timestamp with microsecond precision for each @@ -601,7 +601,7 @@ by the same node are unique. Timestamps assigned at different nodes are not guaranteed to be globally unique. With a steadily high write rate timestamp collision is not unlikely. If it happens, i.e. two ``INSERTS`` have the same -timestamp, the lexicographically bigger value prevails: +timestamp, a conflict resolution algorithm determines which of the inserted cells prevails (see :ref:`update ordering `). Please refer to the :ref:`UPDATE ` section for more information on the :token:`update_parameter`. @@ -709,8 +709,8 @@ Similarly to ``INSERT``, ``UPDATE`` statement happening concurrently at differen cluster nodes proceed without coordination. Cell values supplied by a statement with the highest timestamp will prevail. If two ``UPDATE`` statements or ``UPDATE`` and ``INSERT`` -statements have the same timestamp, -lexicographically bigger value prevails. +statements have the same timestamp, a conflict resolution algorithm determines which cells prevails +(see :ref:`update ordering `). Regarding the :token:`assignment`: @@ -749,7 +749,7 @@ parameters: Scylla ensures that query timestamps created by the same coordinator node are unique (even across different shards on the same node). However, timestamps assigned at different nodes are not guaranteed to be globally unique. Note that with a steadily high write rate, timestamp collision is not unlikely. If it happens, e.g. two INSERTS - have the same timestamp, conflicting cell values are compared and the cells with the lexicographically bigger value prevail. + have the same timestamp, a conflict resolution algorithm determines which of the inserted cells prevails (see :ref:`update ordering ` for more information): - ``TTL``: specifies an optional Time To Live (in seconds) for the inserted values. If set, the inserted values are automatically removed from the database after the specified time. Note that the TTL concerns the inserted values, not the columns themselves. This means that any subsequent update of the column will also reset the TTL (to whatever TTL @@ -759,6 +759,55 @@ parameters: - ``TIMEOUT``: specifies a timeout duration for the specific request. Please refer to the :ref:`SELECT ` section for more information. +.. _update-ordering: + +Update ordering +~~~~~~~~~~~~~~~ + +:ref:`INSERT `, :ref:`UPDATE `, and :ref:`DELETE ` +operations are ordered by their ``TIMESTAMP``. + +Ordering of such changes is done at the cell level, where each cell carries a write ``TIMESTAMP``, +other attributes related to its expiration when it has a non-zero time-to-live (``TTL``), +and the cell value. + +The fundamental rule for ordering cells that insert, update, or delete data in a given row and column +is that the cell with the highest timestamp wins. + +However, it is possible that multiple such cells will carry the same ``TIMESTAMP``. +There could be several reasons for ``TIMESTAMP`` collision: + +* Benign collision can be caused by "replay" of a mutation, e.g., due to client retry, or due to internal processes. + In such cases, the cells are equivalent, and any of them can be selected arbitrarily. +* ``TIMESTAMP`` collisions might be normally caused by parallel queries that are served + by different coordinator nodes. The coordinators might calculate the same write ``TIMESTAMP`` + based on their local time in microseconds. +* Collisions might also happen with user-provided timestamps if the application does not guarantee + unique timestamps with the ``USING TIMESTAMP`` parameter (see :ref:`Update parameters ` for more information). + +As said above, in the replay case, ordering of cells should not matter, as they carry the same value +and same expiration attributes, so picking any of them will reach the same result. +However, other ``TIMESTAMP`` conflicts must be resolved in a consistent way by all nodes. +Otherwise, if nodes would have picked an arbitrary cell in case of a conflict and they would +reach different results, reading from different replicas would detect the inconsistency and trigger +read-repair that will generate yet another cell that would still conflict with the existing cells, +with no guarantee for convergence. + +Therefore, Scylla implements an internal, consistent conflict-resolution algorithm +that orders cells with conflicting ``TIMESTAMP`` values based on other properties, like: + +* whether the cell is a tombstone or a live cell, +* whether the cell has an expiration time, +* the cell ``TTL``, +* and finally, what value the cell carries. + +The conflict-resolution algorithm is documented in `Scylla's internal documentation `_ +and it may be subject to change. + +Reliable serialization can be achieved using unique write ``TIMESTAMP`` +and by using :doc:`Lightweight Transactions (LWT) ` to ensure atomicity of +:ref:`INSERT `, :ref:`UPDATE `, and :ref:`DELETE `. + .. _delete_statement: DELETE @@ -798,7 +847,7 @@ For more information on the :token:`update_parameter` refer to the :ref:`UPDATE In a ``DELETE`` statement, all deletions within the same partition key are applied atomically, meaning either all columns mentioned in the statement are deleted or none. If ``DELETE`` statement has the same timestamp as ``INSERT`` or -``UPDATE`` of the same primary key, delete operation prevails. +``UPDATE`` of the same primary key, delete operation prevails (see :ref:`update ordering `). A ``DELETE`` operation can be conditional through the use of an ``IF`` clause, similar to ``UPDATE`` and ``INSERT`` statements. Each such ``DELETE`` gets a globally unique timestamp. diff --git a/docs/dev/timestamp-conflict-resolution.md b/docs/dev/timestamp-conflict-resolution.md new file mode 100644 index 000000000000..421a979dfd4d --- /dev/null +++ b/docs/dev/timestamp-conflict-resolution.md @@ -0,0 +1,37 @@ +# Timestamp conflict resolution + +The fundamental rule for ordering cells that insert, update, or delete data in a given row and column +is that the cell with the highest timestamp wins. + +However, it is possible that multiple such cells will carry the same `TIMESTAMP`. +In this case, conflicts must be resolved in a consistent way by all nodes. +Otherwise, if nodes would have picked an arbitrary cell in case of a conflict and they would +reach different results, reading from different replicas would detect the inconsistency and trigger +read-repair that will generate yet another cell that would still conflict with the existing cells, +with no guarantee for convergence. + +The first tie-breaking rule when two cells have the same write timestamp is that +dead cells win over live cells; and if both cells are deleted, the one with the later deletion time prevails. + +If both cells are alive, their expiration time is examined. +Cells that are written with a non-zero TTL (either implicit, as determined by +the table's default TTL, or explicit, `USING TTL`) are due to expire +TTL seconds after the time they were written (as determined by the coordinator, +and rounded to 1 second resolution). That time is the cell's expiration time. +When cells expire, they become tombstones, shadowing any data written with a write timestamp +less than or equal to the timestamp of the expiring cell. +Therefore, cells that have an expiration time win over cells with no expiration time. + +If both cells have an expiration time, the one with the latest expiration time wins; +and if they have the same expiration time (in whole second resolution), +their write time is derived from the expiration time less the original time-to-live value +and the one that was written at a later time prevails. + +Finally, if both cells are live and have no expiration, or have the same expiration time and time-to-live, +the cell with the lexicographically bigger value prevails. + +Note that when multiple columns are INSERTed or UPDATEed using the same timestamp, +SELECTing those columns might return a result that mixes cells from either upsert. +This may happen when both upserts have no expiration time, or both their expiration time and TTL are the +same, respectively (in whole second resolution). In such a case, cell selection would be based on the cell values +in each column, independently of each other.