Skip to content

Commit

Permalink
Merge ' cql3/prepare_context: fix generating pk_indexes for duplicate…
Browse files Browse the repository at this point in the history
… named bind variables' from Jan Ciołek

When presented with queries that use the same named bind variables twice, like this one:
```cql
SELECT p FROM table WHERE p = :x AND c = :x
```

Scylla generated empty `partition_key_bind_indexes` (`pk_indexes`).
`pk_indexes` tell the driver which bind variables it should use to calculate the partition token for a query. Without it, the driver is unable to determine the token and it will send the query to a random node.

Scylla should generate pk_indexes which tell the driver that it can use bind variable with `bind_index = 0` to calculate the partition token for this query.

The problem was that `_target_columns` keep only a single target_column for each bind variable.
In the example above `:x` is compared with both `p` and `c`, but `_target_columns` would contain only one of them, and Scylla wasn't able to tell that this bind variable is compared with a partition key column.

To fix it, let's replace `_target_columns` with `_targets`. `_targets` keeps all comparisons
between bind variables and other expressions, so none of them will be forgotten/overwritten.

A `cql-pytest` reproducer is added.

I also added some comments in `prepare_context.hh/cc` to make it easier to read.

Fixes: #15374

Closes #15526

* github.com:scylladb/scylladb:
  cql-pytest/test-prepare: remove xfail marker from *pk_indexes_duplicate_named_variables
  cql3/prepare_context: fix generating pk_indexes for duplicate named bind variables
  cql3: improve readability of prepare_context
  cql-pytest: test generation of pk indexes during PREPARE
  • Loading branch information
avikivity committed Sep 26, 2023
2 parents 9dea205 + 649b634 commit 301b0a9
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 26 deletions.
13 changes: 8 additions & 5 deletions cql3/Cql.g
Expand Up @@ -147,8 +147,11 @@ using uexpression = uninitialized<expression>;

listener_type* listener;

std::vector<::shared_ptr<cql3::column_identifier>> _bind_variables;
// index into _bind_variables
// Keeps the names of all bind variables. For bind variables without a name ('?'), the name is nullptr.
// Maps bind_index -> name.
std::vector<::shared_ptr<cql3::column_identifier>> _bind_variable_names;

// Maps name -> bind_index for all named bind variables.
std::unordered_map<cql3::column_identifier, size_t> _named_bind_variables_indexes;
std::vector<std::unique_ptr<TokenType>> _missing_tokens;

Expand All @@ -172,8 +175,8 @@ using uexpression = uninitialized<expression>;
if (name && _named_bind_variables_indexes.contains(*name)) {
return bind_variable{_named_bind_variables_indexes[*name]};
}
auto marker = bind_variable{_bind_variables.size()};
_bind_variables.push_back(name);
auto marker = bind_variable{_bind_variable_names.size()};
_bind_variable_names.push_back(name);
if (name) {
_named_bind_variables_indexes[*name] = marker.bind_index;
}
Expand Down Expand Up @@ -321,7 +324,7 @@ query returns [std::unique_ptr<raw::parsed_statement> stmnt]
;
cqlStatement returns [std::unique_ptr<raw::parsed_statement> stmt]
@after{ if (stmt) { stmt->set_bound_variables(_bind_variables); } }
@after{ if (stmt) { stmt->set_bound_variables(_bind_variable_names); } }
: st1= selectStatement { $stmt = std::move(st1); }
| st2= insertStatement { $stmt = std::move(st2); }
| st3= updateStatement { $stmt = std::move(st3); }
Expand Down
36 changes: 18 additions & 18 deletions cql3/prepare_context.cc
Expand Up @@ -19,22 +19,22 @@ size_t prepare_context::bound_variables_size() const {
}

const std::vector<lw_shared_ptr<column_specification>>& prepare_context::get_variable_specifications() const & {
return _specs;
return _variable_specs;
}

std::vector<lw_shared_ptr<column_specification>> prepare_context::get_variable_specifications() && {
return std::move(_specs);
return std::move(_variable_specs);
}

std::vector<uint16_t> prepare_context::get_partition_key_bind_indexes(const schema& schema) const {
auto count = schema.partition_key_columns().size();
std::vector<uint16_t> partition_key_positions(count, uint16_t(0));
std::vector<bool> set(count, false);
for (size_t i = 0; i < _target_columns.size(); i++) {
auto& target_column = _target_columns[i];
const auto* cdef = target_column ? schema.get_column_definition(target_column->name->name()) : nullptr;

for (auto&& [bind_index, target_spec] : _targets) {
const auto* cdef = target_spec ? schema.get_column_definition(target_spec->name->name()) : nullptr;
if (cdef && cdef->is_partition_key()) {
partition_key_positions[cdef->position()] = i;
partition_key_positions[cdef->position()] = bind_index;
set[cdef->position()] = true;
}
}
Expand All @@ -48,30 +48,30 @@ std::vector<uint16_t> prepare_context::get_partition_key_bind_indexes(const sche

void prepare_context::add_variable_specification(int32_t bind_index, lw_shared_ptr<column_specification> spec) {
auto name = _variable_names[bind_index];
if (_specs[bind_index]) {
if (_variable_specs[bind_index]) {
// If the same variable is used in multiple places, check that the types are compatible
if (&spec->type->without_reversed() != &_specs[bind_index]->type->without_reversed()) {
if (&spec->type->without_reversed() != &_variable_specs[bind_index]->type->without_reversed()) {
throw exceptions::invalid_request_exception(
fmt::format("variable :{} has type {} which doesn't match {}",
*name, _specs[bind_index]->type->as_cql3_type(), spec->name));
*name, _variable_specs[bind_index]->type->as_cql3_type(), spec->name));
}
}
_target_columns[bind_index] = spec;
_targets.emplace_back(bind_index, spec);
// Use the user name, if there is one
if (name) {
spec = make_lw_shared<column_specification>(spec->ks_name, spec->cf_name, name, spec->type);
}
_specs[bind_index] = spec;
_variable_specs[bind_index] = spec;
}

void prepare_context::set_bound_variables(const std::vector<shared_ptr<column_identifier>>& prepare_meta) {
_variable_names = prepare_meta;
_specs.clear();
_target_columns.clear();
void prepare_context::set_bound_variables(const std::vector<shared_ptr<column_identifier>>& bind_variable_names) {
_variable_names = bind_variable_names;
_variable_specs.clear();
_targets.clear();

const size_t bn_size = prepare_meta.size();
_specs.resize(bn_size);
_target_columns.resize(bn_size);
const size_t bn_size = bind_variable_names.size();
_variable_specs.resize(bn_size);
_targets.resize(bn_size);
}

void prepare_context::clear_pk_function_calls_cache() {
Expand Down
15 changes: 12 additions & 3 deletions cql3/prepare_context.hh
Expand Up @@ -33,9 +33,18 @@ namespace functions { class function_call; }
*/
class prepare_context final {
private:
// Keeps names of all the bind variables. For bind variables without a name ('?'), the name is nullptr.
// Maps bind_index -> name.
std::vector<shared_ptr<column_identifier>> _variable_names;
std::vector<lw_shared_ptr<column_specification>> _specs;
std::vector<lw_shared_ptr<column_specification>> _target_columns;

// Keeps column_specification for every bind_index. column_specification describes the name and type of this variable.
std::vector<lw_shared_ptr<column_specification>> _variable_specs;

// For every expression like (<target> = <bind variable>), there's a pair of (bind_index, target column_specification) in _targets.
// Collecting all equalities of bind variables allows to determine which of the variables set the value of partition key columns.
// The driver needs this information in order to compute the partition token and send the request to the right node.
std::vector<std::pair<std::size_t, lw_shared_ptr<column_specification>>> _targets;

// A list of pointers to prepared `function_call` cache ids, that
// participate in partition key ranges computation within an LWT statement.
std::vector<::shared_ptr<std::optional<uint8_t>>> _pk_function_calls_cache_ids;
Expand All @@ -61,7 +70,7 @@ public:

void add_variable_specification(int32_t bind_index, lw_shared_ptr<column_specification> spec);

void set_bound_variables(const std::vector<shared_ptr<column_identifier>>& prepare_meta);
void set_bound_variables(const std::vector<shared_ptr<column_identifier>>& bind_variable_names);

void clear_pk_function_calls_cache();

Expand Down
152 changes: 152 additions & 0 deletions test/cql-pytest/test_prepare.py
@@ -0,0 +1,152 @@
# Copyright 2023-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later

#############################################################################
# Tests for preparing various kinds of statements. When a client asks to prepare
# a statement, Scylla has to process it and return the correct prepared statement
# metadata. The metadata contains information about the keyspace, table and bind variables.
# Let's ensure that this information is correct.
# Here's the description of prepared metadata in CQL protocol spec:
# https://github.com/apache/cassandra/blob/1959502d8b16212479eecb076c89945c3f0f180c/doc/native_protocol_v4.spec#L675

import pytest
from util import new_test_table


@pytest.fixture(scope="module")
def table1(cql, test_keyspace):
with new_test_table(cql, test_keyspace, "p int, c int, PRIMARY KEY (p, c)") as table:
yield table


@pytest.fixture(scope="module")
def table2(cql, test_keyspace):
with new_test_table(cql, test_keyspace, "p1 int, p2 int, p3 int, p4 int, c1 int, c2 int, r1 int, r2 int, PRIMARY KEY ((p1, p2, p3, p4), c1, c2)") as table:
yield table

# The following tests test the generation of "pk indexes"
# "pk indexes" tell the driver which bind variable values it should use to calculate the partition token, so that it can send queries to the correct shard.
# https://github.com/apache/cassandra/blob/1959502d8b16212479eecb076c89945c3f0f180c/doc/native_protocol_v4.spec#L699-L707


# Test generating pk indexes for a single column partition key.
def test_single_pk_indexes(cql, table1):
prepared = cql.prepare(f"SELECT p FROM {table1} WHERE p = ?")
assert prepared.routing_key_indexes == [0]

prepared = cql.prepare(f"SELECT p, c FROM {table1} WHERE c = ? AND p = ?")
assert prepared.routing_key_indexes == [1]

# Test that pk indexes aren't generated when the partition key column isn't restricted using a bind variable.
# In this situation the driver won't be able to calculate the token, so pk indexes should be empty (None).
def test_single_pk_no_indexes(cql, table1):
prepared = cql.prepare(f"SELECT p, c FROM {table1}")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(f"SELECT p, c FROM {table1} WHERE c = ? ALLOW FILTERING")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(f"SELECT p FROM {table1} WHERE p = 0 AND c = ?")
assert prepared.routing_key_indexes is None

# Test generating pk indexes for a composite partition key.
def test_composite_pk_indexes(cql, table2):
prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = ? AND p2 = ? AND p3 = ? AND p4 = ? AND c1 = ? AND c2 = ?")
assert prepared.routing_key_indexes == [0, 1, 2, 3]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p4 = ? AND p3 = ? AND p2 = ? AND p1 = ? AND c1 = ? AND c2 = ?")
assert prepared.routing_key_indexes == [3, 2, 1, 0]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE r1 = ? AND c2 = ? AND p3 = ? AND p1 = ? AND r2 = ? AND p4 = ? AND c1 = ? AND p2 = ? ALLOW FILTERING")
assert prepared.routing_key_indexes == [3, 7, 2, 5]

# Test that pk indexes aren't generated when not all partition key columns are restricted using bind variables.
# In this situation the driver won't be able to calculate the token, so pk indexes should be empty (None).
def test_composite_pk_no_indexes(cql, table2):
prepared = cql.prepare(
f"SELECT * FROM {table2}")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = ? ALLOW FILTERING")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = ? AND p2 = ? AND p4 = ? ALLOW FILTERING")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = ? AND p2 = 0 AND p3 = ? AND p4 = ?")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = ? AND p2 = 0 AND p3 = ? AND p4 = ? AND c1 = ? AND c2 = ?")
assert prepared.routing_key_indexes is None

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = 0 AND p2 = 1 AND p3 = 2 AND p4 = 3 AND c1 = 5 AND c2 = 5")
assert prepared.routing_key_indexes is None

# Test generating pk indexes for a single column partition key using named bind variables.
def test_single_pk_indexes_named_variables(cql, table1):
prepared = cql.prepare(f"SELECT p FROM {table1} WHERE p = :a")
assert prepared.routing_key_indexes == [0]

prepared = cql.prepare(f"SELECT p, c FROM {table1} WHERE c = :a AND p = :b")
assert prepared.routing_key_indexes == [1]

# Test generating pk indexes for a composite partition key using named bind variables.
def test_composite_pk_indexes_named_variables(cql, table2):
prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = :a AND p2 = :b AND p3 = :c AND p4 = :d AND c1 = :e AND c2 = :f")
assert prepared.routing_key_indexes == [0, 1, 2, 3]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = :f AND p2 = :e AND p3 = :d AND p4 = :c AND c1 = :b AND c2 = :a")
assert prepared.routing_key_indexes == [0, 1, 2, 3]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE c1 = :a AND c2 = :b AND p1 = :f AND p2 = :e AND p3 = :d AND p4 = :c")
assert prepared.routing_key_indexes == [2, 3, 4, 5]

# Test generating pk indexes with named bind variables where the same variable is used multiple times.
# The test is scylla_only because Scylla treats :x as a single bind variable, but Cassandra thinks
# that there are two bind variables, both of them named :x.
def test_single_pk_indexes_duplicate_named_variables(cql, table1, scylla_only):
prepared = cql.prepare(f"SELECT p FROM {table1} WHERE p = :x")
assert prepared.routing_key_indexes == [0]

prepared = cql.prepare(f"SELECT p FROM {table1} WHERE p = :x AND c = :x")
assert prepared.routing_key_indexes == [0]

prepared = cql.prepare(f"SELECT p FROM {table1} WHERE c = :x AND p = :x")
assert prepared.routing_key_indexes == [0]

# Test generating pk indexes with named bind variables where the same variable is used multiple times.
# The test is scylla_only because Scylla treats :x as a single bind variable, but Cassandra thinks
# that there are multiple bind variables, all of them named :x.
def test_composite_pk_indexes_duplicate_named_variables(cql, table2, scylla_only):
prepared = cql.prepare(f"SELECT * FROM {table2} WHERE p1 = :x AND p2 = :x AND p3 = :x AND p4 = :x")
assert prepared.routing_key_indexes == [0, 0, 0, 0]

prepared = cql.prepare(f"SELECT * FROM {table2} WHERE p1 = :a AND p2 = :a AND p3 = :b AND p4 = :b")
assert prepared.routing_key_indexes == [0, 0, 1, 1]

prepared = cql.prepare(f"SELECT * FROM {table2} WHERE p1 = :a AND p2 = :b AND p3 = :a AND p4 = :b")
assert prepared.routing_key_indexes == [0, 1, 0, 1]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE c1 = :a AND c2 = :b AND p1 = :a AND p2 = :b AND p3 = :a AND p4 = :b")
assert prepared.routing_key_indexes == [0, 1, 0, 1]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = :a AND p2 = :b AND p3 = :a AND p4 = :b AND c1 = :a AND c2 = :b ")
assert prepared.routing_key_indexes == [0, 1, 0, 1]

prepared = cql.prepare(
f"SELECT * FROM {table2} WHERE p1 = :x AND p2 = :x AND p3 = :z AND p4 = :y AND c1 = :y AND c2 = :z ")
assert prepared.routing_key_indexes == [0, 0, 1, 2]

0 comments on commit 301b0a9

Please sign in to comment.