From 993363c7a2e38c80a065857d156bbfd49091d6d8 Mon Sep 17 00:00:00 2001 From: James Stone Date: Tue, 5 Mar 2019 02:10:59 -0800 Subject: [PATCH] Optimise queries search for a chain of OR strings (#3250) This is a performance enhancement motivated by users who are generating queries with many string comparisons on a single column, for example from cocoa's "IN" queries. The idea is to combine string equality conditions from a single "OR" query node and store them in an unordered_set. With N elements to search, and C conditions, the runtime changes from O(N*C) to O(N). The added benchmark goes from 30 seconds to 2 seconds. This change does not try to optimise indexed columns which should be running O(log(N)*C). The benchmark with indexes turned on runs in 3.5 seconds. Since N is likely the dominant term, using indexes should still be fastest in practice when compared to this optimisation. --- CHANGELOG.md | 4 +- src/realm/query_engine.cpp | 117 +++++++++++++++++++++++++-- src/realm/query_engine.hpp | 52 ++++++++++-- test/benchmark-common-tasks/main.cpp | 46 +++++++++++ test/test_parser.cpp | 50 ++++++++++++ test/test_query.cpp | 9 +++ 6 files changed, 261 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66bd83107c0..a90a04f1cbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,9 @@ ----------- ### Internals -* None. +* Optimised queries for unindexed string columns when the query has a chain of OR conditions. + This will improve performance of "IN" queries generated by bindings. + ([#22](https://github.com/realm/engineering/issues/22). ---------------------------------------------- diff --git a/src/realm/query_engine.cpp b/src/realm/query_engine.cpp index d1771fa1479..94a5d84600d 100644 --- a/src/realm/query_engine.cpp +++ b/src/realm/query_engine.cpp @@ -19,6 +19,7 @@ #include #include +#include using namespace realm; @@ -325,8 +326,75 @@ void StringNode::_search_index_init() } } +void StringNode::consume_condition(StringNode* other) +{ + // If a search index is present, don't try to combine conditions since index search is most likely faster. + // Assuming N elements to search and M conditions to check: + // 1) search index present: O(log(N)*M) + // 2) no search index, combine conditions: O(N) + // 3) no search index, conditions not combined: O(N*M) + // In practice N is much larger than M, so if we have a search index, choose 1, otherwise if possible choose 2. + REALM_ASSERT(m_condition_column == other->m_condition_column); + REALM_ASSERT(other->m_needles.empty()); + if (m_needles.empty()) { + m_needles.insert(bool(m_value) ? StringData(*m_value) : StringData()); + } + if (bool(other->m_value)) { + m_needle_storage.push_back(StringBuffer()); + m_needle_storage.back().append(*other->m_value); + m_needles.insert(StringData(m_needle_storage.back().data(), m_needle_storage.back().size())); + } + else { + m_needles.insert(StringData()); + } +} + +// Requirements of template types: +// ArrayType must support: size() -> size_t, and get(size_t) -> ElementType +// ElementType must be convertable to a StringData via data() and size() +template +size_t StringNode::find_first_in(ArrayType& array, size_t begin, size_t end) +{ + if (m_needles.empty()) + return not_found; + + size_t n = array.size(); + if (end == npos) + end = n; + REALM_ASSERT_7(begin, <=, n, &&, end, <=, n); + REALM_ASSERT_3(begin, <=, end); + + const auto not_in_set = m_needles.end(); + // For a small number of conditions it is faster to cycle through + // and check them individually. The threshold depends on how fast + // our hashing of StringData is (see `StringData.hash()`). The + // number 20 was found empirically when testing small strings + // with N==100k + if (m_needles.size() < 20) { + for (size_t i = begin; i < end; ++i) { + ElementType element = array.get(i); + StringData value_2{element.data(), element.size()}; + for (auto it = m_needles.begin(); it != not_in_set; ++it) { + if (*it == value_2) + return i; + } + } + } + else { + for (size_t i = begin; i < end; ++i) { + ElementType element = array.get(i); + StringData value_2{element.data(), element.size()}; + if (m_needles.find(value_2) != not_in_set) + return i; + } + } + + return not_found; +} + size_t StringNode::_find_first_local(size_t start, size_t end) { + const bool multi_target_search = !m_needles.empty(); // Normal string column, with long or short leaf for (size_t s = start; s < end; ++s) { const StringColumn* asc = static_cast(m_condition_column); @@ -345,14 +413,23 @@ size_t StringNode::_find_first_local(size_t start, size_t end) } size_t end2 = (end > m_leaf_end ? m_leaf_end - m_leaf_start : end - m_leaf_start); - if (m_leaf_type == StringColumn::leaf_type_Small) - s = static_cast(*m_leaf).find_first(m_value, s - m_leaf_start, end2); - else if (m_leaf_type == StringColumn::leaf_type_Medium) - s = static_cast(*m_leaf).find_first(m_value, s - m_leaf_start, end2); - else - s = static_cast(*m_leaf).find_first(str_to_bin(m_value), true, s - m_leaf_start, - end2); - + if (multi_target_search) { + if (m_leaf_type == StringColumn::leaf_type_Small) + s = find_first_in(static_cast(*m_leaf), s - m_leaf_start, end2); + else if (m_leaf_type == StringColumn::leaf_type_Medium) + s = find_first_in(static_cast(*m_leaf), s - m_leaf_start, end2); + else + s = find_first_in(static_cast(*m_leaf), s - m_leaf_start, end2); + } + else { + if (m_leaf_type == StringColumn::leaf_type_Small) + s = static_cast(*m_leaf).find_first(m_value, s - m_leaf_start, end2); + else if (m_leaf_type == StringColumn::leaf_type_Medium) + s = static_cast(*m_leaf).find_first(m_value, s - m_leaf_start, end2); + else + s = static_cast(*m_leaf).find_first(str_to_bin(m_value), true, s - m_leaf_start, + end2); + } if (s == not_found) s = m_leaf_end - 1; else @@ -362,6 +439,30 @@ size_t StringNode::_find_first_local(size_t start, size_t end) return not_found; } +std::string StringNode::describe(util::serializer::SerialisationState& state) const +{ + if (m_needles.empty()) { + return StringNodeEqualBase::describe(state); + } + + // FIXME: once the parser supports it, print something like "column IN {s1, s2, s3}" + REALM_ASSERT(m_condition_column != nullptr); + std::string desc; + bool is_first = true; + for (auto it : m_needles) { + StringData sd(it.data(), it.size()); + desc += (is_first ? "" : " or ") + + state.describe_column(ParentNode::m_table, m_condition_column->get_column_index()) + + " " + Equal::description() + " " + util::serializer::print_value(sd); + is_first = false; + } + if (!is_first) { + desc = "(" + desc + ")"; + } + return desc; +} + + void StringNode::_search_index_init() { if (m_column_type == col_type_StringEnum) { diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp index 15d56445300..1cab97f8820 100644 --- a/src/realm/query_engine.hpp +++ b/src/realm/query_engine.hpp @@ -113,9 +113,11 @@ AggregateState State of the aggregate - contains a state variable that stor #include #include #include +#include #include #include +#include #if REALM_X86_OR_X64_TRUE && defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 #include @@ -1184,6 +1186,11 @@ class StringNodeBase : public ParentNode { do_verify_column(m_condition_column); } + bool has_search_index() const + { + return m_condition_column->has_search_index(); + } + void init() override { ParentNode::init(); @@ -1301,7 +1308,6 @@ class StringNode : public StringNodeBase { StringNodeBase::init(); } - size_t find_first_local(size_t start, size_t end) override { TConditionFunction cond; @@ -1537,9 +1543,9 @@ class StringNodeEqualBase : public StringNodeBase { size_t m_last_start; }; - // Specialization for Equal condition on Strings - we specialize because we can utilize indexes (if they exist) for -// Equal. +// Equal. This specialisation also supports combining other StringNode conditions into itself in order to +// optimise the non-indexed linear search that can be happen when many conditions are OR'd together in an "IN" query. // Future optimization: make specialization for greater, notequal, etc template <> class StringNode : public StringNodeEqualBase { @@ -1548,13 +1554,22 @@ class StringNode : public StringNodeEqualBase { void _search_index_init() override; + void consume_condition(StringNode* other); + std::unique_ptr clone(QueryNodeHandoverPatches* patches) const override { return std::unique_ptr(new StringNode(*this, patches)); } + virtual std::string describe(util::serializer::SerialisationState& state) const override; + private: + template + size_t find_first_in(ArrayType& array, size_t begin, size_t end); + size_t _find_first_local(size_t start, size_t end) override; + std::unordered_set m_needles; + std::vector m_needle_storage; }; @@ -1603,7 +1618,6 @@ class StringNode : public StringNodeEqualBase { size_t _find_first_local(size_t start, size_t end) override; }; - // OR node contains at least two node pointers: Two or more conditions to OR // together in m_conditions, and the next AND condition (if any) in m_child. // @@ -1641,11 +1655,9 @@ class OrNode : public ParentNode { condition->verify_column(); } } + std::string describe(util::serializer::SerialisationState& state) const override { - if (m_conditions.size() >= 2) { - - } std::string s; for (size_t i = 0; i < m_conditions.size(); ++i) { if (m_conditions[i]) { @@ -1661,13 +1673,37 @@ class OrNode : public ParentNode { return s; } - void init() override { ParentNode::init(); m_dD = 10.0; + StringNode* first = nullptr; + std::sort(m_conditions.begin(), m_conditions.end(), + [](auto& a, auto& b) { return a->m_condition_column_idx < b->m_condition_column_idx; }); + auto it = m_conditions.begin(); + while (it != m_conditions.end()) { + // Only try to optimize on StringNode conditions without search index + if ((first = dynamic_cast*>(it->get())) && !first->has_search_index()) { + auto col_ndx = first->m_condition_column_idx; + auto next = it + 1; + while (next != m_conditions.end() && (*next)->m_condition_column_idx == col_ndx) { + if (auto advance = dynamic_cast*>(next->get())) { + first->consume_condition(advance); + next = m_conditions.erase(next); + } + else { + ++next; + } + } + it = next; + } + else { + ++it; + } + } + m_start.clear(); m_start.resize(m_conditions.size(), 0); diff --git a/test/benchmark-common-tasks/main.cpp b/test/benchmark-common-tasks/main.cpp index 0e327f70f95..350d598af2f 100644 --- a/test/benchmark-common-tasks/main.cpp +++ b/test/benchmark-common-tasks/main.cpp @@ -366,6 +366,51 @@ struct BenchmarkQuery : BenchmarkWithStrings { } }; +struct BenchmarkQueryChainedOrStrings : BenchmarkWithStringsTable { + const size_t num_queried_matches = 1000; + const size_t num_rows = 100000; + std::vector values_to_query; + const char* name() const + { + return "QueryChainedOrStrings"; + } + + void before_all(SharedGroup& group) + { + BenchmarkWithStringsTable::before_all(group); + WriteTransaction tr(group); + TableRef t = tr.get_table("StringOnly"); + t->add_empty_row(num_rows); + REALM_ASSERT(num_rows > num_queried_matches); + Random r; + for (size_t i = 0; i < num_rows; ++i) { + std::stringstream ss; + ss << i; + auto s = ss.str(); + t->set_string(0, i, s); + } + //t->add_search_index(0); + for (size_t i = 0; i < num_queried_matches; ++i) { + size_t ndx_to_match = (num_rows / num_queried_matches) * i; + values_to_query.push_back(t->get_string(0, ndx_to_match)); + } + tr.commit(); + } + + void operator()(SharedGroup& group) + { + ReadTransaction tr(group); + ConstTableRef table = tr.get_table("StringOnly"); + Query query = table->where(); + for (size_t i = 0; i < values_to_query.size(); ++i) { + query.Or().equal(0, values_to_query[i]); + } + TableView results = query.find_all(); + REALM_ASSERT_EX(results.size() == num_queried_matches, results.size(), num_queried_matches, values_to_query.size()); + static_cast(results); + } +}; + struct BenchmarkSize : BenchmarkWithStrings { const char* name() const { @@ -1003,6 +1048,7 @@ int benchmark_common_tasks_main() BENCH(BenchmarkQueryInsensitiveString); BENCH(BenchmarkQueryInsensitiveStringIndexed); BENCH(BenchmarkNonInitatorOpen); + BENCH(BenchmarkQueryChainedOrStrings); #undef BENCH return 0; diff --git a/test/test_parser.cpp b/test/test_parser.cpp index bdae7f718f3..01d184eb5dd 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -2730,4 +2730,54 @@ TEST(Parser_Between) CHECK(message.find("Invalid Predicate. The 'between' operator is not supported yet, please rewrite the expression using '>' and '<'.") != std::string::npos); } +TEST(Parser_ChainedStringEqualQueries) +{ + Group g; + TableRef table = g.add_table("table"); + size_t a_col_ndx = table->add_column(type_String, "a", false); + size_t b_col_ndx = table->add_column(type_String, "b", true); + size_t c_col_ndx = table->add_column(type_String, "c", false); + size_t d_col_ndx = table->add_column(type_String, "d", true); + + table->add_search_index(c_col_ndx); + table->add_search_index(d_col_ndx); + + table->add_empty_row(100); + std::vector populated_data; + std::stringstream ss; + for (size_t i = 0; i < table->size(); ++i) { + ss.str({}); + ss << i; + std::string sd (ss.str()); + populated_data.push_back(sd); + table->set_string(a_col_ndx, i, sd); + table->set_string(b_col_ndx, i, sd); + table->set_string(c_col_ndx, i, sd); + table->set_string(d_col_ndx, i, sd); + } + table->add_empty_row(); // one null/empty string + + verify_query(test_context, table, "a == '0' or a == '1' or a == '2'", 3); + verify_query(test_context, table, "a == '0' or b == '2' or a == '3' or b == '4'", 4); + verify_query(test_context, table, "(a == '0' or b == '2' or a == '3' or b == '4') and (c == '0' or d == '2' or c == '3' or d == '4')", 4); + verify_query(test_context, table, "a == '' or a == null", 1); + verify_query(test_context, table, "b == '' or b == null", 1); + verify_query(test_context, table, "c == '' or c == null", 1); + verify_query(test_context, table, "d == '' or d == null", 1); + verify_query(test_context, table, "(a == null or a == '') and (b == null or b == '') and (c == null or c == '') and (d == null or d == '')", 1); + + Random rd; + rd.shuffle(populated_data.begin(), populated_data.end()); + std::string query; + bool first = true; + char column_to_query = 0; + for (auto s : populated_data) { + std::string column_name(1, 'a' + column_to_query); + query += (first ? "" : " or " ) + column_name + " == '" + s + "'"; + first = false; + column_to_query = (column_to_query + 1) % 4; + } + verify_query(test_context, table, query, populated_data.size()); +} + #endif // TEST_PARSER diff --git a/test/test_query.cpp b/test/test_query.cpp index 743ebe5a825..e525f632603 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -5895,6 +5895,7 @@ TEST(Query_FindAllOr) add(ttt, 5, "a"); add(ttt, 6, "a"); add(ttt, 7, "X"); + add(ttt, 8, "z"); // first == 5 || second == X Query q1 = ttt.where().equal(0, 5).Or().equal(1, "X"); @@ -5903,6 +5904,14 @@ TEST(Query_FindAllOr) CHECK_EQUAL(2, tv1.get_source_ndx(0)); CHECK_EQUAL(4, tv1.get_source_ndx(1)); CHECK_EQUAL(6, tv1.get_source_ndx(2)); + + // second == X || second == b || second == z || first == -1 + Query q2 = ttt.where().equal(1, "X").Or().equal(1, "b").Or().equal(1, "z").Or().equal(0, -1); + TableView tv2 = q2.find_all(); + CHECK_EQUAL(3, tv2.size()); + CHECK_EQUAL(2, tv2.get_source_ndx(0)); + CHECK_EQUAL(6, tv2.get_source_ndx(1)); + CHECK_EQUAL(7, tv2.get_source_ndx(2)); }