Skip to content

Commit

Permalink
Optimise queries search for a chain of OR strings (#3250)
Browse files Browse the repository at this point in the history
This is a performance enhancement motivated by users who are generating queries with many string
comparisons on a single column, for example from cocoa's "IN" queries.

The idea is to combine string equality conditions from a single "OR" query node and store them in an
unordered_set. With N elements to search, and C conditions, the runtime changes from O(N*C) to
O(N). The added benchmark goes from 30 seconds to 2 seconds. This change does not try to optimise 
indexed columns which should be running O(log(N)*C). The benchmark with indexes turned on runs in 
3.5 seconds. Since N is likely the dominant term, using indexes should still be fastest in practice when
compared to this optimisation.
  • Loading branch information
James Stone authored and jedelbo committed Mar 5, 2019
1 parent e119c5d commit 993363c
Show file tree
Hide file tree
Showing 6 changed files with 261 additions and 17 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
-----------

### Internals
* None.
* Optimised queries for unindexed string columns when the query has a chain of OR conditions.
This will improve performance of "IN" queries generated by bindings.
([#22](https://github.com/realm/engineering/issues/22).

----------------------------------------------

Expand Down
117 changes: 109 additions & 8 deletions src/realm/query_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <realm/query_engine.hpp>

#include <realm/query_expression.hpp>
#include <realm/utilities.hpp>

using namespace realm;

Expand Down Expand Up @@ -325,8 +326,75 @@ void StringNode<Equal>::_search_index_init()
}
}

void StringNode<Equal>::consume_condition(StringNode<Equal>* other)
{
// If a search index is present, don't try to combine conditions since index search is most likely faster.
// Assuming N elements to search and M conditions to check:
// 1) search index present: O(log(N)*M)
// 2) no search index, combine conditions: O(N)
// 3) no search index, conditions not combined: O(N*M)
// In practice N is much larger than M, so if we have a search index, choose 1, otherwise if possible choose 2.
REALM_ASSERT(m_condition_column == other->m_condition_column);
REALM_ASSERT(other->m_needles.empty());
if (m_needles.empty()) {
m_needles.insert(bool(m_value) ? StringData(*m_value) : StringData());
}
if (bool(other->m_value)) {
m_needle_storage.push_back(StringBuffer());
m_needle_storage.back().append(*other->m_value);
m_needles.insert(StringData(m_needle_storage.back().data(), m_needle_storage.back().size()));
}
else {
m_needles.insert(StringData());
}
}

// Requirements of template types:
// ArrayType must support: size() -> size_t, and get(size_t) -> ElementType
// ElementType must be convertable to a StringData via data() and size()
template<class ArrayType, class ElementType>
size_t StringNode<Equal>::find_first_in(ArrayType& array, size_t begin, size_t end)
{
if (m_needles.empty())
return not_found;

size_t n = array.size();
if (end == npos)
end = n;
REALM_ASSERT_7(begin, <=, n, &&, end, <=, n);
REALM_ASSERT_3(begin, <=, end);

const auto not_in_set = m_needles.end();
// For a small number of conditions it is faster to cycle through
// and check them individually. The threshold depends on how fast
// our hashing of StringData is (see `StringData.hash()`). The
// number 20 was found empirically when testing small strings
// with N==100k
if (m_needles.size() < 20) {
for (size_t i = begin; i < end; ++i) {
ElementType element = array.get(i);
StringData value_2{element.data(), element.size()};
for (auto it = m_needles.begin(); it != not_in_set; ++it) {
if (*it == value_2)
return i;
}
}
}
else {
for (size_t i = begin; i < end; ++i) {
ElementType element = array.get(i);
StringData value_2{element.data(), element.size()};
if (m_needles.find(value_2) != not_in_set)
return i;
}
}

return not_found;
}

size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
{
const bool multi_target_search = !m_needles.empty();
// Normal string column, with long or short leaf
for (size_t s = start; s < end; ++s) {
const StringColumn* asc = static_cast<const StringColumn*>(m_condition_column);
Expand All @@ -345,14 +413,23 @@ size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
}
size_t end2 = (end > m_leaf_end ? m_leaf_end - m_leaf_start : end - m_leaf_start);

if (m_leaf_type == StringColumn::leaf_type_Small)
s = static_cast<const ArrayString&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else if (m_leaf_type == StringColumn::leaf_type_Medium)
s = static_cast<const ArrayStringLong&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else
s = static_cast<const ArrayBigBlobs&>(*m_leaf).find_first(str_to_bin(m_value), true, s - m_leaf_start,
end2);

if (multi_target_search) {
if (m_leaf_type == StringColumn::leaf_type_Small)
s = find_first_in<const ArrayString&, StringData>(static_cast<const ArrayString&>(*m_leaf), s - m_leaf_start, end2);
else if (m_leaf_type == StringColumn::leaf_type_Medium)
s = find_first_in<const ArrayStringLong&, StringData>(static_cast<const ArrayStringLong&>(*m_leaf), s - m_leaf_start, end2);
else
s = find_first_in<const ArrayBigBlobs&, BinaryData>(static_cast<const ArrayBigBlobs&>(*m_leaf), s - m_leaf_start, end2);
}
else {
if (m_leaf_type == StringColumn::leaf_type_Small)
s = static_cast<const ArrayString&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else if (m_leaf_type == StringColumn::leaf_type_Medium)
s = static_cast<const ArrayStringLong&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else
s = static_cast<const ArrayBigBlobs&>(*m_leaf).find_first(str_to_bin(m_value), true, s - m_leaf_start,
end2);
}
if (s == not_found)
s = m_leaf_end - 1;
else
Expand All @@ -362,6 +439,30 @@ size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
return not_found;
}

std::string StringNode<Equal>::describe(util::serializer::SerialisationState& state) const
{
if (m_needles.empty()) {
return StringNodeEqualBase::describe(state);
}

// FIXME: once the parser supports it, print something like "column IN {s1, s2, s3}"
REALM_ASSERT(m_condition_column != nullptr);
std::string desc;
bool is_first = true;
for (auto it : m_needles) {
StringData sd(it.data(), it.size());
desc += (is_first ? "" : " or ")
+ state.describe_column(ParentNode::m_table, m_condition_column->get_column_index())
+ " " + Equal::description() + " " + util::serializer::print_value(sd);
is_first = false;
}
if (!is_first) {
desc = "(" + desc + ")";
}
return desc;
}


void StringNode<EqualIns>::_search_index_init()
{
if (m_column_type == col_type_StringEnum) {
Expand Down
52 changes: 44 additions & 8 deletions src/realm/query_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,11 @@ AggregateState State of the aggregate - contains a state variable that stor
#include <realm/util/miscellaneous.hpp>
#include <realm/util/serializer.hpp>
#include <realm/util/shared_ptr.hpp>
#include <realm/util/string_buffer.hpp>
#include <realm/utilities.hpp>

#include <map>
#include <unordered_set>

#if REALM_X86_OR_X64_TRUE && defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219
#include <immintrin.h>
Expand Down Expand Up @@ -1184,6 +1186,11 @@ class StringNodeBase : public ParentNode {
do_verify_column(m_condition_column);
}

bool has_search_index() const
{
return m_condition_column->has_search_index();
}

void init() override
{
ParentNode::init();
Expand Down Expand Up @@ -1301,7 +1308,6 @@ class StringNode : public StringNodeBase {
StringNodeBase::init();
}


size_t find_first_local(size_t start, size_t end) override
{
TConditionFunction cond;
Expand Down Expand Up @@ -1537,9 +1543,9 @@ class StringNodeEqualBase : public StringNodeBase {
size_t m_last_start;
};


// Specialization for Equal condition on Strings - we specialize because we can utilize indexes (if they exist) for
// Equal.
// Equal. This specialisation also supports combining other StringNode<Equal> conditions into itself in order to
// optimise the non-indexed linear search that can be happen when many conditions are OR'd together in an "IN" query.
// Future optimization: make specialization for greater, notequal, etc
template <>
class StringNode<Equal> : public StringNodeEqualBase {
Expand All @@ -1548,13 +1554,22 @@ class StringNode<Equal> : public StringNodeEqualBase {

void _search_index_init() override;

void consume_condition(StringNode<Equal>* other);

std::unique_ptr<ParentNode> clone(QueryNodeHandoverPatches* patches) const override
{
return std::unique_ptr<ParentNode>(new StringNode<Equal>(*this, patches));
}

virtual std::string describe(util::serializer::SerialisationState& state) const override;

private:
template<class ArrayType, class ElementType>
size_t find_first_in(ArrayType& array, size_t begin, size_t end);

size_t _find_first_local(size_t start, size_t end) override;
std::unordered_set<StringData> m_needles;
std::vector<StringBuffer> m_needle_storage;
};


Expand Down Expand Up @@ -1603,7 +1618,6 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
size_t _find_first_local(size_t start, size_t end) override;
};


// OR node contains at least two node pointers: Two or more conditions to OR
// together in m_conditions, and the next AND condition (if any) in m_child.
//
Expand Down Expand Up @@ -1641,11 +1655,9 @@ class OrNode : public ParentNode {
condition->verify_column();
}
}

std::string describe(util::serializer::SerialisationState& state) const override
{
if (m_conditions.size() >= 2) {

}
std::string s;
for (size_t i = 0; i < m_conditions.size(); ++i) {
if (m_conditions[i]) {
Expand All @@ -1661,13 +1673,37 @@ class OrNode : public ParentNode {
return s;
}


void init() override
{
ParentNode::init();

m_dD = 10.0;

StringNode<Equal>* first = nullptr;
std::sort(m_conditions.begin(), m_conditions.end(),
[](auto& a, auto& b) { return a->m_condition_column_idx < b->m_condition_column_idx; });
auto it = m_conditions.begin();
while (it != m_conditions.end()) {
// Only try to optimize on StringNode<Equal> conditions without search index
if ((first = dynamic_cast<StringNode<Equal>*>(it->get())) && !first->has_search_index()) {
auto col_ndx = first->m_condition_column_idx;
auto next = it + 1;
while (next != m_conditions.end() && (*next)->m_condition_column_idx == col_ndx) {
if (auto advance = dynamic_cast<StringNode<Equal>*>(next->get())) {
first->consume_condition(advance);
next = m_conditions.erase(next);
}
else {
++next;
}
}
it = next;
}
else {
++it;
}
}

m_start.clear();
m_start.resize(m_conditions.size(), 0);

Expand Down
46 changes: 46 additions & 0 deletions test/benchmark-common-tasks/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,51 @@ struct BenchmarkQuery : BenchmarkWithStrings {
}
};

struct BenchmarkQueryChainedOrStrings : BenchmarkWithStringsTable {
const size_t num_queried_matches = 1000;
const size_t num_rows = 100000;
std::vector<std::string> values_to_query;
const char* name() const
{
return "QueryChainedOrStrings";
}

void before_all(SharedGroup& group)
{
BenchmarkWithStringsTable::before_all(group);
WriteTransaction tr(group);
TableRef t = tr.get_table("StringOnly");
t->add_empty_row(num_rows);
REALM_ASSERT(num_rows > num_queried_matches);
Random r;
for (size_t i = 0; i < num_rows; ++i) {
std::stringstream ss;
ss << i;
auto s = ss.str();
t->set_string(0, i, s);
}
//t->add_search_index(0);
for (size_t i = 0; i < num_queried_matches; ++i) {
size_t ndx_to_match = (num_rows / num_queried_matches) * i;
values_to_query.push_back(t->get_string(0, ndx_to_match));
}
tr.commit();
}

void operator()(SharedGroup& group)
{
ReadTransaction tr(group);
ConstTableRef table = tr.get_table("StringOnly");
Query query = table->where();
for (size_t i = 0; i < values_to_query.size(); ++i) {
query.Or().equal(0, values_to_query[i]);
}
TableView results = query.find_all();
REALM_ASSERT_EX(results.size() == num_queried_matches, results.size(), num_queried_matches, values_to_query.size());
static_cast<void>(results);
}
};

struct BenchmarkSize : BenchmarkWithStrings {
const char* name() const
{
Expand Down Expand Up @@ -1003,6 +1048,7 @@ int benchmark_common_tasks_main()
BENCH(BenchmarkQueryInsensitiveString);
BENCH(BenchmarkQueryInsensitiveStringIndexed);
BENCH(BenchmarkNonInitatorOpen);
BENCH(BenchmarkQueryChainedOrStrings);

#undef BENCH
return 0;
Expand Down
50 changes: 50 additions & 0 deletions test/test_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2730,4 +2730,54 @@ TEST(Parser_Between)
CHECK(message.find("Invalid Predicate. The 'between' operator is not supported yet, please rewrite the expression using '>' and '<'.") != std::string::npos);
}

TEST(Parser_ChainedStringEqualQueries)
{
Group g;
TableRef table = g.add_table("table");
size_t a_col_ndx = table->add_column(type_String, "a", false);
size_t b_col_ndx = table->add_column(type_String, "b", true);
size_t c_col_ndx = table->add_column(type_String, "c", false);
size_t d_col_ndx = table->add_column(type_String, "d", true);

table->add_search_index(c_col_ndx);
table->add_search_index(d_col_ndx);

table->add_empty_row(100);
std::vector<std::string> populated_data;
std::stringstream ss;
for (size_t i = 0; i < table->size(); ++i) {
ss.str({});
ss << i;
std::string sd (ss.str());
populated_data.push_back(sd);
table->set_string(a_col_ndx, i, sd);
table->set_string(b_col_ndx, i, sd);
table->set_string(c_col_ndx, i, sd);
table->set_string(d_col_ndx, i, sd);
}
table->add_empty_row(); // one null/empty string

verify_query(test_context, table, "a == '0' or a == '1' or a == '2'", 3);
verify_query(test_context, table, "a == '0' or b == '2' or a == '3' or b == '4'", 4);
verify_query(test_context, table, "(a == '0' or b == '2' or a == '3' or b == '4') and (c == '0' or d == '2' or c == '3' or d == '4')", 4);
verify_query(test_context, table, "a == '' or a == null", 1);
verify_query(test_context, table, "b == '' or b == null", 1);
verify_query(test_context, table, "c == '' or c == null", 1);
verify_query(test_context, table, "d == '' or d == null", 1);
verify_query(test_context, table, "(a == null or a == '') and (b == null or b == '') and (c == null or c == '') and (d == null or d == '')", 1);

Random rd;
rd.shuffle(populated_data.begin(), populated_data.end());
std::string query;
bool first = true;
char column_to_query = 0;
for (auto s : populated_data) {
std::string column_name(1, 'a' + column_to_query);
query += (first ? "" : " or " ) + column_name + " == '" + s + "'";
first = false;
column_to_query = (column_to_query + 1) % 4;
}
verify_query(test_context, table, query, populated_data.size());
}

#endif // TEST_PARSER
Loading

0 comments on commit 993363c

Please sign in to comment.