Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimise queries search for a chain of OR strings #3250

Merged
merged 4 commits into from
Mar 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
-----------

### Internals
* None.
* Optimised queries for unindexed string columns when the query has a chain of OR conditions.
This will improve performance of "IN" queries generated by bindings.
([#22](https://github.com/realm/engineering/issues/22).

----------------------------------------------

Expand Down
117 changes: 109 additions & 8 deletions src/realm/query_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <realm/query_engine.hpp>

#include <realm/query_expression.hpp>
#include <realm/utilities.hpp>

using namespace realm;

Expand Down Expand Up @@ -325,8 +326,75 @@ void StringNode<Equal>::_search_index_init()
}
}

void StringNode<Equal>::consume_condition(StringNode<Equal>* other)
{
// If a search index is present, don't try to combine conditions since index search is most likely faster.
// Assuming N elements to search and M conditions to check:
// 1) search index present: O(log(N)*M)
// 2) no search index, combine conditions: O(N)
// 3) no search index, conditions not combined: O(N*M)
// In practice N is much larger than M, so if we have a search index, choose 1, otherwise if possible choose 2.
REALM_ASSERT(m_condition_column == other->m_condition_column);
REALM_ASSERT(other->m_needles.empty());
if (m_needles.empty()) {
m_needles.insert(bool(m_value) ? StringData(*m_value) : StringData());
}
if (bool(other->m_value)) {
m_needle_storage.push_back(StringBuffer());
m_needle_storage.back().append(*other->m_value);
m_needles.insert(StringData(m_needle_storage.back().data(), m_needle_storage.back().size()));
}
else {
m_needles.insert(StringData());
}
}

// Requirements of template types:
// ArrayType must support: size() -> size_t, and get(size_t) -> ElementType
// ElementType must be convertable to a StringData via data() and size()
template<class ArrayType, class ElementType>
size_t StringNode<Equal>::find_first_in(ArrayType& array, size_t begin, size_t end)
{
if (m_needles.empty())
return not_found;

size_t n = array.size();
if (end == npos)
end = n;
REALM_ASSERT_7(begin, <=, n, &&, end, <=, n);
REALM_ASSERT_3(begin, <=, end);

const auto not_in_set = m_needles.end();
// For a small number of conditions it is faster to cycle through
// and check them individually. The threshold depends on how fast
// our hashing of StringData is (see `StringData.hash()`). The
// number 20 was found empirically when testing small strings
// with N==100k
if (m_needles.size() < 20) {
for (size_t i = begin; i < end; ++i) {
ElementType element = array.get(i);
StringData value_2{element.data(), element.size()};
for (auto it = m_needles.begin(); it != not_in_set; ++it) {
if (*it == value_2)
return i;
}
}
}
else {
for (size_t i = begin; i < end; ++i) {
ElementType element = array.get(i);
StringData value_2{element.data(), element.size()};
if (m_needles.find(value_2) != not_in_set)
return i;
}
}

return not_found;
}

size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
{
const bool multi_target_search = !m_needles.empty();
// Normal string column, with long or short leaf
for (size_t s = start; s < end; ++s) {
const StringColumn* asc = static_cast<const StringColumn*>(m_condition_column);
Expand All @@ -345,14 +413,23 @@ size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
}
size_t end2 = (end > m_leaf_end ? m_leaf_end - m_leaf_start : end - m_leaf_start);

if (m_leaf_type == StringColumn::leaf_type_Small)
s = static_cast<const ArrayString&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else if (m_leaf_type == StringColumn::leaf_type_Medium)
s = static_cast<const ArrayStringLong&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else
s = static_cast<const ArrayBigBlobs&>(*m_leaf).find_first(str_to_bin(m_value), true, s - m_leaf_start,
end2);

if (multi_target_search) {
if (m_leaf_type == StringColumn::leaf_type_Small)
s = find_first_in<const ArrayString&, StringData>(static_cast<const ArrayString&>(*m_leaf), s - m_leaf_start, end2);
else if (m_leaf_type == StringColumn::leaf_type_Medium)
s = find_first_in<const ArrayStringLong&, StringData>(static_cast<const ArrayStringLong&>(*m_leaf), s - m_leaf_start, end2);
else
s = find_first_in<const ArrayBigBlobs&, BinaryData>(static_cast<const ArrayBigBlobs&>(*m_leaf), s - m_leaf_start, end2);
}
else {
if (m_leaf_type == StringColumn::leaf_type_Small)
s = static_cast<const ArrayString&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else if (m_leaf_type == StringColumn::leaf_type_Medium)
s = static_cast<const ArrayStringLong&>(*m_leaf).find_first(m_value, s - m_leaf_start, end2);
else
s = static_cast<const ArrayBigBlobs&>(*m_leaf).find_first(str_to_bin(m_value), true, s - m_leaf_start,
end2);
}
if (s == not_found)
s = m_leaf_end - 1;
else
Expand All @@ -362,6 +439,30 @@ size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
return not_found;
}

std::string StringNode<Equal>::describe(util::serializer::SerialisationState& state) const
{
if (m_needles.empty()) {
return StringNodeEqualBase::describe(state);
}

// FIXME: once the parser supports it, print something like "column IN {s1, s2, s3}"
REALM_ASSERT(m_condition_column != nullptr);
std::string desc;
bool is_first = true;
for (auto it : m_needles) {
StringData sd(it.data(), it.size());
desc += (is_first ? "" : " or ")
+ state.describe_column(ParentNode::m_table, m_condition_column->get_column_index())
+ " " + Equal::description() + " " + util::serializer::print_value(sd);
is_first = false;
}
if (!is_first) {
desc = "(" + desc + ")";
}
return desc;
}


void StringNode<EqualIns>::_search_index_init()
{
if (m_column_type == col_type_StringEnum) {
Expand Down
52 changes: 44 additions & 8 deletions src/realm/query_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,11 @@ AggregateState State of the aggregate - contains a state variable that stor
#include <realm/util/miscellaneous.hpp>
#include <realm/util/serializer.hpp>
#include <realm/util/shared_ptr.hpp>
#include <realm/util/string_buffer.hpp>
#include <realm/utilities.hpp>

#include <map>
#include <unordered_set>

#if REALM_X86_OR_X64_TRUE && defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219
#include <immintrin.h>
Expand Down Expand Up @@ -1184,6 +1186,11 @@ class StringNodeBase : public ParentNode {
do_verify_column(m_condition_column);
}

bool has_search_index() const
{
return m_condition_column->has_search_index();
}

void init() override
{
ParentNode::init();
Expand Down Expand Up @@ -1301,7 +1308,6 @@ class StringNode : public StringNodeBase {
StringNodeBase::init();
}


size_t find_first_local(size_t start, size_t end) override
{
TConditionFunction cond;
Expand Down Expand Up @@ -1537,9 +1543,9 @@ class StringNodeEqualBase : public StringNodeBase {
size_t m_last_start;
};


// Specialization for Equal condition on Strings - we specialize because we can utilize indexes (if they exist) for
// Equal.
// Equal. This specialisation also supports combining other StringNode<Equal> conditions into itself in order to
// optimise the non-indexed linear search that can be happen when many conditions are OR'd together in an "IN" query.
// Future optimization: make specialization for greater, notequal, etc
template <>
class StringNode<Equal> : public StringNodeEqualBase {
Expand All @@ -1548,13 +1554,22 @@ class StringNode<Equal> : public StringNodeEqualBase {

void _search_index_init() override;

void consume_condition(StringNode<Equal>* other);

std::unique_ptr<ParentNode> clone(QueryNodeHandoverPatches* patches) const override
{
return std::unique_ptr<ParentNode>(new StringNode<Equal>(*this, patches));
}

virtual std::string describe(util::serializer::SerialisationState& state) const override;

private:
template<class ArrayType, class ElementType>
size_t find_first_in(ArrayType& array, size_t begin, size_t end);

size_t _find_first_local(size_t start, size_t end) override;
std::unordered_set<StringData> m_needles;
std::vector<StringBuffer> m_needle_storage;
};


Expand Down Expand Up @@ -1603,7 +1618,6 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
size_t _find_first_local(size_t start, size_t end) override;
};


// OR node contains at least two node pointers: Two or more conditions to OR
// together in m_conditions, and the next AND condition (if any) in m_child.
//
Expand Down Expand Up @@ -1641,11 +1655,9 @@ class OrNode : public ParentNode {
condition->verify_column();
}
}

std::string describe(util::serializer::SerialisationState& state) const override
{
if (m_conditions.size() >= 2) {

}
std::string s;
for (size_t i = 0; i < m_conditions.size(); ++i) {
if (m_conditions[i]) {
Expand All @@ -1661,13 +1673,37 @@ class OrNode : public ParentNode {
return s;
}


void init() override
{
ParentNode::init();

m_dD = 10.0;

StringNode<Equal>* first = nullptr;
std::sort(m_conditions.begin(), m_conditions.end(),
[](auto& a, auto& b) { return a->m_condition_column_idx < b->m_condition_column_idx; });
auto it = m_conditions.begin();
while (it != m_conditions.end()) {
// Only try to optimize on StringNode<Equal> conditions without search index
if ((first = dynamic_cast<StringNode<Equal>*>(it->get())) && !first->has_search_index()) {
auto col_ndx = first->m_condition_column_idx;
auto next = it + 1;
while (next != m_conditions.end() && (*next)->m_condition_column_idx == col_ndx) {
if (auto advance = dynamic_cast<StringNode<Equal>*>(next->get())) {
first->consume_condition(advance);
next = m_conditions.erase(next);
}
else {
++next;
}
}
it = next;
}
else {
++it;
}
}

m_start.clear();
m_start.resize(m_conditions.size(), 0);

Expand Down
46 changes: 46 additions & 0 deletions test/benchmark-common-tasks/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,51 @@ struct BenchmarkQuery : BenchmarkWithStrings {
}
};

struct BenchmarkQueryChainedOrStrings : BenchmarkWithStringsTable {
const size_t num_queried_matches = 1000;
const size_t num_rows = 100000;
std::vector<std::string> values_to_query;
const char* name() const
{
return "QueryChainedOrStrings";
}

void before_all(SharedGroup& group)
{
BenchmarkWithStringsTable::before_all(group);
WriteTransaction tr(group);
TableRef t = tr.get_table("StringOnly");
t->add_empty_row(num_rows);
REALM_ASSERT(num_rows > num_queried_matches);
Random r;
for (size_t i = 0; i < num_rows; ++i) {
std::stringstream ss;
ss << i;
auto s = ss.str();
t->set_string(0, i, s);
}
//t->add_search_index(0);
for (size_t i = 0; i < num_queried_matches; ++i) {
size_t ndx_to_match = (num_rows / num_queried_matches) * i;
values_to_query.push_back(t->get_string(0, ndx_to_match));
}
tr.commit();
}

void operator()(SharedGroup& group)
{
ReadTransaction tr(group);
ConstTableRef table = tr.get_table("StringOnly");
Query query = table->where();
for (size_t i = 0; i < values_to_query.size(); ++i) {
query.Or().equal(0, values_to_query[i]);
}
TableView results = query.find_all();
REALM_ASSERT_EX(results.size() == num_queried_matches, results.size(), num_queried_matches, values_to_query.size());
static_cast<void>(results);
}
};

struct BenchmarkSize : BenchmarkWithStrings {
const char* name() const
{
Expand Down Expand Up @@ -1003,6 +1048,7 @@ int benchmark_common_tasks_main()
BENCH(BenchmarkQueryInsensitiveString);
BENCH(BenchmarkQueryInsensitiveStringIndexed);
BENCH(BenchmarkNonInitatorOpen);
BENCH(BenchmarkQueryChainedOrStrings);

#undef BENCH
return 0;
Expand Down
50 changes: 50 additions & 0 deletions test/test_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2730,4 +2730,54 @@ TEST(Parser_Between)
CHECK(message.find("Invalid Predicate. The 'between' operator is not supported yet, please rewrite the expression using '>' and '<'.") != std::string::npos);
}

TEST(Parser_ChainedStringEqualQueries)
{
Group g;
TableRef table = g.add_table("table");
size_t a_col_ndx = table->add_column(type_String, "a", false);
size_t b_col_ndx = table->add_column(type_String, "b", true);
size_t c_col_ndx = table->add_column(type_String, "c", false);
size_t d_col_ndx = table->add_column(type_String, "d", true);

table->add_search_index(c_col_ndx);
table->add_search_index(d_col_ndx);

table->add_empty_row(100);
std::vector<std::string> populated_data;
std::stringstream ss;
for (size_t i = 0; i < table->size(); ++i) {
ss.str({});
ss << i;
std::string sd (ss.str());
populated_data.push_back(sd);
table->set_string(a_col_ndx, i, sd);
table->set_string(b_col_ndx, i, sd);
table->set_string(c_col_ndx, i, sd);
table->set_string(d_col_ndx, i, sd);
}
table->add_empty_row(); // one null/empty string

verify_query(test_context, table, "a == '0' or a == '1' or a == '2'", 3);
verify_query(test_context, table, "a == '0' or b == '2' or a == '3' or b == '4'", 4);
verify_query(test_context, table, "(a == '0' or b == '2' or a == '3' or b == '4') and (c == '0' or d == '2' or c == '3' or d == '4')", 4);
verify_query(test_context, table, "a == '' or a == null", 1);
verify_query(test_context, table, "b == '' or b == null", 1);
verify_query(test_context, table, "c == '' or c == null", 1);
verify_query(test_context, table, "d == '' or d == null", 1);
verify_query(test_context, table, "(a == null or a == '') and (b == null or b == '') and (c == null or c == '') and (d == null or d == '')", 1);

Random rd;
rd.shuffle(populated_data.begin(), populated_data.end());
std::string query;
bool first = true;
char column_to_query = 0;
for (auto s : populated_data) {
std::string column_name(1, 'a' + column_to_query);
query += (first ? "" : " or " ) + column_name + " == '" + s + "'";
first = false;
column_to_query = (column_to_query + 1) % 4;
}
verify_query(test_context, table, query, populated_data.size());
}

#endif // TEST_PARSER
Loading