Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] counting_agent for the interleaved bloom filter #2373

Merged
merged 1 commit into from
Feb 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
#### Search

* The `seqan3::fm_index_cursor` exposes its suffix array interval ([\#2076](https://github.com/seqan/seqan3/pull/2076)).
* The `seqan3::interleaved_bloom_filter` supports counting occurrences of a range of values
([\#2373](https://github.com/seqan/seqan3/pull/2373)).

## Notable Bug-fixes

Expand Down
128 changes: 123 additions & 5 deletions include/seqan3/search/dream_index/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ struct bin_index : public detail::strong_type<size_t, bin_index, detail::strong_
* To query the Interleaved Bloom Filter for a value, call seqan3::interleaved_bloom_filter::membership_agent() and use
* the returned seqan3::interleaved_bloom_filter::membership_agent.
*
* To count the occurrences of a range of values in the Interleaved Bloom Filter, call
* seqan3::interleaved_bloom_filter::counting_agent() and use
* the returned seqan3::interleaved_bloom_filter::counting_agent_type.
*
* ### Compression
*
* The Interleaved Bloom Filter can be compressed by passing `data_layout::compressed` as template argument.
Expand Down Expand Up @@ -186,6 +190,8 @@ class interleaved_bloom_filter
static constexpr data_layout data_layout_mode = data_layout_mode_;

class membership_agent; // documented upon definition below
template <std::integral value_t>
class counting_agent_type; // documented upon definition below

/*!\name Constructors, destructor and assignment
* \{
Expand Down Expand Up @@ -356,9 +362,9 @@ class interleaved_bloom_filter
/*!\name Lookup
* \{
*/
/*!\brief Returns seqan3::interleaved_bloom_filter::membership_agent to be used for lookup.
/*!\brief Returns a seqan3::interleaved_bloom_filter::membership_agent to be used for lookup.
* \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates all
* seqan3::interleaved_bloom_filter::membership_agent constructed for this Interleaved Bloom Filter.
* `seqan3::interleaved_bloom_filter::membership_agent`s constructed for this Interleaved Bloom Filter.
*
* \details
*
Expand All @@ -371,6 +377,23 @@ class interleaved_bloom_filter
{
return typename interleaved_bloom_filter<data_layout_mode>::membership_agent{*this};
}

/*!\brief Returns a seqan3::interleaved_bloom_filter::counting_agent_type to be used for counting.
* \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates all
* `seqan3::interleaved_bloom_filter::counting_agent_type`s constructed for this Interleaved Bloom Filter.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/counting_agent_construction.cpp
* \sa seqan3::interleaved_bloom_filter::counting_agent_type::bulk_count
*/
template <typename value_t = uint16_t>
counting_agent_type<value_t> counting_agent() const
{
return counting_agent_type<value_t>{*this};
}
//!\}

/*!\name Capacity
Expand Down Expand Up @@ -509,7 +532,7 @@ class interleaved_bloom_filter<data_layout_mode>::membership_agent
/*!\brief Determines set membership of a given value.
* \param[in] value The raw value to process.
*
* \attention The result of this function must always be bound via reference, e.g. `auto &` to prevent copying.
* \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying.
* \attention Sequential calls to this function invalidate the previously returned reference.
*
* \details
Expand All @@ -520,8 +543,8 @@ class interleaved_bloom_filter<data_layout_mode>::membership_agent
*
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a seqan3::membership_agent for each
* thread.
* Concurrent invocations of this function are not thread safe, please create a
* seqan3::interleaved_bloom_filter::membership_agent for each thread.
*/
[[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) & noexcept
{
Expand Down Expand Up @@ -714,6 +737,101 @@ class counting_vector : public std::vector<value_t>

};

/*!\brief Manages counting ranges of values for the seqan3::interleaved_bloom_filter.
* \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates the counting_agent_type.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/counting_agent.cpp
*/
template <data_layout data_layout_mode>
template <std::integral value_t>
class interleaved_bloom_filter<data_layout_mode>::counting_agent_type
{
private:
//!\brief The type of the augmented seqan3::interleaved_bloom_filter.
using ibf_t = interleaved_bloom_filter<data_layout_mode>;

//!\brief A pointer to the augmented seqan3::interleaved_bloom_filter.
ibf_t const * ibf_ptr{nullptr};

//!\brief Store a seqan3::interleaved_bloom_filter::membership_agent to call `bulk_contains`.
typename ibf_t::membership_agent membership_agent;

public:
/*!\name Constructors, destructor and assignment
* \{
*/
counting_agent_type() = default; //!< Defaulted.
counting_agent_type(counting_agent_type const &) = default; //!< Defaulted.
counting_agent_type & operator=(counting_agent_type const &) = default; //!< Defaulted.
counting_agent_type(counting_agent_type &&) = default; //!< Defaulted.
counting_agent_type & operator=(counting_agent_type &&) = default; //!< Defaulted.
~counting_agent_type() = default; //!< Defaulted.

/*!\brief Construct a counting_agent_type for an existing seqan3::interleaved_bloom_filter.
* \private
* \param ibf The seqan3::interleaved_bloom_filter.
*/
counting_agent_type(ibf_t const & ibf) : ibf_ptr(std::addressof(ibf)), membership_agent(ibf)
{
result_buffer.resize(ibf_ptr->bin_count());
};
//!\}

//!\brief Stores the result of bulk_count().
counting_vector<value_t> result_buffer;

/*!\name Counting
* \{
*/
/*!\brief Counts the occurrences in each bin for all values in a range.
* \tparam value_range_t The type of the range of values. Must model std::ranges::input_range. The reference type
* must model std::unsigned_integral.
* \param[in] values The range of values to process.
*
* \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying.
* \attention Sequential calls to this function invalidate the previously returned reference.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/counting_agent.cpp
*
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a
* seqan3::interleaved_bloom_filter::counting_agent_type for each thread.
*/
template <std::ranges::range value_range_t>
[[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) & noexcept
{
assert(ibf_ptr != nullptr);
assert(result_buffer.size() == ibf_ptr->bin_count());

static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
"An individual value must be an unsigned integral.");

std::ranges::fill(result_buffer, 0);

for (auto && value : values)
result_buffer += membership_agent.bulk_contains(value);

return result_buffer;
}

// `bulk_count` cannot be called on a temporary, since the object the returned reference points to
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting that we don't need any !\brief here

// is immediately destroyed.
template <std::ranges::range value_range_t>
[[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) && noexcept = delete;
//!\}

};

//!\}

} // namespace seqan3
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,24 @@ void bulk_contains_benchmark(::benchmark::State & state)
state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

template <typename ibf_type>
void bulk_count_benchmark(::benchmark::State & state)
{
auto && [ bin_indices, hash_values, ibf ] = set_up<ibf_type>(state.range(0),
state.range(1),
state.range(2),
state.range(3));
(void) bin_indices;

auto agent = ibf.counting_agent();
for (auto _ : state)
{
[[maybe_unused]] auto & res = agent.bulk_count(hash_values);
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

BENCHMARK_TEMPLATE(emplace_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);

Expand All @@ -90,4 +108,9 @@ BENCHMARK_TEMPLATE(bulk_contains_benchmark,
BENCHMARK_TEMPLATE(bulk_contains_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed>)->Apply(arguments);

BENCHMARK_TEMPLATE(bulk_count_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);
BENCHMARK_TEMPLATE(bulk_count_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed>)->Apply(arguments);

BENCHMARK_MAIN();
46 changes: 46 additions & 0 deletions test/snippet/search/dream_index/counting_agent.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/range/views/kmer_hash.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>

using seqan3::operator""_dna4;

int main()
{
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{8u},
seqan3::bin_size{8192u},
seqan3::hash_function_count{2u}};

auto const sequence1 = "ACTGACTGACTGATC"_dna4;
auto const sequence2 = "GTGACTGACTGACTCG"_dna4;
auto const sequence3 = "AAAAAAACGATCGACA"_dna4;
auto hash_adaptor = seqan3::views::kmer_hash(seqan3::ungapped{5u});

// Insert all 5-mers of sequence1 into bin 0
for (auto && value : sequence1 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{0u});

// Insert all 5-mers of sequence2 into bin 4
for (auto && value : sequence2 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{4u});

// Insert all 5-mers of sequence3 into bin 7
for (auto && value : sequence3 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{7u});

auto agent = ibf.counting_agent();

// Count all 5-mers of sequence1 for all bins
seqan3::debug_stream << agent.bulk_count(sequence1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0]

// Search for specific values
std::vector<size_t> const values{92, 1238, 812, 81273};
seqan3::debug_stream << agent.bulk_count(values) << '\n'; // [0,0,0,0,0,0,0,0]
seqan3::debug_stream << agent.bulk_count(std::views::iota(0u, 1024u)) << '\n'; // [6,0,0,0,7,0,0,10]

// The default counters are 16 bit unsigned integer.
// An optional template parameter can be used to specify the counter type
auto agent2 = ibf.counting_agent<uint8_t>();
// The returned counts are now 8 bit unsigned integers.
seqan3::debug_stream << agent.bulk_count(sequence1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0]
}
18 changes: 18 additions & 0 deletions test/snippet/search/dream_index/counting_agent_construction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>

int main()
{
// Construct an Interleaved Bloom Filter to be used with the counting_agent.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{43u},
seqan3::bin_size{8192u},
seqan3::hash_function_count{3}};

// The counting_agent can now be constructed by calling `counting_agent` on the Interleaved Bloom Filter.
auto agent = ibf.counting_agent();

// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan3::bin_count{60u});

// So make sure to construct a new counting_agent.
agent = ibf.counting_agent();
}
48 changes: 47 additions & 1 deletion test/unit/search/dream_index/interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
#include <seqan3/test/cereal.hpp>
#include <seqan3/test/expect_range_eq.hpp>

template <typename ibf_type>
struct interleaved_bloom_filter_test : public ::testing::Test
Expand Down Expand Up @@ -43,7 +44,7 @@ TYPED_TEST(interleaved_bloom_filter_test, construction)
TypeParam ibf2{TestFixture::make_ibf(seqan3::bin_count{64u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u})};
EXPECT_EQ(ibf1, ibf2);
EXPECT_TRUE(ibf1 == ibf2);

// bin_size parameter is too small
EXPECT_THROW((TestFixture::make_ibf(seqan3::bin_count{64u}, seqan3::bin_size{0u})), std::logic_error);
Expand Down Expand Up @@ -164,6 +165,27 @@ TYPED_TEST(interleaved_bloom_filter_test, counting)
EXPECT_EQ(counting, expected2);
}

TYPED_TEST(interleaved_bloom_filter_test, counting_agent)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{128u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u}};

for (size_t bin_idx : std::views::iota(0, 128))
for (size_t hash : std::views::iota(0, 128))
ibf.emplace(hash, seqan3::bin_index{bin_idx});

// 2. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_count
TypeParam ibf2{ibf};
auto agent = ibf2.counting_agent();
auto agent2 = ibf2.template counting_agent<size_t>();

std::vector<size_t> expected(128, 128);
EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected);
EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected);
}

// Check special case where there is only one `1` in the bitvector.
TYPED_TEST(interleaved_bloom_filter_test, counting_no_ub)
{
Expand Down Expand Up @@ -197,6 +219,30 @@ TYPED_TEST(interleaved_bloom_filter_test, counting_no_ub)
EXPECT_EQ(counting, expected2);
}

// Check special case where there is only one `1` in the bitvector.
TYPED_TEST(interleaved_bloom_filter_test, counting_agent_no_ub)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{128u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u}};

for (size_t bin_idx : std::array<size_t, 2>{63, 127})
for (size_t hash : std::views::iota(0, 128))
ibf.emplace(hash, seqan3::bin_index{bin_idx});

// 2. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_contains
TypeParam ibf2{ibf};
auto agent = ibf2.counting_agent();
auto agent2 = ibf2.template counting_agent<size_t>();

std::vector<size_t> expected(128, 0);
expected[63] = 128;
expected[127] = 128;
EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected);
EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected);
}

TYPED_TEST(interleaved_bloom_filter_test, increase_bin_number_to)
{
seqan3::interleaved_bloom_filter ibf1{seqan3::bin_count{73u}, seqan3::bin_size{1024u}};
Expand Down