From cd510f792f64072370cc5c2ed8606504c4757855 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 10 Feb 2021 17:27:58 +0100 Subject: [PATCH] [FEATURE] counting_agent for the interleaved bloom filter --- CHANGELOG.md | 2 + .../dream_index/interleaved_bloom_filter.hpp | 128 +++++++++++++++++- .../interleaved_bloom_filter_benchmark.cpp | 23 ++++ .../search/dream_index/counting_agent.cpp | 46 +++++++ .../counting_agent_construction.cpp | 18 +++ .../interleaved_bloom_filter_test.cpp | 48 ++++++- 6 files changed, 259 insertions(+), 6 deletions(-) create mode 100644 test/snippet/search/dream_index/counting_agent.cpp create mode 100644 test/snippet/search/dream_index/counting_agent_construction.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 3730cd964ba..7bf4f3581dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,8 @@ If possible, provide tooling that performs the changes, e.g. a shell-script. #### Search * The `seqan3::fm_index_cursor` exposes its suffix array interval ([\#2076](https://github.com/seqan/seqan3/pull/2076)). +* The `seqan3::interleaved_bloom_filter` supports counting occurrences of a range of values + ([\#2373](https://github.com/seqan/seqan3/pull/2373)). ## Notable Bug-fixes diff --git a/include/seqan3/search/dream_index/interleaved_bloom_filter.hpp b/include/seqan3/search/dream_index/interleaved_bloom_filter.hpp index 491a0132ee7..1d9db5e6716 100644 --- a/include/seqan3/search/dream_index/interleaved_bloom_filter.hpp +++ b/include/seqan3/search/dream_index/interleaved_bloom_filter.hpp @@ -106,6 +106,10 @@ struct bin_index : public detail::strong_type + class counting_agent_type; // documented upon definition below /*!\name Constructors, destructor and assignment * \{ @@ -356,9 +362,9 @@ class interleaved_bloom_filter /*!\name Lookup * \{ */ - /*!\brief Returns seqan3::interleaved_bloom_filter::membership_agent to be used for lookup. + /*!\brief Returns a seqan3::interleaved_bloom_filter::membership_agent to be used for lookup. * \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates all - * seqan3::interleaved_bloom_filter::membership_agent constructed for this Interleaved Bloom Filter. + * `seqan3::interleaved_bloom_filter::membership_agent`s constructed for this Interleaved Bloom Filter. * * \details * @@ -371,6 +377,23 @@ class interleaved_bloom_filter { return typename interleaved_bloom_filter::membership_agent{*this}; } + + /*!\brief Returns a seqan3::interleaved_bloom_filter::counting_agent_type to be used for counting. + * \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates all + * `seqan3::interleaved_bloom_filter::counting_agent_type`s constructed for this Interleaved Bloom Filter. + * + * \details + * + * ### Example + * + * \include test/snippet/search/dream_index/counting_agent_construction.cpp + * \sa seqan3::interleaved_bloom_filter::counting_agent_type::bulk_count + */ + template + counting_agent_type counting_agent() const + { + return counting_agent_type{*this}; + } //!\} /*!\name Capacity @@ -509,7 +532,7 @@ class interleaved_bloom_filter::membership_agent /*!\brief Determines set membership of a given value. * \param[in] value The raw value to process. * - * \attention The result of this function must always be bound via reference, e.g. `auto &` to prevent copying. + * \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying. * \attention Sequential calls to this function invalidate the previously returned reference. * * \details @@ -520,8 +543,8 @@ class interleaved_bloom_filter::membership_agent * * ### Thread safety * - * Concurrent invocations of this function are not thread safe, please create a seqan3::membership_agent for each - * thread. + * Concurrent invocations of this function are not thread safe, please create a + * seqan3::interleaved_bloom_filter::membership_agent for each thread. */ [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) & noexcept { @@ -714,6 +737,101 @@ class counting_vector : public std::vector }; +/*!\brief Manages counting ranges of values for the seqan3::interleaved_bloom_filter. + * \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates the counting_agent_type. + * + * \details + * + * ### Example + * + * \include test/snippet/search/dream_index/counting_agent.cpp + */ +template +template +class interleaved_bloom_filter::counting_agent_type +{ +private: + //!\brief The type of the augmented seqan3::interleaved_bloom_filter. + using ibf_t = interleaved_bloom_filter; + + //!\brief A pointer to the augmented seqan3::interleaved_bloom_filter. + ibf_t const * ibf_ptr{nullptr}; + + //!\brief Store a seqan3::interleaved_bloom_filter::membership_agent to call `bulk_contains`. + typename ibf_t::membership_agent membership_agent; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + counting_agent_type() = default; //!< Defaulted. + counting_agent_type(counting_agent_type const &) = default; //!< Defaulted. + counting_agent_type & operator=(counting_agent_type const &) = default; //!< Defaulted. + counting_agent_type(counting_agent_type &&) = default; //!< Defaulted. + counting_agent_type & operator=(counting_agent_type &&) = default; //!< Defaulted. + ~counting_agent_type() = default; //!< Defaulted. + + /*!\brief Construct a counting_agent_type for an existing seqan3::interleaved_bloom_filter. + * \private + * \param ibf The seqan3::interleaved_bloom_filter. + */ + counting_agent_type(ibf_t const & ibf) : ibf_ptr(std::addressof(ibf)), membership_agent(ibf) + { + result_buffer.resize(ibf_ptr->bin_count()); + }; + //!\} + + //!\brief Stores the result of bulk_count(). + counting_vector result_buffer; + + /*!\name Counting + * \{ + */ + /*!\brief Counts the occurrences in each bin for all values in a range. + * \tparam value_range_t The type of the range of values. Must model std::ranges::input_range. The reference type + * must model std::unsigned_integral. + * \param[in] values The range of values to process. + * + * \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying. + * \attention Sequential calls to this function invalidate the previously returned reference. + * + * \details + * + * ### Example + * + * \include test/snippet/search/dream_index/counting_agent.cpp + * + * ### Thread safety + * + * Concurrent invocations of this function are not thread safe, please create a + * seqan3::interleaved_bloom_filter::counting_agent_type for each thread. + */ + template + [[nodiscard]] counting_vector const & bulk_count(value_range_t && values) & noexcept + { + assert(ibf_ptr != nullptr); + assert(result_buffer.size() == ibf_ptr->bin_count()); + + static_assert(std::ranges::input_range, "The values must model input_range."); + static_assert(std::unsigned_integral>, + "An individual value must be an unsigned integral."); + + std::ranges::fill(result_buffer, 0); + + for (auto && value : values) + result_buffer += membership_agent.bulk_contains(value); + + return result_buffer; + } + + // `bulk_count` cannot be called on a temporary, since the object the returned reference points to + // is immediately destroyed. + template + [[nodiscard]] counting_vector const & bulk_count(value_range_t && values) && noexcept = delete; + //!\} + +}; + //!\} } // namespace seqan3 diff --git a/test/performance/search/dream_index/interleaved_bloom_filter_benchmark.cpp b/test/performance/search/dream_index/interleaved_bloom_filter_benchmark.cpp index da1c191aa26..f433f75ec2f 100644 --- a/test/performance/search/dream_index/interleaved_bloom_filter_benchmark.cpp +++ b/test/performance/search/dream_index/interleaved_bloom_filter_benchmark.cpp @@ -82,6 +82,24 @@ void bulk_contains_benchmark(::benchmark::State & state) state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values)); } +template +void bulk_count_benchmark(::benchmark::State & state) +{ + auto && [ bin_indices, hash_values, ibf ] = set_up(state.range(0), + state.range(1), + state.range(2), + state.range(3)); + (void) bin_indices; + + auto agent = ibf.counting_agent(); + for (auto _ : state) + { + [[maybe_unused]] auto & res = agent.bulk_count(hash_values); + } + + state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values)); +} + BENCHMARK_TEMPLATE(emplace_benchmark, seqan3::interleaved_bloom_filter)->Apply(arguments); @@ -90,4 +108,9 @@ BENCHMARK_TEMPLATE(bulk_contains_benchmark, BENCHMARK_TEMPLATE(bulk_contains_benchmark, seqan3::interleaved_bloom_filter)->Apply(arguments); +BENCHMARK_TEMPLATE(bulk_count_benchmark, + seqan3::interleaved_bloom_filter)->Apply(arguments); +BENCHMARK_TEMPLATE(bulk_count_benchmark, + seqan3::interleaved_bloom_filter)->Apply(arguments); + BENCHMARK_MAIN(); diff --git a/test/snippet/search/dream_index/counting_agent.cpp b/test/snippet/search/dream_index/counting_agent.cpp new file mode 100644 index 00000000000..9568b919a11 --- /dev/null +++ b/test/snippet/search/dream_index/counting_agent.cpp @@ -0,0 +1,46 @@ +#include +#include +#include +#include + +using seqan3::operator""_dna4; + +int main() +{ + seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{8u}, + seqan3::bin_size{8192u}, + seqan3::hash_function_count{2u}}; + + auto const sequence_1 = "ACTGACTGACTGATC"_dna4; + auto const sequence_2 = "GTGACTGACTGACTCG"_dna4; + auto const sequence_3 = "AAAAAAACGATCGACA"_dna4; + auto hash_adaptor = seqan3::views::kmer_hash(seqan3::ungapped{5u}); + + // Insert all 5-mers of sequence_1 into bin 0 + for (auto && value : sequence_1 | hash_adaptor) + ibf.emplace(value, seqan3::bin_index{0u}); + + // Insert all 5-mers of sequence_2 into bin 4 + for (auto && value : sequence_2 | hash_adaptor) + ibf.emplace(value, seqan3::bin_index{4u}); + + // Insert all 5-mers of sequence_3 into bin 7 + for (auto && value : sequence_3 | hash_adaptor) + ibf.emplace(value, seqan3::bin_index{7u}); + + auto agent = ibf.counting_agent(); + + // Count all 5-mers of sequence_1 for all bins + seqan3::debug_stream << agent.bulk_count(sequence_1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0] + + // Search for specific values + std::vector const values{92, 1238, 812, 81273}; + seqan3::debug_stream << agent.bulk_count(values) << '\n'; // [0,0,0,0,0,0,0,0] + seqan3::debug_stream << agent.bulk_count(std::views::iota(0u, 1024u)) << '\n'; // [6,0,0,0,7,0,0,10] + + // The default counters are 16 bit unsigned integer. + // An optional template parameter can be used to specify the counter type + auto agent2 = ibf.counting_agent(); + // The returned counts are now 8 bit unsigned integers. + seqan3::debug_stream << agent.bulk_count(sequence_1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0] +} diff --git a/test/snippet/search/dream_index/counting_agent_construction.cpp b/test/snippet/search/dream_index/counting_agent_construction.cpp new file mode 100644 index 00000000000..34aecf67bbe --- /dev/null +++ b/test/snippet/search/dream_index/counting_agent_construction.cpp @@ -0,0 +1,18 @@ +#include + +int main() +{ + // Construct an Interleaved Bloom Filter to be used with the counting_agent. + seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{43u}, + seqan3::bin_size{8192u}, + seqan3::hash_function_count{3}}; + + // The counting_agent can now be constructed by calling `counting_agent` on the Interleaved Bloom Filter. + auto agent = ibf.counting_agent(); + + // Calling `increase_bin_number_to` invalidates the agent. + ibf.increase_bin_number_to(seqan3::bin_count{60u}); + + // So make sure to construct a new counting_agent. + agent = ibf.counting_agent(); +} diff --git a/test/unit/search/dream_index/interleaved_bloom_filter_test.cpp b/test/unit/search/dream_index/interleaved_bloom_filter_test.cpp index 1bc85d55def..fb6c0fa5bb0 100644 --- a/test/unit/search/dream_index/interleaved_bloom_filter_test.cpp +++ b/test/unit/search/dream_index/interleaved_bloom_filter_test.cpp @@ -9,6 +9,7 @@ #include #include +#include template struct interleaved_bloom_filter_test : public ::testing::Test @@ -43,7 +44,7 @@ TYPED_TEST(interleaved_bloom_filter_test, construction) TypeParam ibf2{TestFixture::make_ibf(seqan3::bin_count{64u}, seqan3::bin_size{1024u}, seqan3::hash_function_count{2u})}; - EXPECT_EQ(ibf1, ibf2); + EXPECT_TRUE(ibf1 == ibf2); // bin_size parameter is too small EXPECT_THROW((TestFixture::make_ibf(seqan3::bin_count{64u}, seqan3::bin_size{0u})), std::logic_error); @@ -164,6 +165,27 @@ TYPED_TEST(interleaved_bloom_filter_test, counting) EXPECT_EQ(counting, expected2); } +TYPED_TEST(interleaved_bloom_filter_test, counting_agent) +{ + // 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable. + seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{128u}, + seqan3::bin_size{1024u}, + seqan3::hash_function_count{2u}}; + + for (size_t bin_idx : std::views::iota(0, 128)) + for (size_t hash : std::views::iota(0, 128)) + ibf.emplace(hash, seqan3::bin_index{bin_idx}); + + // 2. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_count + TypeParam ibf2{ibf}; + auto agent = ibf2.counting_agent(); + auto agent2 = ibf2.template counting_agent(); + + std::vector expected(128, 128); + EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected); + EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected); +} + // Check special case where there is only one `1` in the bitvector. TYPED_TEST(interleaved_bloom_filter_test, counting_no_ub) { @@ -197,6 +219,30 @@ TYPED_TEST(interleaved_bloom_filter_test, counting_no_ub) EXPECT_EQ(counting, expected2); } +// Check special case where there is only one `1` in the bitvector. +TYPED_TEST(interleaved_bloom_filter_test, counting_agent_no_ub) +{ + // 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable. + seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{128u}, + seqan3::bin_size{1024u}, + seqan3::hash_function_count{2u}}; + + for (size_t bin_idx : std::array{63, 127}) + for (size_t hash : std::views::iota(0, 128)) + ibf.emplace(hash, seqan3::bin_index{bin_idx}); + + // 2. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_contains + TypeParam ibf2{ibf}; + auto agent = ibf2.counting_agent(); + auto agent2 = ibf2.template counting_agent(); + + std::vector expected(128, 0); + expected[63] = 128; + expected[127] = 128; + EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected); + EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected); +} + TYPED_TEST(interleaved_bloom_filter_test, increase_bin_number_to) { seqan3::interleaved_bloom_filter ibf1{seqan3::bin_count{73u}, seqan3::bin_size{1024u}};