Skip to content

Commit

Permalink
[FEATURE] counting_agent for the interleaved bloom filter
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Feb 15, 2021
1 parent 94a4e22 commit cd510f7
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
#### Search

* The `seqan3::fm_index_cursor` exposes its suffix array interval ([\#2076](https://github.com/seqan/seqan3/pull/2076)).
* The `seqan3::interleaved_bloom_filter` supports counting occurrences of a range of values
([\#2373](https://github.com/seqan/seqan3/pull/2373)).

## Notable Bug-fixes

Expand Down
128 changes: 123 additions & 5 deletions include/seqan3/search/dream_index/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ struct bin_index : public detail::strong_type<size_t, bin_index, detail::strong_
* To query the Interleaved Bloom Filter for a value, call seqan3::interleaved_bloom_filter::membership_agent() and use
* the returned seqan3::interleaved_bloom_filter::membership_agent.
*
* To count the occurrences of a range of values in the Interleaved Bloom Filter, call
* seqan3::interleaved_bloom_filter::counting_agent() and use
* the returned seqan3::interleaved_bloom_filter::counting_agent_type.
*
* ### Compression
*
* The Interleaved Bloom Filter can be compressed by passing `data_layout::compressed` as template argument.
Expand Down Expand Up @@ -186,6 +190,8 @@ class interleaved_bloom_filter
static constexpr data_layout data_layout_mode = data_layout_mode_;

class membership_agent; // documented upon definition below
template <std::integral value_t>
class counting_agent_type; // documented upon definition below

/*!\name Constructors, destructor and assignment
* \{
Expand Down Expand Up @@ -356,9 +362,9 @@ class interleaved_bloom_filter
/*!\name Lookup
* \{
*/
/*!\brief Returns seqan3::interleaved_bloom_filter::membership_agent to be used for lookup.
/*!\brief Returns a seqan3::interleaved_bloom_filter::membership_agent to be used for lookup.
* \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates all
* seqan3::interleaved_bloom_filter::membership_agent constructed for this Interleaved Bloom Filter.
* `seqan3::interleaved_bloom_filter::membership_agent`s constructed for this Interleaved Bloom Filter.
*
* \details
*
Expand All @@ -371,6 +377,23 @@ class interleaved_bloom_filter
{
return typename interleaved_bloom_filter<data_layout_mode>::membership_agent{*this};
}

/*!\brief Returns a seqan3::interleaved_bloom_filter::counting_agent_type to be used for counting.
* \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates all
* `seqan3::interleaved_bloom_filter::counting_agent_type`s constructed for this Interleaved Bloom Filter.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/counting_agent_construction.cpp
* \sa seqan3::interleaved_bloom_filter::counting_agent_type::bulk_count
*/
template <typename value_t = uint16_t>
counting_agent_type<value_t> counting_agent() const
{
return counting_agent_type<value_t>{*this};
}
//!\}

/*!\name Capacity
Expand Down Expand Up @@ -509,7 +532,7 @@ class interleaved_bloom_filter<data_layout_mode>::membership_agent
/*!\brief Determines set membership of a given value.
* \param[in] value The raw value to process.
*
* \attention The result of this function must always be bound via reference, e.g. `auto &` to prevent copying.
* \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying.
* \attention Sequential calls to this function invalidate the previously returned reference.
*
* \details
Expand All @@ -520,8 +543,8 @@ class interleaved_bloom_filter<data_layout_mode>::membership_agent
*
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a seqan3::membership_agent for each
* thread.
* Concurrent invocations of this function are not thread safe, please create a
* seqan3::interleaved_bloom_filter::membership_agent for each thread.
*/
[[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) & noexcept
{
Expand Down Expand Up @@ -714,6 +737,101 @@ class counting_vector : public std::vector<value_t>

};

/*!\brief Manages counting ranges of values for the seqan3::interleaved_bloom_filter.
* \attention Calling seqan3::interleaved_bloom_filter::increase_bin_number_to invalidates the counting_agent_type.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/counting_agent.cpp
*/
template <data_layout data_layout_mode>
template <std::integral value_t>
class interleaved_bloom_filter<data_layout_mode>::counting_agent_type
{
private:
//!\brief The type of the augmented seqan3::interleaved_bloom_filter.
using ibf_t = interleaved_bloom_filter<data_layout_mode>;

//!\brief A pointer to the augmented seqan3::interleaved_bloom_filter.
ibf_t const * ibf_ptr{nullptr};

//!\brief Store a seqan3::interleaved_bloom_filter::membership_agent to call `bulk_contains`.
typename ibf_t::membership_agent membership_agent;

public:
/*!\name Constructors, destructor and assignment
* \{
*/
counting_agent_type() = default; //!< Defaulted.
counting_agent_type(counting_agent_type const &) = default; //!< Defaulted.
counting_agent_type & operator=(counting_agent_type const &) = default; //!< Defaulted.
counting_agent_type(counting_agent_type &&) = default; //!< Defaulted.
counting_agent_type & operator=(counting_agent_type &&) = default; //!< Defaulted.
~counting_agent_type() = default; //!< Defaulted.

/*!\brief Construct a counting_agent_type for an existing seqan3::interleaved_bloom_filter.
* \private
* \param ibf The seqan3::interleaved_bloom_filter.
*/
counting_agent_type(ibf_t const & ibf) : ibf_ptr(std::addressof(ibf)), membership_agent(ibf)
{
result_buffer.resize(ibf_ptr->bin_count());
};
//!\}

//!\brief Stores the result of bulk_count().
counting_vector<value_t> result_buffer;

/*!\name Counting
* \{
*/
/*!\brief Counts the occurrences in each bin for all values in a range.
* \tparam value_range_t The type of the range of values. Must model std::ranges::input_range. The reference type
* must model std::unsigned_integral.
* \param[in] values The range of values to process.
*
* \attention The result of this function must always be bound via reference, e.g. `auto &`, to prevent copying.
* \attention Sequential calls to this function invalidate the previously returned reference.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/counting_agent.cpp
*
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a
* seqan3::interleaved_bloom_filter::counting_agent_type for each thread.
*/
template <std::ranges::range value_range_t>
[[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) & noexcept
{
assert(ibf_ptr != nullptr);
assert(result_buffer.size() == ibf_ptr->bin_count());

static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
"An individual value must be an unsigned integral.");

std::ranges::fill(result_buffer, 0);

for (auto && value : values)
result_buffer += membership_agent.bulk_contains(value);

return result_buffer;
}

// `bulk_count` cannot be called on a temporary, since the object the returned reference points to
// is immediately destroyed.
template <std::ranges::range value_range_t>
[[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) && noexcept = delete;
//!\}

};

//!\}

} // namespace seqan3
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,24 @@ void bulk_contains_benchmark(::benchmark::State & state)
state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

template <typename ibf_type>
void bulk_count_benchmark(::benchmark::State & state)
{
auto && [ bin_indices, hash_values, ibf ] = set_up<ibf_type>(state.range(0),
state.range(1),
state.range(2),
state.range(3));
(void) bin_indices;

auto agent = ibf.counting_agent();
for (auto _ : state)
{
[[maybe_unused]] auto & res = agent.bulk_count(hash_values);
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

BENCHMARK_TEMPLATE(emplace_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);

Expand All @@ -90,4 +108,9 @@ BENCHMARK_TEMPLATE(bulk_contains_benchmark,
BENCHMARK_TEMPLATE(bulk_contains_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed>)->Apply(arguments);

BENCHMARK_TEMPLATE(bulk_count_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);
BENCHMARK_TEMPLATE(bulk_count_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed>)->Apply(arguments);

BENCHMARK_MAIN();
46 changes: 46 additions & 0 deletions test/snippet/search/dream_index/counting_agent.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/range/views/kmer_hash.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>

using seqan3::operator""_dna4;

int main()
{
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{8u},
seqan3::bin_size{8192u},
seqan3::hash_function_count{2u}};

auto const sequence_1 = "ACTGACTGACTGATC"_dna4;
auto const sequence_2 = "GTGACTGACTGACTCG"_dna4;
auto const sequence_3 = "AAAAAAACGATCGACA"_dna4;
auto hash_adaptor = seqan3::views::kmer_hash(seqan3::ungapped{5u});

// Insert all 5-mers of sequence_1 into bin 0
for (auto && value : sequence_1 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{0u});

// Insert all 5-mers of sequence_2 into bin 4
for (auto && value : sequence_2 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{4u});

// Insert all 5-mers of sequence_3 into bin 7
for (auto && value : sequence_3 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{7u});

auto agent = ibf.counting_agent();

// Count all 5-mers of sequence_1 for all bins
seqan3::debug_stream << agent.bulk_count(sequence_1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0]

// Search for specific values
std::vector<size_t> const values{92, 1238, 812, 81273};
seqan3::debug_stream << agent.bulk_count(values) << '\n'; // [0,0,0,0,0,0,0,0]
seqan3::debug_stream << agent.bulk_count(std::views::iota(0u, 1024u)) << '\n'; // [6,0,0,0,7,0,0,10]

// The default counters are 16 bit unsigned integer.
// An optional template parameter can be used to specify the counter type
auto agent2 = ibf.counting_agent<uint8_t>();
// The returned counts are now 8 bit unsigned integers.
seqan3::debug_stream << agent.bulk_count(sequence_1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0]
}
18 changes: 18 additions & 0 deletions test/snippet/search/dream_index/counting_agent_construction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>

int main()
{
// Construct an Interleaved Bloom Filter to be used with the counting_agent.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{43u},
seqan3::bin_size{8192u},
seqan3::hash_function_count{3}};

// The counting_agent can now be constructed by calling `counting_agent` on the Interleaved Bloom Filter.
auto agent = ibf.counting_agent();

// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan3::bin_count{60u});

// So make sure to construct a new counting_agent.
agent = ibf.counting_agent();
}
48 changes: 47 additions & 1 deletion test/unit/search/dream_index/interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
#include <seqan3/test/cereal.hpp>
#include <seqan3/test/expect_range_eq.hpp>

template <typename ibf_type>
struct interleaved_bloom_filter_test : public ::testing::Test
Expand Down Expand Up @@ -43,7 +44,7 @@ TYPED_TEST(interleaved_bloom_filter_test, construction)
TypeParam ibf2{TestFixture::make_ibf(seqan3::bin_count{64u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u})};
EXPECT_EQ(ibf1, ibf2);
EXPECT_TRUE(ibf1 == ibf2);

// bin_size parameter is too small
EXPECT_THROW((TestFixture::make_ibf(seqan3::bin_count{64u}, seqan3::bin_size{0u})), std::logic_error);
Expand Down Expand Up @@ -164,6 +165,27 @@ TYPED_TEST(interleaved_bloom_filter_test, counting)
EXPECT_EQ(counting, expected2);
}

TYPED_TEST(interleaved_bloom_filter_test, counting_agent)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{128u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u}};

for (size_t bin_idx : std::views::iota(0, 128))
for (size_t hash : std::views::iota(0, 128))
ibf.emplace(hash, seqan3::bin_index{bin_idx});

// 2. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_count
TypeParam ibf2{ibf};
auto agent = ibf2.counting_agent();
auto agent2 = ibf2.template counting_agent<size_t>();

std::vector<size_t> expected(128, 128);
EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected);
EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected);
}

// Check special case where there is only one `1` in the bitvector.
TYPED_TEST(interleaved_bloom_filter_test, counting_no_ub)
{
Expand Down Expand Up @@ -197,6 +219,30 @@ TYPED_TEST(interleaved_bloom_filter_test, counting_no_ub)
EXPECT_EQ(counting, expected2);
}

// Check special case where there is only one `1` in the bitvector.
TYPED_TEST(interleaved_bloom_filter_test, counting_agent_no_ub)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{128u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u}};

for (size_t bin_idx : std::array<size_t, 2>{63, 127})
for (size_t hash : std::views::iota(0, 128))
ibf.emplace(hash, seqan3::bin_index{bin_idx});

// 2. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_contains
TypeParam ibf2{ibf};
auto agent = ibf2.counting_agent();
auto agent2 = ibf2.template counting_agent<size_t>();

std::vector<size_t> expected(128, 0);
expected[63] = 128;
expected[127] = 128;
EXPECT_RANGE_EQ(agent.bulk_count(std::views::iota(0u, 128u)), expected);
EXPECT_RANGE_EQ(agent2.bulk_count(std::views::iota(0u, 128u)), expected);
}

TYPED_TEST(interleaved_bloom_filter_test, increase_bin_number_to)
{
seqan3::interleaved_bloom_filter ibf1{seqan3::bin_count{73u}, seqan3::bin_size{1024u}};
Expand Down

0 comments on commit cd510f7

Please sign in to comment.