diff --git a/doc/tutorial/ranges/views/minimiser/index.md b/doc/tutorial/search/views/minimiser/index.md similarity index 95% rename from doc/tutorial/ranges/views/minimiser/index.md rename to doc/tutorial/search/views/minimiser/index.md index e76f0ac650..334c4787a1 100644 --- a/doc/tutorial/ranges/views/minimiser/index.md +++ b/doc/tutorial/search/views/minimiser/index.md @@ -62,7 +62,7 @@ assignment! Task: Obtain the minimisers for "CCACGTCGACGGTT" with an ungapped shape of size 4 and a window size of 8. \endassignment \solution -\include doc/tutorial/ranges/views/minimiser/minimiser_solution1.cpp +\include doc/tutorial/search/views/minimiser/minimiser_solution1.cpp \endsolution If you have completed the assignment, you probably wonder what these large numbers mean. As explained above, the @@ -71,7 +71,7 @@ XORs the hash values with a random seed (Default: 0x8F3F73B5CF1C9ADE). How would then? Well, you just use XOR again! -\include doc/tutorial/ranges/views/minimiser/seed_example.cpp +\include doc/tutorial/search/views/minimiser/seed_example.cpp From these hash values, you can obtain the sequence they are representing by transforming the numbers to base 4. (For example, 182 is "2312" in base four and therefore represents "GTCG".) @@ -102,7 +102,7 @@ same hash value. \endassignment \solution -\include doc/tutorial/ranges/views/minimiser/minimiser_solution2.cpp +\include doc/tutorial/search/views/minimiser/minimiser_solution2.cpp \endsolution ### Ignoring the backward strand @@ -112,7 +112,7 @@ the backward strand should not be considered. If this is the desired behaviour t to be used. Unlike the `seqan3::views::minimiser_hash`, `seqan3::views::minimiser` does not hash the values for you, so you have to do this yourself. But fear not, `seqan3::views::kmer_hash` makes this really easy for you! -\snippet doc/tutorial/ranges/views/minimiser/minimiser_snippets.cpp minimiser +\snippet doc/tutorial/search/views/minimiser/minimiser_snippets.cpp minimiser This syntax will result in minimisers with k-mer size 4 and a window-length of 8 (5 + 4 - 1). (So, to determine the @@ -144,11 +144,11 @@ In order to ensure that this is the desired behaviour, using `seqan3::views::min Last but not least, `seqan3::views::kmer_hash` and `seqan3::views::minimiser` do not have a seed parameter. So, in order to obtain a random ordering, you have to XOR the view yourself. This can be done with the following command: -\snippet doc/tutorial/ranges/views/minimiser/minimiser_snippets.cpp minimiser_seed +\snippet doc/tutorial/search/views/minimiser/minimiser_snippets.cpp minimiser_seed \assignment{Assignment 3: Fun with minimisers III} Task: Repeat assignment 2 but this time do not consider the backward strand. \endassignment \solution -\include doc/tutorial/ranges/views/minimiser/minimiser_solution3.cpp +\include doc/tutorial/search/views/minimiser/minimiser_solution3.cpp \endsolution diff --git a/doc/tutorial/ranges/views/minimiser/minimiser.png b/doc/tutorial/search/views/minimiser/minimiser.png similarity index 100% rename from doc/tutorial/ranges/views/minimiser/minimiser.png rename to doc/tutorial/search/views/minimiser/minimiser.png diff --git a/doc/tutorial/ranges/views/minimiser/minimiser_snippets.cpp b/doc/tutorial/search/views/minimiser/minimiser_snippets.cpp similarity index 89% rename from doc/tutorial/ranges/views/minimiser/minimiser_snippets.cpp rename to doc/tutorial/search/views/minimiser/minimiser_snippets.cpp index 0e876c5152..ca4bf844ec 100644 --- a/doc/tutorial/ranges/views/minimiser/minimiser_snippets.cpp +++ b/doc/tutorial/search/views/minimiser/minimiser_snippets.cpp @@ -1,9 +1,10 @@ #include -#include #include // include all of the standard library's views +#include + #include -#include -#include +#include +#include using seqan3::operator""_dna4; diff --git a/doc/tutorial/ranges/views/minimiser/minimiser_solution1.cpp b/doc/tutorial/search/views/minimiser/minimiser_solution1.cpp similarity index 92% rename from doc/tutorial/ranges/views/minimiser/minimiser_solution1.cpp rename to doc/tutorial/search/views/minimiser/minimiser_solution1.cpp index c4bf3f877b..b57ce5138d 100644 --- a/doc/tutorial/ranges/views/minimiser/minimiser_solution1.cpp +++ b/doc/tutorial/search/views/minimiser/minimiser_solution1.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using seqan3::operator""_dna4; diff --git a/doc/tutorial/ranges/views/minimiser/minimiser_solution2.cpp b/doc/tutorial/search/views/minimiser/minimiser_solution2.cpp similarity index 96% rename from doc/tutorial/ranges/views/minimiser/minimiser_solution2.cpp rename to doc/tutorial/search/views/minimiser/minimiser_solution2.cpp index 012fc00cc7..a06c76a333 100644 --- a/doc/tutorial/ranges/views/minimiser/minimiser_solution2.cpp +++ b/doc/tutorial/search/views/minimiser/minimiser_solution2.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using seqan3::operator""_dna4; using seqan3::operator""_shape; diff --git a/doc/tutorial/ranges/views/minimiser/minimiser_solution3.cpp b/doc/tutorial/search/views/minimiser/minimiser_solution3.cpp similarity index 94% rename from doc/tutorial/ranges/views/minimiser/minimiser_solution3.cpp rename to doc/tutorial/search/views/minimiser/minimiser_solution3.cpp index 1ecce5a1e4..24487a3f31 100644 --- a/doc/tutorial/ranges/views/minimiser/minimiser_solution3.cpp +++ b/doc/tutorial/search/views/minimiser/minimiser_solution3.cpp @@ -1,7 +1,7 @@ #include #include -#include -#include +#include +#include using seqan3::operator""_dna4; using seqan3::operator""_shape; diff --git a/doc/tutorial/ranges/views/minimiser/seed_example.cpp b/doc/tutorial/search/views/minimiser/seed_example.cpp similarity index 95% rename from doc/tutorial/ranges/views/minimiser/seed_example.cpp rename to doc/tutorial/search/views/minimiser/seed_example.cpp index 5143f341d7..cdb6ff4670 100644 --- a/doc/tutorial/ranges/views/minimiser/seed_example.cpp +++ b/doc/tutorial/search/views/minimiser/seed_example.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using seqan3::operator""_dna4; diff --git a/include/seqan3/range/views/detail.hpp b/include/seqan3/range/views/detail.hpp index 52443b7747..6297ac5011 100644 --- a/include/seqan3/range/views/detail.hpp +++ b/include/seqan3/range/views/detail.hpp @@ -352,9 +352,9 @@ class adaptor_for_view_without_args : public adaptor_base - * \brief Provides seqan3::views::kmer_hash. + * \brief [DEPRECATED] Provides seqan3::views::kmer_hash. */ #pragma once -#include -#include -#include -#include +#include -namespace seqan3::detail -{ -// --------------------------------------------------------------------------------------------------------------------- -// kmer_hash_view class -// --------------------------------------------------------------------------------------------------------------------- - -/*!\brief The type returned by seqan3::views::kmer_hash. - * \tparam urng_t The type of the underlying ranges, must model std::forward_range, the reference type must model - * seqan3::semialphabet. - * \implements std::ranges::view - * \implements std::ranges::random_access_range - * \implements std::ranges::sized_range - * \ingroup views - * - * \details - * - * Note that most members of this class are generated by ranges::view_interface which is not yet documented here. - */ -template -class kmer_hash_view : public std::ranges::view_interface> -{ -private: - static_assert(std::ranges::forward_range, "The kmer_hash_view only works on forward_ranges"); - static_assert(semialphabet>, - "The reference type of the underlying range must model seqan3::semialphabet."); - - //!\brief The underlying range. - urng_t urange; - - //!\brief The shape to use. - shape shape_; - - template - class basic_iterator; - -public: - /*!\name Constructors, destructor and assignment - * \{ - */ - kmer_hash_view() = default; //!< Defaulted. - kmer_hash_view(kmer_hash_view const & rhs) = default; //!< Defaulted. - kmer_hash_view(kmer_hash_view && rhs) = default; //!< Defaulted. - kmer_hash_view & operator=(kmer_hash_view const & rhs) = default; //!< Defaulted. - kmer_hash_view & operator=(kmer_hash_view && rhs) = default; //!< Defaulted. - ~kmer_hash_view() = default; //!< Defaulted. - - /*!\brief Construct from a view and a given shape. - * \throws std::invalid_argument if hashes resulting from the shape/alphabet combination cannot be represented in - * `uint64_t`, i.e. \f$s>\frac{64}{\log_2\sigma}\f$ with shape size \f$s\f$ and alphabet size \f$\sigma\f$. - */ - kmer_hash_view(urng_t urange_, shape const & s_) : urange{std::move(urange_)}, shape_{s_} - { - if (shape_.count() > (64 / std::log2(alphabet_size>))) - { - throw std::invalid_argument{"The chosen shape/alphabet combination is not valid. " - "The alphabet or shape size must be reduced."}; - } - } - - /*!\brief Construct from a non-view that can be view-wrapped and a given shape. - * \throws std::invalid_argument if hashes resulting from the shape/alphabet combination cannot be represented in - * `uint64_t`, i.e. \f$s>\frac{64}{\log_2\sigma}\f$ with shape size \f$s\f$ and alphabet size \f$\sigma\f$. - */ - template - //!\cond - requires (!std::same_as, kmer_hash_view>) && - std::ranges::viewable_range && - std::constructible_from>> - //!\endcond - kmer_hash_view(rng_t && urange_, shape const & s_) : - urange{std::views::all(std::forward(urange_))}, shape_{s_} - { - if (shape_.count() > (64 / std::log2(alphabet_size>))) - { - throw std::invalid_argument{"The chosen shape/alphabet combination is not valid. " - "The alphabet or shape size must be reduced."}; - } - } - //!\} - - /*!\name Iterators - * \{ - */ - /*!\brief Returns an iterator to the first element of the range. - * \returns Iterator to the first element. - * - * \details - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto begin() noexcept - { - return basic_iterator{std::ranges::begin(urange), std::ranges::end(urange), shape_}; - } - - //!\copydoc begin() - auto begin() const noexcept - //!\cond - requires const_iterable_range - //!\endcond - { - return basic_iterator{std::ranges::cbegin(urange), std::ranges::cend(urange), shape_}; - } - - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * \details - * - * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() noexcept - { - // Assigning the end iterator to the text_right iterator of the basic_iterator only works for common ranges. - if constexpr (std::ranges::common_range) - return basic_iterator{std::ranges::begin(urange), std::ranges::end(urange), shape_, true}; - else - return std::ranges::end(urange); - } - - //!\copydoc end() - auto end() const noexcept - //!\cond - requires const_iterable_range - //!\endcond - { - // Assigning the end iterator to the text_right iterator of the basic_iterator only works for common ranges. - if constexpr (std::ranges::common_range) - return basic_iterator{std::ranges::cbegin(urange), std::ranges::cend(urange), shape_, true}; - else - return std::ranges::cend(urange); - } - //!\} - - /*!\brief Returns the size of the range, if the underlying range is a std::ranges::sized_range. - * \returns Size of range. - */ - auto size() - //!\cond - requires std::ranges::sized_range - //!\endcond - { - using size_type = std::ranges::range_size_t; - return std::max(std::ranges::size(urange) + 1, shape_.size()) - shape_.size(); - } - - //!\copydoc size() - auto size() const - //!\cond - requires std::ranges::sized_range - //!\endcond - { - using size_type = std::ranges::range_size_t; - return std::max(std::ranges::size(urange) + 1, shape_.size()) - shape_.size(); - } -}; - -/*!\brief Iterator for calculating hash values via a given seqan3::shape. - * \tparam urng_t Type of the text. Must model std::forward_range. Reference type must model seqan3::semialphabet. - * - * \details - * - * The basic_iterator can be used to iterate over the hash values of a text. The basic_iterator needs an iterator of - * the text and a seqan3::shape that defines how to hash the text. - * - * Depending on the type of the iterator passed to the basic_iterator, different functionality is available: - * - * | Concept modelled by passed text iterator | Available functions | - * |------------------------------------------|---------------------------------| - * | std::forward_iterator | \ref basic_iterator_comparison_kmer_hash "Comparison operators"
\ref operator++ "Pre-increment (++it)"
\ref operator++(int) "Post-increment (it++)"
\ref operator* "Indirection operator (*it)" | - * | std::bidirectional_iterator | \ref operator-- "Pre-decrement (--it)"
\ref operator--(int) "Post-decrement (it--)" | - * | std::random_access_iterator | \ref operator+= "Forward (it +=)"
\ref operator+ "Forward copy (it +)"
\ref operator-= "Decrement(it -=)"
\ref basic_iterator_operator-decrement "Decrement copy (it -)"
\ref basic_iterator_operator-difference "Difference (it1 - it2)"
\ref operator[] "Subscript (it[])" | - * - * When using a gapped seqan3::shape, the `0`s of the seqan3::shape are virtually removed from the hashed k-mer. - * Note that any shape is expected to start with a `1` and end with a `1`. - * - * \experimentalapi - * - * ### Implementation detail - * - * To avoid dereferencing the sentinel when iterating, the basic_iterator computes the hash value up until - * the second to last position and performs the addition of the last position upon - * access (\ref operator* and \ref operator[]). - */ -template -template -class kmer_hash_view::basic_iterator -{ -private: - //!\brief The iterator type of the underlying range. - using it_t = maybe_const_iterator_t; - //!\brief The sentinel type of the underlying range. - using sentinel_t = maybe_const_sentinel_t; - - template - friend class basic_iterator; - -public: - /*!\name Associated types - * \{ - */ - //!\brief Type for distances between iterators. - using difference_type = typename std::iter_difference_t; - //!\brief Value type of this iterator. - using value_type = size_t; - //!\brief The pointer type. - using pointer = void; - //!\brief Reference to `value_type`. - using reference = value_type; - //!\brief Tag this class as input iterator. - using iterator_category = detail::iterator_category_tag_t; - //!\brief Tag this class depending on which concept `it_t` models. - using iterator_concept = std::conditional_t, - typename std::random_access_iterator_tag, - detail::iterator_concept_tag_t>; - //!\} - - /*!\name Constructors, destructor and assignment - * \{ - */ - constexpr basic_iterator() = default; //!< Defaulted. - constexpr basic_iterator(basic_iterator const &) = default; //!< Defaulted. - constexpr basic_iterator(basic_iterator &&) = default; //!< Defaulted. - constexpr basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. - constexpr basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. - ~basic_iterator() = default; //!< Defaulted. - - //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. - constexpr basic_iterator(basic_iterator const & it) noexcept - //!\cond - requires const_range - //!\endcond - : hash_value{std::move(it.hash_value)}, - roll_factor{std::move(it.roll_factor)}, - shape_{std::move(it.shape_)}, - text_left{std::move(it.text_left)}, - text_right{std::move(it.text_right)} - {} - - /*!\brief Construct from a given iterator on the text and a seqan3::shape. - * /param[in] it_start Iterator pointing to the first position of the text. - * /param[in] it_end Sentinel pointing to the end of the text. - * /param[in] s_ The seqan3::shape that determines which positions participate in hashing. - * - * \details - * - * ### Complexity - * - * Linear in size of shape. - */ - basic_iterator(it_t it_start, sentinel_t it_end, shape s_) : - shape_{s_}, text_left{it_start}, text_right{std::ranges::next(text_left, shape_.size() - 1, it_end)} - { - assert(std::ranges::size(shape_) > 0); - - // shape size = 3 - // Text: 1 2 3 4 5 6 7 8 9 - // text_left: ^ - // text_right: ^ - // distance(text_left, text_right) = 2 - if (shape_.size() <= std::ranges::distance(text_left, text_right) + 1) - { - roll_factor = pow(sigma, static_cast(std::ranges::size(shape_) - 1)); - hash_full(); - } - } - - /*!\brief Construct from a given iterator on the text and a seqan3::shape. - * /param[in] it_start Iterator pointing to the first position of the text. - * /param[in] it_end Sentinel pointing to the end of the text. - * /param[in] s_ The seqan3::shape that determines which positions participate in hashing. - * /param[in] is_end Indicates that this iterator should point to the end of the text. - * - * \details - * - * If we have a common_range as underlying range, we want to preserve this property. - * This means that we need to have a basic_iterator that can act as end for the kmer_hash_view, i.e. - * the text_right iterator is equal to the end iterator of the underlying range. - * However, we still need to do some initialisation via hash_full: - * When using `std::views::reverse`, we start iterating from the end and decrement the iterator. - * After calling hash_full, we need to reset our text_right iterator to point to the end again. - * - * Another difference to the other constructor is that we need to do some work to determine the position of - * the text_left iterator. Note that we use `std::ranges::next` instead of `std::ranges::prev` because the latter - * only works for bidirectional ranges. * - * - * ### Complexity - * - * Linear in size of shape. - */ - basic_iterator(it_t it_start, sentinel_t it_end, shape s_, bool SEQAN3_DOXYGEN_ONLY(is_end)) : shape_{s_} - { - assert(std::ranges::size(shape_) > 0); - - auto urange_size = std::ranges::distance(it_start, it_end); - auto step = (shape_.size() > urange_size + 1) ? 0 : urange_size - shape_.size() + 1; - text_left = std::ranges::next(it_start, step, it_end); - - // shape size = 3 - // Text: 1 2 3 4 5 6 7 8 9 - // text_left: ^ - // text_right: ^ - // distance(text_left, text_right) = 2 - if (shape_.size() <= std::ranges::distance(text_left, it_end) + 1) - { - roll_factor = pow(sigma, static_cast(std::ranges::size(shape_) - 1)); - hash_full(); - } - - text_right = it_end; - } - //!\} - - //!\anchor basic_iterator_comparison_kmer_hash - //!\name Comparison operators - //!\{ - - //!\brief Compare to iterator on text. - friend bool operator==(basic_iterator const & lhs, sentinel_t const & rhs) noexcept - { - return lhs.text_right == rhs; - } - - //!\brief Compare to iterator on text. - friend bool operator==(sentinel_t const & lhs, basic_iterator const & rhs) noexcept - { - return lhs == rhs.text_right; - } - - //!\brief Compare to another basic_iterator. - friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - { - return std::tie(lhs.text_right, lhs.shape_) == std::tie(rhs.text_right, rhs.shape_); - } - - //!\brief Compare to iterator on text. - friend bool operator!=(basic_iterator const & lhs, sentinel_t const & rhs) noexcept - { - return !(lhs == rhs); - } - - //!\brief Compare to iterator on text. - friend bool operator!=(sentinel_t const & lhs, basic_iterator const & rhs) noexcept - { - return !(lhs == rhs); - } - - //!\brief Compare to another basic_iterator. - friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - { - return !(lhs == rhs); - } - - //!\brief Compare to another basic_iterator. - friend bool operator<(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - { - return (lhs.shape_ <= rhs.shape_) && (lhs.text_right < rhs.text_right); - } - - //!\brief Compare to another basic_iterator. - friend bool operator>(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - { - return (lhs.shape_ >= rhs.shape_) && (lhs.text_right > rhs.text_right); - } - - //!\brief Compare to another basic_iterator. - friend bool operator<=(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - { - return (lhs.shape_ <= rhs.shape_) && (lhs.text_right <= rhs.text_right); - } - - //!\brief Compare to another basic_iterator. - friend bool operator>=(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - { - return (lhs.shape_ >= rhs.shape_) && (lhs.text_right >= rhs.text_right); - } - - //!\} - - //!\brief Pre-increment. - basic_iterator & operator++() noexcept - { - hash_forward(); - return *this; - } - - //!\brief Post-increment. - basic_iterator operator++(int) noexcept - { - basic_iterator tmp{*this}; - hash_forward(); - return tmp; - } - - /*!\brief Pre-decrement. - * \attention This function is only available if `it_t` models std::bidirectional_iterator. - */ - basic_iterator & operator--() noexcept - //!\cond - requires std::bidirectional_iterator - //!\endcond - { - hash_backward(); - return *this; - } - - /*!\brief Post-decrement. - * \attention This function is only available if `it_t` models std::bidirectional_iterator. - */ - basic_iterator operator--(int) noexcept - //!\cond - requires std::bidirectional_iterator - //!\endcond - { - basic_iterator tmp{*this}; - hash_backward(); - return tmp; - } - - /*!\brief Forward this iterator. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - basic_iterator & operator+=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - hash_forward(skip); - return *this; - } - - /*!\brief Forward copy of this iterator. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - basic_iterator operator+(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp += skip; - } - - /*!\brief Non-member operator+ delegates to non-friend operator+. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - friend basic_iterator operator+(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it + skip; - } - - /*!\brief Decrement iterator by `skip`. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - basic_iterator & operator-=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - hash_backward(skip); - return *this; - } - - /*!\anchor basic_iterator_operator-decrement - * \brief Return decremented copy of this iterator. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - basic_iterator operator-(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp -= skip; - } - - /*!\brief Non-member operator- delegates to non-friend operator-. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - friend basic_iterator operator-(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it - skip; - } - - /*!\anchor basic_iterator_operator-difference - * \brief Return offset between two iterator's positions. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - friend difference_type operator-(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return static_cast(lhs.text_right - rhs.text_right); - } - - /*!\brief Return offset between remote sentinel's position and this. - * \attention This function is only available if sentinel_t and it_t model std::sized_sentinel_for. - */ - friend difference_type operator-(sentinel_t const & lhs, basic_iterator const & rhs) noexcept - //!\cond - requires std::sized_sentinel_for - //!\endcond - { - return static_cast(lhs - rhs.text_right); - } - - /*!\brief Return offset this and remote sentinel's position. - * \attention This function is only available if it_t and sentinel_t model std::sized_sentinel_for. - */ - friend difference_type operator-(basic_iterator const & lhs, sentinel_t const & rhs) noexcept - //!\cond - requires std::sized_sentinel_for - //!\endcond - { - return static_cast(lhs.text_right - rhs); - } - - /*!\brief Move the iterator by a given offset and return the corresponding hash value. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - reference operator[](difference_type const n) const - //!\cond - requires std::random_access_iterator - //!\endcond - { - return *(*this + n); - } - - //!\brief Return the hash value. - value_type operator*() const noexcept - { - return hash_value + to_rank(*text_right); - } - -private: - //!\brief The alphabet type of the passed iterator. - using alphabet_t = std::iter_value_t; - - //!\brief The alphabet size. - static constexpr auto const sigma{alphabet_size}; - - //!\brief The hash value. - size_t hash_value{0}; - - //!\brief The factor for the left most position of the hash value. - size_t roll_factor{0}; - - //!\brief The shape to use. - shape shape_; - - //!\brief Iterator to the leftmost position of the k-mer. - it_t text_left; - - //!\brief Iterator to the rightmost position of the k-mer. - it_t text_right; - - //!\brief Increments iterator by 1. - void hash_forward() - { - if (shape_.all()) - { - hash_roll_forward(); - } - else - { - std::ranges::advance(text_left, 1); - hash_full(); - } - } - - /*!\brief Increments iterator by `skip`. - * \param skip Amount to increment. - * \attention This function is only available if `it_t` models std::random_access_iterator. - */ - void hash_forward(difference_type const skip) - //!\cond - requires std::random_access_iterator - //!\endcond - { - std::ranges::advance(text_left, skip); - hash_full(); - } - - /*!\brief Decrements iterator by 1. - * \attention This function is only available if `it_t` models std::bidirectional_iterator. - */ - void hash_backward() - //!\cond - requires std::bidirectional_iterator - //!\endcond - { - if (shape_.all()) - { - hash_roll_backward(); - } - else - { - std::ranges::advance(text_left, -1); - hash_full(); - } - } - - /*!\brief Decrements iterator by `skip`. - * \param skip Amount to decrement. - * \attention This function is only available if `it_t` models std::bidirectional_iterator. - */ - void hash_backward(difference_type const skip) - { - std::ranges::advance(text_left, -skip); - hash_full(); - } - - //!\brief Calculates a hash value by explicitly looking at each position. - void hash_full() - { - text_right = text_left; - hash_value = 0; - - for (size_t i{0}; i < shape_.size() - 1u; ++i) - { - hash_value += shape_[i] * to_rank(*text_right); - hash_value *= shape_[i] ? sigma : 1; - std::ranges::advance(text_right, 1); - } - - } - - //!\brief Calculates the next hash value via rolling hash. - void hash_roll_forward() - { - hash_value -= to_rank(*(text_left)) * roll_factor; - hash_value += to_rank(*(text_right)); - hash_value *= sigma; - - std::ranges::advance(text_left, 1); - std::ranges::advance(text_right, 1); - } - - /*!\brief Calculates the previous hash value via rolling hash. - * \attention This function is only available if `it_t` models std::bidirectional_iterator. - */ - void hash_roll_backward() - //!\cond - requires std::bidirectional_iterator - //!\endcond - { - std::ranges::advance(text_left, -1); - std::ranges::advance(text_right, -1); - - hash_value /= sigma; - hash_value -= to_rank(*(text_right)); - hash_value += to_rank(*(text_left)) * roll_factor; - } -}; - -//!\brief A deduction guide for the view class template. -template -kmer_hash_view(rng_t &&, shape const & shape_) -> kmer_hash_view>; - -// --------------------------------------------------------------------------------------------------------------------- -// kmer_hash_fn (adaptor definition) -// --------------------------------------------------------------------------------------------------------------------- - -//![adaptor_def] -//!\brief views::kmer_hash's range adaptor object type (non-closure). -struct kmer_hash_fn -{ - //!\brief Store the shape and return a range adaptor closure object. - constexpr auto operator()(shape const & shape_) const - { - return adaptor_from_functor{*this, shape_}; - } - - /*!\brief Call the view's constructor with the underlying view and a seqan3::shape as argument. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type - * of the range must model seqan3::semialphabet. - * \param[in] shape_ The seqan3::shape to use for hashing. - * \throws std::invalid_argument if resulting hash values would be too big for a 64 bit integer. - * \returns A range of converted elements. - */ - template - constexpr auto operator()(urng_t && urange, shape const & shape_) const - { - static_assert(std::ranges::viewable_range, - "The range parameter to views::kmer_hash cannot be a temporary of a non-view range."); - static_assert(std::ranges::forward_range, - "The range parameter to views::kmer_hash must model std::ranges::forward_range."); - static_assert(semialphabet>, - "The range parameter to views::kmer_hash must be over elements of seqan3::semialphabet."); - - return kmer_hash_view{std::forward(urange), shape_}; - } -}; -//![adaptor_def] - -} // namespace seqan3::detail - -namespace seqan3::views -{ - -/*!\name Alphabet related views - * \{ - */ - -/*!\brief Computes hash values for each position of a range via a given shape. - * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is - * omitted in pipe notation] - * \param[in] urange The range being processed. [parameter is omitted in pipe notation] - * \param[in] shape The seqan3::shape that determines how to compute the hash value. - * \returns A range of std::size_t where each value is the hash of the resp. k-mer. - * See below for the properties of the returned range. - * \ingroup views - * - * \details - * - * \attention - * For the alphabet size \f$\sigma\f$ of the alphabet of `urange` and the number of 1s \f$s\f$ of `shape` it must hold - * that \f$s>\frac{64}{\log_2\sigma}\f$, i.e. hashes resulting from the shape/alphabet combination can be represented - * in an `uint64_t`. - * - * ### View properties - * - * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | - * |----------------------------------|:----------------------------------:|:--------------------------------:| - * | std::ranges::input_range | *required* | *preserved* | - * | std::ranges::forward_range | *required* | *preserved* | - * | std::ranges::bidirectional_range | | *preserved* | - * | std::ranges::random_access_range | | *preserved* | - * | std::ranges::contiguous_range | | *lost* | - * | | | | - * | std::ranges::viewable_range | *required* | *guaranteed* | - * | std::ranges::view | | *guaranteed* | - * | std::ranges::sized_range | | *preserved* | - * | std::ranges::common_range | | *preserved* | - * | std::ranges::output_range | | *lost* | - * | seqan3::const_iterable_range | | *preserved* | - * | | | | - * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | - * - * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. - * - * ### Example - * - * \include test/snippet/range/views/kmer_hash.cpp - * - * \hideinitializer - */ -inline constexpr auto kmer_hash = detail::kmer_hash_fn{}; - -//!\} - -} // namespace seqan3::views +SEQAN3_DEPRECATED_HEADER( + "This header is deprecated and will be removed in SeqAn-3.1.0; Please #include instead.") diff --git a/include/seqan3/range/views/minimiser.hpp b/include/seqan3/range/views/minimiser.hpp index 72d9f35ec9..63a9ca6cec 100644 --- a/include/seqan3/range/views/minimiser.hpp +++ b/include/seqan3/range/views/minimiser.hpp @@ -7,584 +7,12 @@ /*!\file * \author Mitra Darvish - * \brief Provides seqan3::views::minimiser. + * \brief [DEPRECATED] Provides seqan3::views::minimiser. */ #pragma once -#include -#include +#include -#include -#include -#include -#include - -namespace seqan3::detail -{ -// --------------------------------------------------------------------------------------------------------------------- -// minimiser_view class -// --------------------------------------------------------------------------------------------------------------------- - -/*!\brief The type returned by seqan3::views::minimiser. - * \tparam urng1_t The type of the underlying range, must model std::ranges::forward_range, the reference type must - * model std::totally_ordered. The typical use case is that the reference type is the result of - * seqan3::kmer_hash. - * \tparam urng2_t The type of the second underlying range, must model std::ranges::forward_range, the reference type - * must model std::totally_ordered. If only one range is provided this defaults to - * std::ranges::empty_view. - * \implements std::ranges::view - * \ingroup views - * - * \details - * - * See seqan3::views::minimiser for a detailed explanation on minimizers. - * - * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. - * - * \sa seqan3::views::minimiser - */ -template > -class minimiser_view : public std::ranges::view_interface> -{ -private: - static_assert(std::ranges::forward_range, "The minimiser_view only works on forward_ranges."); - static_assert(std::ranges::forward_range, "The minimiser_view only works on forward_ranges."); - static_assert(std::totally_ordered>, - "The reference type of the underlying range must model std::totally_ordered."); - - //!\brief The default argument of the second range. - using default_urng2_t = std::ranges::empty_view; - - //!\brief Boolean variable, which is true, when second range is not of empty type. - static constexpr bool second_range_is_given = !std::same_as; - - static_assert(!second_range_is_given || std::totally_ordered_with, - std::ranges::range_reference_t>, - "The reference types of the underlying ranges must model std::totally_ordered_with."); - - //!\brief Whether the given ranges are const_iterable - static constexpr bool const_iterable = seqan3::const_iterable_range && - seqan3::const_iterable_range; - - //!\brief The first underlying range. - urng1_t urange1{}; - //!\brief The second underlying range. - urng2_t urange2{}; - - //!\brief The number of values in one window. - size_t window_size{}; - - template - class basic_iterator; - - //!\brief The sentinel type of the minimiser_view. - using sentinel = std::default_sentinel_t; - -public: - /*!\name Constructors, destructor and assignment - * \{ - */ - minimiser_view() = default; //!< Defaulted. - minimiser_view(minimiser_view const & rhs) = default; //!< Defaulted. - minimiser_view(minimiser_view && rhs) = default; //!< Defaulted. - minimiser_view & operator=(minimiser_view const & rhs) = default; //!< Defaulted. - minimiser_view & operator=(minimiser_view && rhs) = default; //!< Defaulted. - ~minimiser_view() = default; //!< Defaulted. - - /*!\brief Construct from a view and a given number of values in one window. - * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_size The number of values in one window. - */ - minimiser_view(urng1_t urange1, size_t const window_size) : - minimiser_view{std::move(urange1), default_urng2_t{}, window_size} - {} - - /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. - * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible - from urng1_t. - * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_size The number of values in one window. - */ - template - //!\cond - requires (std::ranges::viewable_range && - std::constructible_from>>) - //!\endcond - minimiser_view(other_urng1_t && urange1, size_t const window_size) : - urange1{std::views::all(std::forward(urange1))}, - urange2{default_urng2_t{}}, - window_size{window_size} - {} - - /*!\brief Construct from two views and a given number of values in one window. - * \param[in] urange1 The first input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_size The number of values in one window. - */ - minimiser_view(urng1_t urange1, urng2_t urange2, size_t const window_size) : - urange1{std::move(urange1)}, - urange2{std::move(urange2)}, - window_size{window_size} - { - if constexpr (second_range_is_given) - { - if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) - throw std::invalid_argument{"The two ranges do not have the same size."}; - } - } - - /*!\brief Construct from two non-views that can be view-wrapped and a given number of values in one window. - * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible - from urng1_t. - * \tparam other_urng2_t The type of another urange. Must model std::ranges::viewable_range and be constructible - from urng2_t. - * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_size The number of values in one window. - */ - template - //!\cond - requires (std::ranges::viewable_range && - std::constructible_from> && - std::ranges::viewable_range && - std::constructible_from>) - //!\endcond - minimiser_view(other_urng1_t && urange1, other_urng2_t && urange2, size_t const window_size) : - urange1{std::views::all(std::forward(urange1))}, - urange2{std::views::all(std::forward(urange2))}, - window_size{window_size} - { - if constexpr (second_range_is_given) - { - if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) - throw std::invalid_argument{"The two ranges do not have the same size."}; - } - } - //!\} - - /*!\name Iterators - * \{ - */ - /*!\brief Returns an iterator to the first element of the range. - * \returns Iterator to the first element. - * - * \details - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * Strong exception guarantee. - */ - basic_iterator begin() - { - return {std::ranges::begin(urange1), - std::ranges::end(urange1), - std::ranges::begin(urange2), - window_size}; - } - - //!\copydoc begin() - basic_iterator begin() const - //!\cond - requires const_iterable - //!\endcond - { - return {std::ranges::cbegin(urange1), - std::ranges::cend(urange1), - std::ranges::cbegin(urange2), - window_size}; - } - - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * \details - * - * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - sentinel end() const - { - return {}; - } - //!\} -}; - -//!\brief Iterator for calculating minimisers. -template -template -class minimiser_view::basic_iterator -{ -private: - //!\brief The sentinel type of the first underlying range. - using urng1_sentinel_t = maybe_const_sentinel_t; - //!\brief The iterator type of the first underlying range. - using urng1_iterator_t = maybe_const_iterator_t; - //!\brief The iterator type of the second underlying range. - using urng2_iterator_t = maybe_const_iterator_t; - - template - friend class basic_iterator; - -public: - /*!\name Associated types - * \{ - */ - //!\brief Type for distances between iterators. - using difference_type = std::ranges::range_difference_t; - //!\brief Value type of this iterator. - using value_type = std::ranges::range_value_t; - //!\brief The pointer type. - using pointer = void; - //!\brief Reference to `value_type`. - using reference = value_type; - //!\brief Tag this class as a forward iterator. - using iterator_category = std::forward_iterator_tag; - //!\brief Tag this class as a forward iterator. - using iterator_concept = iterator_category; - //!\} - - /*!\name Constructors, destructor and assignment - * \{ - */ - basic_iterator() = default; //!< Defaulted. - basic_iterator(basic_iterator const &) = default; //!< Defaulted. - basic_iterator(basic_iterator &&) = default; //!< Defaulted. - basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. - basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. - ~basic_iterator() = default; //!< Defaulted. - - //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. - basic_iterator(basic_iterator const & it) - //!\cond - requires const_range - //!\endcond - : minimiser_value{std::move(it.minimiser_value)}, - urng1_iterator{std::move(it.urng1_iterator)}, - urng1_sentinel{std::move(it.urng1_sentinel)}, - urng2_iterator{std::move(it.urng2_iterator)}, - window_values{std::move(it.window_values)} - {} - - /*!\brief Construct from begin and end iterators of a given range over std::totally_ordered values, and the number - of values per window. - * \param[in] urng1_iterator Iterator pointing to the first position of the first std::totally_ordered range. - * \param[in] urng1_sentinel Iterator pointing to the last position of the first std::totally_ordered range. - * \param[in] urng2_iterator Iterator pointing to the first position of the second std::totally_ordered range. - * \param[in] window_size The number of values in one window. - * - * \details - * - * Looks at the number of values per window in two ranges, returns the smallest between both as minimiser and - * shifts then by one to repeat this action. If a minimiser in consecutive windows is the same, it is returned only - * once. - */ - basic_iterator(urng1_iterator_t urng1_iterator, - urng1_sentinel_t urng1_sentinel, - urng2_iterator_t urng2_iterator, - size_t window_size) : - urng1_iterator{std::move(urng1_iterator)}, - urng1_sentinel{std::move(urng1_sentinel)}, - urng2_iterator{std::move(urng2_iterator)} - { - size_t size = std::ranges::distance(urng1_iterator, urng1_sentinel); - window_size = std::min(window_size, size); - - window_first(window_size); - } - //!\} - - //!\anchor basic_iterator_comparison - //!\name Comparison operators - //!\{ - - //!\brief Compare to another basic_iterator. - friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) - { - return (lhs.urng1_iterator == rhs.urng1_iterator) && - (rhs.urng2_iterator == rhs.urng2_iterator) && - (lhs.window_values.size() == rhs.window_values.size()); - } - - //!\brief Compare to another basic_iterator. - friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) - { - return !(lhs == rhs); - } - - //!\brief Compare to the sentinel of the minimiser_view. - friend bool operator==(basic_iterator const & lhs, sentinel const &) - { - return lhs.urng1_iterator == lhs.urng1_sentinel; - } - - //!\brief Compare to the sentinel of the minimiser_view. - friend bool operator==(sentinel const & lhs, basic_iterator const & rhs) - { - return rhs == lhs; - } - - //!\brief Compare to the sentinel of the minimiser_view. - friend bool operator!=(sentinel const & lhs, basic_iterator const & rhs) - { - return !(lhs == rhs); - } - - //!\brief Compare to the sentinel of the minimiser_view. - friend bool operator!=(basic_iterator const & lhs, sentinel const & rhs) - { - return !(lhs == rhs); - } - //!\} - - //!\brief Pre-increment. - basic_iterator & operator++() noexcept - { - next_unique_minimiser(); - return *this; - } - - //!\brief Post-increment. - basic_iterator operator++(int) noexcept - { - basic_iterator tmp{*this}; - next_unique_minimiser(); - return tmp; - } - - //!\brief Return the minimiser. - value_type operator*() const noexcept - { - return minimiser_value; - } - -private: - //!\brief The minimiser value. - value_type minimiser_value{}; - - //!\brief The offset relative to the beginning of the window where the minimizer value is found. - size_t minimiser_position_offset{}; - - //!\brief Iterator to the rightmost value of one window. - urng1_iterator_t urng1_iterator{}; - //!brief Iterator to last element in range. - urng1_sentinel_t urng1_sentinel{}; - //!\brief Iterator to the rightmost value of one window of the second range. - urng2_iterator_t urng2_iterator{}; - - //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current minimiser. - std::deque window_values{}; - - //!\brief Increments iterator by 1. - void next_unique_minimiser() - { - while (!next_minimiser()) {} - } - - //!\brief Returns new window value. - auto window_value() const - { - if constexpr (!second_range_is_given) - return *urng1_iterator; - else - return std::min(*urng1_iterator, *urng2_iterator); - } - - //!\brief Advances the window to the next position. - void advance_window() - { - ++urng1_iterator; - if constexpr (second_range_is_given) - ++urng2_iterator; - } - - //!\brief Calculates minimisers for the first window. - void window_first(size_t const window_size) - { - if (window_size == 0u) - return; - - for (size_t i = 0u; i < window_size - 1u; ++i) - { - window_values.push_back(window_value()); - advance_window(); - } - window_values.push_back(window_value()); - auto minimiser_it = std::ranges::min_element(window_values, std::less_equal{}); - minimiser_value = *minimiser_it ; - minimiser_position_offset = std::distance(std::begin(window_values), minimiser_it); - } - - /*!\brief Calculates the next minimiser value. - * \returns True, if new minimiser is found or end is reached. Otherwise returns false. - * \details - * For the following windows, we remove the first window value (is now not in window_values) and add the new - * value that results from the window shifting. - */ - bool next_minimiser() - { - advance_window(); - if (urng1_iterator == urng1_sentinel) - return true; - - value_type const new_value = window_value(); - - window_values.pop_front(); - window_values.push_back(new_value); - - if (minimiser_position_offset == 0) - { - auto minimiser_it = std::ranges::min_element(window_values, std::less_equal{}); - minimiser_value = *minimiser_it ; - minimiser_position_offset = std::distance(std::begin(window_values), minimiser_it); - return true; - } - - if (new_value < minimiser_value) - { - minimiser_value = new_value; - minimiser_position_offset = window_values.size() - 1; - return true; - } - - --minimiser_position_offset; - return false; - } -}; - -//!\brief A deduction guide for the view class template. -template -minimiser_view(rng1_t &&, size_t const window_size) -> minimiser_view>; - -//!\brief A deduction guide for the view class template. -template -minimiser_view(rng1_t &&, rng2_t &&, size_t const window_size) -> minimiser_view, - std::views::all_t>; - -// --------------------------------------------------------------------------------------------------------------------- -// minimiser_fn (adaptor definition) -// --------------------------------------------------------------------------------------------------------------------- - -//![adaptor_def] -//!\brief views::minimiser's range adaptor object type (non-closure). -struct minimiser_fn -{ - //!\brief Store the number of values in one window and return a range adaptor closure object. - constexpr auto operator()(size_t const window_size) const - { - return adaptor_from_functor{*this, window_size}; - } - - /*!\brief Call the view's constructor with two arguments: the underlying view and an integer indicating how many - * values one window contains. - * \tparam urng1_t The type of the input range to process. Must model std::ranges::viewable_range. - * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_size The number of values in one window. - * \returns A range of converted values. - */ - template - constexpr auto operator()(urng1_t && urange1, size_t const window_size) const - { - static_assert(std::ranges::viewable_range, - "The range parameter to views::minimiser cannot be a temporary of a non-view range."); - static_assert(std::ranges::forward_range, - "The range parameter to views::minimiser must model std::ranges::forward_range."); - - if (window_size == 1) // Would just return urange1 without any changes - throw std::invalid_argument{"The chosen window_size is not valid. " - "Please choose a value greater than 1 or use two ranges."}; - - return minimiser_view{urange1, window_size}; - } -}; -//![adaptor_def] - -} // namespace seqan3::detail - -namespace seqan3::views -{ - -/*!\name General purpose views - * \{ - */ - -/*!\brief Computes minimisers for a range of comparable values. A minimiser is the smallest value in a window. - * \tparam urng_t The type of the first range being processed. See below for requirements. [template - * parameter is omitted in pipe notation] - * \param[in] urange1 The range being processed. [parameter is omitted in pipe notation] - * \param[in] window_size The number of values in one window. - * \returns A range of std::totally_ordered where each value is the minimal value for one window. See below for the - * properties of the returned range. - * \ingroup views - * - * \details - * - * A minimiser is the smallest value in a window. For example for the following list of hash values - * `[28, 100, 9, 23, 4, 1, 72, 37, 8]` and 4 as `window_size`, the minimiser values are `[9, 4, 1]`. - * - * The minimiser can be calculated for one given range or for two given ranges, where the minimizer is the smallest - * value in both windows. For example for the following list of hash values `[28, 100, 9, 23, 4, 1, 72, 37, 8]` and - * `[30, 2, 11, 101, 199, 73, 34, 900]` and 4 as `window_size`, the minimiser values are `[2, 4, 1]`. - * - * Note that in the interface with the second underlying range the const-iterable property will only be preserved if - * both underlying ranges are const-iterable. - * - * ### Robust Winnowing - * - * In case there are multiple minimal values within one window, the minimum and therefore the minimiser is ambiguous. - * We choose the rightmost value as the minimiser of the window, and when shifting the window, the minimiser is only - * changed if there appears a value that is strictly smaller than the current minimum. This approach is termed - * *robust winnowing* by [Chirag et al.](https://www.biorxiv.org/content/10.1101/2020.02.11.943241v1.full.pdf) - * and is proven to work especially well on repeat regions. - * - * ### Example - * - * \include test/snippet/range/views/minimiser.cpp - * - * ### View properties - * - * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | - * |----------------------------------|:----------------------------------:|:--------------------------------:| - * | std::ranges::input_range | *required* | *preserved* | - * | std::ranges::forward_range | *required* | *preserved* | - * | std::ranges::bidirectional_range | | *lost* | - * | std::ranges::random_access_range | | *lost* | - * | std::ranges::contiguous_range | | *lost* | - * | | | | - * | std::ranges::viewable_range | *required* | *guaranteed* | - * | std::ranges::view | | *guaranteed* | - * | std::ranges::sized_range | | *lost* | - * | std::ranges::common_range | | *lost* | - * | std::ranges::output_range | | *lost* | - * | seqan3::const_iterable_range | | *preserved* | - * | | | | - * | std::ranges::range_reference_t | std::totally_ordered | std::totally_ordered | - * - * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. - * - * \hideinitializer - */ -inline constexpr auto minimiser = detail::minimiser_fn{}; - -//!\} - -} // namespace seqan3::views +SEQAN3_DEPRECATED_HEADER( + "This header is deprecated and will be removed in SeqAn-3.1.0; Please #include instead.") diff --git a/include/seqan3/range/views/minimiser_hash.hpp b/include/seqan3/range/views/minimiser_hash.hpp index a87fc4a8d9..52c55c462d 100644 --- a/include/seqan3/range/views/minimiser_hash.hpp +++ b/include/seqan3/range/views/minimiser_hash.hpp @@ -7,181 +7,12 @@ /*!\file * \author Mitra Darvish - * \brief Provides seqan3::views::minimiser_hash. + * \brief [DEPRECATED] Provides seqan3::views::minimiser_hash. */ #pragma once -#include -#include -#include -#include +#include -namespace seqan3 -{ -//!\brief strong_type for seed. -struct seed : seqan3::detail::strong_type -{ - using seqan3::detail::strong_type::strong_type; -}; - -//!\brief strong_type for the window_size. -struct window_size : seqan3::detail::strong_type -{ - using seqan3::detail::strong_type::strong_type; -}; -} // namespace seqan3 - -namespace seqan3::detail -{ -//!\brief seqan3::views::minimiser_hash's range adaptor object type (non-closure). -struct minimiser_hash_fn -{ - /*!\brief Store the shape and the window size and return a range adaptor closure object. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] window_size The windows size to use. - * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. - * \returns A range of converted elements. - */ - constexpr auto operator()(shape const & shape, window_size const window_size) const - { - return seqan3::detail::adaptor_from_functor{*this, shape, window_size}; - } - - /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] window_size The size of the window. - * \param[in] seed The seed to use. - * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. - * \returns A range of converted elements. - */ - constexpr auto operator()(shape const & shape, window_size const window_size, seed const seed) const - { - return seqan3::detail::adaptor_from_functor{*this, shape, window_size, seed}; - } - - /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type - * of the range must model seqan3::semialphabet. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] window_size The size of the window. - * \param[in] seed The seed to use. - * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. - * \returns A range of converted elements. - */ - template - constexpr auto operator()(urng_t && urange, - shape const & shape, - window_size const window_size, - seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const - { - static_assert(std::ranges::viewable_range, - "The range parameter to views::minimiser_hash cannot be a temporary of a non-view range."); - static_assert(std::ranges::forward_range, - "The range parameter to views::minimiser_hash must model std::ranges::forward_range."); - static_assert(semialphabet>, - "The range parameter to views::minimiser_hash must be over elements of seqan3::semialphabet."); - - if (shape.size() > window_size.get()) - throw std::invalid_argument{"The size of the shape cannot be greater than the window size."}; - - auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}); - - auto reverse_strand = std::forward(urange) | seqan3::views::complement - | std::views::reverse - | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}) - | std::views::reverse; - - return seqan3::detail::minimiser_view(forward_strand, reverse_strand, window_size.get() - shape.size() + 1); - } -}; - -} // namespace seqan3::detail - -namespace seqan3::views -{ - -/*!\name Alphabet related views - * \{ - */ - -/*!\brief Computes minimisers for a range with a given shape, window size and seed. - * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is - * omitted in pipe notation] - * \param[in] urange The range being processed. [parameter is omitted in pipe notation] - * \param[in] shape The seqan3::shape that determines how to compute the hash value. - * \param[in] window_size The window size to use. - * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. - * \returns A range of `size_t` where each value is the minimiser of the resp. window. - * See below for the properties of the returned range. - * \ingroup views - * - * \details - * - * A sequence can be presented by a small number of k-mers (minimisers). For a given shape and window size all k-mers - * are determined in the forward strand and the backward strand and only the lexicographically smallest k-mer is - * returned for one window. This process is repeated over every possible window of a sequence. If consecutive windows - * share a minimiser, it is saved only once. - * For example, in the sequence "TAAAGTGCTAAA" for an ungapped shape of length 3 and a window size of 5 the first, - * the second and the last window contain the same minimiser "AAA". - * Because the minimisers of the first two consecutive windows also share the same position, storing this minimiser - * twice is redundant and it is stored only once. The "AAA" minimiser of the last window on the other hand is stored, - * since it is located at an other position than the previous "AAA" minimiser and hence storing the second - * "AAA"-minimiser is not redundant but necessary. - * - * ### Non-lexicographical Minimisers by skewing the hash value with a seed - * - * It might happen that a minimiser changes only slightly when sliding the window over the sequence. For instance, when - * a minimiser starts with a repetition of A’s, then in the next window it is highly likely that the minimiser will - * start with a repetition of A’s as well. Because it is only one A shorter, depending on how long the repetition is - * this might go on for multiple window shifts. Saving these only slightly different minimiser makes no sense because - * they contain no new information about the underlying sequence. - * Additionally, sequences with a repetition of A’s will be seen as more similar to each other than they actually are. - * As [Marçais et al.](https://doi.org/10.1093/bioinformatics/btx235) have shown, randomizing the order of the k-mers - * can solve this problem. Therefore, a random seed is used to XOR all k-mers, thereby randomizing the - * order. The user can change the seed to any other value he or she thinks is useful. A seed of 0 is returning the - * lexicographical order. - * - * \sa seqan3::views::minimiser_view - * - * \attention - * Be aware of the requirements of the seqan3::views::kmer_hash view. - * - * \experimentalapi - * - * ### View properties - * - * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | - * |----------------------------------|:----------------------------------:|:--------------------------------:| - * | std::ranges::input_range | *required* | *preserved* | - * | std::ranges::forward_range | *required* | *preserved* | - * | std::ranges::bidirectional_range | | *lost* | - * | std::ranges::random_access_range | | *lost* | - * | std::ranges::contiguous_range | | *lost* | - * | | | | - * | std::ranges::viewable_range | *required* | *guaranteed* | - * | std::ranges::view | | *guaranteed* | - * | std::ranges::sized_range | | *lost* | - * | std::ranges::common_range | | *lost* | - * | std::ranges::output_range | | *lost* | - * | seqan3::const_iterable_range | | *preserved* | - * | | | | - * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | - * - * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. - * - * ### Example - * - * \include test/snippet/range/views/minimiser_hash.cpp - * - * \hideinitializer - */ -inline constexpr auto minimiser_hash = detail::minimiser_hash_fn{}; - -//!\} - -} // namespace seqan3::views +SEQAN3_DEPRECATED_HEADER( + "This header is deprecated and will be removed in SeqAn-3.1.0; Please #include instead.") diff --git a/include/seqan3/search/views/kmer_hash.hpp b/include/seqan3/search/views/kmer_hash.hpp new file mode 100644 index 0000000000..626a9b9983 --- /dev/null +++ b/include/seqan3/search/views/kmer_hash.hpp @@ -0,0 +1,794 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Enrico Seiler + * \brief Provides seqan3::views::kmer_hash. + */ + +#pragma once + +#include +#include +#include +#include + +namespace seqan3::detail +{ +// --------------------------------------------------------------------------------------------------------------------- +// kmer_hash_view class +// --------------------------------------------------------------------------------------------------------------------- + +/*!\brief The type returned by seqan3::views::kmer_hash. + * \tparam urng_t The type of the underlying ranges, must model std::forward_range, the reference type must model + * seqan3::semialphabet. + * \implements std::ranges::view + * \implements std::ranges::random_access_range + * \implements std::ranges::sized_range + * \ingroup views + * + * \details + * + * Note that most members of this class are generated by ranges::view_interface which is not yet documented here. + */ +template +class kmer_hash_view : public std::ranges::view_interface> +{ +private: + static_assert(std::ranges::forward_range, "The kmer_hash_view only works on forward_ranges"); + static_assert(semialphabet>, + "The reference type of the underlying range must model seqan3::semialphabet."); + + //!\brief The underlying range. + urng_t urange; + + //!\brief The shape to use. + shape shape_; + + template + class basic_iterator; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + kmer_hash_view() = default; //!< Defaulted. + kmer_hash_view(kmer_hash_view const & rhs) = default; //!< Defaulted. + kmer_hash_view(kmer_hash_view && rhs) = default; //!< Defaulted. + kmer_hash_view & operator=(kmer_hash_view const & rhs) = default; //!< Defaulted. + kmer_hash_view & operator=(kmer_hash_view && rhs) = default; //!< Defaulted. + ~kmer_hash_view() = default; //!< Defaulted. + + /*!\brief Construct from a view and a given shape. + * \throws std::invalid_argument if hashes resulting from the shape/alphabet combination cannot be represented in + * `uint64_t`, i.e. \f$s>\frac{64}{\log_2\sigma}\f$ with shape size \f$s\f$ and alphabet size \f$\sigma\f$. + */ + kmer_hash_view(urng_t urange_, shape const & s_) : urange{std::move(urange_)}, shape_{s_} + { + if (shape_.count() > (64 / std::log2(alphabet_size>))) + { + throw std::invalid_argument{"The chosen shape/alphabet combination is not valid. " + "The alphabet or shape size must be reduced."}; + } + } + + /*!\brief Construct from a non-view that can be view-wrapped and a given shape. + * \throws std::invalid_argument if hashes resulting from the shape/alphabet combination cannot be represented in + * `uint64_t`, i.e. \f$s>\frac{64}{\log_2\sigma}\f$ with shape size \f$s\f$ and alphabet size \f$\sigma\f$. + */ + template + //!\cond + requires (!std::same_as, kmer_hash_view>) && + std::ranges::viewable_range && + std::constructible_from>> + //!\endcond + kmer_hash_view(rng_t && urange_, shape const & s_) : + urange{std::views::all(std::forward(urange_))}, shape_{s_} + { + if (shape_.count() > (64 / std::log2(alphabet_size>))) + { + throw std::invalid_argument{"The chosen shape/alphabet combination is not valid. " + "The alphabet or shape size must be reduced."}; + } + } + //!\} + + /*!\name Iterators + * \{ + */ + /*!\brief Returns an iterator to the first element of the range. + * \returns Iterator to the first element. + * + * \details + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + auto begin() noexcept + { + return basic_iterator{std::ranges::begin(urange), std::ranges::end(urange), shape_}; + } + + //!\copydoc begin() + auto begin() const noexcept + //!\cond + requires const_iterable_range + //!\endcond + { + return basic_iterator{std::ranges::cbegin(urange), std::ranges::cend(urange), shape_}; + } + + /*!\brief Returns an iterator to the element following the last element of the range. + * \returns Iterator to the end. + * + * \details + * + * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + auto end() noexcept + { + // Assigning the end iterator to the text_right iterator of the basic_iterator only works for common ranges. + if constexpr (std::ranges::common_range) + return basic_iterator{std::ranges::begin(urange), std::ranges::end(urange), shape_, true}; + else + return std::ranges::end(urange); + } + + //!\copydoc end() + auto end() const noexcept + //!\cond + requires const_iterable_range + //!\endcond + { + // Assigning the end iterator to the text_right iterator of the basic_iterator only works for common ranges. + if constexpr (std::ranges::common_range) + return basic_iterator{std::ranges::cbegin(urange), std::ranges::cend(urange), shape_, true}; + else + return std::ranges::cend(urange); + } + //!\} + + /*!\brief Returns the size of the range, if the underlying range is a std::ranges::sized_range. + * \returns Size of range. + */ + auto size() + //!\cond + requires std::ranges::sized_range + //!\endcond + { + using size_type = std::ranges::range_size_t; + return std::max(std::ranges::size(urange) + 1, shape_.size()) - shape_.size(); + } + + //!\copydoc size() + auto size() const + //!\cond + requires std::ranges::sized_range + //!\endcond + { + using size_type = std::ranges::range_size_t; + return std::max(std::ranges::size(urange) + 1, shape_.size()) - shape_.size(); + } +}; + +/*!\brief Iterator for calculating hash values via a given seqan3::shape. + * \tparam urng_t Type of the text. Must model std::forward_range. Reference type must model seqan3::semialphabet. + * + * \details + * + * The basic_iterator can be used to iterate over the hash values of a text. The basic_iterator needs an iterator of + * the text and a seqan3::shape that defines how to hash the text. + * + * Depending on the type of the iterator passed to the basic_iterator, different functionality is available: + * + * | Concept modelled by passed text iterator | Available functions | + * |------------------------------------------|---------------------------------| + * | std::forward_iterator | \ref basic_iterator_comparison_kmer_hash "Comparison operators"
\ref operator++ "Pre-increment (++it)"
\ref operator++(int) "Post-increment (it++)"
\ref operator* "Indirection operator (*it)" | + * | std::bidirectional_iterator | \ref operator-- "Pre-decrement (--it)"
\ref operator--(int) "Post-decrement (it--)" | + * | std::random_access_iterator | \ref operator+= "Forward (it +=)"
\ref operator+ "Forward copy (it +)"
\ref operator-= "Decrement(it -=)"
\ref basic_iterator_operator-decrement "Decrement copy (it -)"
\ref basic_iterator_operator-difference "Difference (it1 - it2)"
\ref operator[] "Subscript (it[])" | + * + * When using a gapped seqan3::shape, the `0`s of the seqan3::shape are virtually removed from the hashed k-mer. + * Note that any shape is expected to start with a `1` and end with a `1`. + * + * \experimentalapi + * + * ### Implementation detail + * + * To avoid dereferencing the sentinel when iterating, the basic_iterator computes the hash value up until + * the second to last position and performs the addition of the last position upon + * access (\ref operator* and \ref operator[]). + */ +template +template +class kmer_hash_view::basic_iterator +{ +private: + //!\brief The iterator type of the underlying range. + using it_t = maybe_const_iterator_t; + //!\brief The sentinel type of the underlying range. + using sentinel_t = maybe_const_sentinel_t; + + template + friend class basic_iterator; + +public: + /*!\name Associated types + * \{ + */ + //!\brief Type for distances between iterators. + using difference_type = typename std::iter_difference_t; + //!\brief Value type of this iterator. + using value_type = size_t; + //!\brief The pointer type. + using pointer = void; + //!\brief Reference to `value_type`. + using reference = value_type; + //!\brief Tag this class as input iterator. + using iterator_category = detail::iterator_category_tag_t; + //!\brief Tag this class depending on which concept `it_t` models. + using iterator_concept = std::conditional_t, + typename std::random_access_iterator_tag, + detail::iterator_concept_tag_t>; + //!\} + + /*!\name Constructors, destructor and assignment + * \{ + */ + constexpr basic_iterator() = default; //!< Defaulted. + constexpr basic_iterator(basic_iterator const &) = default; //!< Defaulted. + constexpr basic_iterator(basic_iterator &&) = default; //!< Defaulted. + constexpr basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. + constexpr basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. + ~basic_iterator() = default; //!< Defaulted. + + //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. + constexpr basic_iterator(basic_iterator const & it) noexcept + //!\cond + requires const_range + //!\endcond + : hash_value{std::move(it.hash_value)}, + roll_factor{std::move(it.roll_factor)}, + shape_{std::move(it.shape_)}, + text_left{std::move(it.text_left)}, + text_right{std::move(it.text_right)} + {} + + /*!\brief Construct from a given iterator on the text and a seqan3::shape. + * /param[in] it_start Iterator pointing to the first position of the text. + * /param[in] it_end Sentinel pointing to the end of the text. + * /param[in] s_ The seqan3::shape that determines which positions participate in hashing. + * + * \details + * + * ### Complexity + * + * Linear in size of shape. + */ + basic_iterator(it_t it_start, sentinel_t it_end, shape s_) : + shape_{s_}, text_left{it_start}, text_right{std::ranges::next(text_left, shape_.size() - 1, it_end)} + { + assert(std::ranges::size(shape_) > 0); + + // shape size = 3 + // Text: 1 2 3 4 5 6 7 8 9 + // text_left: ^ + // text_right: ^ + // distance(text_left, text_right) = 2 + if (shape_.size() <= std::ranges::distance(text_left, text_right) + 1) + { + roll_factor = pow(sigma, static_cast(std::ranges::size(shape_) - 1)); + hash_full(); + } + } + + /*!\brief Construct from a given iterator on the text and a seqan3::shape. + * /param[in] it_start Iterator pointing to the first position of the text. + * /param[in] it_end Sentinel pointing to the end of the text. + * /param[in] s_ The seqan3::shape that determines which positions participate in hashing. + * /param[in] is_end Indicates that this iterator should point to the end of the text. + * + * \details + * + * If we have a common_range as underlying range, we want to preserve this property. + * This means that we need to have a basic_iterator that can act as end for the kmer_hash_view, i.e. + * the text_right iterator is equal to the end iterator of the underlying range. + * However, we still need to do some initialisation via hash_full: + * When using `std::views::reverse`, we start iterating from the end and decrement the iterator. + * After calling hash_full, we need to reset our text_right iterator to point to the end again. + * + * Another difference to the other constructor is that we need to do some work to determine the position of + * the text_left iterator. Note that we use `std::ranges::next` instead of `std::ranges::prev` because the latter + * only works for bidirectional ranges. * + * + * ### Complexity + * + * Linear in size of shape. + */ + basic_iterator(it_t it_start, sentinel_t it_end, shape s_, bool SEQAN3_DOXYGEN_ONLY(is_end)) : shape_{s_} + { + assert(std::ranges::size(shape_) > 0); + + auto urange_size = std::ranges::distance(it_start, it_end); + auto step = (shape_.size() > urange_size + 1) ? 0 : urange_size - shape_.size() + 1; + text_left = std::ranges::next(it_start, step, it_end); + + // shape size = 3 + // Text: 1 2 3 4 5 6 7 8 9 + // text_left: ^ + // text_right: ^ + // distance(text_left, text_right) = 2 + if (shape_.size() <= std::ranges::distance(text_left, it_end) + 1) + { + roll_factor = pow(sigma, static_cast(std::ranges::size(shape_) - 1)); + hash_full(); + } + + text_right = it_end; + } + //!\} + + //!\anchor basic_iterator_comparison_kmer_hash + //!\name Comparison operators + //!\{ + + //!\brief Compare to iterator on text. + friend bool operator==(basic_iterator const & lhs, sentinel_t const & rhs) noexcept + { + return lhs.text_right == rhs; + } + + //!\brief Compare to iterator on text. + friend bool operator==(sentinel_t const & lhs, basic_iterator const & rhs) noexcept + { + return lhs == rhs.text_right; + } + + //!\brief Compare to another basic_iterator. + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + { + return std::tie(lhs.text_right, lhs.shape_) == std::tie(rhs.text_right, rhs.shape_); + } + + //!\brief Compare to iterator on text. + friend bool operator!=(basic_iterator const & lhs, sentinel_t const & rhs) noexcept + { + return !(lhs == rhs); + } + + //!\brief Compare to iterator on text. + friend bool operator!=(sentinel_t const & lhs, basic_iterator const & rhs) noexcept + { + return !(lhs == rhs); + } + + //!\brief Compare to another basic_iterator. + friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + { + return !(lhs == rhs); + } + + //!\brief Compare to another basic_iterator. + friend bool operator<(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + { + return (lhs.shape_ <= rhs.shape_) && (lhs.text_right < rhs.text_right); + } + + //!\brief Compare to another basic_iterator. + friend bool operator>(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + { + return (lhs.shape_ >= rhs.shape_) && (lhs.text_right > rhs.text_right); + } + + //!\brief Compare to another basic_iterator. + friend bool operator<=(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + { + return (lhs.shape_ <= rhs.shape_) && (lhs.text_right <= rhs.text_right); + } + + //!\brief Compare to another basic_iterator. + friend bool operator>=(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + { + return (lhs.shape_ >= rhs.shape_) && (lhs.text_right >= rhs.text_right); + } + + //!\} + + //!\brief Pre-increment. + basic_iterator & operator++() noexcept + { + hash_forward(); + return *this; + } + + //!\brief Post-increment. + basic_iterator operator++(int) noexcept + { + basic_iterator tmp{*this}; + hash_forward(); + return tmp; + } + + /*!\brief Pre-decrement. + * \attention This function is only available if `it_t` models std::bidirectional_iterator. + */ + basic_iterator & operator--() noexcept + //!\cond + requires std::bidirectional_iterator + //!\endcond + { + hash_backward(); + return *this; + } + + /*!\brief Post-decrement. + * \attention This function is only available if `it_t` models std::bidirectional_iterator. + */ + basic_iterator operator--(int) noexcept + //!\cond + requires std::bidirectional_iterator + //!\endcond + { + basic_iterator tmp{*this}; + hash_backward(); + return tmp; + } + + /*!\brief Forward this iterator. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + basic_iterator & operator+=(difference_type const skip) noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + hash_forward(skip); + return *this; + } + + /*!\brief Forward copy of this iterator. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + basic_iterator operator+(difference_type const skip) const noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + basic_iterator tmp{*this}; + return tmp += skip; + } + + /*!\brief Non-member operator+ delegates to non-friend operator+. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + friend basic_iterator operator+(difference_type const skip, basic_iterator const & it) noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + return it + skip; + } + + /*!\brief Decrement iterator by `skip`. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + basic_iterator & operator-=(difference_type const skip) noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + hash_backward(skip); + return *this; + } + + /*!\anchor basic_iterator_operator-decrement + * \brief Return decremented copy of this iterator. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + basic_iterator operator-(difference_type const skip) const noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + basic_iterator tmp{*this}; + return tmp -= skip; + } + + /*!\brief Non-member operator- delegates to non-friend operator-. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + friend basic_iterator operator-(difference_type const skip, basic_iterator const & it) noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + return it - skip; + } + + /*!\anchor basic_iterator_operator-difference + * \brief Return offset between two iterator's positions. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + friend difference_type operator-(basic_iterator const & lhs, basic_iterator const & rhs) noexcept + //!\cond + requires std::random_access_iterator + //!\endcond + { + return static_cast(lhs.text_right - rhs.text_right); + } + + /*!\brief Return offset between remote sentinel's position and this. + * \attention This function is only available if sentinel_t and it_t model std::sized_sentinel_for. + */ + friend difference_type operator-(sentinel_t const & lhs, basic_iterator const & rhs) noexcept + //!\cond + requires std::sized_sentinel_for + //!\endcond + { + return static_cast(lhs - rhs.text_right); + } + + /*!\brief Return offset this and remote sentinel's position. + * \attention This function is only available if it_t and sentinel_t model std::sized_sentinel_for. + */ + friend difference_type operator-(basic_iterator const & lhs, sentinel_t const & rhs) noexcept + //!\cond + requires std::sized_sentinel_for + //!\endcond + { + return static_cast(lhs.text_right - rhs); + } + + /*!\brief Move the iterator by a given offset and return the corresponding hash value. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + reference operator[](difference_type const n) const + //!\cond + requires std::random_access_iterator + //!\endcond + { + return *(*this + n); + } + + //!\brief Return the hash value. + value_type operator*() const noexcept + { + return hash_value + to_rank(*text_right); + } + +private: + //!\brief The alphabet type of the passed iterator. + using alphabet_t = std::iter_value_t; + + //!\brief The alphabet size. + static constexpr auto const sigma{alphabet_size}; + + //!\brief The hash value. + size_t hash_value{0}; + + //!\brief The factor for the left most position of the hash value. + size_t roll_factor{0}; + + //!\brief The shape to use. + shape shape_; + + //!\brief Iterator to the leftmost position of the k-mer. + it_t text_left; + + //!\brief Iterator to the rightmost position of the k-mer. + it_t text_right; + + //!\brief Increments iterator by 1. + void hash_forward() + { + if (shape_.all()) + { + hash_roll_forward(); + } + else + { + std::ranges::advance(text_left, 1); + hash_full(); + } + } + + /*!\brief Increments iterator by `skip`. + * \param skip Amount to increment. + * \attention This function is only available if `it_t` models std::random_access_iterator. + */ + void hash_forward(difference_type const skip) + //!\cond + requires std::random_access_iterator + //!\endcond + { + std::ranges::advance(text_left, skip); + hash_full(); + } + + /*!\brief Decrements iterator by 1. + * \attention This function is only available if `it_t` models std::bidirectional_iterator. + */ + void hash_backward() + //!\cond + requires std::bidirectional_iterator + //!\endcond + { + if (shape_.all()) + { + hash_roll_backward(); + } + else + { + std::ranges::advance(text_left, -1); + hash_full(); + } + } + + /*!\brief Decrements iterator by `skip`. + * \param skip Amount to decrement. + * \attention This function is only available if `it_t` models std::bidirectional_iterator. + */ + void hash_backward(difference_type const skip) + { + std::ranges::advance(text_left, -skip); + hash_full(); + } + + //!\brief Calculates a hash value by explicitly looking at each position. + void hash_full() + { + text_right = text_left; + hash_value = 0; + + for (size_t i{0}; i < shape_.size() - 1u; ++i) + { + hash_value += shape_[i] * to_rank(*text_right); + hash_value *= shape_[i] ? sigma : 1; + std::ranges::advance(text_right, 1); + } + + } + + //!\brief Calculates the next hash value via rolling hash. + void hash_roll_forward() + { + hash_value -= to_rank(*(text_left)) * roll_factor; + hash_value += to_rank(*(text_right)); + hash_value *= sigma; + + std::ranges::advance(text_left, 1); + std::ranges::advance(text_right, 1); + } + + /*!\brief Calculates the previous hash value via rolling hash. + * \attention This function is only available if `it_t` models std::bidirectional_iterator. + */ + void hash_roll_backward() + //!\cond + requires std::bidirectional_iterator + //!\endcond + { + std::ranges::advance(text_left, -1); + std::ranges::advance(text_right, -1); + + hash_value /= sigma; + hash_value -= to_rank(*(text_right)); + hash_value += to_rank(*(text_left)) * roll_factor; + } +}; + +//!\brief A deduction guide for the view class template. +template +kmer_hash_view(rng_t &&, shape const & shape_) -> kmer_hash_view>; + +// --------------------------------------------------------------------------------------------------------------------- +// kmer_hash_fn (adaptor definition) +// --------------------------------------------------------------------------------------------------------------------- + +//![adaptor_def] +//!\brief views::kmer_hash's range adaptor object type (non-closure). +struct kmer_hash_fn +{ + //!\brief Store the shape and return a range adaptor closure object. + constexpr auto operator()(shape const & shape_) const + { + return adaptor_from_functor{*this, shape_}; + } + + /*!\brief Call the view's constructor with the underlying view and a seqan3::shape as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type + * of the range must model seqan3::semialphabet. + * \param[in] shape_ The seqan3::shape to use for hashing. + * \throws std::invalid_argument if resulting hash values would be too big for a 64 bit integer. + * \returns A range of converted elements. + */ + template + constexpr auto operator()(urng_t && urange, shape const & shape_) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::kmer_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::kmer_hash must model std::ranges::forward_range."); + static_assert(semialphabet>, + "The range parameter to views::kmer_hash must be over elements of seqan3::semialphabet."); + + return kmer_hash_view{std::forward(urange), shape_}; + } +}; +//![adaptor_def] + +} // namespace seqan3::detail + +namespace seqan3::views +{ + +/*!\name Alphabet related views + * \{ + */ + +/*!\brief Computes hash values for each position of a range via a given shape. + * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is + * omitted in pipe notation] + * \param[in] urange The range being processed. [parameter is omitted in pipe notation] + * \param[in] shape The seqan3::shape that determines how to compute the hash value. + * \returns A range of std::size_t where each value is the hash of the resp. k-mer. + * See below for the properties of the returned range. + * \ingroup views + * + * \details + * + * \attention + * For the alphabet size \f$\sigma\f$ of the alphabet of `urange` and the number of 1s \f$s\f$ of `shape` it must hold + * that \f$s>\frac{64}{\log_2\sigma}\f$, i.e. hashes resulting from the shape/alphabet combination can be represented + * in an `uint64_t`. + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *preserved* | + * | std::ranges::random_access_range | | *preserved* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *preserved* | + * | std::ranges::common_range | | *preserved* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | + * + * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. + * + * ### Example + * + * \include test/snippet/search/views/kmer_hash.cpp + * + * \hideinitializer + * + * \stableapi{Since version 3.1.} + */ +inline constexpr auto kmer_hash = detail::kmer_hash_fn{}; + +//!\} + +} // namespace seqan3::views diff --git a/include/seqan3/search/views/minimiser.hpp b/include/seqan3/search/views/minimiser.hpp new file mode 100644 index 0000000000..37b0ebd27e --- /dev/null +++ b/include/seqan3/search/views/minimiser.hpp @@ -0,0 +1,592 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides seqan3::views::minimiser. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace seqan3::detail +{ +// --------------------------------------------------------------------------------------------------------------------- +// minimiser_view class +// --------------------------------------------------------------------------------------------------------------------- + +/*!\brief The type returned by seqan3::views::minimiser. + * \tparam urng1_t The type of the underlying range, must model std::ranges::forward_range, the reference type must + * model std::totally_ordered. The typical use case is that the reference type is the result of + * seqan3::kmer_hash. + * \tparam urng2_t The type of the second underlying range, must model std::ranges::forward_range, the reference type + * must model std::totally_ordered. If only one range is provided this defaults to + * std::ranges::empty_view. + * \implements std::ranges::view + * \ingroup views + * + * \details + * + * See seqan3::views::minimiser for a detailed explanation on minimizers. + * + * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. + * + * \sa seqan3::views::minimiser + */ +template > +class minimiser_view : public std::ranges::view_interface> +{ +private: + static_assert(std::ranges::forward_range, "The minimiser_view only works on forward_ranges."); + static_assert(std::ranges::forward_range, "The minimiser_view only works on forward_ranges."); + static_assert(std::totally_ordered>, + "The reference type of the underlying range must model std::totally_ordered."); + + //!\brief The default argument of the second range. + using default_urng2_t = std::ranges::empty_view; + + //!\brief Boolean variable, which is true, when second range is not of empty type. + static constexpr bool second_range_is_given = !std::same_as; + + static_assert(!second_range_is_given || std::totally_ordered_with, + std::ranges::range_reference_t>, + "The reference types of the underlying ranges must model std::totally_ordered_with."); + + //!\brief Whether the given ranges are const_iterable + static constexpr bool const_iterable = seqan3::const_iterable_range && + seqan3::const_iterable_range; + + //!\brief The first underlying range. + urng1_t urange1{}; + //!\brief The second underlying range. + urng2_t urange2{}; + + //!\brief The number of values in one window. + size_t window_size{}; + + template + class basic_iterator; + + //!\brief The sentinel type of the minimiser_view. + using sentinel = std::default_sentinel_t; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + minimiser_view() = default; //!< Defaulted. + minimiser_view(minimiser_view const & rhs) = default; //!< Defaulted. + minimiser_view(minimiser_view && rhs) = default; //!< Defaulted. + minimiser_view & operator=(minimiser_view const & rhs) = default; //!< Defaulted. + minimiser_view & operator=(minimiser_view && rhs) = default; //!< Defaulted. + ~minimiser_view() = default; //!< Defaulted. + + /*!\brief Construct from a view and a given number of values in one window. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + minimiser_view(urng1_t urange1, size_t const window_size) : + minimiser_view{std::move(urange1), default_urng2_t{}, window_size} + {} + + /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from>>) + //!\endcond + minimiser_view(other_urng1_t && urange1, size_t const window_size) : + urange1{std::views::all(std::forward(urange1))}, + urange2{default_urng2_t{}}, + window_size{window_size} + {} + + /*!\brief Construct from two views and a given number of values in one window. + * \param[in] urange1 The first input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + minimiser_view(urng1_t urange1, urng2_t urange2, size_t const window_size) : + urange1{std::move(urange1)}, + urange2{std::move(urange2)}, + window_size{window_size} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + + /*!\brief Construct from two non-views that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \tparam other_urng2_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng2_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from> && + std::ranges::viewable_range && + std::constructible_from>) + //!\endcond + minimiser_view(other_urng1_t && urange1, other_urng2_t && urange2, size_t const window_size) : + urange1{std::views::all(std::forward(urange1))}, + urange2{std::views::all(std::forward(urange2))}, + window_size{window_size} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + //!\} + + /*!\name Iterators + * \{ + */ + /*!\brief Returns an iterator to the first element of the range. + * \returns Iterator to the first element. + * + * \details + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * Strong exception guarantee. + */ + basic_iterator begin() + { + return {std::ranges::begin(urange1), + std::ranges::end(urange1), + std::ranges::begin(urange2), + window_size}; + } + + //!\copydoc begin() + basic_iterator begin() const + //!\cond + requires const_iterable + //!\endcond + { + return {std::ranges::cbegin(urange1), + std::ranges::cend(urange1), + std::ranges::cbegin(urange2), + window_size}; + } + + /*!\brief Returns an iterator to the element following the last element of the range. + * \returns Iterator to the end. + * + * \details + * + * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + sentinel end() const + { + return {}; + } + //!\} +}; + +//!\brief Iterator for calculating minimisers. +template +template +class minimiser_view::basic_iterator +{ +private: + //!\brief The sentinel type of the first underlying range. + using urng1_sentinel_t = maybe_const_sentinel_t; + //!\brief The iterator type of the first underlying range. + using urng1_iterator_t = maybe_const_iterator_t; + //!\brief The iterator type of the second underlying range. + using urng2_iterator_t = maybe_const_iterator_t; + + template + friend class basic_iterator; + +public: + /*!\name Associated types + * \{ + */ + //!\brief Type for distances between iterators. + using difference_type = std::ranges::range_difference_t; + //!\brief Value type of this iterator. + using value_type = std::ranges::range_value_t; + //!\brief The pointer type. + using pointer = void; + //!\brief Reference to `value_type`. + using reference = value_type; + //!\brief Tag this class as a forward iterator. + using iterator_category = std::forward_iterator_tag; + //!\brief Tag this class as a forward iterator. + using iterator_concept = iterator_category; + //!\} + + /*!\name Constructors, destructor and assignment + * \{ + */ + basic_iterator() = default; //!< Defaulted. + basic_iterator(basic_iterator const &) = default; //!< Defaulted. + basic_iterator(basic_iterator &&) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. + ~basic_iterator() = default; //!< Defaulted. + + //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. + basic_iterator(basic_iterator const & it) + //!\cond + requires const_range + //!\endcond + : minimiser_value{std::move(it.minimiser_value)}, + urng1_iterator{std::move(it.urng1_iterator)}, + urng1_sentinel{std::move(it.urng1_sentinel)}, + urng2_iterator{std::move(it.urng2_iterator)}, + window_values{std::move(it.window_values)} + {} + + /*!\brief Construct from begin and end iterators of a given range over std::totally_ordered values, and the number + of values per window. + * \param[in] urng1_iterator Iterator pointing to the first position of the first std::totally_ordered range. + * \param[in] urng1_sentinel Iterator pointing to the last position of the first std::totally_ordered range. + * \param[in] urng2_iterator Iterator pointing to the first position of the second std::totally_ordered range. + * \param[in] window_size The number of values in one window. + * + * \details + * + * Looks at the number of values per window in two ranges, returns the smallest between both as minimiser and + * shifts then by one to repeat this action. If a minimiser in consecutive windows is the same, it is returned only + * once. + */ + basic_iterator(urng1_iterator_t urng1_iterator, + urng1_sentinel_t urng1_sentinel, + urng2_iterator_t urng2_iterator, + size_t window_size) : + urng1_iterator{std::move(urng1_iterator)}, + urng1_sentinel{std::move(urng1_sentinel)}, + urng2_iterator{std::move(urng2_iterator)} + { + size_t size = std::ranges::distance(urng1_iterator, urng1_sentinel); + window_size = std::min(window_size, size); + + window_first(window_size); + } + //!\} + + //!\anchor basic_iterator_comparison + //!\name Comparison operators + //!\{ + + //!\brief Compare to another basic_iterator. + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) + { + return (lhs.urng1_iterator == rhs.urng1_iterator) && + (rhs.urng2_iterator == rhs.urng2_iterator) && + (lhs.window_values.size() == rhs.window_values.size()); + } + + //!\brief Compare to another basic_iterator. + friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator==(basic_iterator const & lhs, sentinel const &) + { + return lhs.urng1_iterator == lhs.urng1_sentinel; + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator==(sentinel const & lhs, basic_iterator const & rhs) + { + return rhs == lhs; + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator!=(sentinel const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator!=(basic_iterator const & lhs, sentinel const & rhs) + { + return !(lhs == rhs); + } + //!\} + + //!\brief Pre-increment. + basic_iterator & operator++() noexcept + { + next_unique_minimiser(); + return *this; + } + + //!\brief Post-increment. + basic_iterator operator++(int) noexcept + { + basic_iterator tmp{*this}; + next_unique_minimiser(); + return tmp; + } + + //!\brief Return the minimiser. + value_type operator*() const noexcept + { + return minimiser_value; + } + +private: + //!\brief The minimiser value. + value_type minimiser_value{}; + + //!\brief The offset relative to the beginning of the window where the minimizer value is found. + size_t minimiser_position_offset{}; + + //!\brief Iterator to the rightmost value of one window. + urng1_iterator_t urng1_iterator{}; + //!brief Iterator to last element in range. + urng1_sentinel_t urng1_sentinel{}; + //!\brief Iterator to the rightmost value of one window of the second range. + urng2_iterator_t urng2_iterator{}; + + //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current minimiser. + std::deque window_values{}; + + //!\brief Increments iterator by 1. + void next_unique_minimiser() + { + while (!next_minimiser()) {} + } + + //!\brief Returns new window value. + auto window_value() const + { + if constexpr (!second_range_is_given) + return *urng1_iterator; + else + return std::min(*urng1_iterator, *urng2_iterator); + } + + //!\brief Advances the window to the next position. + void advance_window() + { + ++urng1_iterator; + if constexpr (second_range_is_given) + ++urng2_iterator; + } + + //!\brief Calculates minimisers for the first window. + void window_first(size_t const window_size) + { + if (window_size == 0u) + return; + + for (size_t i = 0u; i < window_size - 1u; ++i) + { + window_values.push_back(window_value()); + advance_window(); + } + window_values.push_back(window_value()); + auto minimiser_it = std::ranges::min_element(window_values, std::less_equal{}); + minimiser_value = *minimiser_it ; + minimiser_position_offset = std::distance(std::begin(window_values), minimiser_it); + } + + /*!\brief Calculates the next minimiser value. + * \returns True, if new minimiser is found or end is reached. Otherwise returns false. + * \details + * For the following windows, we remove the first window value (is now not in window_values) and add the new + * value that results from the window shifting. + */ + bool next_minimiser() + { + advance_window(); + if (urng1_iterator == urng1_sentinel) + return true; + + value_type const new_value = window_value(); + + window_values.pop_front(); + window_values.push_back(new_value); + + if (minimiser_position_offset == 0) + { + auto minimiser_it = std::ranges::min_element(window_values, std::less_equal{}); + minimiser_value = *minimiser_it ; + minimiser_position_offset = std::distance(std::begin(window_values), minimiser_it); + return true; + } + + if (new_value < minimiser_value) + { + minimiser_value = new_value; + minimiser_position_offset = window_values.size() - 1; + return true; + } + + --minimiser_position_offset; + return false; + } +}; + +//!\brief A deduction guide for the view class template. +template +minimiser_view(rng1_t &&, size_t const window_size) -> minimiser_view>; + +//!\brief A deduction guide for the view class template. +template +minimiser_view(rng1_t &&, rng2_t &&, size_t const window_size) -> minimiser_view, + std::views::all_t>; + +// --------------------------------------------------------------------------------------------------------------------- +// minimiser_fn (adaptor definition) +// --------------------------------------------------------------------------------------------------------------------- + +//![adaptor_def] +//!\brief views::minimiser's range adaptor object type (non-closure). +struct minimiser_fn +{ + //!\brief Store the number of values in one window and return a range adaptor closure object. + constexpr auto operator()(size_t const window_size) const + { + return adaptor_from_functor{*this, window_size}; + } + + /*!\brief Call the view's constructor with two arguments: the underlying view and an integer indicating how many + * values one window contains. + * \tparam urng1_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + * \returns A range of converted values. + */ + template + constexpr auto operator()(urng1_t && urange1, size_t const window_size) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::minimiser cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minimiser must model std::ranges::forward_range."); + + if (window_size == 1) // Would just return urange1 without any changes + throw std::invalid_argument{"The chosen window_size is not valid. " + "Please choose a value greater than 1 or use two ranges."}; + + return minimiser_view{urange1, window_size}; + } +}; +//![adaptor_def] + +} // namespace seqan3::detail + +namespace seqan3::views +{ + +/*!\name General purpose views + * \{ + */ + +/*!\brief Computes minimisers for a range of comparable values. A minimiser is the smallest value in a window. + * \tparam urng_t The type of the first range being processed. See below for requirements. [template + * parameter is omitted in pipe notation] + * \param[in] urange1 The range being processed. [parameter is omitted in pipe notation] + * \param[in] window_size The number of values in one window. + * \returns A range of std::totally_ordered where each value is the minimal value for one window. See below for the + * properties of the returned range. + * \ingroup views + * + * \details + * + * A minimiser is the smallest value in a window. For example for the following list of hash values + * `[28, 100, 9, 23, 4, 1, 72, 37, 8]` and 4 as `window_size`, the minimiser values are `[9, 4, 1]`. + * + * The minimiser can be calculated for one given range or for two given ranges, where the minimizer is the smallest + * value in both windows. For example for the following list of hash values `[28, 100, 9, 23, 4, 1, 72, 37, 8]` and + * `[30, 2, 11, 101, 199, 73, 34, 900]` and 4 as `window_size`, the minimiser values are `[2, 4, 1]`. + * + * Note that in the interface with the second underlying range the const-iterable property will only be preserved if + * both underlying ranges are const-iterable. + * + * ### Robust Winnowing + * + * In case there are multiple minimal values within one window, the minimum and therefore the minimiser is ambiguous. + * We choose the rightmost value as the minimiser of the window, and when shifting the window, the minimiser is only + * changed if there appears a value that is strictly smaller than the current minimum. This approach is termed + * *robust winnowing* by [Chirag et al.](https://www.biorxiv.org/content/10.1101/2020.02.11.943241v1.full.pdf) + * and is proven to work especially well on repeat regions. + * + * ### Example + * + * \include test/snippet/search/views/minimiser.cpp + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | std::totally_ordered | std::totally_ordered | + * + * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. + * + * \hideinitializer + * + * \stableapi{Since version 3.1.} + */ +inline constexpr auto minimiser = detail::minimiser_fn{}; + +//!\} + +} // namespace seqan3::views diff --git a/include/seqan3/search/views/minimiser_hash.hpp b/include/seqan3/search/views/minimiser_hash.hpp new file mode 100644 index 0000000000..2476920ce9 --- /dev/null +++ b/include/seqan3/search/views/minimiser_hash.hpp @@ -0,0 +1,189 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides seqan3::views::minimiser_hash. + */ + +#pragma once + +#include +#include +#include +#include + +namespace seqan3 +{ +//!\brief strong_type for seed. +struct seed : seqan3::detail::strong_type +{ + using seqan3::detail::strong_type::strong_type; +}; + +//!\brief strong_type for the window_size. +struct window_size : seqan3::detail::strong_type +{ + using seqan3::detail::strong_type::strong_type; +}; +} // namespace seqan3 + +namespace seqan3::detail +{ +//!\brief seqan3::views::minimiser_hash's range adaptor object type (non-closure). +struct minimiser_hash_fn +{ + /*!\brief Store the shape and the window size and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The windows size to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, window_size const window_size) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, window_size}; + } + + /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The size of the window. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, window_size const window_size, seed const seed) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, window_size, seed}; + } + + /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type + * of the range must model seqan3::semialphabet. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The size of the window. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + template + constexpr auto operator()(urng_t && urange, + shape const & shape, + window_size const window_size, + seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::minimiser_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minimiser_hash must model std::ranges::forward_range."); + static_assert(semialphabet>, + "The range parameter to views::minimiser_hash must be over elements of seqan3::semialphabet."); + + if (shape.size() > window_size.get()) + throw std::invalid_argument{"The size of the shape cannot be greater than the window size."}; + + auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}); + + auto reverse_strand = std::forward(urange) | seqan3::views::complement + | std::views::reverse + | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}) + | std::views::reverse; + + return seqan3::detail::minimiser_view(forward_strand, reverse_strand, window_size.get() - shape.size() + 1); + } +}; + +} // namespace seqan3::detail + +namespace seqan3::views +{ + +/*!\name Alphabet related views + * \{ + */ + +/*!\brief Computes minimisers for a range with a given shape, window size and seed. + * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is + * omitted in pipe notation] + * \param[in] urange The range being processed. [parameter is omitted in pipe notation] + * \param[in] shape The seqan3::shape that determines how to compute the hash value. + * \param[in] window_size The window size to use. + * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. + * \returns A range of `size_t` where each value is the minimiser of the resp. window. + * See below for the properties of the returned range. + * \ingroup views + * + * \details + * + * A sequence can be presented by a small number of k-mers (minimisers). For a given shape and window size all k-mers + * are determined in the forward strand and the backward strand and only the lexicographically smallest k-mer is + * returned for one window. This process is repeated over every possible window of a sequence. If consecutive windows + * share a minimiser, it is saved only once. + * For example, in the sequence "TAAAGTGCTAAA" for an ungapped shape of length 3 and a window size of 5 the first, + * the second and the last window contain the same minimiser "AAA". + * Because the minimisers of the first two consecutive windows also share the same position, storing this minimiser + * twice is redundant and it is stored only once. The "AAA" minimiser of the last window on the other hand is stored, + * since it is located at an other position than the previous "AAA" minimiser and hence storing the second + * "AAA"-minimiser is not redundant but necessary. + * + * ### Non-lexicographical Minimisers by skewing the hash value with a seed + * + * It might happen that a minimiser changes only slightly when sliding the window over the sequence. For instance, when + * a minimiser starts with a repetition of A’s, then in the next window it is highly likely that the minimiser will + * start with a repetition of A’s as well. Because it is only one A shorter, depending on how long the repetition is + * this might go on for multiple window shifts. Saving these only slightly different minimiser makes no sense because + * they contain no new information about the underlying sequence. + * Additionally, sequences with a repetition of A’s will be seen as more similar to each other than they actually are. + * As [Marçais et al.](https://doi.org/10.1093/bioinformatics/btx235) have shown, randomizing the order of the k-mers + * can solve this problem. Therefore, a random seed is used to XOR all k-mers, thereby randomizing the + * order. The user can change the seed to any other value he or she thinks is useful. A seed of 0 is returning the + * lexicographical order. + * + * \sa seqan3::views::minimiser_view + * + * \attention + * Be aware of the requirements of the seqan3::views::kmer_hash view. + * + * \experimentalapi + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | + * + * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. + * + * ### Example + * + * \include test/snippet/search/views/minimiser_hash.cpp + * + * \hideinitializer + * + * \experimentalapi{Experimental since version 3.1.} + */ +inline constexpr auto minimiser_hash = detail::minimiser_hash_fn{}; + +//!\} + +} // namespace seqan3::views diff --git a/test/include/seqan3/test/performance/naive_kmer_hash.hpp b/test/include/seqan3/test/performance/naive_kmer_hash.hpp index 6a2245a507..9a308332d4 100644 --- a/test/include/seqan3/test/performance/naive_kmer_hash.hpp +++ b/test/include/seqan3/test/performance/naive_kmer_hash.hpp @@ -94,7 +94,7 @@ namespace seqan3::views * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. * * ### Example - * \snippet test/snippet/range/views/kmer_hash.cpp usage + * \snippet test/snippet/search/views/kmer_hash.cpp usage * \hideinitializer */ inline auto constexpr naive_kmer_hash = detail::naive_kmer_hash_fn{}; diff --git a/test/include/seqan3/test/performance/naive_minimiser_hash.hpp b/test/include/seqan3/test/performance/naive_minimiser_hash.hpp index 0770f80692..0e0fd2fdf3 100644 --- a/test/include/seqan3/test/performance/naive_minimiser_hash.hpp +++ b/test/include/seqan3/test/performance/naive_minimiser_hash.hpp @@ -20,9 +20,9 @@ #include #include #include -#include -#include #include +#include +#include namespace seqan3::detail { diff --git a/test/performance/range/views/CMakeLists.txt b/test/performance/range/views/CMakeLists.txt index cee25c52c6..839d76e284 100644 --- a/test/performance/range/views/CMakeLists.txt +++ b/test/performance/range/views/CMakeLists.txt @@ -1,8 +1,6 @@ seqan3_benchmark(view_all_benchmark.cpp) seqan3_benchmark(view_drop_benchmark.cpp) seqan3_benchmark(view_drop_view_take_benchmark.cpp) -seqan3_benchmark(view_kmer_hash_benchmark.cpp) -seqan3_benchmark(view_minimiser_hash_benchmark.cpp) seqan3_benchmark(view_take_benchmark.cpp) seqan3_benchmark(view_take_until_benchmark.cpp) seqan3_benchmark(view_translate_1D_benchmark.cpp) diff --git a/test/performance/search/views/CMakeLists.txt b/test/performance/search/views/CMakeLists.txt new file mode 100644 index 0000000000..8e50453c19 --- /dev/null +++ b/test/performance/search/views/CMakeLists.txt @@ -0,0 +1,2 @@ +seqan3_benchmark(view_kmer_hash_benchmark.cpp) +seqan3_benchmark(view_minimiser_hash_benchmark.cpp) diff --git a/test/performance/range/views/view_kmer_hash_benchmark.cpp b/test/performance/search/views/view_kmer_hash_benchmark.cpp similarity index 99% rename from test/performance/range/views/view_kmer_hash_benchmark.cpp rename to test/performance/search/views/view_kmer_hash_benchmark.cpp index b9bb306da3..acb776081e 100644 --- a/test/performance/range/views/view_kmer_hash_benchmark.cpp +++ b/test/performance/search/views/view_kmer_hash_benchmark.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/test/performance/range/views/view_minimiser_hash_benchmark.cpp b/test/performance/search/views/view_minimiser_hash_benchmark.cpp similarity index 99% rename from test/performance/range/views/view_minimiser_hash_benchmark.cpp rename to test/performance/search/views/view_minimiser_hash_benchmark.cpp index 907c4f839a..c019dd2e8e 100644 --- a/test/performance/range/views/view_minimiser_hash_benchmark.cpp +++ b/test/performance/search/views/view_minimiser_hash_benchmark.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/test/snippet/search/dream_index/counting_agent.cpp b/test/snippet/search/dream_index/counting_agent.cpp index 63a7e20748..f0da50afaa 100644 --- a/test/snippet/search/dream_index/counting_agent.cpp +++ b/test/snippet/search/dream_index/counting_agent.cpp @@ -1,7 +1,7 @@ #include #include -#include #include +#include using seqan3::operator""_dna4; diff --git a/test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp b/test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp index b4875192c1..67dd73021b 100644 --- a/test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp +++ b/test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp @@ -1,7 +1,7 @@ #include #include -#include #include +#include using seqan3::operator""_dna4; diff --git a/test/snippet/range/views/kmer_hash.cpp b/test/snippet/search/views/kmer_hash.cpp similarity index 92% rename from test/snippet/range/views/kmer_hash.cpp rename to test/snippet/search/views/kmer_hash.cpp index c914bba09d..b3b46f3576 100644 --- a/test/snippet/range/views/kmer_hash.cpp +++ b/test/snippet/search/views/kmer_hash.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using seqan3::operator""_dna4; using seqan3::operator""_shape; diff --git a/test/snippet/range/views/minimiser.cpp b/test/snippet/search/views/minimiser.cpp similarity index 95% rename from test/snippet/range/views/minimiser.cpp rename to test/snippet/search/views/minimiser.cpp index 649c3dc289..85410a7139 100644 --- a/test/snippet/range/views/minimiser.cpp +++ b/test/snippet/search/views/minimiser.cpp @@ -1,8 +1,8 @@ #include #include #include -#include -#include +#include +#include using seqan3::operator""_dna4; using seqan3::operator""_shape; diff --git a/test/snippet/range/views/minimiser_hash.cpp b/test/snippet/search/views/minimiser_hash.cpp similarity index 96% rename from test/snippet/range/views/minimiser_hash.cpp rename to test/snippet/search/views/minimiser_hash.cpp index 17817cc2dc..81ad7e64a8 100644 --- a/test/snippet/range/views/minimiser_hash.cpp +++ b/test/snippet/search/views/minimiser_hash.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using seqan3::operator""_dna4; using seqan3::operator""_shape; diff --git a/test/unit/range/views/CMakeLists.txt b/test/unit/range/views/CMakeLists.txt index 74f150ebe5..dc21fb9584 100644 --- a/test/unit/range/views/CMakeLists.txt +++ b/test/unit/range/views/CMakeLists.txt @@ -29,8 +29,5 @@ seqan3_test(translate_test.cpp) seqan3_test(trim_quality_test.cpp) seqan3_test(single_pass_input_test.cpp) seqan3_test(get_test.cpp) -seqan3_test(kmer_hash_test.cpp) seqan3_test(interleave_test.cpp) -seqan3_test(minimiser_test.cpp) -seqan3_test(minimiser_hash_test.cpp) seqan3_test(zip_test.cpp) diff --git a/test/unit/search/views/CMakeLists.txt b/test/unit/search/views/CMakeLists.txt new file mode 100644 index 0000000000..4625270d01 --- /dev/null +++ b/test/unit/search/views/CMakeLists.txt @@ -0,0 +1,3 @@ +seqan3_test (kmer_hash_test.cpp) +seqan3_test (minimiser_hash_test.cpp) +seqan3_test (minimiser_test.cpp) diff --git a/test/unit/range/views/kmer_hash_test.cpp b/test/unit/search/views/kmer_hash_test.cpp similarity index 99% rename from test/unit/range/views/kmer_hash_test.cpp rename to test/unit/search/views/kmer_hash_test.cpp index cf92dbdb5d..22ca42cb9d 100644 --- a/test/unit/range/views/kmer_hash_test.cpp +++ b/test/unit/search/views/kmer_hash_test.cpp @@ -13,12 +13,12 @@ #include #include #include -#include #include #include +#include #include -#include "../iterator_test_template.hpp" +#include "../../range/iterator_test_template.hpp" #include diff --git a/test/unit/range/views/minimiser_hash_test.cpp b/test/unit/search/views/minimiser_hash_test.cpp similarity index 98% rename from test/unit/range/views/minimiser_hash_test.cpp rename to test/unit/search/views/minimiser_hash_test.cpp index 9a1cda1a22..6f5950cc9b 100644 --- a/test/unit/range/views/minimiser_hash_test.cpp +++ b/test/unit/search/views/minimiser_hash_test.cpp @@ -10,13 +10,13 @@ #include #include -#include #include +#include #include #include -#include "../iterator_test_template.hpp" +#include "../../range/iterator_test_template.hpp" using seqan3::operator""_dna4; using seqan3::operator""_shape; diff --git a/test/unit/range/views/minimiser_test.cpp b/test/unit/search/views/minimiser_test.cpp similarity index 98% rename from test/unit/range/views/minimiser_test.cpp rename to test/unit/search/views/minimiser_test.cpp index 8d7a895dad..7ab46e02d4 100644 --- a/test/unit/range/views/minimiser_test.cpp +++ b/test/unit/search/views/minimiser_test.cpp @@ -14,14 +14,14 @@ #include #include #include -#include -#include #include +#include +#include #include #include -#include "../iterator_test_template.hpp" +#include "../../range/iterator_test_template.hpp" using seqan3::operator""_dna4; using seqan3::operator""_shape;