Skip to content

Commit

Permalink
[FEATURE] Node access for fm_index_cursor (#2076)
Browse files Browse the repository at this point in the history
* [FEATURE] Suffix array interval of fm_index_cursor
  • Loading branch information
eseiler committed Sep 22, 2020
1 parent 205674c commit ac39faa
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 29 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ The following API changes should be documented as such:
If possible, provide tooling that performs the changes, e.g. a shell-script.
-->

# 3.0.3

## New features

### Search

* The `seqan3::fm_index_cursor` exposes its suffix array interval ([\#2076](https://github.com/seqan/seqan3/pull/2076)).

# 3.0.2

Note that 3.1.0 will be the first API stable release and interfaces in this release might still change.
Expand Down
13 changes: 0 additions & 13 deletions include/seqan3/search/fm_index/detail/fm_index_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,6 @@ struct fm_index_cursor_node
}
//!\endcond
};

// std::tuple get_suffix_array_range(fm_index_cursor<index_t> const & it)
// {
// return {node.lb, node.rb};
// }
//
// std::tuple get_suffix_array_range(bi_fm_index_cursor<index_t> const & it)
// {
// return {node.lb, node.rb};
// }

//!\publicsection

//!\}

}
67 changes: 64 additions & 3 deletions include/seqan3/search/fm_index/fm_index_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,39 @@ namespace seqan3
* \{
*/

//!\brief The underlying suffix array interval.
struct suffix_array_interval
{
//!\brief The begin position of the interval ("left boundary").
size_t begin_position;
//!\brief The exclusive end position of the interval ("right boundary").
size_t end_position;

/*!\name Comparison operators
* \{
*/
/*!\brief Test for equality.
* \param[in] lhs A `seqan3::suffix_array_interval`.
* \param[in] rhs `seqan3::suffix_array_interval` to compare to.
* \returns `true` if equal, `false` otherwise.
*/
friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
{
return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
}

/*!\brief Test for inequality.
* \param[in] lhs A `seqan3::suffix_array_interval`.
* \param[in] rhs `seqan3::suffix_array_interval` to compare to.
* \returns `true` if unequal, `false` otherwise.
*/
friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
{
return !(lhs == rhs);
}
//!\}
};

/*!\brief The SeqAn FM Index Cursor.
* \implements seqan3::fm_index_cursor_specialisation
* \tparam index_t The type of the underlying index; must model seqan3::fm_index_specialisation.
Expand Down Expand Up @@ -168,7 +201,9 @@ class fm_index_cursor
index(&_index),
node({0, _index.index.size() - 1, 0, 0}),
sigma(_index.index.sigma - index_t::text_layout_mode)
{}
{
assert(_index.index.size() != 0);
}
//\}

/*!\brief Compares two cursors.
Expand Down Expand Up @@ -364,7 +399,7 @@ class fm_index_cursor
* \returns `true` if there exists a query in the text where the rightmost character of the query is
* lexicographically larger than the current rightmost character of the query.
*
* Example:
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
Expand Down Expand Up @@ -404,7 +439,7 @@ class fm_index_cursor
/*!\brief Outputs the rightmost rank.
* \returns Rightmost rank.
*
* Example:
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
Expand All @@ -424,12 +459,38 @@ class fm_index_cursor
return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
}

/*!\brief Returns the half-open suffix array interval.
* \returns A seqan3::suffix_array_interval contains the half-open interval.
*
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
* ### Complexity
*
* Constant.
*
* ### Exceptions
*
* No-throw guarantee.
*/
seqan3::suffix_array_interval suffix_array_interval() const noexcept
{
assert(index != nullptr);

return {node.lb, node.rb + 1};
}

/*!\brief Returns the length of the searched query.
* \if DEV
* Returns the depth of the cursor node in the implicit suffix tree.
* \endif
* \returns Length of query.
*
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
* ### Complexity
*
* Constant.
Expand Down
24 changes: 18 additions & 6 deletions test/snippet/search/fm_index_cursor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,27 @@ int main()
auto cur = index.cursor(); // create a cursor
// cur.cycle_back(); // cycle_back on begin() is undefined behaviour!
cur.extend_right("AAC"_dna4); // search the sequence "AAC"
seqan3::debug_stream << cur.path_label(genome) << '\n'; // outputs "AAC"
seqan3::debug_stream << cur.last_rank() << '\n'; // outputs 1
seqan3::debug_stream << cur.path_label(genome) << '\n'; // prints "AAC"
seqan3::debug_stream << cur.last_rank() << '\n'; // prints 1
seqan3::debug_stream << cur.query_length() << '\n'; // prints 3
auto [left_bound, right_bound] = cur.suffix_array_interval(); // Get the half-open suffix array interval.
seqan3::debug_stream << '[' << left_bound
<< ',' << right_bound << ")\n"; // prints "[7,8)"

cur.cycle_back(); // search the sequence "AAT"
seqan3::debug_stream << cur.path_label(genome) << '\n'; // outputs "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // outputs 3
seqan3::debug_stream << cur.path_label(genome) << '\n'; // prints "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // prints 3
seqan3::debug_stream << cur.query_length() << '\n'; // prints 3
auto interval = cur.suffix_array_interval(); // Get the half-open suffix array interval.
seqan3::debug_stream << '[' << interval.begin_position
<< ',' << interval.end_position << ")\n"; // prints "[8,10)"

cur.cycle_back(); // "cur" doesn't change because the rightmost char
// is already the largest dna4 char.
seqan3::debug_stream << cur.path_label(genome) << '\n'; // outputs "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // outputs 3
seqan3::debug_stream << cur.path_label(genome) << '\n'; // prints "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // prints 3
seqan3::debug_stream << cur.query_length() << '\n'; // prints 3
auto && [lb, rb] = cur.suffix_array_interval(); // Get the half-open suffix array interval.
seqan3::debug_stream << '[' << lb
<< ',' << rb << ")\n"; // prints "[8,10)"
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ struct fm_index_cursor_collection_test : public ::testing::Test
using alphabet_type = typename index_type::alphabet_type;
using text_type = std::vector<alphabet_type>;

static constexpr bool is_bi_fm_index = seqan3::bi_fm_index_specialisation<index_type>;
static constexpr auto convert = seqan3::views::char_to<alphabet_type> | seqan3::views::to<text_type>;

text_type text1{convert(std::string_view{"ACGACG"})};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
// -----------------------------------------------------------------------------------------------------

#include <gtest/gtest.h>

#include <seqan3/std/algorithm>
#include <type_traits>

#include <seqan3/range/views/slice.hpp>
#include <seqan3/search/fm_index/concept.hpp>
#include <seqan3/std/algorithm>
#include <seqan3/test/cereal.hpp>
#include <seqan3/test/expect_range_eq.hpp>

#include <sdsl/csa_wt.hpp>

#include "../helper.hpp"

using sdsl_byte_index_type = sdsl::csa_wt<
Expand Down Expand Up @@ -44,6 +43,10 @@ TYPED_TEST_P(fm_index_cursor_collection_test, ctr)
// custom constructor
TypeParam it0{fm};
EXPECT_EQ(it0.query_length(), 0u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it0.suffix_array_interval() == (seqan3::suffix_array_interval{0u, fm.size()}));
}
EXPECT_EQ(it0.locate().size(), fm.size());

// default construction (does not initialize the cursor)
Expand Down Expand Up @@ -78,6 +81,10 @@ TYPED_TEST_P(fm_index_cursor_collection_test, begin)
{1,4}, {1,5}, {1,6}}));
// one sentinel position included
EXPECT_EQ(it.query_length(), 0u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{0u, fm.size()}));
}
EXPECT_EQ(it.count(), 14u);
}

Expand All @@ -90,11 +97,19 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_range)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 1, 3))); // "CG"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{9u, 13u}));
}
EXPECT_EQ(it.count(), 4u);

EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 1))); // "A"
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 3u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{2u, 5u}));
}
EXPECT_EQ(it.count(), 3u);

// unsuccessful extend_right(range), it remains untouched
Expand All @@ -117,11 +132,19 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_range_empty_text)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 1, 3))); // "CG"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {3,2}, {3,6}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{9u, 13u}));
}
EXPECT_EQ(it.count(), 4u);

EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 1))); // "A"
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {3,2}, {3,6}}));
EXPECT_EQ(it.query_length(), 3u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{2u, 5u}));
}
EXPECT_EQ(it.count(), 3u);

// unsuccessful extend_right(range), it remains untouched
Expand Down Expand Up @@ -157,10 +180,18 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_char)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 1))); // "A"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}, {1,4}, {1,8}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{1u, 5u}));
}

EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 1, 2))); // "C"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{5u, 7u}));
}

// unsuccessful extend_right(char), it remains untouched
TypeParam it_cpy = it;
Expand Down Expand Up @@ -190,10 +221,18 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_range_and_cycle)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 4))); // "ACGA"
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {1,1}}));
EXPECT_EQ(it.query_length(), 4u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{3u, 5u}));
}

EXPECT_TRUE(it.cycle_back());
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,4}}));
EXPECT_EQ(it.query_length(), 4u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{10u, 11u}));
}
}

TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_char_and_cycle)
Expand All @@ -206,11 +245,19 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_char_and_cycle)
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}, {0,4},
{1,4}, {1,8}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{1u, 6u}));
}

EXPECT_TRUE(it.cycle_back());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,5}, {0,7},
{1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{6u, 11u}));
}
}

TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_and_cycle)
Expand All @@ -222,14 +269,25 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_and_cycle)
EXPECT_TRUE(it.extend_right());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}, {1,4}, {1,8}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{1u, 5u}));
}

EXPECT_TRUE(it.cycle_back());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 1u);

if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{5u, 9u}));
}
EXPECT_TRUE(it.extend_right());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{9u, 13u}));
}

// unsuccessful cycle_back(), it remains untouched
TypeParam it_cpy = it;
Expand Down
1 change: 1 addition & 0 deletions test/unit/search/fm_index_cursor/fm_index_cursor_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct fm_index_cursor_test : public ::testing::Test
using alphabet_type = typename index_type::alphabet_type;
using text_type = std::vector<alphabet_type>;

static constexpr bool is_bi_fm_index = seqan3::bi_fm_index_specialisation<index_type>;
static constexpr auto convert = seqan3::views::char_to<alphabet_type> | seqan3::views::to<text_type>;

text_type text1{convert(std::string_view{"ACGACG"})};
Expand Down

0 comments on commit ac39faa

Please sign in to comment.