Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Node access for fm_index_cursor #2076

Merged
merged 2 commits into from
Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ The following API changes should be documented as such:
If possible, provide tooling that performs the changes, e.g. a shell-script.
-->

# 3.0.3

## New features

### Search

* The `seqan3::fm_index_cursor` exposes its suffix array interval ([\#2076](https://github.com/seqan/seqan3/pull/2076)).

# 3.0.2

Note that 3.1.0 will be the first API stable release and interfaces in this release might still change.
Expand Down
13 changes: 0 additions & 13 deletions include/seqan3/search/fm_index/detail/fm_index_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,6 @@ struct fm_index_cursor_node
}
//!\endcond
};

// std::tuple get_suffix_array_range(fm_index_cursor<index_t> const & it)
// {
// return {node.lb, node.rb};
// }
//
// std::tuple get_suffix_array_range(bi_fm_index_cursor<index_t> const & it)
// {
// return {node.lb, node.rb};
// }

//!\publicsection

//!\}

}
67 changes: 64 additions & 3 deletions include/seqan3/search/fm_index/fm_index_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,39 @@ namespace seqan3
* \{
*/

//!\brief The underlying suffix array interval.
struct suffix_array_interval
{
//!\brief The begin position of the interval ("left boundary").
size_t begin_position;
//!\brief The exclusive end position of the interval ("right boundary").
size_t end_position;

/*!\name Comparison operators
* \{
*/
/*!\brief Test for equality.
* \param[in] lhs A `seqan3::suffix_array_interval`.
* \param[in] rhs `seqan3::suffix_array_interval` to compare to.
* \returns `true` if equal, `false` otherwise.
*/
friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
{
return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
}

/*!\brief Test for inequality.
* \param[in] lhs A `seqan3::suffix_array_interval`.
* \param[in] rhs `seqan3::suffix_array_interval` to compare to.
* \returns `true` if unequal, `false` otherwise.
*/
friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
{
return !(lhs == rhs);
}
//!\}
};

/*!\brief The SeqAn FM Index Cursor.
* \implements seqan3::fm_index_cursor_specialisation
* \tparam index_t The type of the underlying index; must model seqan3::fm_index_specialisation.
Expand Down Expand Up @@ -168,7 +201,9 @@ class fm_index_cursor
index(&_index),
node({0, _index.index.size() - 1, 0, 0}),
sigma(_index.index.sigma - index_t::text_layout_mode)
{}
{
assert(_index.index.size() != 0);
}
//\}

/*!\brief Compares two cursors.
Expand Down Expand Up @@ -364,7 +399,7 @@ class fm_index_cursor
* \returns `true` if there exists a query in the text where the rightmost character of the query is
* lexicographically larger than the current rightmost character of the query.
*
* Example:
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
Expand Down Expand Up @@ -404,7 +439,7 @@ class fm_index_cursor
/*!\brief Outputs the rightmost rank.
* \returns Rightmost rank.
*
* Example:
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
Expand All @@ -424,12 +459,38 @@ class fm_index_cursor
return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
}

/*!\brief Returns the half-open suffix array interval.
* \returns A seqan3::suffix_array_interval contains the half-open interval.
*
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
* ### Complexity
*
* Constant.
*
* ### Exceptions
*
* No-throw guarantee.
*/
seqan3::suffix_array_interval suffix_array_interval() const noexcept
{
assert(index != nullptr);

return {node.lb, node.rb + 1};
}

/*!\brief Returns the length of the searched query.
* \if DEV
* Returns the depth of the cursor node in the implicit suffix tree.
* \endif
* \returns Length of query.
*
* ### Example
*
* \include test/snippet/search/fm_index_cursor.cpp
*
* ### Complexity
*
* Constant.
Expand Down
24 changes: 18 additions & 6 deletions test/snippet/search/fm_index_cursor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,27 @@ int main()
auto cur = index.cursor(); // create a cursor
// cur.cycle_back(); // cycle_back on begin() is undefined behaviour!
cur.extend_right("AAC"_dna4); // search the sequence "AAC"
seqan3::debug_stream << cur.path_label(genome) << '\n'; // outputs "AAC"
seqan3::debug_stream << cur.last_rank() << '\n'; // outputs 1
seqan3::debug_stream << cur.path_label(genome) << '\n'; // prints "AAC"
seqan3::debug_stream << cur.last_rank() << '\n'; // prints 1
seqan3::debug_stream << cur.query_length() << '\n'; // prints 3
auto [left_bound, right_bound] = cur.suffix_array_interval(); // Get the half-open suffix array interval.
seqan3::debug_stream << '[' << left_bound
<< ',' << right_bound << ")\n"; // prints "[7,8)"

cur.cycle_back(); // search the sequence "AAT"
seqan3::debug_stream << cur.path_label(genome) << '\n'; // outputs "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // outputs 3
seqan3::debug_stream << cur.path_label(genome) << '\n'; // prints "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // prints 3
seqan3::debug_stream << cur.query_length() << '\n'; // prints 3
auto interval = cur.suffix_array_interval(); // Get the half-open suffix array interval.
seqan3::debug_stream << '[' << interval.begin_position
<< ',' << interval.end_position << ")\n"; // prints "[8,10)"

cur.cycle_back(); // "cur" doesn't change because the rightmost char
// is already the largest dna4 char.
seqan3::debug_stream << cur.path_label(genome) << '\n'; // outputs "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // outputs 3
seqan3::debug_stream << cur.path_label(genome) << '\n'; // prints "AAT"
seqan3::debug_stream << cur.last_rank() << '\n'; // prints 3
seqan3::debug_stream << cur.query_length() << '\n'; // prints 3
auto && [lb, rb] = cur.suffix_array_interval(); // Get the half-open suffix array interval.
seqan3::debug_stream << '[' << lb
<< ',' << rb << ")\n"; // prints "[8,10)"
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ struct fm_index_cursor_collection_test : public ::testing::Test
using alphabet_type = typename index_type::alphabet_type;
using text_type = std::vector<alphabet_type>;

static constexpr bool is_bi_fm_index = seqan3::bi_fm_index_specialisation<index_type>;
static constexpr auto convert = seqan3::views::char_to<alphabet_type> | seqan3::views::to<text_type>;

text_type text1{convert(std::string_view{"ACGACG"})};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
// -----------------------------------------------------------------------------------------------------

#include <gtest/gtest.h>

#include <seqan3/std/algorithm>
#include <type_traits>

#include <seqan3/range/views/slice.hpp>
#include <seqan3/search/fm_index/concept.hpp>
#include <seqan3/std/algorithm>
#include <seqan3/test/cereal.hpp>
#include <seqan3/test/expect_range_eq.hpp>

#include <sdsl/csa_wt.hpp>

#include "../helper.hpp"

using sdsl_byte_index_type = sdsl::csa_wt<
Expand Down Expand Up @@ -44,6 +43,10 @@ TYPED_TEST_P(fm_index_cursor_collection_test, ctr)
// custom constructor
TypeParam it0{fm};
EXPECT_EQ(it0.query_length(), 0u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it0.suffix_array_interval() == (seqan3::suffix_array_interval{0u, fm.size()}));
}
EXPECT_EQ(it0.locate().size(), fm.size());

// default construction (does not initialize the cursor)
Expand Down Expand Up @@ -78,6 +81,10 @@ TYPED_TEST_P(fm_index_cursor_collection_test, begin)
{1,4}, {1,5}, {1,6}}));
// one sentinel position included
EXPECT_EQ(it.query_length(), 0u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{0u, fm.size()}));
}
EXPECT_EQ(it.count(), 14u);
}

Expand All @@ -90,11 +97,19 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_range)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 1, 3))); // "CG"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{9u, 13u}));
}
EXPECT_EQ(it.count(), 4u);

EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 1))); // "A"
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 3u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{2u, 5u}));
}
EXPECT_EQ(it.count(), 3u);

// unsuccessful extend_right(range), it remains untouched
Expand All @@ -117,11 +132,19 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_range_empty_text)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 1, 3))); // "CG"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {3,2}, {3,6}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{9u, 13u}));
}
EXPECT_EQ(it.count(), 4u);

EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 1))); // "A"
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {3,2}, {3,6}}));
EXPECT_EQ(it.query_length(), 3u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{2u, 5u}));
}
EXPECT_EQ(it.count(), 3u);

// unsuccessful extend_right(range), it remains untouched
Expand Down Expand Up @@ -157,10 +180,18 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_char)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 1))); // "A"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}, {1,4}, {1,8}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{1u, 5u}));
}

EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 1, 2))); // "C"
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{5u, 7u}));
}

// unsuccessful extend_right(char), it remains untouched
TypeParam it_cpy = it;
Expand Down Expand Up @@ -190,10 +221,18 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_range_and_cycle)
EXPECT_TRUE(it.extend_right(seqan3::views::slice(this->text1, 0, 4))); // "ACGA"
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {1,1}}));
EXPECT_EQ(it.query_length(), 4u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{3u, 5u}));
}

EXPECT_TRUE(it.cycle_back());
EXPECT_EQ(it.locate(), (std::vector<std::pair<uint64_t, uint64_t>>{{0,4}}));
EXPECT_EQ(it.query_length(), 4u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{10u, 11u}));
}
}

TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_char_and_cycle)
Expand All @@ -206,11 +245,19 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_char_and_cycle)
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}, {0,4},
{1,4}, {1,8}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{1u, 6u}));
}

EXPECT_TRUE(it.cycle_back());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,5}, {0,7},
{1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{6u, 11u}));
}
}

TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_and_cycle)
Expand All @@ -222,14 +269,25 @@ TYPED_TEST_P(fm_index_cursor_collection_test, extend_right_and_cycle)
EXPECT_TRUE(it.extend_right());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,0}, {0,3}, {1,4}, {1,8}}));
EXPECT_EQ(it.query_length(), 1u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{1u, 5u}));
}

EXPECT_TRUE(it.cycle_back());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 1u);

if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{5u, 9u}));
}
EXPECT_TRUE(it.extend_right());
EXPECT_EQ(seqan3::uniquify(it.locate()), (std::vector<std::pair<uint64_t, uint64_t>>{{0,1}, {0,4}, {1,2}, {1,6}}));
EXPECT_EQ(it.query_length(), 2u);
if constexpr (!this->is_bi_fm_index)
{
EXPECT_TRUE(it.suffix_array_interval() == (seqan3::suffix_array_interval{9u, 13u}));
}

// unsuccessful cycle_back(), it remains untouched
TypeParam it_cpy = it;
Expand Down
1 change: 1 addition & 0 deletions test/unit/search/fm_index_cursor/fm_index_cursor_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct fm_index_cursor_test : public ::testing::Test
using alphabet_type = typename index_type::alphabet_type;
using text_type = std::vector<alphabet_type>;

static constexpr bool is_bi_fm_index = seqan3::bi_fm_index_specialisation<index_type>;
static constexpr auto convert = seqan3::views::char_to<alphabet_type> | seqan3::views::to<text_type>;

text_type text1{convert(std::string_view{"ACGACG"})};
Expand Down
Loading