Skip to content

Commit

Permalink
[FEATURE] new seqan3::cigar::operation
Browse files Browse the repository at this point in the history
This commit does multiple things:

* mark entities within seqan3::cigar/::operations stable
* moves extra alphabet seqan3::cigar_op into seqan3::cigar::operation
* adds the actual implementation seqan3::cigar::operation
  (former seqan3::cigar_op) to seqan3::exposition_only::cigar_operation
  • Loading branch information
marehr committed Feb 17, 2021
1 parent 6a72114 commit 9ecaa52
Show file tree
Hide file tree
Showing 15 changed files with 178 additions and 135 deletions.
6 changes: 3 additions & 3 deletions include/seqan3/alphabet/cigar/all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
*
* CIGAR strings are combinations of count values and CIGAR operations, representing an alignment as a sequence of
* edit operations. This submodule has two different
* alphabets. One is the seqan3::cigar_op alphabet, which is a base seqan3::alphabet implementation. This
* alphabets. One is the seqan3::cigar::operation alphabet, which is a base seqan3::alphabet implementation. This
* contains all valid symbols contained in CIGAR strings. The other alphabet is the seqan3::cigar alphabet, which
* is an alphabet tuple. It combines the seqan3::cigar_op alphabet with a count value,
* is an alphabet tuple. It combines the seqan3::cigar::operation alphabet with a count value,
* such that one can represent an entire CIGAR string with a std::vector of seqan3::cigar values.
*
* The following table outlines the valid characters in the seqan3::cigar_op alphabet.
* The following table outlines the valid characters in the seqan3::cigar::operation alphabet.
*
* \copydoc seqan3::doxygen::cigar_operation_table
*/
63 changes: 57 additions & 6 deletions include/seqan3/alphabet/cigar/cigar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
namespace seqan3
{

/*!\brief The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar_op letter.
/*!\brief The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
* \ingroup cigar
* \implements seqan3::writable_semialphabet
* \implements seqan3::trivially_copyable
Expand All @@ -36,7 +36,7 @@ namespace seqan3
* \details
*
* This semialphabet represents a unit in a CIGAR string, typically found in the
* SAM and BAM formats. It consists of a number and a seqan3::cigar_op symbol.
* SAM and BAM formats. It consists of a number and a seqan3::cigar::operation symbol.
*
* It has a "visual representation", but since this is a string and not a char,
* the type only models seqan3::writable_semialphabet and not
Expand All @@ -45,7 +45,7 @@ namespace seqan3
*
* To avoid confusion between string and char literal, this alphabet has
* no user defined literal operators. Always assign from a pair of
* uint32_t and seqan3::cigar_op.
* uint32_t and seqan3::cigar::operation.
*
* \copydoc seqan3::doxygen::cigar_operation_table
*
Expand All @@ -55,17 +55,48 @@ namespace seqan3
*
* \sa https://samtools.github.io/hts-specs/SAMv1.pdf#page=8
*/
class cigar : public alphabet_tuple_base<cigar, uint32_t, cigar_op>
class cigar : public alphabet_tuple_base<cigar, uint32_t, exposition_only::cigar_operation>
{
private:
//!\brief The base class.
using base_t = alphabet_tuple_base<cigar, uint32_t, cigar_op>;
using base_t = alphabet_tuple_base<cigar, uint32_t, exposition_only::cigar_operation>;

//!\cond \brief Befriend seqan3::alphabet_tuple_base.
friend base_t;
//!\endcond

public:

/*!\brief The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
*
* \details
*
* The CIGAR string can be either basic or extended. The only difference in the extended cigar alphabet is that
* aligned bases are classified as an actual match ('=') or mismatch ('X'). In contrast, the basic cigar alphabet
* only indicated the aligned status with an 'M', without further information if the bases are actually equal or
* not.
*
* The main purpose of the seqan3::cigar::operation alphabet is to be used in the seqan3::cigar
* composition, where a cigar operation is paired with a count value.
*
* \copydoc seqan3::doxygen::cigar_operation_table
*
* Example usage:
* \include test/snippet/alphabet/cigar/cigar_op.cpp
*
* \if DEV
* \note Usually you do not want to manipulate cigar elements and vectors on
* your own but convert an alignment to a cigar and back. See
* seqan3::detail::get_cigar_vector for how to convert two aligned sequences into
* a cigar_vector.
* \endif
*
* \sa https://samtools.github.io/hts-specs/SAMv1.pdf#page=8
*
* \stableapi{Since version 3.1.}
*/
using operation = exposition_only::cigar_operation;

/*!\name Constructors, destructor and assignment
* \{
*/
Expand Down Expand Up @@ -127,7 +158,7 @@ class cigar : public alphabet_tuple_base<cigar, uint32_t, cigar_op>
uint32_t num{};
auto [ ptr, errc ] = std::from_chars(s.data(), s.data() + 10, num);

if ((errc != std::errc{}) || (!char_is_valid_for<cigar_op>(*ptr)) || (*(ptr + 1) != 0))
if ((errc != std::errc{}) || (!char_is_valid_for<operation>(*ptr)) || (*(ptr + 1) != 0))
{
get<0>(*this) = 0;
assign_char_to('P', get<1>(*this));
Expand Down Expand Up @@ -169,4 +200,24 @@ inline debug_stream_type<char_t> & operator<<(debug_stream_type<char_t> & s, cig
return s;
}

// ------------------------------------------------------------------
// literals
// ------------------------------------------------------------------

/*!\name Literals
* \{
*/

/*!\brief The seqan3::cigar::operation char literal.
* \relates seqan3::cigar
* \returns seqan3::cigar::operation
*
* \stableapi{Since version 3.1.}
*/
inline cigar::operation operator""_cigar_operation(char const c) noexcept
{
return cigar::operation{}.assign_char(c);
}
//!\}

} // namespace seqan3
68 changes: 23 additions & 45 deletions include/seqan3/alphabet/cigar/exposition_only/cigar_operation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// -----------------------------------------------------------------------------------------------------

/*!\file
* \brief Introduces the cigar_op alphabet.
* \brief Introduces the seqan3::exposition_only::cigar_operation alphabet.
* \author Joshua Kim <joshua.kim AT fu-berlin.de>
*/

Expand All @@ -15,14 +15,13 @@
#include <seqan3/alphabet/alphabet_base.hpp>

// ------------------------------------------------------------------
// cigar_op
// cigar_operation
// ------------------------------------------------------------------

namespace seqan3
namespace seqan3::exposition_only
{

/*!\brief The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
* \ingroup cigar
/*!\brief The actual implementation of seqan3::cigar::operation for documentation purpose-only.
* \implements seqan3::writable_alphabet
* \if DEV \implements seqan3::detail::writable_constexpr_alphabet \endif
* \implements seqan3::trivially_copyable
Expand All @@ -31,29 +30,25 @@ namespace seqan3
*
* \details
*
* The CIGAR string can be either basic or extended. The only difference in the
* extended cigar alphabet is that aligned bases are classified as an actual
* match ('=') or mismatch ('X'). In contrast, the basic cigar alphabet
* only indicated the aligned status with an 'M', without further
* information if the bases are actually equal or not.
* \note Please note that this class only exists, because of technically reasons.
* Please always use seqan3::cigar::operation instead of seqan3::exposition_only::cigar_operation.
*
* The main purpose of the seqan3::cigar_op alphabet is to be used in the
* seqan3::cigar composition, where a cigar operation is paired with a count
* value.
* \if DEV
* \note We can't declare seqan3::cigar::operation in-class, because we need to specify the second tuple element of
* seqan3::alphabet_tuple_base before we actually can declare it in-class. This is a trade-off to make
* seqan3::cigar a non-template class.
* \endif
*
* Example usage:
* \include test/snippet/alphabet/cigar/cigar_op.cpp
* \sa seqan3::exposition_only for what exposition-only means.
*
* \note Usually you do not want to manipulate cigar elements and vectors on
* your own but convert an alignment to a cigar and back. See
* seqan3::get_cigar_vector for how to convert two aligned sequences into
* a cigar_vector.
* \noapi{Please always use seqan3::cigar::operation. The API-Stability of all members documented in this class applies
* directly to seqan3::cigar::operation.}
*/
class cigar_op : public alphabet_base<cigar_op, 9, char>
class cigar_operation : public alphabet_base<cigar_operation, 9, char>
{
private:
//!\brief The base class.
using base_t = alphabet_base<cigar_op, 9, char>;
using base_t = alphabet_base<cigar_operation, 9, char>;

//!\cond \brief Befriend seqan3::alphabet_base.
friend base_t;
Expand All @@ -63,12 +58,12 @@ class cigar_op : public alphabet_base<cigar_op, 9, char>
/*!\name Constructors, destructor and assignment
* \{
*/
constexpr cigar_op() noexcept = default; //!< Defaulted.
constexpr cigar_op(cigar_op const &) noexcept = default; //!< Defaulted.
constexpr cigar_op(cigar_op &&) noexcept = default; //!< Defaulted.
constexpr cigar_op & operator=(cigar_op const &) noexcept = default; //!< Defaulted.
constexpr cigar_op & operator=(cigar_op &&) noexcept = default; //!< Defaulted.
~cigar_op() noexcept = default; //!< Defaulted.
constexpr cigar_operation() noexcept = default; //!< Defaulted.
constexpr cigar_operation(cigar_operation const &) noexcept = default; //!< Defaulted.
constexpr cigar_operation(cigar_operation &&) noexcept = default; //!< Defaulted.
constexpr cigar_operation & operator=(cigar_operation const &) noexcept = default; //!< Defaulted.
constexpr cigar_operation & operator=(cigar_operation &&) noexcept = default; //!< Defaulted.
~cigar_operation() noexcept = default; //!< Defaulted.
//!\}

protected:
Expand Down Expand Up @@ -106,21 +101,4 @@ class cigar_op : public alphabet_base<cigar_op, 9, char>
};
};

// ------------------------------------------------------------------
// literals
// ------------------------------------------------------------------

/*!\name Literals
* \{
*/

/*!\brief The seqan3::cigar_op char literal.
* \relates seqan3::cigar_op
* \returns seqan3::cigar_op
*/
inline cigar_op operator""_cigar_op(char const c) noexcept
{
return cigar_op{}.assign_char(c);
}
//!\}
} // namespace seqan3
} // namespace seqan3::exposition_only
41 changes: 23 additions & 18 deletions include/seqan3/io/alignment_file/detail.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct view_equality_fn
* \param reference_char The aligned character of the reference to compare.
* \param query_char The aligned character of the query to compare.
* \param extended_cigar Whether to print the extended cigar alphabet or not. See cigar operation.
* \returns A seqan3::cigar_op representing the alignment operation between the two values.
* \returns A seqan3::cigar::operation representing the alignment operation between the two values.
*
* \details
*
Expand Down Expand Up @@ -81,9 +81,9 @@ struct view_equality_fn
* \sa seqan3::aligned_sequence
*/
template <typename reference_char_type, typename query_char_type>
[[nodiscard]] constexpr cigar_op map_aligned_values_to_cigar_op(reference_char_type const reference_char,
query_char_type const query_char,
bool const extended_cigar)
[[nodiscard]] constexpr cigar::operation map_aligned_values_to_cigar_op(reference_char_type const reference_char,
query_char_type const query_char,
bool const extended_cigar)
//!\cond
requires seqan3::detail::weakly_equality_comparable_with<reference_char_type, gap> &&
seqan3::detail::weakly_equality_comparable_with<query_char_type, gap>
Expand All @@ -94,7 +94,7 @@ template <typename reference_char_type, typename query_char_type>
if (extended_cigar && (key == 0)) // in extended format refine the substitution operator to match/mismatch.
key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].

return assign_char_to(operators[key], cigar_op{});
return assign_char_to(operators[key], cigar::operation{});
}

/*!\brief Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two
Expand Down Expand Up @@ -155,18 +155,20 @@ template <seqan3::detail::pairwise_alignment alignment_type>

// Add (S)oft-clipping at the start of the read
if (query_start_pos)
result.emplace_back(query_start_pos, 'S'_cigar_op);
result.emplace_back(query_start_pos, 'S'_cigar_operation);

// Create cigar string from alignment
// -------------------------------------------------------------------------
// initialize first operation and count value:
cigar_op operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
cigar::operation operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
uint32_t count{0};

// go through alignment columns
for (auto column : views::zip(ref_seq, query_seq))
{
cigar_op next_op = map_aligned_values_to_cigar_op(std::get<0>(column), std::get<1>(column), extended_cigar);
cigar::operation next_op = map_aligned_values_to_cigar_op(std::get<0>(column),
std::get<1>(column),
extended_cigar);

if (operation == next_op)
{
Expand All @@ -185,7 +187,7 @@ template <seqan3::detail::pairwise_alignment alignment_type>

// Add (S)oft-clipping at the end of the read
if (query_end_pos)
result.emplace_back(query_end_pos, 'S'_cigar_op);
result.emplace_back(query_end_pos, 'S'_cigar_operation);

return result;
}
Expand Down Expand Up @@ -319,34 +321,37 @@ inline void alignment_from_cigar(alignment_type & alignment, std::vector<cigar>
for (auto [cigar_count, cigar_operation] : cigar_vector)
{
// ignore since alignment shall contain sliced sequences
if (('S'_cigar_op == cigar_operation) || ('H'_cigar_op == cigar_operation))
if (('S'_cigar_operation == cigar_operation) || ('H'_cigar_operation == cigar_operation))
continue;

assert(('M'_cigar_op == cigar_operation) || ('='_cigar_op == cigar_operation) ||
('X'_cigar_op == cigar_operation) || ('D'_cigar_op == cigar_operation) ||
('N'_cigar_op == cigar_operation) || ('I'_cigar_op == cigar_operation) ||
('P'_cigar_op == cigar_operation)); // checked during IO
assert(('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
('X'_cigar_operation == cigar_operation) || ('D'_cigar_operation == cigar_operation) ||
('N'_cigar_operation == cigar_operation) || ('I'_cigar_operation == cigar_operation) ||
('P'_cigar_operation == cigar_operation)); // checked during IO

if (('M'_cigar_op == cigar_operation) || ('='_cigar_op == cigar_operation) || ('X'_cigar_op == cigar_operation))
if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
('X'_cigar_operation == cigar_operation))
{
std::ranges::advance(current_ref_pos , cigar_count);
std::ranges::advance(current_read_pos, cigar_count);
}
else if (('D'_cigar_op == cigar_operation) || ('N'_cigar_op == cigar_operation)) // insert gaps into read
else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
{
// insert gaps into read

assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
++current_read_pos;
std::ranges::advance(current_ref_pos , cigar_count);
}
else if (('I'_cigar_op == cigar_operation)) // Insert gaps into ref
else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
{
assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
++current_ref_pos;
std::ranges::advance(current_read_pos, cigar_count);
}
else if (('P'_cigar_op == cigar_operation)) // skip padding
else if (('P'_cigar_operation == cigar_operation)) // skip padding
{
current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
++current_ref_pos;
Expand Down
6 changes: 3 additions & 3 deletions include/seqan3/io/alignment_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -763,8 +763,8 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & s
{
tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
cigar_vector.resize(2);
cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_op};
cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_op};
cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
}

std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
Expand Down Expand Up @@ -1125,7 +1125,7 @@ inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint
count = operation_and_count >> 4;

update_alignment_lengths(ref_length, seq_length, operation, count);
operations.emplace_back(count, cigar_op{}.assign_char(operation));
operations.emplace_back(count, cigar::operation{}.assign_char(operation));
--n_cigar_op;
}

Expand Down
6 changes: 3 additions & 3 deletions include/seqan3/io/alignment_file/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ inline void format_sam_base::transfer_soft_clipping_to(std::vector<cigar> const
int32_t & sc_end) const
{
// Checks if the given index in the cigar vector is a soft clip.
auto soft_clipping_at = [&] (size_t const index) { return cigar_vector[index] == 'S'_cigar_op; };
auto soft_clipping_at = [&] (size_t const index) { return cigar_vector[index] == 'S'_cigar_operation; };
// Checks if the given index in the cigar vector is a hard clip.
auto hard_clipping_at = [&] (size_t const index) { return cigar_vector[index] == 'H'_cigar_op; };
auto hard_clipping_at = [&] (size_t const index) { return cigar_vector[index] == 'H'_cigar_operation; };
// Checks if the given cigar vector as at least min_size many elements.
auto vector_size_at_least = [&] (size_t const min_size) { return cigar_vector.size() >= min_size; };
// Returns the cigar count of the ith cigar element in the given cigar vector.
Expand Down Expand Up @@ -280,7 +280,7 @@ inline std::tuple<std::vector<cigar>, int32_t, int32_t> format_sam_base::parse_c
throw format_error{"Corrupted cigar string encountered"};

update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
operations.emplace_back(cigar_count, cigar_op{}.assign_char(cigar_operation));
operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
}

return {operations, ref_length, seq_length};
Expand Down
Loading

0 comments on commit 9ecaa52

Please sign in to comment.