Skip to content

Commit

Permalink
[MISC] Remove field::alignment from sam_file_[in/out]out.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Oct 5, 2022
1 parent c20f219 commit 02023da
Show file tree
Hide file tree
Showing 15 changed files with 141 additions and 759 deletions.
70 changes: 0 additions & 70 deletions include/seqan3/io/sam_file/detail/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,6 @@ class format_sam_base
header_type & header,
ref_seqs_type & /*tag*/);

template <typename align_type, typename ref_seqs_type>
void construct_alignment(align_type & align,
std::vector<cigar> & cigar_vector,
[[maybe_unused]] int32_t rid,
[[maybe_unused]] ref_seqs_type & ref_seqs,
[[maybe_unused]] int32_t ref_start,
size_t ref_length);

void transfer_soft_clipping_to(std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end) const;

template <typename stream_view_t>
Expand Down Expand Up @@ -199,68 +191,6 @@ inline void format_sam_base::transfer_soft_clipping_to(std::vector<cigar> const
sc_end = cigar_count_at(second_last_index);
}

/*!\brief Construct the field::alignment depending on the given information.
* \tparam align_type The alignment type.
* \tparam ref_seqs_type The type of reference sequences (might decay to ignore).
* \param[in,out] align The alignment (pair of aligned sequences) to fill.
* \param[in] cigar_vector The cigar information to convert to an alignment.
* \param[in] rid The index of the reference sequence in header.ref_ids().
* \param[in] ref_seqs The reference sequence information.
* \param[in] ref_start The start position of the alignment in the reference sequence.
* \param[in] ref_length The length of the aligned reference sequence.
*/
template <typename align_type, typename ref_seqs_type>
inline void format_sam_base::construct_alignment(align_type & align,
std::vector<cigar> & cigar_vector,
[[maybe_unused]] int32_t rid,
[[maybe_unused]] ref_seqs_type & ref_seqs,
[[maybe_unused]] int32_t ref_start,
size_t ref_length)
{
if (rid > -1 && ref_start > -1 && // read is mapped
!cigar_vector.empty() && // alignment field was not empty
!std::ranges::empty(get<1>(align))) // seq field was not empty
{
if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
{
assert(static_cast<size_t>(ref_start + ref_length) <= std::ranges::size(ref_seqs[rid]));
// copy over unaligned reference sequence part
assign_unaligned(get<0>(align), ref_seqs[rid] | views::slice(ref_start, ref_start + ref_length));
}
else
{
using unaligned_t = std::remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
auto dummy_seq = views::repeat_n(std::ranges::range_value_t<unaligned_t>{}, ref_length)
| std::views::transform(detail::access_restrictor_fn{});
static_assert(std::same_as<unaligned_t, decltype(dummy_seq)>,
"No reference information was given so the type of the first alignment tuple position"
"must have an unaligned sequence type of a dummy sequence ("
"views::repeat_n(dna5{}, size_t{}) | "
"std::views::transform(detail::access_restrictor_fn{}))");

assign_unaligned(get<0>(align), dummy_seq); // assign dummy sequence
}

// insert gaps according to the cigar information
detail::alignment_from_cigar(align, cigar_vector);
}
else // not enough information for an alignment, assign an empty view/dummy_sequence
{
if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>) // reference info given
{
assert(std::ranges::size(ref_seqs) > 0); // we assume that the given ref info is not empty
assign_unaligned(get<0>(align), ref_seqs[0] | views::slice(0, 0));
}
else
{
using unaligned_t = std::remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
assign_unaligned(get<0>(align),
views::repeat_n(std::ranges::range_value_t<unaligned_t>{}, 0)
| std::views::transform(detail::access_restrictor_fn{}));
}
}
}

/*!\brief Reads std::byte fields using std::from_chars.
* \tparam stream_view_t The type of the stream as a view.
*
Expand Down
141 changes: 16 additions & 125 deletions include/seqan3/io/sam_file/format_bam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ class format_bam : private detail::format_sam_base
typename ref_seq_type,
typename ref_id_type,
typename ref_offset_type,
typename align_type,
typename cigar_type,
typename flag_type,
typename mapq_type,
Expand All @@ -98,7 +97,6 @@ class format_bam : private detail::format_sam_base
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
align_type & align,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
Expand All @@ -113,7 +111,6 @@ class format_bam : private detail::format_sam_base
typename id_type,
typename ref_seq_type,
typename ref_id_type,
typename align_type,
typename cigar_type,
typename qual_type,
typename mate_type,
Expand All @@ -128,7 +125,6 @@ class format_bam : private detail::format_sam_base
[[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
[[maybe_unused]] ref_id_type && ref_id,
[[maybe_unused]] std::optional<int32_t> ref_offset,
[[maybe_unused]] align_type && align,
[[maybe_unused]] cigar_type && cigar_vector,
[[maybe_unused]] sam_flag const flag,
[[maybe_unused]] uint8_t const mapq,
Expand Down Expand Up @@ -247,7 +243,6 @@ template <typename stream_type, // constraints checked by file
typename ref_seq_type,
typename ref_id_type,
typename ref_offset_type,
typename align_type,
typename cigar_type,
typename flag_type,
typename mapq_type,
Expand All @@ -269,7 +264,6 @@ format_bam::read_alignment_record(stream_type & stream,
ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
ref_id_type & ref_id,
ref_offset_type & ref_offset,
align_type & align,
cigar_type & cigar_vector,
flag_type & flag,
mapq_type & mapq,
Expand All @@ -290,11 +284,8 @@ format_bam::read_alignment_record(stream_type & stream,

auto stream_view = seqan3::detail::istreambuf(stream);

// these variables need to be stored to compute the ALIGNMENT
[[maybe_unused]] int32_t offset_tmp{};
[[maybe_unused]] int32_t soft_clipping_end{};
[[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
[[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
[[maybe_unused]] int32_t offset_tmp{}; // needed in case the cigar string was stored in the tag dictionary
[[maybe_unused]] int32_t ref_length{0}; // needed in case the cigar string was stored in the tag dictionary

// Header
// -------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -426,11 +417,12 @@ format_bam::read_alignment_record(stream_type & stream,

// read cigar string
// -------------------------------------------------------------------------------------------------------------
if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
if constexpr (!detail::decays_to_ignore_v<cigar_type>)
{
std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
// the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
int32_t seq_length{0};
std::tie(cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
int32_t soft_clipping_end{};
transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
}
else
{
Expand Down Expand Up @@ -460,56 +452,7 @@ format_bam::read_alignment_record(stream_type & stream,
++std::ranges::begin(stream_view);
};

if constexpr (!detail::decays_to_ignore_v<align_type>)
{
static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
"If you want to read ALIGNMENT but not SEQ, the alignment"
" object must store a sequence container at the second (query) position.");

if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
{
assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;

get<1>(align).reserve(seq_length);

auto tmp_iter = std::ranges::begin(seq_stream);
std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning

if (offset_tmp & 1)
{
get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
++tmp_iter;
--seq_length;
}

for (size_t i = (seq_length / 2); i > 0; --i)
{
get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
++tmp_iter;
}

if (seq_length & 1)
{
get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
++tmp_iter;
--soft_clipping_end;
}

std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
}
else
{
skip_sequence_bytes();
get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
}
}
else
{
skip_sequence_bytes();
}
skip_sequence_bytes();
}
else
{
Expand All @@ -529,14 +472,6 @@ format_bam::read_alignment_record(stream_type & stream,
seq.push_back(from_dna16[to_rank(d)]);
++std::ranges::begin(stream_view);
}

if constexpr (!detail::decays_to_ignore_v<align_type>)
{
assign_unaligned(get<1>(align),
seq
| views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
std::ranges::distance(seq) - soft_clipping_end));
}
}
}

Expand Down Expand Up @@ -570,7 +505,7 @@ format_bam::read_alignment_record(stream_type & stream,

// DONE READING - wrap up
// -------------------------------------------------------------------------------------------------------------
if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
if constexpr (!detail::decays_to_ignore_v<cigar_type>)
{
// Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
// alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
Expand Down Expand Up @@ -603,29 +538,15 @@ format_bam::read_alignment_record(stream_type & stream,
"record.")};

auto cigar_view = std::views::all(std::get<std::string>(it->second));
std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
offset_tmp = soft_clipping_end = 0;
transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
int32_t seq_length{0};
std::tie(cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
offset_tmp = 0;
int32_t soft_clipping_end{};
transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
tag_dict.erase(it); // remove redundant information

if constexpr (!detail::decays_to_ignore_v<align_type>)
{
assign_unaligned(
get<1>(align),
seq
| views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
std::ranges::distance(seq) - soft_clipping_end));
}
}
}
}

// Alignment object construction
if constexpr (!detail::decays_to_ignore_v<align_type>)
construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM

if constexpr (!detail::decays_to_ignore_v<cigar_type>)
std::swap(cigar_vector, tmp_cigar_vector);
}

//!\copydoc sam_file_output_format::write_alignment_record
Expand All @@ -635,7 +556,6 @@ template <typename stream_type,
typename id_type,
typename ref_seq_type,
typename ref_id_type,
typename align_type,
typename cigar_type,
typename qual_type,
typename mate_type,
Expand All @@ -650,7 +570,6 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
[[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
[[maybe_unused]] ref_id_type && ref_id,
[[maybe_unused]] std::optional<int32_t> ref_offset,
[[maybe_unused]] align_type && align,
[[maybe_unused]] cigar_type && cigar_vector,
[[maybe_unused]] sam_flag const flag,
[[maybe_unused]] uint8_t const mapq,
Expand Down Expand Up @@ -682,16 +601,6 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
"over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
}

static_assert(tuple_like<std::remove_cvref_t<align_type>>,
"The align object must be a std::pair of two ranges whose "
"value_type is comparable to seqan3::gap");

static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2
&& std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>>
&& std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
"The align object must be a std::pair of two ranges whose "
"value_type is comparable to seqan3::gap");

static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
"The qual object must be a std::ranges::forward_range "
"over letters that model seqan3::alphabet.");
Expand Down Expand Up @@ -771,38 +680,20 @@ inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & st
// ---------------------------------------------------------------------
int32_t ref_length{};

// if alignment is non-empty, replace cigar_vector.
// else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
// Compute the ref_length from given cigar_vector which is needed to fill field `bin`.
if (!std::ranges::empty(cigar_vector))
{
int32_t dummy_seq_length{};
for (auto & [count, operation] : cigar_vector)
detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
}
else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
{
ref_length = std::ranges::distance(get<1>(align));

// compute possible distance from alignment end to sequence end
// which indicates soft clipping at the end.
// This should be replaced by a free count_gaps function for
// aligned sequences which is more efficient if possible.
int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};

for (auto chr : get<1>(align))
if (chr == gap{})
++off_end;

off_end -= ref_length;
cigar_vector = detail::get_cigar_vector(align, offset, off_end);
}

if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
{
tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
cigar_vector.resize(2);
cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
cigar_vector[1] = cigar{static_cast<uint32_t>(ref_length), 'N'_cigar_operation};
}

std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
Expand Down
Loading

0 comments on commit 02023da

Please sign in to comment.