diff --git a/Makefile.am b/Makefile.am index ac177889..d7df7a43 100644 --- a/Makefile.am +++ b/Makefile.am @@ -114,6 +114,7 @@ test_scripts/test_roi.log: test_scripts/test_sym.log noinst_LIBRARIES = libdnmtools.a libdnmtools_a_SOURCES = \ + src/common/bam_record_utils.cpp \ src/common/BetaBin.cpp \ src/common/Distro.cpp \ src/common/EmissionDistribution.cpp \ @@ -129,6 +130,8 @@ libdnmtools_a_SOURCES = \ src/common/numerical_utils.cpp libdnmtools_a_SOURCES += \ + src/common/bam_record.hpp \ + src/common/bam_record_utils.hpp \ src/common/BetaBin.hpp \ src/common/Distro.hpp \ src/common/EmissionDistribution.hpp \ diff --git a/src/common/bam_record.hpp b/src/common/bam_record.hpp new file mode 100644 index 00000000..cbe786f8 --- /dev/null +++ b/src/common/bam_record.hpp @@ -0,0 +1,98 @@ +/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith + * + * Authors: Masaru Nakajima and Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* ADS: need to control all the macros from HTSlib pollution. For + functions maybe: + + $ gcc -dM -E sam.h | grep "define [a-z]" | awk '{print $2}' |\ + grep "[(]" | awk -v FS="(" '{print "#undef",$1}' + + This gives about 65 symbols that need to be deleted. For the others + I don't know what to do because some of them have "#define _" which + means they should be system symbols. +*/ + +#ifndef BAM_RECORD_HPP +#define BAM_RECORD_HPP + +#include +#include + +#include // from HTSlib +#include // from HTSlib + +struct bam_rec { + bam_rec() : b{bam_init1()} {} + bam_rec(const bam_rec &other) : b{bam_copy1(bam_init1(), other.b)} {} + bam_rec &operator=(bam_rec rhs) { std::swap(b, rhs.b); return *this; } + ~bam_rec() { if (b != nullptr) bam_destroy1(b); } + bam1_t *b; +}; + +struct bam_infile { + bam_infile(const std::string &fn) : f{hts_open(fn.c_str(), "r")} {} + ~bam_infile() { if (f != nullptr) hts_close(f); } + operator bool() const { return f != nullptr; } + template bool read(T &h, bam_rec &b) { + const int x = sam_read1(f, h.h, b.b); // -1 on EOF; args non-const + if (x < -1) throw std::runtime_error("failed reading bam record"); + return x >= 0; + } + bool is_mapped_reads_file() const { + const htsFormat *fmt = hts_get_format(f); + return fmt->category == sequence_data && + (fmt->format == bam || fmt->format == sam); + } + samFile *f{}; +}; + +struct bam_header { + bam_header() = default; + bam_header(const bam_header &rhs) : h{bam_hdr_dup(rhs.h)} {} + bam_header(bam_infile &in) : h{sam_hdr_read(in.f)} {} + ~bam_header() { if (h != nullptr) bam_hdr_destroy(h); } + operator bool() const { return h != nullptr; } + bool add_pg_line(const std::string cmd, const std::string id, + const std::string vn) { + return sam_hdr_add_line(h, "PG", "ID", id.c_str(), "VN", + vn.c_str(), "CL", cmd.c_str(), nullptr) == 0; + } + std::string tostring() const { return sam_hdr_str(h); } + sam_hdr_t *h{}; +}; + +struct bam_outfile { + bam_outfile(const std::string &fn, const bool fmt = false) : + f{hts_open(fn.c_str(), fmt ? "bw" : "w")} {} + ~bam_outfile() { if (f != nullptr) hts_close(f); } + operator bool() const { return f != nullptr; } + bool write(const bam_header &h, const bam_rec &b) { + return sam_write1(f, h.h, b.b) >= 0; + } + bool write(const bam_header &h) { return sam_hdr_write(f, h.h) == 0; } + htsFile *f{}; +}; + +struct bam_tpool{ + bam_tpool(const size_t n_threads) : tpool{hts_tpool_init(n_threads), 0} {} + ~bam_tpool() { hts_tpool_destroy(tpool.pool); } + template void set_io(const T &bam_file) { + const int ret = hts_set_thread_pool(bam_file.f, &tpool); + if (ret < 0) throw std::runtime_error("failed to set thread pool"); + } + htsThreadPool tpool{}; +}; + +#endif diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp new file mode 100644 index 00000000..adbd089d --- /dev/null +++ b/src/common/bam_record_utils.cpp @@ -0,0 +1,885 @@ +/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith + * + * Authors: Masaru Nakajima and Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "bam_record_utils.hpp" +#include "dnmt_error.hpp" + +#include +#include +#include + +#include + +#include "smithlab_utils.hpp" + +using std::string; +using std::vector; + +/// functions in place of undefd macro +static inline bool +bam_is_rev(const bam1_t *b) { return (b->core.flag & BAM_FREVERSE) != 0; } + +static inline char * +bam_get_qname(const bam1_t *b) { return reinterpret_cast(b->data); } + +static inline uint32_t * +bam_get_cigar(const bam1_t *b) { + return reinterpret_cast(b->data + b->core.l_qname); +} + +static inline uint8_t * +bam_get_seq(const bam1_t *b) { + // start of data + bytes for query/read name+ bytes for cigar + return b->data + b->core.l_qname + (b->core.n_cigar << 2); +} + +static inline uint8_t * +bam_get_qual(const bam1_t *b) { + return b->data + // start of data + b->core.l_qname + // bytes for query name + (b->core.n_cigar << 2) + // bytes for cigar + ((b->core.l_qseq + 1) >> 1); // bytes for packed query/read +} + +static inline uint8_t * +bam_get_aux(const bam1_t *b) { + return b->data + + b->core.l_qname + + (b->core.n_cigar << 2) + + ((b->core.l_qseq + 1) >> 1) + + b->core.l_qseq; +} + +static inline int +bam_get_l_aux(const bam1_t *b) { + return b->l_data - + (b->core.l_qname + + (b->core.n_cigar << 2) + + ((b->core.l_qseq + 1) >> 1) + + b->core.l_qseq); +} + +static inline bool +bam_same_orientation(const bam1_t *a, const bam1_t *b) { + return ((a->core.flag ^ b->core.flag) & BAM_FREVERSE) != 0; +} + +static void +roundup_to_power_of_2(uint32_t &x) { + bool k_high_bit_set = (x >> (sizeof(uint32_t) * 8 - 1)) & 1; + if (x > 0) { + uint8_t size = sizeof(uint32_t); + --x; + x |= x >> (size / 4); + x |= x >> (size / 2); + x |= x >> (size); + x |= x >> (size * 2); + x |= x >> (size * 4); + x += !k_high_bit_set; + } + else x = 0; +} + +static int +sam_realloc_bam_data(bam1_t *b, size_t desired) { + uint32_t new_m_data = desired; + roundup_to_power_of_2(new_m_data); + if (new_m_data < desired) { + errno = ENOMEM; // (from sam.c) Not strictly true but we can't + // store the size + return -1; + } + uint8_t *new_data = nullptr; + if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { + new_data = static_cast(realloc(b->data, new_m_data)); + } + else { + new_data = static_cast(malloc(new_m_data)); + if (new_data != nullptr) { + if (b->l_data > 0) + std::copy_n(new_data, (static_cast(b->l_data) < b->m_data) ? + b->l_data : b->m_data, b->data); + bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); + } + } + if (!new_data) return -1; + b->data = new_data; + b->m_data = new_m_data; + return 0; +} + +// static int +// sam_realloc_bam_data(bam1_t *b, size_t desired) { +// /* returns flag: either 0 for success or -1 for error (unable to +// allocate desired memory) */ +// uint32_t new_m_data = desired; +// roundup_to_power_of_2(new_m_data); +// if (new_m_data < desired) return -1; +// uint8_t *new_data = (uint8_t *)realloc(b->data, new_m_data); +// if (!new_data) return -1; +// // ADS: what would be the state of members below if -1 was returned? +// b->data = new_data; +// b->m_data = new_m_data; +// return 0; +// } + +// static inline void +// bam_copy_core(const bam1_t *a, bam1_t *b) { +// // ADS: prepared for a possibly more efficient block copy to assign +// // all variables at once, like this: b->core = a->core; +// b->core.pos = a->core.pos; +// b->core.tid = a->core.tid; +// b->core.bin = a->core.bin; +// b->core.qual = a->core.qual; +// b->core.l_extranul = a->core.l_extranul; +// b->core.flag = a->core.flag; +// b->core.l_qname = a->core.l_qname; +// b->core.n_cigar = a->core.n_cigar; +// b->core.l_qseq = a->core.l_qseq; +// b->core.mtid = a->core.mtid; +// b->core.mpos = a->core.mpos; +// b->core.isize = a->core.isize; +// } + +static inline void +bam_set1_core(bam1_core_t &core, + const size_t l_qname, const uint16_t flag, const int32_t tid, + const hts_pos_t pos, const uint8_t mapq, const size_t n_cigar, + const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize, + const size_t l_seq, const size_t qname_nuls) { + /* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and + likely mean "region to bin" for indexing */ + /* MN: hts_reg2bin categorizes the size of the reference region. + Here, we use the numbers used in htslib/cram/cram_samtools.h */ + static const int min_shift = 14; + static const int n_lvls = 5; + + core.pos = pos; + core.tid = tid; + core.bin = hts_reg2bin(pos, pos + isize, min_shift, n_lvls); + core.qual = mapq; + core.l_extranul = qname_nuls - 1; + core.flag = flag; + core.l_qname = l_qname + qname_nuls; + core.n_cigar = n_cigar; + core.l_qseq = l_seq; + core.mtid = mtid; + core.mpos = mpos; + core.isize = isize; +} + +static inline int +bam_set1_wrapper(bam1_t *bam, + const size_t l_qname, const char *qname, + const uint16_t flag, const int32_t tid, + const hts_pos_t pos, const uint8_t mapq, + const size_t n_cigar, const uint32_t *cigar, + const int32_t mtid, const hts_pos_t mpos, + const hts_pos_t isize, const size_t l_seq, + const size_t l_aux) { + /* This is based on how assignment is done in the `bam_set1` + function defined in `sam.c` from htslib */ + + /* This modification assigns variables of bam1_t struct but not the + * query/read sequence. + * + * many checks in bam_set1 have been removed because they are + * checked in code that calls this function, mostly because the + * values come from valid `bam1_t` instances, so have already been + * validated. + * + * Assumptions: + * cigar: correct format and matching length + * rlen = isize + * qlen = l_seq + * l_qname <= 254 + * HTS_POS_MAX - rlen > pos + * Where HTS_POS_MAX = ((((int64_t)INT_MAX)<<32)|INT_MAX) is the highest + * supported position. + * + * Number of bytes needed for the data is smaller than INT32_MAX + * + * qual = NULL, because we do not keep the quality scores through + * formatting the reads. + */ + + // `qname_nuls` below is the number of '\0' to use to pad the qname + // so that the cigar has 4-byte alignment. + const size_t qname_nuls = 4 - l_qname % 4; + bam_set1_core(bam->core, l_qname, flag, tid, pos, mapq, n_cigar, + mtid, mpos, isize, l_seq, qname_nuls); + + const size_t data_len = + (l_qname + qname_nuls + n_cigar*sizeof(uint32_t) + (l_seq + 1) / 2 + l_seq); + + bam->l_data = data_len; + if (data_len + l_aux > bam->m_data) { + const int ret = sam_realloc_bam_data(bam, data_len + l_aux); + if (ret < 0) + throw dnmt_error(ret, "Failed to allocate memory for BAM record"); + } + auto data_iter = bam->data; + + std::copy_n(qname, l_qname, data_iter); + std::fill_n(data_iter + l_qname, qname_nuls, '\0'); + data_iter += l_qname + qname_nuls; + + // ADS: reinterpret here because we know the cigar is originally an + // array of uint32_t and has been aligned for efficiency + std::copy_n(cigar, n_cigar, reinterpret_cast(data_iter)); + data_iter += n_cigar*sizeof(uint32_t); + + // skipping sequece assignment + data_iter += (l_seq + 1) / 2; + + std::fill(data_iter, data_iter + l_seq, '\xff'); + + return static_cast(data_len); +} + +static inline bool +eats_ref(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; } + +static inline bool +eats_query(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } + +static inline size_t +bam_get_n_cigar(const bam1_t *b) { return b->core.n_cigar; } + +static inline uint32_t +to_insertion(const uint32_t x) { + return (x & ~BAM_CIGAR_MASK) | BAM_CINS; +} + +static inline void +fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) { + if (n_cigar < 3) return; + // find first non-softclip + auto c_beg = cigar; + auto c_end = cigar + n_cigar; + + while (!eats_ref(*c_beg) && ++c_beg != c_end); + if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); + + while (!eats_ref(*(c_end-1)) && --c_end != c_beg); + if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); + + for (auto c_itr = c_beg; c_itr != c_end; ++c_itr) + if (bam_cigar_op(*c_itr) == BAM_CSOFT_CLIP) + *c_itr = to_insertion(*c_itr); +} + +static inline uint32_t +to_softclip(const uint32_t x) { + return (x & ~BAM_CIGAR_MASK) | BAM_CSOFT_CLIP; +} + +static inline void +fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { + if (n_cigar < 2) return; + + auto c_itr = cigar; + const auto c_end = c_itr + n_cigar; + + for (; !eats_ref(*c_itr) && c_itr != c_end; ++c_itr) + *c_itr = to_softclip(*c_itr); + + if (c_itr == c_end) throw dnmt_error("cigar eats no ref"); + + c_itr = cigar + n_cigar - 1; + for (; !eats_ref(*c_itr) && c_itr != cigar; --c_itr) + *c_itr = to_softclip(*c_itr); +} + +static inline size_t +merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) { + if (n_cigar < 2) return n_cigar; + auto c_itr1 = cigar; + auto c_end = c_itr1 + n_cigar; + auto c_itr2 = c_itr1 + 1; + auto op1 = bam_cigar_op(*c_itr1); + while (c_itr2 != c_end) { + auto op2 = bam_cigar_op(*c_itr2); + if (op1 == op2) { + *c_itr1 = bam_cigar_gen(bam_cigar_oplen(*c_itr1) + + bam_cigar_oplen(*c_itr2), op1); + } + else { + *(++c_itr1) = *c_itr2; + op1 = op2; + } + ++c_itr2; + } + // another increment to move past final "active" element for c_itr1 + ++c_itr1; + return std::distance(cigar, c_itr1); +} + +static inline size_t +correct_cigar(bam1_t *b) { + /* This function will change external insertions into soft clip + operations. Not sure why those would be present. It will also + change internal soft-clip operations into insertions. This could + be needed if soft-clipped ends of reads were moved to the middle + of a merged fragment. Finally, it will collapse adjacent + identical operations. None of this impacts the seq/qual/aux which + get moved as a block */ + + uint32_t *cigar = bam_get_cigar(b); + size_t n_cigar = b->core.n_cigar; + fix_external_insertion(n_cigar, cigar); + fix_internal_softclip(n_cigar, cigar); + + // merge identical adjacent cigar ops and get new number of ops + n_cigar = merge_cigar_ops(n_cigar, cigar); + // difference in bytes to shift the internal data + const size_t delta = (b->core.n_cigar - n_cigar) * sizeof(uint32_t); + if (delta > 0) { // if there is a difference; do the shift + const auto data_end = b->data + b->l_data; // bam_get_aux(b) + bam_get_l_aux(b); + std::copy(bam_get_seq(b), data_end, bam_get_seq(b) - delta); + b->core.n_cigar = n_cigar; // and update number of cigar ops + } + return delta; +} + +size_t correct_cigar(bam_rec &b) { + return correct_cigar(b.b); +} + +static inline hts_pos_t +get_rlen(const bam1_t *b) { // less tedious + return bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); +} + +static inline size_t +get_l_qseq(const bam1_t *b) { return b->core.l_qseq; } + +static inline void +complement_seq(char *first, char *last) { + for (; first != last; ++first) { + assert(valid_base(*first)); + *first = complement(*first); + } +} + +static inline void +reverse(char *a, char *b) { + char *p1, *p2; + for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) { + *p1 ^= *p2; + *p2 ^= *p1; + *p1 ^= *p2; + assert(valid_base(*p1) && valid_base(*p2)); + } +} + +// return value is the number of cigar ops that are fully consumed in +// order to read n_ref, while "partial_oplen" is the number of bases +// that could be taken from the next operation, which might be merged +// with the other read. +static inline uint32_t +get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops, + const uint32_t n_ref_full, uint32_t *partial_oplen) { + // assume: n_ops <= size(cig_in) <= size(cig_out) + size_t rlen = 0; + uint32_t i = 0; + for (i = 0; i < in_ops; ++i) { + if (eats_ref(cig_in[i])) { + if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full) + break; + rlen += bam_cigar_oplen(cig_in[i]); + } + } + *partial_oplen = n_ref_full - rlen; + return i; +} + +/* This table converts 2 bases packed in a byte to their reverse + * complement. The input is therefore a unit8_t representing 2 bases. + * It is assumed that the input uint8_t value is of form "xx" or "x-", + * where 'x' a 4-bit number representing either A, C, G, T, or N and + * '-' is 0000. For example, the ouptut for "AG" is "CT". The format + * "x-" is often used at the end of an odd-length sequence. The + * output of "A-" is "-T", and the output of "C-" is "-G", and so + * forth. The user must handle this case separately. + */ +const uint8_t byte_revcom_table[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248, + 4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 130, 66, 0, 34, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 242, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 129, 65, 0, 33, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 241, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 255 +}; + +static inline void +revcom_byte_then_reverse(unsigned char *a, unsigned char *b) { + unsigned char *p1, *p2; + for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) { + *p1 = byte_revcom_table[*p1]; + *p2 = byte_revcom_table[*p2]; + *p1 ^= *p2; + *p2 ^= *p1; + *p1 ^= *p2; + } + if (p1 == p2) *p1 = byte_revcom_table[*p1]; +} + +static inline void +revcomp_seq_by_byte(bam1_t *aln) { + const size_t l_qseq = get_l_qseq(aln); + auto seq = bam_get_seq(aln); + const size_t num_bytes = ceil(l_qseq / 2.0); + auto seq_end = seq + num_bytes; + revcom_byte_then_reverse(seq, seq_end); + if (l_qseq % 2 == 1) { // for odd-length sequences + for (size_t i = 0; i < num_bytes - 1; i++) { + // swap 4-bit chunks within consecutive bytes like this: + // (----aaaa bbbbcccc dddd....) => (aaaabbbb ccccdddd ....) + seq[i] = (seq[i] << 4) | (seq[i + 1] >> 4); + } + seq[num_bytes - 1] <<= 4; + } +} + +// places seq of b at the end of seq of c +// assumes 0 < c_seq_len - b_seq_len <= a_seq_len +// also assumes that c_seq_len has been figured out +// Also assumes the number of bytes allocated to sequence potion of c->data +// has been set to ceil((a_used_len + b_seq_len) / 2.0) where +// a_used_len = c_seq_len - b_seq_len +static inline void +merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) { + // ADS: (todo) need some functions for int_ceil and is_odd + const size_t b_seq_len = get_l_qseq(b); + const size_t c_seq_len = get_l_qseq(c); + const size_t a_used_len = c_seq_len - b_seq_len; + + const bool is_a_odd = a_used_len % 2 == 1; + const bool is_b_odd = b_seq_len % 2 == 1; + const bool is_c_odd = c_seq_len % 2 == 1; + + const size_t a_num_bytes = ceil(a_used_len / 2.0); + const size_t b_num_bytes = ceil(b_seq_len / 2.0); + + const size_t b_offset = is_a_odd && is_b_odd; + + const auto a_seq = bam_get_seq(a); + const auto b_seq = bam_get_seq(b); + auto c_seq = bam_get_seq(c); + + std::copy_n(a_seq, a_num_bytes, c_seq); + if (is_a_odd) { + // c_seq looks like either [ aa aa aa aa ] + // or [ aa aa aa a- ] + c_seq[a_num_bytes - 1] &= 0xf0; + c_seq[a_num_bytes - 1] |= is_b_odd ? + byte_revcom_table[b_seq[b_num_bytes - 1]] : + byte_revcom_table[b_seq[b_num_bytes - 1]] >> 4; + } + if (is_c_odd) { + // c_seq looks like either [ aa aa aa aa ] + // or [ aa aa aa ab ] + for (size_t i = 0; i < b_num_bytes - 1; i++) { + c_seq[a_num_bytes + i] = + (byte_revcom_table[b_seq[b_num_bytes - i - 1]] << 4) | + (byte_revcom_table[b_seq[b_num_bytes - i - 2]] >> 4); + } + c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcom_table[b_seq[0]] << 4; + // Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd) + // or [ aa aa aa ab bb bb bb b- ] (a odd; b odd) + } + else { + for (size_t i = 0; i < b_num_bytes - b_offset; i++) { + c_seq[a_num_bytes + i] = + byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]]; + } + // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even) + // or [ aa aa aa ab bb bb bb ] (a odd and b odd) + } +} + +static inline void +flip_conversion(bam1_t *aln) { + if (aln->core.flag & BAM_FREVERSE) + aln->core.flag = aln->core.flag & (~BAM_FREVERSE); + else + aln->core.flag = aln->core.flag | BAM_FREVERSE; + + revcomp_seq_by_byte(aln); + + // ADS: don't like *(cv + 1) below, but no HTSlib function for it? + uint8_t *cv = bam_aux_get(aln, "CV"); + if (!cv) throw dnmt_error("bam_aux_get failed for CV"); + *(cv + 1) = 'T'; +} + +void +flip_conversion(bam_rec &aln) { flip_conversion(aln.b); } + +static inline bool +are_mates(const bam1_t *one, const bam1_t *two) { + return one->core.mtid == two->core.tid && + one->core.mpos == two->core.pos && + (one->core.flag & BAM_FREVERSE) != (one->core.flag & BAM_FREVERSE); + // below is a consistency check and should not be necessary + /* && + two->core.mtid == one->core.tid && + two->core.mpos == one->core.pos; */ +} + +static inline int +truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { + + const uint32_t *a_cig = bam_get_cigar(a); + const uint32_t a_ops = a->core.n_cigar; + + uint32_t part_op = 0; + const uint32_t c_cur = + get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op); + + // ADS: hack here because the get_full_and_partial_ops doesn't do + // exactly what is needed for this. + const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0); + + const uint32_t c_ops = c_cur + use_partial; + vector c_cig(c_ops, 0); + + // ADS: replace this with a std::copy + auto c_cig_itr = std::copy(a_cig, a_cig + c_cur, begin(c_cig)); + // ADS: warning, if (use_partial==false), then the amount of part_op + // used below would make no sense. + if (use_partial) + *c_cig_itr = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur])); + /* after this point the cigar is set and should decide everything */ + + const uint32_t c_seq_len = bam_cigar2qlen(c_ops, c_cig.data()); + const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data()); + + // flag only needs to worry about strand and single-end stuff + const uint16_t flag = + a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); + + const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); + int ret = bam_set1_wrapper(c, + a_qname_len, + bam_get_qname(a), + flag, // flags (SR and revcomp info) + a->core.tid, + a->core.pos, + a->core.qual, + c_ops, // merged cigar ops + c_cig.data(), // merged cigar + -1, // (no mate) + -1, // (no mate) + isize, // rlen from new cigar + c_seq_len, // truncated seq length + 8); // enough for the 2 tags? + if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper"); + + const size_t n_bytes_to_copy = (c_seq_len + 1)/2; // compression + std::copy_n(bam_get_seq(a), n_bytes_to_copy, bam_get_seq(c)); + + /* add the tags */ + const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here! + // "_udpate" for "int" because it determines the right size + ret = bam_aux_update_int(c, "NM", nm); + if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int"); + + const uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV")); + // "_append" for "char" because there is no corresponding update + ret = bam_aux_append(c, "CV", 'A', 1, &conversion); + if (ret < 0) throw dnmt_error(ret, "bam_aux_append"); + + return ret; +} + +int +truncate_overlap(const bam_rec &a, const uint32_t overlap, bam_rec &c) { + return truncate_overlap(a.b, overlap, c.b); +} + +int +merge_overlap(const bam1_t *a, const bam1_t *b, + const uint32_t head, bam1_t *c) { + assert(head > 0); + + const uint32_t *a_cig = bam_get_cigar(a); + const uint32_t a_ops = a->core.n_cigar; + + const uint32_t *b_cig = bam_get_cigar(b); + const uint32_t b_ops = b->core.n_cigar; + + uint32_t part_op = 0; + uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, head, &part_op); + // ADS: hack here because the get_full_and_partial_ops doesn't do + // exactly what is needed for this. + const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0); + + // check if the middle op would be the same + const bool merge_mid = + (use_partial > 0 ? + bam_cigar_op(a_cig[c_cur]) == bam_cigar_op(b_cig[0]) : + bam_cigar_op(a_cig[c_cur - 1]) == bam_cigar_op(b_cig[0])); + + // c_ops: include the prefix of a_cig we need; then add for the + // partial op; subtract for the identical op in the middle; finally + // add the rest of b_cig. + const uint32_t c_ops = c_cur + use_partial - merge_mid + b_ops; + vector c_cig(c_ops, 0); + std::copy(a_cig, a_cig + c_cur, begin(c_cig)); + + if (use_partial) { + c_cig[c_cur] = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur])); + c_cur++; // index of dest for copying b_cig; faciltates corner case + } + // Here we get the length of a's sequence part contribution to c's + // sequence before the possibility of merging the last entry with + // the first entry in b's cigar. This is done with the cigar, so + // everything depends on the "use_partial" + const size_t a_seq_len = bam_cigar2qlen(c_cur, c_cig.data()); + /* ADS: above the return type of bam_cigar2qlen is uint64_t, but + according to the source as of 05/2023 it cannot become + negative; no possible error code returned */ + + if (merge_mid) // update the middle op if it's the same + c_cig[c_cur - 1] = bam_cigar_gen(bam_cigar_oplen(c_cig[c_cur - 1]) + + bam_cigar_oplen(b_cig[0]), + bam_cigar_op(b_cig[0])); + // copy the cigar from b into c + std::copy(b_cig + merge_mid, b_cig + b_ops, begin(c_cig) + c_cur); + /* done with cigar here */ + + const uint32_t c_seq_len = a_seq_len + b->core.l_qseq; + + // get the template length + const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data()); + + // flag only needs to worry about strand and single-end stuff + const uint16_t flag = a->core.flag & (BAM_FREAD1 | + BAM_FREAD2 | + BAM_FREVERSE); + + const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); + int ret = bam_set1_wrapper(c, + a_qname_len, + bam_get_qname(a), + flag, // (no PE; revcomp info) + a->core.tid, + a->core.pos, + a->core.qual, // mapq from "a" (consider update) + c_ops, // merged cigar ops + c_cig.data(), // merged cigar + -1, // (no mate) + -1, // (no mate) + isize, // updated + c_seq_len, // merged sequence length + 8); // enough for 2 tags? + if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper in merge_overlap"); + // Merge the sequences by bytes + merge_by_byte(a, b, c); + + // add the tag for mismatches + const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + + bam_aux2i(bam_aux_get(b, "NM"))); + ret = bam_aux_update_int(c, "NM", nm); + if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int in merge_overlap"); + + // add the tag for conversion + const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); + ret = bam_aux_append(c, "CV", 'A', 1, &cv); + if (ret < 0) throw dnmt_error(ret, "bam_aux_append in merge_overlap"); + + return ret; +} + +int +merge_overlap(const bam_rec &a, const bam_rec &b, + const uint32_t head, bam_rec &c) { + return merge_overlap(a.b, b.b, head, c.b); +} + +static inline int +merge_non_overlap(const bam1_t *a, const bam1_t *b, + const uint32_t spacer, bam1_t *c) { + /* make the cigar string */ + // collect info about the cigar strings + const uint32_t *a_cig = bam_get_cigar(a); + const uint32_t a_ops = a->core.n_cigar; + const uint32_t *b_cig = bam_get_cigar(b); + const uint32_t b_ops = b->core.n_cigar; + + // allocate the new cigar string + const uint32_t c_ops = a_ops + b_ops + 1; + vector c_cig(c_ops, 0); + + // concatenate the new cigar strings with a "skip" in the middle + auto c_cig_itr = std::copy(a_cig, a_cig + a_ops, begin(c_cig)); + *c_cig_itr++ = bam_cigar_gen(spacer, BAM_CREF_SKIP); + std::copy(b_cig, b_cig + b_ops, c_cig_itr); + /* done with cigars */ + + const size_t a_seq_len = get_l_qseq(a); + const size_t b_seq_len = get_l_qseq(b); + const size_t c_seq_len = a_seq_len + b_seq_len; + + // get the template length from the cigar + const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data()); + + // flag: only need to keep strand and single-end info + const uint16_t flag = a->core.flag & (BAM_FREAD1 | + BAM_FREAD2 | + BAM_FREVERSE); + + const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); + int ret = bam_set1_wrapper(c, + a_qname_len, + bam_get_qname(a), + flag, // flags (no PE; revcomp info) + a->core.tid, + a->core.pos, + a->core.qual, // mapq from a (consider update) + c_ops, // merged cigar ops + c_cig.data(), // merged cigar + -1, // (no mate) + -1, // (no mate) + isize, // TLEN (relative to reference; SAM docs) + c_seq_len, // merged sequence length + 8); // enough for 2 tags of 1 byte value? + if (ret < 0) throw dnmt_error(ret, "bam_set1 in merge_non_overlap"); + + merge_by_byte(a, b, c); + + /* add the tags */ + const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + + bam_aux2i(bam_aux_get(b, "NM"))); + // "udpate" for "int" because it determines the right size + ret = bam_aux_update_int(c, "NM", nm); + if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int"); + + const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); + // "append" for "char" because there is no corresponding update + ret = bam_aux_append(c, "CV", 'A', 1, &cv); + if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_append"); + + return ret; +} + +int +merge_non_overlap(const bam_rec &a, const bam_rec &b, + const uint32_t spacer, bam_rec &c) { + return merge_non_overlap(a.b, b.b, spacer, c.b); +} + +static inline int +keep_better_end(const bam1_t *a, const bam1_t *b, bam1_t *c) { + const auto a_rl = get_rlen(a); + const auto b_rl = get_rlen(b); + const auto c_rl = std::max(a_rl, b_rl); + c = bam_copy1(c, a_rl >= b_rl ? a : b); + c->core.mtid = -1; + c->core.mpos = -1; + c->core.isize = c_rl; + c->core.flag &= (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); + return 0; +} + +int +keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c) { + return keep_better_end(a.b, b.b, c.b); +} + +// ADS: will move to using this function once it is written +static inline void +standardize_format(const string &input_format, bam1_t *aln) { + int err_code = 0; + + if (input_format == "abismal" || input_format == "walt") return; + + if (input_format == "bsmap") { + // A/T rich; get the ZS tag value + auto zs_tag = bam_aux_get(aln, "ZS"); + if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)"); + // ADS: test for errors on the line below + const uint8_t cv = string(bam_aux2Z(zs_tag))[1] == '-' ? 'A' : 'T'; + // get the "mismatches" tag + auto nm_tag = bam_aux_get(aln, "NM"); + if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bsmap)"); + const int64_t nm = bam_aux2i(nm_tag); + + aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize) + + /* add the tags we want */ + // "udpate" for "int" because it determines the right size; even + // though we just deleted all tags, it will add it back here. + err_code = bam_aux_update_int(aln, "NM", nm); + if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int"); + // "append" for "char" because there is no corresponding update + err_code = bam_aux_append(aln, "CV", 'A', 1, &cv); + if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append"); + + if (bam_is_rev(aln)) + revcomp_seq_by_byte(aln); // reverse complement if needed + } + if (input_format == "bismark") { + // ADS: Previously we modified the read names at the first + // underscore. Even if the names are still that way, it should no + // longer be needed since we compare names up to a learned suffix. + + // A/T rich; get the XR tag value + auto xr_tag = bam_aux_get(aln, "XR"); + if (!xr_tag) throw dnmt_error("bam_aux_get for XR (invalid bismark)"); + const uint8_t cv = string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T'; + // get the "mismatches" tag + auto nm_tag = bam_aux_get(aln, "NM"); + if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bismark)"); + const int64_t nm = bam_aux2i(nm_tag); + + aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize) + + /* add the tags we want */ + // "udpate" for "int" because it determines the right size; even + // though we just deleted all tags, it will add it back here. + err_code = bam_aux_update_int(aln, "NM", nm); + if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int"); + // "append" for "char" because there is no corresponding update + err_code = bam_aux_append(aln, "CV", 'A', 1, &cv); + if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append"); + + if (bam_is_rev(aln)) + revcomp_seq_by_byte(aln); // reverse complement if needed + } + + // Be sure this doesn't depend on mapper! Removes the "qual" part of + // the data in a bam1_t struct but does not change its uncompressed + // size. + const auto qs = bam_get_qual(aln); + std::fill(qs, qs + aln->core.l_qseq, '\xff'); // overwrites qseq +} + +void +standardize_format(const string &input_format, bam_rec &aln) { + standardize_format(input_format, aln.b); +} diff --git a/src/common/bam_record_utils.hpp b/src/common/bam_record_utils.hpp new file mode 100644 index 00000000..90291b99 --- /dev/null +++ b/src/common/bam_record_utils.hpp @@ -0,0 +1,184 @@ +/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith + * + * Authors: Masaru Nakajima and Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef BAM_RECORD_UTILS_HPP +#define BAM_RECORD_UTILS_HPP + +#include "bam_record.hpp" + +#include + +#ifdef bam_is_rev +#undef bam_is_rev +#endif + +inline bool +bam_is_rev(const bam_rec &b) { return (b.b->core.flag & BAM_FREVERSE) != 0; } + +#ifdef bam_is_mrev +#undef bam_is_mrev +#endif + +inline bool +bam_is_mrev(const bam_rec &b) { return (b.b->core.flag & BAM_FMREVERSE) != 0; } + +#ifdef bam_get_qname +#undef bam_get_qname +#endif + +inline char * +bam_get_qname(const bam_rec &b) { return reinterpret_cast(b.b->data); } + +#ifdef bam_get_cigar +#undef bam_get_cigar +#endif + +inline uint32_t * +bam_get_cigar(const bam_rec &b) { + // start of data + bytes for query/read name + return reinterpret_cast(b.b->data + b.b->core.l_qname); +} + +#ifdef bam_get_seq +#undef bam_get_seq +#endif + +inline uint8_t * +bam_get_seq(const bam_rec &b) { + // start of data + bytes for cigar + bytes for query/read name + return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2); +} + +#ifdef bam_get_qual +#undef bam_get_qual +#endif + +inline uint8_t * +bam_get_qual(const bam_rec &b) { + return b.b->data + // start of data + b.b->core.l_qname + // bytes for query name + (b.b->core.n_cigar << 2) + // bytes for cigar + ((b.b->core.l_qseq + 1) >> 1); // bytes for packed query/read +} + +#ifdef bam_get_aux +#undef bam_get_aux +#endif + +inline uint8_t * +bam_get_aux(const bam_rec &b) { + return b.b->data + + b.b->core.l_qname + + (b.b->core.n_cigar << 2) + + ((b.b->core.l_qseq + 1) >> 1) + + b.b->core.l_qseq; +} + +#ifdef bam_get_l_aux +#undef bam_get_l_aux +#endif + +inline int +bam_get_l_aux(const bam_rec &b) { + return b.b->l_data - + (b.b->core.l_qname + + (b.b->core.n_cigar << 2) + + ((b.b->core.l_qseq + 1) >> 1) + + b.b->core.l_qseq); +} + +inline bool +bam_same_orientation(const bam_rec &a, const bam_rec &b) { + return ((a.b->core.flag ^ b.b->core.flag) & BAM_FREVERSE) != 0; +} + +int truncate_overlap(const bam_rec &a, const uint32_t overlap, bam_rec &c); + +int merge_overlap(const bam_rec &a, const bam_rec &b, + const uint32_t head, bam_rec &c); + +int merge_non_overlap(const bam_rec &a, const bam_rec &b, + const uint32_t spacer, bam_rec &c); + +int keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c); + +size_t correct_cigar(bam_rec &b); + +void flip_conversion(bam_rec &aln); + +inline bool +is_a_rich(const bam_rec &b) { return bam_aux2A(bam_aux_get(b.b, "CV")) == 'A'; } + +void +standardize_format(const std::string &input_format, bam_rec &aln); + +inline bool +are_mates(const bam_rec &one, const bam_rec &two) { + return one.b->core.mtid == two.b->core.tid && + one.b->core.mpos == two.b->core.pos && + bam_same_orientation(one, two); + // below is a consistency check and should not be necessary + /* && + two->core.mtid == one->core.tid && + two->core.mpos == one->core.pos; */ +} + +inline size_t +get_l_qseq(const bam_rec &b) { return b.b->core.l_qseq; } + +inline size_t +get_n_targets(const bam_header &bh) { return bh.h->n_targets; } + +inline std::string +get_qname(const bam_rec &b) { return bam_get_qname(b); } + +inline int32_t +get_tid(const bam_rec &b) { return b.b->core.tid; } + +inline hts_pos_t +get_pos(const bam_rec &b) { return b.b->core.pos; } + +inline hts_pos_t +get_endpos(const bam_rec &b) { return bam_endpos(b.b); } + +inline bool +precedes_by_start(const bam_rec &a, const bam_rec &b) { + // assumes a.get_tid() <= b.get_tid() + return get_tid(a) == get_tid(b) && get_pos(a) < get_pos(b); +} + +inline bool +precedes_by_end_and_strand(const bam_rec &a, const bam_rec &b) { + const auto end_a = bam_endpos(a.b); + const auto end_b = bam_endpos(b.b); + return end_a < end_b || (end_a == end_b && bam_is_rev(a) < bam_is_rev(b)); +} + +inline bool +equivalent_chrom_and_start(const bam_rec &a, const bam_rec &b) { + return a.b->core.pos == b.b->core.pos && a.b->core.tid == b.b->core.tid; +} + +inline bool +equivalent_end_and_strand(const bam_rec &a, const bam_rec &b) { + return bam_endpos(a.b) == bam_endpos(b.b) && bam_is_rev(a) == bam_is_rev(b); +} + +template int +bam_aux_update_int(bam_rec &b, const char tag[2], T val) { + return bam_aux_update_int(b.b, tag, val); +} + +#endif diff --git a/src/common/dnmt_error.hpp b/src/common/dnmt_error.hpp index 8090ebb6..38a714c1 100644 --- a/src/common/dnmt_error.hpp +++ b/src/common/dnmt_error.hpp @@ -20,6 +20,7 @@ #include #include #include // for int64_t +#include struct dnmt_error: public std::exception { int64_t err; // error possibly from HTSlib diff --git a/src/utils/format-reads.cpp b/src/utils/format-reads.cpp index d82b6d1b..73d82871 100644 --- a/src/utils/format-reads.cpp +++ b/src/utils/format-reads.cpp @@ -40,10 +40,6 @@ #include -// from HTSlib -#include -#include - // from smithlab #include "OptionParser.hpp" #include "smithlab_utils.hpp" @@ -51,6 +47,7 @@ // from dnmtools #include "dnmt_error.hpp" +#include "bam_record_utils.hpp" using std::string; using std::vector; @@ -58,716 +55,19 @@ using std::cerr; using std::endl; using std::string; using std::vector; - - - - -static inline void -roundup_to_power_of_2(uint32_t &x) { - bool k_high_bit_set = (x >> (sizeof(uint32_t) * 8 - 1)) & 1; - if (x > 0) { - uint8_t size = sizeof(uint32_t); - --x; - x |= x >> (size / 4); - x |= x >> (size / 2); - x |= x >> (size); - x |= x >> (size * 2); - x |= x >> (size * 4); - x += !k_high_bit_set; - } - else x = 0; -} - -static int -sam_realloc_bam_data(bam1_t *b, size_t desired) { - /* returns flag: either 0 for success or -1 for error (unable to - allocate desired memory) */ - uint32_t new_m_data = desired; - roundup_to_power_of_2(new_m_data); - if (new_m_data < desired) return -1; - uint8_t *new_data = (uint8_t *)realloc(b->data, new_m_data); - if (!new_data) return -1; - // ADS: what would be the state of members below if -1 was returned? - b->data = new_data; - b->m_data = new_m_data; - return 0; -} - -static inline void -bam_copy_core(const bam1_t *a, bam1_t *b) { - /* ADS: prepared for a possibly more efficient block copy to assign - all variables at once */ - b->core.pos = a->core.pos; - b->core.tid = a->core.tid; - b->core.bin = a->core.bin; - b->core.qual = a->core.qual; - b->core.l_extranul = a->core.l_extranul; - b->core.flag = a->core.flag; - b->core.l_qname = a->core.l_qname; - b->core.n_cigar = a->core.n_cigar; - b->core.l_qseq = a->core.l_qseq; - b->core.mtid = a->core.mtid; - b->core.mpos = a->core.mpos; - b->core.isize = a->core.isize; -} - -static inline void -bam_set1_core(bam1_core_t &core, - const size_t l_qname, const uint16_t flag, const int32_t tid, - const hts_pos_t pos, const uint8_t mapq, const size_t n_cigar, - const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize, - const size_t l_seq, const size_t qname_nuls) { - /* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and - likely mean "region to bin" for indexing */ - /* MN: hts_reg2bin categorizes the size of the reference region. - Here, we use the numbers used in htslib/cram/cram_samtools.h */ - static const int min_shift = 14; - static const int n_lvls = 5; - - core.pos = pos; - core.tid = tid; - core.bin = hts_reg2bin(pos, pos + isize, min_shift, n_lvls); - core.qual = mapq; - core.l_extranul = qname_nuls - 1; - core.flag = flag; - core.l_qname = l_qname + qname_nuls; - core.n_cigar = n_cigar; - core.l_qseq = l_seq; - core.mtid = mtid; - core.mpos = mpos; - core.isize = isize; -} - -static int -bam_set1_wrapper(bam1_t *bam, - const size_t l_qname, const char *qname, - const uint16_t flag, const int32_t tid, - const hts_pos_t pos, const uint8_t mapq, - const size_t n_cigar, const uint32_t *cigar, - const int32_t mtid, const hts_pos_t mpos, - const hts_pos_t isize, const size_t l_seq, - const size_t l_aux) { - /* This is based on how assignment is done in the `bam_set1` - function defined in `sam.c` from htslib */ - - /* - * This modification assigns variables of bam1_t struct but not the sequence. - * - * Many checks have been removed because they are checked in code - * that calls this function, mostly because they already come from a - * valid `bam1_t` struct and so the values have been individually - * validated. - * - * Assumptions: - * cigar has been computed and is in the right format - * rlen = isize - * qlen = l_seq - * l_qname <= 254 - * HTS_POS_MAX - rlen > pos - * Where HTS_POS_MAX = ((((int64_t)INT_MAX)<<32)|INT_MAX) is the highest - * supported position. - * - * Number of bytes needed for the data is smaller than INT32_MAX - * - * qual = NULL, because we do not keep the quality scores through - * formatting the reads. - */ - - // `qname_nuls` below is the number of '\0' to use to pad the qname - // so that the cigar has 4-byte alignment. - const size_t qname_nuls = 4 - l_qname % 4; - bam_set1_core(bam->core, l_qname, flag, tid, pos, mapq, n_cigar, - mtid, mpos, isize, l_seq, qname_nuls); - - const size_t data_len = - (l_qname + qname_nuls + n_cigar*sizeof(uint32_t) + (l_seq + 1) / 2 + l_seq); - - bam->l_data = data_len; - if (data_len + l_aux > bam->m_data) { - const int ret = sam_realloc_bam_data(bam, data_len + l_aux); - if (ret < 0) { - throw dnmt_error(ret, "Failed to allocate memory for BAM record"); - } - } - auto data_iter = bam->data; - - std::copy_n(qname, l_qname, data_iter); - std::fill_n(data_iter + l_qname, qname_nuls, '\0'); - data_iter += l_qname + qname_nuls; - - // ADS: reinterpret here because we know the cigar is originally an - // array of uint32_t and has been aligned for efficiency - std::copy_n(cigar, n_cigar, reinterpret_cast(data_iter)); - data_iter += n_cigar * sizeof(uint32_t); - - // skipping sequece assignment - data_iter += (l_seq + 1) / 2; - - std::fill(data_iter, data_iter + l_seq, '\xff'); - - return static_cast(data_len); -} - -static inline bool -eats_ref(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; } - -static inline bool -eats_query(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } - -static inline size_t -bam_get_n_cigar(const bam1_t *b) { return b->core.n_cigar; } - -static inline uint32_t -to_insertion(const uint32_t x) { - return (x & ~BAM_CIGAR_MASK) | BAM_CINS; -} - -static void -fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) { - if (n_cigar < 3) return; - // find first non-softclip - auto c_beg = cigar; - auto c_end = cigar + n_cigar; - - while (!eats_ref(*c_beg) && ++c_beg != c_end); - if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); - - while (!eats_ref(*(c_end-1)) && --c_end != c_beg); - if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); - - for (auto c_itr = c_beg; c_itr != c_end; ++c_itr) - if (bam_cigar_op(*c_itr) == BAM_CSOFT_CLIP) - *c_itr = to_insertion(*c_itr); -} - -static inline uint32_t -to_softclip(const uint32_t x) { - return (x & ~BAM_CIGAR_MASK) | BAM_CSOFT_CLIP; -} - -static void -fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { - if (n_cigar < 2) return; - - auto c_itr = cigar; - const auto c_end = c_itr + n_cigar; - - for (; !eats_ref(*c_itr) && c_itr != c_end; ++c_itr) - *c_itr = to_softclip(*c_itr); - - if (c_itr == c_end) throw dnmt_error("cigar eats no ref"); - - c_itr = cigar + n_cigar - 1; - for (; !eats_ref(*c_itr) && c_itr != cigar; --c_itr) - *c_itr = to_softclip(*c_itr); -} - -static size_t -merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) { - if (n_cigar < 2) return n_cigar; - auto c_itr1 = cigar; - auto c_end = c_itr1 + n_cigar; - auto c_itr2 = c_itr1 + 1; - auto op1 = bam_cigar_op(*c_itr1); - while (c_itr2 != c_end) { - auto op2 = bam_cigar_op(*c_itr2); - if (op1 == op2) { - *c_itr1 = bam_cigar_gen(bam_cigar_oplen(*c_itr1) + - bam_cigar_oplen(*c_itr2), op1); - } - else { - *(++c_itr1) = *c_itr2; - op1 = op2; - } - ++c_itr2; - } - // another increment to move past final "active" element for c_itr1 - ++c_itr1; - return std::distance(cigar, c_itr1); -} - -static size_t -correct_cigar(bam1_t *b) { - /* This function will change external insertions into soft clip - operations. Not sure why those would be present. It will also - change internal soft-clip operations into insertions. This could - be needed if soft-clipped ends of reads were moved to the middle - of a merged fragment. Finally, it will collapse adjacent - identical operations. None of this impacts the seq/qual/aux which - get moved as a block */ - - uint32_t *cigar = bam_get_cigar(b); - size_t n_cigar = b->core.n_cigar; - fix_external_insertion(n_cigar, cigar); - fix_internal_softclip(n_cigar, cigar); - - // merge identical adjacent cigar ops and get new number of ops - n_cigar = merge_cigar_ops(n_cigar, cigar); - // difference in bytes to shift the internal data - const size_t delta = (b->core.n_cigar - n_cigar) * sizeof(uint32_t); - if (delta > 0) { // if there is a difference; do the shift - auto data_end = bam_get_aux(b) + bam_get_l_aux(b); - std::copy(bam_get_seq(b), data_end, bam_get_seq(b) - delta); - b->core.n_cigar = n_cigar; // and update number of cigar ops - } - return delta; -} - -static inline size_t -get_rlen(const bam1_t *b) { // less tedious - return bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); -} - -static inline size_t -get_qlen(const bam1_t *b) { // less tedious - return b->core.l_qseq; -} - -static inline void -complement_seq(char *first, char *last) { - for (; first != last; ++first) { - assert(valid_base(*first)); - *first = complement(*first); - } -} - -static inline void -reverse(char *a, char *b) { - char *p1, *p2; - for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) { - *p1 ^= *p2; - *p2 ^= *p1; - *p1 ^= *p2; - assert(valid_base(*p1) && valid_base(*p2)); - } -} - - -// return value is the number of cigar ops that are fully consumed in -// order to read n_ref, while "partial_oplen" is the number of bases -// that could be taken from the next operation, which might be merged -// with the other read. -static uint32_t -get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops, - const uint32_t n_ref_full, uint32_t *partial_oplen) { - // assume: n_ops <= size(cig_in) <= size(cig_out) - size_t rlen = 0; - uint32_t i = 0; - for (i = 0; i < in_ops; ++i) { - if (eats_ref(cig_in[i])) { - if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full) - break; - rlen += bam_cigar_oplen(cig_in[i]); - } - } - *partial_oplen = n_ref_full - rlen; - return i; -} - - -/* This table converts 2 bases packed in a byte to their reverse - * complement. The input is therefore a unit8_t representing 2 bases. - * It is assumed that the input uint8_t value is of form "xx" or "x-", - * where 'x' a 4-bit number representing either A, C, G, T, or N and - * '-' is 0000. For example, the ouptut for "AG" is "CT". The format - * "x-" is often used at the end of an odd-length sequence. The - * output of "A-" is "-T", and the output of "C-" is "-G", and so - * forth. The user must handle this case separately. - */ -const uint8_t byte_revcom_table[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248, - 4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 2, 130, 66, 0, 34, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 242, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 129, 65, 0, 33, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 241, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 255 -}; - - -static inline void -revcom_byte_then_reverse(unsigned char *a, unsigned char *b) { - unsigned char *p1, *p2; - for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) { - *p1 = byte_revcom_table[*p1]; - *p2 = byte_revcom_table[*p2]; - *p1 ^= *p2; - *p2 ^= *p1; - *p1 ^= *p2; - } - if (p1 == p2) *p1 = byte_revcom_table[*p1]; -} - -static void -revcomp_seq_by_byte(bam1_t *aln) { - const size_t l_qseq = get_qlen(aln); - auto seq = bam_get_seq(aln); - const size_t num_bytes = ceil(l_qseq / 2.0); - auto seq_end = seq + num_bytes; - revcom_byte_then_reverse(seq, seq_end); - if (l_qseq % 2 == 1) { // for odd-length sequences - for (size_t i = 0; i < num_bytes - 1; i++) { - // swap 4-bit chunks within consecutive bytes like this: - // (----aaaa bbbbcccc dddd....) => (aaaabbbb ccccdddd ....) - seq[i] = (seq[i] << 4) | (seq[i + 1] >> 4); - } - seq[num_bytes - 1] <<= 4; - } -} - -// places seq of b at the end of seq of c -// assumes 0 < c_seq_len - b_seq_len <= a_seq_len -// also assumes that c_seq_len has been figured out -// Also assumes the number of bytes allocated to sequence potion of c->data -// has been set to ceil((a_used_len + b_seq_len) / 2.0) where -// a_used_len = c_seq_len - b_seq_len -static void -merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) { - // ADS: (todo) need some functions for int_ceil and is_odd - const size_t b_seq_len = get_qlen(b); - const size_t c_seq_len = get_qlen(c); - const size_t a_used_len = c_seq_len - b_seq_len; - - const bool is_a_odd = a_used_len % 2 == 1; - const bool is_b_odd = b_seq_len % 2 == 1; - const bool is_c_odd = c_seq_len % 2 == 1; - - const size_t a_num_bytes = ceil(a_used_len / 2.0); - const size_t b_num_bytes = ceil(b_seq_len / 2.0); - - const size_t b_offset = is_a_odd && is_b_odd; - - const auto a_seq = bam_get_seq(a); - const auto b_seq = bam_get_seq(b); - auto c_seq = bam_get_seq(c); - - std::copy_n(a_seq, a_num_bytes, c_seq); - if (is_a_odd) { - // c_seq looks like either [ aa aa aa aa ] - // or [ aa aa aa a- ] - c_seq[a_num_bytes - 1] &= 0xf0; - c_seq[a_num_bytes - 1] |= is_b_odd ? - byte_revcom_table[b_seq[b_num_bytes - 1]] : - byte_revcom_table[b_seq[b_num_bytes - 1]] >> 4; - } - if (is_c_odd) { - // c_seq looks like either [ aa aa aa aa ] - // or [ aa aa aa ab ] - for (size_t i = 0; i < b_num_bytes - 1; i++) { - c_seq[a_num_bytes + i] = - (byte_revcom_table[b_seq[b_num_bytes - i - 1]] << 4) | - (byte_revcom_table[b_seq[b_num_bytes - i - 2]] >> 4); - } - c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcom_table[b_seq[0]] << 4; - // Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd) - // or [ aa aa aa ab bb bb bb b- ] (a odd; b odd) - } - else { - for (size_t i = 0; i < b_num_bytes - b_offset; i++) { - c_seq[a_num_bytes + i] = - byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]]; - } - // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even) - // or [ aa aa aa ab bb bb bb ] (a odd and b odd) - } -} - -static inline bool -is_a_rich(const bam1_t *b) { return bam_aux2A(bam_aux_get(b, "CV")) == 'A'; } - -static inline bool -format_is_bam_or_sam(htsFile *hts) { - const htsFormat *fmt = hts_get_format(hts); - return fmt->category == sequence_data && - (fmt->format == bam || fmt->format == sam); -} - -static void -flip_conversion(bam1_t *aln) { - if (aln->core.flag & BAM_FREVERSE) - aln->core.flag = aln->core.flag & (~BAM_FREVERSE); - else - aln->core.flag = aln->core.flag | BAM_FREVERSE; - - revcomp_seq_by_byte(aln); - - // ADS: don't like *(cv + 1) below, but no HTSlib function for it? - uint8_t *cv = bam_aux_get(aln, "CV"); - if (!cv) throw dnmt_error("bam_aux_get failed for CV"); - *(cv + 1) = 'T'; -} - -static bool -are_mates(const bam1_t *one, const bam1_t *two) { - return one->core.mtid == two->core.tid && - one->core.mpos == two->core.pos && - bam_is_rev(one) != bam_is_rev(two); - // below is a consistency check and should not be necessary - /* && - two->core.mtid == one->core.tid && - two->core.mpos == one->core.pos; */ -} - -static int -truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { - - const uint32_t *a_cig = bam_get_cigar(a); - const uint32_t a_ops = a->core.n_cigar; - - uint32_t part_op = 0; - const uint32_t c_cur = - get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op); - - // ADS: hack here because the get_full_and_partial_ops doesn't do - // exactly what is needed for this. - const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0); - - const uint32_t c_ops = c_cur + use_partial; - uint32_t *c_cig = (uint32_t *)calloc(c_ops, sizeof(uint32_t)); - - // ADS: replace this with a std::copy - memcpy(c_cig, a_cig, c_cur * sizeof(uint32_t)); - // ADS: warning, if !use_partial, the amount of part_op used below - // would make no sense. - if (use_partial) - c_cig[c_cur] = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur])); - /* after this point the cigar is set and should decide everything */ - - const uint32_t c_seq_len = bam_cigar2qlen(c_ops, c_cig); - const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig); - - // flag only needs to worry about strand and single-end stuff - const uint16_t flag = - a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); - - int ret = bam_set1_wrapper(c, - a->core.l_qname - (a->core.l_extranul + 1), - bam_get_qname(a), - flag, // flags (SR and revcomp info) - a->core.tid, - a->core.pos, - a->core.qual, - c_ops, // merged cigar ops - c_cig, // merged cigar - -1, // (no mate) - -1, // (no mate) - isize, // rlen from new cigar - c_seq_len, // truncated seq length - 8); // enough for the 2 tags? - if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper"); - // ADS: might it be better to fill `c->data` directly? - free(c_cig); - - auto c_seq = bam_get_seq(c); - size_t num_bytes_to_copy = (c_seq_len + 1)/2; - std::copy_n(bam_get_seq(a), num_bytes_to_copy, c_seq); - - /* add the tags */ - const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here! - // "udpate" for "int" because it determines the right size - ret = bam_aux_update_int(c, "NM", nm); - if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int"); - - const uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV")); - // "append" for "char" because there is no corresponding update - ret = bam_aux_append(c, "CV", 'A', 1, &conversion); - if (ret < 0) throw dnmt_error(ret, "bam_aux_append"); - - return ret; -} - -static int -merge_overlap(const bam1_t *a, const bam1_t *b, - const uint32_t head, bam1_t *c) { - assert(head > 0); - - const uint32_t *a_cig = bam_get_cigar(a); - const uint32_t a_ops = a->core.n_cigar; - - const uint32_t *b_cig = bam_get_cigar(b); - const uint32_t b_ops = b->core.n_cigar; - - uint32_t part_op = 0; - uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, head, &part_op); - // ADS: hack here because the get_full_and_partial_ops doesn't do - // exactly what is needed for this. - const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0); - - // check if the middle op would be the same - const bool merge_mid = - (use_partial > 0 ? - bam_cigar_op(a_cig[c_cur]) == bam_cigar_op(b_cig[0]) : - bam_cigar_op(a_cig[c_cur - 1]) == bam_cigar_op(b_cig[0])); - - // c_ops: include the prefix of a_cig we need; then add for the - // partial op; subtract for the identical op in the middle; finally - // add the rest of b_cig. - const uint32_t c_ops = c_cur + use_partial - merge_mid + b_ops; - - uint32_t *c_cig = (uint32_t *)calloc(c_ops, sizeof(uint32_t)); - // std::fill(c_cig, c_cig + c_ops, std::numeric_limits::max()); - memcpy(c_cig, a_cig, c_cur * sizeof(uint32_t)); - - if (use_partial) { - c_cig[c_cur] = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur])); - c_cur++; // index of dest for copying b_cig; faciltates corner case - } - // Here we get the length of a's sequence part contribution to c's - // sequence before the possibility of merging the last entry with - // the first entry in b's cigar. This is done with the cigar, so - // everything depends on the "use_partial" - const size_t a_seq_len = bam_cigar2qlen(c_cur, c_cig); - /* ADS: above the return type of bam_cigar2qlen is uint64_t, but - according to the source as of 05/2023 it cannot become - negative; no possible error code returned */ - - if (merge_mid) // update the middle op if it's the same - c_cig[c_cur - 1] = bam_cigar_gen(bam_cigar_oplen(c_cig[c_cur - 1]) + - bam_cigar_oplen(b_cig[0]), - bam_cigar_op(b_cig[0])); - // copy the cigar from b into c - memcpy(c_cig + c_cur, b_cig + merge_mid, - (b_ops - merge_mid)*sizeof(uint32_t)); - /* done with cigar string here */ - - const uint32_t c_seq_len = a_seq_len + b->core.l_qseq; - - // get the template length - const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig); - - // flag only needs to worry about strand and single-end stuff - uint16_t flag = (a->core.flag & (BAM_FREAD1 | - BAM_FREAD2 | - BAM_FREVERSE)); - - int ret = bam_set1_wrapper(c, - a->core.l_qname - (a->core.l_extranul + 1), - bam_get_qname(a), - flag, // (no PE; revcomp info) - a->core.tid, - a->core.pos, - a->core.qual, // mapq from "a" (consider update) - c_ops, // merged cigar ops - c_cig, // merged cigar - -1, // (no mate) - -1, // (no mate) - isize, // updated - c_seq_len, // merged sequence length - 8); // enough for 2 tags? - free(c_cig); - if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper in merge_overlap"); - // Merge the sequences by bytes - merge_by_byte(a, b, c); - - // add the tag for mismatches - const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + - bam_aux2i(bam_aux_get(b, "NM"))); - ret = bam_aux_update_int(c, "NM", nm); - if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int in merge_overlap"); - - // add the tag for conversion - const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); - ret = bam_aux_append(c, "CV", 'A', 1, &cv); - if (ret < 0) throw dnmt_error(ret, "bam_aux_append in merge_overlap"); - - return ret; -} - -static int -merge_non_overlap(const bam1_t *a, const bam1_t *b, - const uint32_t spacer, bam1_t *c) { - /* make the cigar string */ - // collect info about the cigar strings - const uint32_t *a_cig = bam_get_cigar(a); - const uint32_t a_ops = a->core.n_cigar; - const uint32_t *b_cig = bam_get_cigar(b); - const uint32_t b_ops = b->core.n_cigar; - // allocate the new cigar string - const uint32_t c_ops = a_ops + b_ops + 1; - uint32_t *c_cig = (uint32_t *)calloc(c_ops, sizeof(uint32_t)); - // concatenate the new cigar strings with a "skip" in the middle - memcpy(c_cig, a_cig, a_ops * sizeof(uint32_t)); - c_cig[a_ops] = bam_cigar_gen(spacer, BAM_CREF_SKIP); - memcpy(c_cig + a_ops + 1, b_cig, b_ops * sizeof(uint32_t)); - /* done with cigars */ - - const size_t a_seq_len = get_qlen(a); - const size_t b_seq_len = get_qlen(b); - const size_t c_seq_len = a_seq_len + b_seq_len; - - // get the template length from the cigar - const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig); - - // flag: only need to keep strand and single-end info - const uint16_t flag = a->core.flag & (BAM_FREAD1 | - BAM_FREAD2 | - BAM_FREVERSE); - - int ret = - bam_set1_wrapper(c, - a->core.l_qname - (a->core.l_extranul + 1), - bam_get_qname(a), - flag, // flags (no PE; revcomp info) - a->core.tid, - a->core.pos, - a->core.qual, // mapq from a (consider update) - c_ops, // merged cigar ops - c_cig, // merged cigar - -1, // (no mate) - -1, // (no mate) - isize, // TLEN (relative to reference; SAM docs) - c_seq_len, // merged sequence length - 8); // enough for 2 tags of 1 byte value? - free(c_cig); - if (ret < 0) throw dnmt_error(ret, "bam_set1 in merge_non_overlap"); - - merge_by_byte(a, b, c); - - /* add the tags */ - const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + - bam_aux2i(bam_aux_get(b, "NM"))); - // "udpate" for "int" because it determines the right size - ret = bam_aux_update_int(c, "NM", nm); - if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int"); - - const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); - // "append" for "char" because there is no corresponding update - ret = bam_aux_append(c, "CV", 'A', 1, &cv); - if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_append"); - - return ret; -} - -static int -keep_better_end(const bam1_t *a, const bam1_t *b, bam1_t *c) { - c = bam_copy1(c, get_rlen(a) >= get_rlen(b) ? a : b); - c->core.mtid = -1; - c->core.mpos = -1; - c->core.isize = get_rlen(c); - c->core.flag &= (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); - return 0; -} +using std::equal; static size_t merge_mates(const size_t range, - bam1_t *one, bam1_t *two, bam1_t *merged) { + bam_rec &one, bam_rec &two, bam_rec &merged) { if (!are_mates(one, two)) return -std::numeric_limits::max(); // arithmetic easier using base 0 so subtracting 1 from pos - const int one_s = one->core.pos; - const int one_e = bam_endpos(one); - const int two_s = two->core.pos; - const int two_e = bam_endpos(two); + const int one_s = get_pos(one); + const int one_e = get_endpos(one); + const int two_s = get_pos(two); + const int two_e = get_endpos(two); assert(one_s >= 0 && two_s >= 0); const int spacer = two_s - one_e; @@ -853,98 +153,22 @@ merge_mates(const size_t range, return two_e - one_s; } -/********Above are functions for merging pair-end reads********/ - -// ADS: will move to using this function once it is written -static void -standardize_format(const string &input_format, bam1_t *aln) { - int err_code = 0; - - if (input_format == "abismal" || input_format == "walt") return; - - if (input_format == "bsmap") { - // A/T rich; get the ZS tag value - auto zs_tag = bam_aux_get(aln, "ZS"); - if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)"); - // ADS: test for errors on the line below - const uint8_t cv = string(bam_aux2Z(zs_tag))[1] == '-' ? 'A' : 'T'; - // get the "mismatches" tag - auto nm_tag = bam_aux_get(aln, "NM"); - if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bsmap)"); - const int64_t nm = bam_aux2i(nm_tag); - - aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize) - - /* add the tags we want */ - // "udpate" for "int" because it determines the right size; even - // though we just deleted all tags, it will add it back here. - err_code = bam_aux_update_int(aln, "NM", nm); - if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int"); - // "append" for "char" because there is no corresponding update - err_code = bam_aux_append(aln, "CV", 'A', 1, &cv); - if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append"); - - if (bam_is_rev(aln)) - revcomp_seq_by_byte(aln); // reverse complement if needed - } - if (input_format == "bismark") { - // ADS: Previously we modified the read names at the first - // underscore. Even if the names are still that way, it should no - // longer be needed since we compare names up to a learned suffix. - - // A/T rich; get the XR tag value - auto xr_tag = bam_aux_get(aln, "XR"); - if (!xr_tag) throw dnmt_error("bam_aux_get for XR (invalid bismark)"); - const uint8_t cv = string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T'; - // get the "mismatches" tag - auto nm_tag = bam_aux_get(aln, "NM"); - if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bismark)"); - const int64_t nm = bam_aux2i(nm_tag); - - aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize) - - /* add the tags we want */ - // "udpate" for "int" because it determines the right size; even - // though we just deleted all tags, it will add it back here. - err_code = bam_aux_update_int(aln, "NM", nm); - if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int"); - // "append" for "char" because there is no corresponding update - err_code = bam_aux_append(aln, "CV", 'A', 1, &cv); - if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append"); - - if (bam_is_rev(aln)) - revcomp_seq_by_byte(aln); // reverse complement if needed - } - // Be sure this doesn't depend on mapper! Removes the "qual" part of - // the data in a bam1_t struct but does not change its uncompressed - // size. - const auto qs = bam_get_qual(aln); - std::fill(qs, qs + aln->core.l_qseq, '\xff'); // deletes qseq -} +/********Above are functions for merging pair-end reads********/ static vector load_read_names(const string &inputfile, const size_t n_reads) { - samFile *hts = hts_open(inputfile.c_str(), "r"); - if (!hts) throw dnmt_error("failed to open file: " + inputfile); + bam_infile hts(inputfile); + if (!hts) throw dnmt_error("failed to open BAM/SAM file: " + inputfile); - sam_hdr_t *hdr = sam_hdr_read(hts); - if (!hdr) throw dnmt_error("failed to read header: " + inputfile); + bam_header hdr(hts); + if (!hdr) throw dnmt_error("failed to get header: " + inputfile); - bam1_t *aln = bam_init1(); + bam_rec aln; vector names; size_t count = 0; - int err_code = 0; - - while ((err_code = sam_read1(hts, hdr, aln)) >= 0 && count++ < n_reads) - names.push_back(string(bam_get_qname(aln))); - // err_core == -1 means EOF - if (err_code < -1) dnmt_error(err_code, "load_read_names:sam_read1"); - - bam_destroy1(aln); - bam_hdr_destroy(hdr); - err_code = hts_close(hts); - if (err_code < 0) throw dnmt_error(err_code, "check_input_file:hts_close"); + while (hts.read(hdr, aln) && count++ < n_reads) + names.push_back(bam_get_qname(aln)); return names; } @@ -1055,65 +279,42 @@ check_sorted(const string &inputfile, const size_t suff_len, size_t n_reads) { return true; } -static bool +static inline void check_input_file(const string &infile) { - // ADS: (below) At some point the errno here is set to 3=ESRCH ("No - // such process"?) when using HTSlib 1.17 on macos ventura - const int prev_errno = errno; - samFile* hts = hts_open(infile.c_str(), "r"); - if (!hts || errno != prev_errno) - throw dnmt_error("error opening: " + infile); - - const htsFormat *fmt = hts_get_format(hts); - if (fmt->category != sequence_data) - throw dnmt_error("not sequence data: " + infile); - if (fmt->format != bam && fmt->format != sam) - throw dnmt_error("not SAM/BAM format: " + infile); - - const int err_code = hts_close(hts); - if (err_code < 0) throw dnmt_error(err_code, "check_input_file:hts_close"); - - return true; + const bam_infile hts(infile); + if (!hts) throw dnmt_error("failed to open file: " + infile); + if (!hts.is_mapped_reads_file()) + throw dnmt_error("not valid SAM/BAM format: " + infile); } static bool check_format_in_header(const string &input_format, const string &inputfile) { - samFile* hts = hts_open(inputfile.c_str(), "r"); + bam_infile hts(inputfile); if (!hts) throw dnmt_error("error opening file: " + inputfile); - - sam_hdr_t *hdr = sam_hdr_read(hts); - if (!hdr) throw dnmt_error("failed to read header: " + inputfile); - - auto begin_hdr = sam_hdr_str(hdr); - auto end_hdr = begin_hdr + std::strlen(begin_hdr); - auto it = std::search(begin_hdr, end_hdr, + const bam_header bh(hts); + if (!bh) throw dnmt_error("failed to read header: " + inputfile); + const string entire_header(bh.tostring()); + auto it = std::search(begin(entire_header), end(entire_header), begin(input_format), end(input_format), [](const unsigned char a, const unsigned char b) { return std::toupper(a) == std::toupper(b); }); - bam_hdr_destroy(hdr); - const int err_code = hts_close(hts); - if (err_code < 0) throw dnmt_error(err_code, "check_format_in_header:hts_close"); - - return it != end_hdr; + return it != end(entire_header); } -static bool -same_name(const bam1_t *a, const bam1_t *b, const size_t suff_len) { +static inline bool +same_name(const bam_rec &a, const bam_rec &b, const size_t suff_len) { // "+ 1" below: extranul counts *extras*; we don't want *any* nulls - const uint16_t a_l = a->core.l_qname - (a->core.l_extranul + 1); - const uint16_t b_l = b->core.l_qname - (b->core.l_extranul + 1); + const uint16_t a_l = a.b->core.l_qname - (a.b->core.l_extranul + 1); + const uint16_t b_l = b.b->core.l_qname - (b.b->core.l_extranul + 1); if (a_l != b_l) return false; assert(a_l > suff_len); return !std::strncmp(bam_get_qname(a), bam_get_qname(b), a_l - suff_len); } -static void -add_pg_line(const string &cmd, sam_hdr_t *hdr) { - int err_code = - sam_hdr_add_line(hdr, "PG", "ID", "DNMTOOLS", "VN", - VERSION, "CL", cmd.c_str(), NULL); - if (err_code) throw dnmt_error(err_code, "failed to add pg header line"); +static inline void +swap(bam_rec &a, bam_rec &b) { + std::swap(a.b, b.b); } static void @@ -1122,99 +323,73 @@ format(const string &cmd, const size_t n_threads, const bool bam_format, const string &input_format, const size_t suff_len, const size_t max_frag_len) { - int err_code = 0; + static const dnmt_error bam_write_err{"error writing bam"}; - // open the hts files; assume already checked - samFile *hts = hts_open(inputfile.c_str(), "r"); - samFile *out = hts_open(outfile.c_str(), bam_format ? "wb" : "w"); + bam_tpool tpool(n_threads); // outer scope: must be destroyed last + { + bam_infile hts(inputfile); // assume already checked + bam_header hdr(hts); + if (!hdr) throw dnmt_error("failed to read header"); - // set the threads - htsThreadPool the_thread_pool{hts_tpool_init(n_threads), 0}; - err_code = hts_set_thread_pool(hts, &the_thread_pool); - if (err_code < 0) throw dnmt_error("error setting threads"); - err_code = hts_set_thread_pool(out, &the_thread_pool); - if (err_code < 0) throw dnmt_error("error setting threads"); + bam_outfile out(outfile, bam_format); + { + bam_header hdr_out(hdr); + if (!hdr_out) throw dnmt_error("failed create header"); + hdr_out.add_pg_line(cmd, "DNMTOOLS", VERSION); + if (!out.write(hdr_out)) throw dnmt_error("failed to write header"); + } - // headers: load the input file's header, and then update to the - // output file's header, then write it and destroy; we will only use - // the input file header. - sam_hdr_t *hdr = sam_hdr_read(hts); - if (!hdr) throw dnmt_error("failed to read header"); - sam_hdr_t *hdr_out = bam_hdr_dup(hdr); - if (!hdr_out) throw dnmt_error("failed create header"); - add_pg_line(cmd, hdr_out); - err_code = sam_hdr_write(out, hdr_out); - if (err_code) throw dnmt_error(err_code, "failed to output header"); + if (n_threads > 1) { + tpool.set_io(hts); + tpool.set_io(out); + } - // now process the reads - bam1_t *aln = bam_init1(); - bam1_t *prev_aln = bam_init1(); - bam1_t *merged = bam_init1(); - bool previous_was_merged = false; + bam_rec aln, prev_aln, merged; + bool previous_was_merged = false; - err_code = sam_read1(hts, hdr, aln); // for EOF, err_code == -1 - if (err_code < -1) throw dnmt_error(err_code, "format:sam_read1"); - const bool empty_reads_file = (err_code == -1); - if (!empty_reads_file) { - standardize_format(input_format, aln); + const bool empty_reads_file = !hts.read(hdr, aln); - std::swap(aln, prev_aln); // start with prev_aln being first read + if (!empty_reads_file) { - while ((err_code = sam_read1(hts, hdr, aln)) >= 0) { standardize_format(input_format, aln); - if (same_name(prev_aln, aln, suff_len)) { - // below: essentially check for dovetail - if (!bam_is_rev(aln)) std::swap(prev_aln, aln); - const size_t frag_len = merge_mates(max_frag_len, prev_aln, aln, merged); - if (frag_len > 0 && frag_len < max_frag_len) { - if (is_a_rich(merged)) flip_conversion(merged); - err_code = sam_write1(out, hdr, merged); - if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1"); + + swap(aln, prev_aln); // start with prev_aln being first read + + while (hts.read(hdr, aln)) { + standardize_format(input_format, aln); + if (same_name(prev_aln, aln, suff_len)) { + // below: essentially check for dovetail + if (!bam_is_rev(aln)) swap(prev_aln, aln); + const size_t frag_len = merge_mates(max_frag_len, prev_aln, aln, merged); + if (frag_len > 0 && frag_len < max_frag_len) { + if (is_a_rich(merged)) flip_conversion(merged); + if (!out.write(hdr, merged)) throw bam_write_err; + } + else { + if (is_a_rich(prev_aln)) flip_conversion(prev_aln); + if (!out.write(hdr, prev_aln)) throw bam_write_err; + if (is_a_rich(aln)) flip_conversion(aln); + if (!out.write(hdr, aln)) throw bam_write_err; + } + previous_was_merged = true; } else { - if (is_a_rich(prev_aln)) flip_conversion(prev_aln); - err_code = sam_write1(out, hdr, prev_aln); - if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1"); - if (is_a_rich(aln)) flip_conversion(aln); - err_code = sam_write1(out, hdr, aln); - if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1"); + if (!previous_was_merged) { + if (is_a_rich(prev_aln)) flip_conversion(prev_aln); + if (!out.write(hdr, prev_aln)) throw bam_write_err; + } + previous_was_merged = false; } - previous_was_merged = true; + swap(prev_aln, aln); } - else { - if (!previous_was_merged) { - if (is_a_rich(prev_aln)) flip_conversion(prev_aln); - err_code = sam_write1(out, hdr, prev_aln); - if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1"); - } - previous_was_merged = false; + if (!previous_was_merged) { + if (is_a_rich(prev_aln)) flip_conversion(prev_aln); + if (!out.write(hdr, prev_aln)) throw bam_write_err; } - std::swap(prev_aln, aln); - } - if (err_code < -1) throw dnmt_error(err_code, "format:sam_read1"); - - if (!previous_was_merged) { - if (is_a_rich(prev_aln)) flip_conversion(prev_aln); - err_code = sam_write1(out, hdr, prev_aln); - if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1"); } } - - // turn off the lights - bam_destroy1(prev_aln); - bam_destroy1(aln); - bam_destroy1(merged); - bam_hdr_destroy(hdr); - bam_hdr_destroy(hdr_out); - err_code = hts_close(hts); - if (err_code < 0) throw dnmt_error(err_code, "format:hts_close"); - err_code = hts_close(out); - if (err_code < 0) throw dnmt_error(err_code, "format:hts_close"); - // do this after the files have been closed - hts_tpool_destroy(the_thread_pool.pool); } - int main_format(int argc, const char **argv) { try { @@ -1324,7 +499,7 @@ int main_format(int argc, const char **argv) { } else if (!check_suff_len(infile, suff_len, n_reads_to_check)) throw dnmt_error("wrong read name suffix length [" + - std::to_string(suff_len) + "] in: " + infile); + std::to_string(suff_len) + "] in: " + infile); if (!check_sorted(infile, suff_len, n_reads_to_check)) throw dnmt_error("mates not consecutive in: " + infile); } diff --git a/src/utils/uniq.cpp b/src/utils/uniq.cpp index 37b1ca08..2ea97e11 100644 --- a/src/utils/uniq.cpp +++ b/src/utils/uniq.cpp @@ -18,7 +18,7 @@ * General Public License for more details. */ -#include // for [u]int[0-9]+_t +#include // for [u]int[0-9]+_t #include #include #include @@ -28,18 +28,13 @@ // generated by autotools #include -// from HTSlib -#include -#include - -// from smithlab #include "GenomicRegion.hpp" #include "OptionParser.hpp" #include "bsutils.hpp" +#include "dnmt_error.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" - -#include "dnmt_error.hpp" +#include "bam_record_utils.hpp" using std::cerr; using std::endl; @@ -59,69 +54,23 @@ namespace uniq_random { bool initialized = false; std::default_random_engine e; std::uniform_int_distribution di; - void initialize(const size_t the_seed) { e = std::default_random_engine(the_seed); initialized = true; } - int rand() { // ADS: should have same range as ordinary rand() by properties of // std::uniform_int_distribution default constructor. // assert(initialized); return di(e); } -} // namespace uniq_random - -inline bool -format_is_bam_or_sam(htsFile *hts) { - const htsFormat *fmt = hts_get_format(hts); - return fmt->category == sequence_data && - (fmt->format == bam || fmt->format == sam); -} - -inline string -qname(const bam1_t *b) { - return string(bam_get_qname(b)); -} - -inline int32_t -get_tid(bam1_t *b) { - return b->core.tid; -} - -inline size_t -qlen(const bam1_t *r) { - return bam_cigar2qlen(r->core.n_cigar, bam_get_cigar(r)); -} - -inline bool -precedes_by_start(const bam1_t *a, const bam1_t *b) { - return a->core.tid == b->core.tid && a->core.pos < b->core.pos; -} - -inline bool -precedes_by_end_and_strand(const bam1_t *a, const bam1_t *b) { - const size_t end_a = bam_endpos(a), end_b = bam_endpos(b); - return (end_a < end_b || (end_a == end_b && bam_is_rev(a) < bam_is_rev(b))); -} - -inline bool -equivalent_chrom_and_start(const bam1_t *a, const bam1_t *b) { - return a->core.pos == b->core.pos && a->core.tid == b->core.tid; -} - -inline bool -equivalent_end_and_strand(const bam1_t *a, const bam1_t *b) { - return bam_endpos(a) == bam_endpos(b) && bam_is_rev(a) == bam_is_rev(b); -} +} // namespace uniq_random -struct rd_stats { // keep track of good bases/reads in and out +struct rd_stats { // keep track of good bases/reads in and out size_t bases{}; size_t reads{}; - - void update(bam1_t *b) { - bases += qlen(b); + void update(const bam_rec &b) { + bases += get_l_qseq(b); ++reads; } }; @@ -159,24 +108,24 @@ write_hist_output(const vector &hist, const string &histfile) { } /* The "inner" buffer corresponds to all reads sharing chrom, start, - * end and strand, and is a contiguous subset of the "outer" buffer - * that shares the same end and strand. - */ + end and strand, and is a contiguous subset of the "outer" buffer + that shares the same end and strand. */ static void process_inner_buffer(const bool add_dup_count, - const vector::const_iterator it, - const vector::const_iterator jt, sam_hdr_t *hdr, - samFile *out, rd_stats &rs_out, size_t &reads_duped, + const vector::iterator it, + const vector::iterator jt, bam_header &hdr, + bam_outfile &out, rd_stats &rs_out, size_t &reads_duped, vector &hist) { + constexpr char du_tag[2] = {'D', 'U'}; const size_t n_reads = std::distance(it, jt); const size_t selected = uniq_random::rand() % n_reads; if (add_dup_count) { - const int ret = bam_aux_update_int(*(it + selected), "DU", n_reads); + const int ret = bam_aux_update_int(*(it + selected), du_tag, n_reads); if (ret < 0) throw dnmt_error("error adding duplicate count aux field"); } - if (sam_write1(out, hdr, *(it + selected)) < 0) + if (!out.write(hdr, *(it + selected))) throw runtime_error("failed writing bam record"); if (hist.size() <= n_reads) hist.resize(n_reads + 1); hist[n_reads]++; @@ -188,8 +137,8 @@ process_inner_buffer(const bool add_dup_count, and start position. These are gathered and then processed together. */ static void process_buffer(const bool add_dup_count, rd_stats &rs_out, size_t &reads_duped, - vector &hist, vector &buffer, sam_hdr_t *hdr, - samFile *out) { + vector &hist, vector &buffer, bam_header &hdr, + bam_outfile &out) { sort(begin(buffer), end(buffer), precedes_by_end_and_strand); auto it(begin(buffer)); auto jt = it + 1; @@ -201,114 +150,72 @@ process_buffer(const bool add_dup_count, rd_stats &rs_out, size_t &reads_duped, } process_inner_buffer(add_dup_count, it, jt, hdr, out, rs_out, reads_duped, hist); - - // free the bam1_t pointers before clearing the buffer - for (size_t i = 0; i < buffer.size(); ++i) - if (buffer[i] != 0) { - bam_destroy1(buffer[i]); - buffer[i] = 0; - } buffer.clear(); } -static bam1_t * -get_read(samFile *hts, sam_hdr_t *hdr) { - bam1_t *b = bam_init1(); - const int result = sam_read1(hts, hdr, b); - if (result >= 0) return b; - - if (result < -1) - throw runtime_error("error reading file: " + string(hts->fn)); - else // -1 should mean EOF, so we free this read - bam_destroy1(b); - return 0; -} - static void uniq(const bool VERBOSE, const bool add_dup_count, const size_t n_threads, const string &cmd, const string &infile, const string &statfile, const string &histfile, const bool bam_format, const string &outfile) { - // ADS: (below) At some point the errno here is set to 3=ESRCH ("No - // such process"?) when using HTSlib 1.17 on macos ventura - int prev_errno = errno; - samFile *hts = hts_open(infile.c_str(), "r"); - if (!hts || errno != prev_errno) throw dnmt_error("error opening: " + infile); - - htsThreadPool the_thread_pool{hts_tpool_init(n_threads), 0}; - if (hts_set_thread_pool(hts, &the_thread_pool) < 0) - throw runtime_error("error setting threads"); - - if (!format_is_bam_or_sam(hts)) - throw runtime_error("bad file format: " + infile); - - sam_hdr_t *hdr = sam_hdr_read(hts); - if (!hdr) throw runtime_error("failed to read header: " + infile); - - // open the output file - samFile *out = hts_open(outfile.c_str(), bam_format ? "wb" : "w"); - - if (hts_set_thread_pool(out, &the_thread_pool) < 0) - throw runtime_error("error setting threads"); - - // take care of the output file's header - sam_hdr_t *hdr_out = bam_hdr_dup(hdr); - if (sam_hdr_add_line(hdr_out, "PG", "ID", "DNMTOOLS", "VN", VERSION, "CL", - cmd.c_str(), NULL)) - throw runtime_error("failed to format header"); - if (sam_hdr_write(out, hdr_out)) - throw runtime_error("failed to output header"); - bam_hdr_destroy(hdr_out); - // values to tabulate stats; no real cost rd_stats rs_in, rs_out; size_t reads_duped = 0; vector hist; - // try to load the first read - prev_errno = errno; - bam1_t *aln = get_read(hts, hdr); - if (errno != prev_errno) - throw runtime_error("failed parsing read from input file"); + bam_tpool tpool(n_threads); // outer scope: must be destroyed last + { + bam_infile hts(infile); + bam_header hdr(hts); + if (!hdr) throw dnmt_error("failed to read header"); + + bam_outfile out(outfile, bam_format); + { + bam_header hdr_out(hdr); + if (!hdr_out) throw dnmt_error("failed create header"); + hdr_out.add_pg_line(cmd, "DNMTOOLS", VERSION); + if (!out.write(hdr_out)) throw dnmt_error("failed to write header"); + } + + if (n_threads > 1) { + tpool.set_io(hts); + tpool.set_io(out); + } - if (aln) { // data file is not empty + bam_rec aln; + if (hts.read(hdr, aln)) { // valid SAM/BAM can have 0 reads - rs_in.update(aln); // update for the input we just got + rs_in.update(aln); // update stats for input we just got - vector buffer(1, aln); // select output from this buffer + vector buffer(1, aln); // select output from this buffer - // to check that reads are sorted properly - vector chroms_seen(hdr->n_targets, false); - int32_t cur_chrom = get_tid(aln); + // to check that reads are sorted properly + vector chroms_seen(get_n_targets(hdr), false); + int32_t cur_chrom = get_tid(aln); - while ((aln = get_read(hts, hdr))) { - rs_in.update(aln); + while (hts.read(hdr, aln)) { + rs_in.update(aln); - // below works because buffer reset at every new chrom - if (precedes_by_start(aln, buffer[0])) - throw runtime_error("not sorted: " + qname(buffer[0]) + " " + - qname(aln)); + // below works because buffer reset at every new chrom + if (precedes_by_start(aln, buffer[0])) + throw runtime_error("not sorted: " + get_qname(buffer[0]) + " " + + get_qname(aln)); - const int32_t chrom = get_tid(aln); - if (chrom != cur_chrom) { - if (chroms_seen[chrom]) throw runtime_error("input not sorted"); - chroms_seen[chrom] = true; - cur_chrom = chrom; - } + const int32_t chrom = get_tid(aln); + if (chrom != cur_chrom) { + if (chroms_seen[chrom]) throw runtime_error("input not sorted"); + chroms_seen[chrom] = true; + cur_chrom = chrom; + } - if (!equivalent_chrom_and_start(buffer[0], aln)) - process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, - out); - buffer.push_back(aln); + if (!equivalent_chrom_and_start(buffer[0], aln)) + process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, + out); + buffer.push_back(aln); + } + process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, + out); } - process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, out); } - - // remember to turn off the lights - bam_hdr_destroy(hdr); - hts_close(out); - hts_close(hts); - hts_tpool_destroy(the_thread_pool.pool); - // write any additional output requested write_stats_output(rs_in, rs_out, reads_duped, statfile); write_hist_output(hist, histfile); @@ -372,7 +279,7 @@ main_uniq(int argc, const char **argv) { if (leftover_args.size() == 2 && !use_stdout) outfile = leftover_args.back(); else - outfile = string("-"); // so htslib can write to stdout + outfile = string("-"); // so htslib can write to stdout /****************** END COMMAND LINE OPTIONS *****************/ // ADS: Random here is because we choose randomly when keeping one