diff --git a/Makefile.am b/Makefile.am
index ac177889..d7df7a43 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -114,6 +114,7 @@ test_scripts/test_roi.log: test_scripts/test_sym.log
 
 noinst_LIBRARIES = libdnmtools.a
 libdnmtools_a_SOURCES = \
+        src/common/bam_record_utils.cpp \
         src/common/BetaBin.cpp \
         src/common/Distro.cpp \
         src/common/EmissionDistribution.cpp \
@@ -129,6 +130,8 @@ libdnmtools_a_SOURCES = \
         src/common/numerical_utils.cpp
 
 libdnmtools_a_SOURCES += \
+	src/common/bam_record.hpp \
+	src/common/bam_record_utils.hpp \
         src/common/BetaBin.hpp \
         src/common/Distro.hpp \
         src/common/EmissionDistribution.hpp \
diff --git a/src/common/bam_record.hpp b/src/common/bam_record.hpp
new file mode 100644
index 00000000..cbe786f8
--- /dev/null
+++ b/src/common/bam_record.hpp
@@ -0,0 +1,98 @@
+/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith
+ *
+ * Authors: Masaru Nakajima and Andrew D. Smith
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+/* ADS: need to control all the macros from HTSlib pollution. For
+   functions maybe:
+
+   $ gcc -dM -E sam.h | grep "define [a-z]" | awk '{print $2}' |\
+       grep "[(]" | awk -v FS="(" '{print "#undef",$1}'
+
+   This gives about 65 symbols that need to be deleted. For the others
+   I don't know what to do because some of them have "#define _" which
+   means they should be system symbols.
+*/
+
+#ifndef BAM_RECORD_HPP
+#define BAM_RECORD_HPP
+
+#include <stdexcept>
+#include <string>
+
+#include <htslib/sam.h>  // from HTSlib
+#include <htslib/thread_pool.h>  // from HTSlib
+
+struct bam_rec {
+  bam_rec() : b{bam_init1()} {}
+  bam_rec(const bam_rec &other) : b{bam_copy1(bam_init1(), other.b)} {}
+  bam_rec &operator=(bam_rec rhs) { std::swap(b, rhs.b); return *this; }
+  ~bam_rec() { if (b != nullptr) bam_destroy1(b); }
+  bam1_t *b;
+};
+
+struct bam_infile {
+  bam_infile(const std::string &fn) : f{hts_open(fn.c_str(), "r")} {}
+  ~bam_infile() { if (f != nullptr) hts_close(f); }
+  operator bool() const { return f != nullptr; }
+  template <typename T> bool read(T &h, bam_rec &b) {
+    const int x = sam_read1(f, h.h, b.b);  // -1 on EOF; args non-const
+    if (x < -1) throw std::runtime_error("failed reading bam record");
+    return x >= 0;
+  }
+  bool is_mapped_reads_file() const {
+    const htsFormat *fmt = hts_get_format(f);
+    return fmt->category == sequence_data &&
+      (fmt->format == bam || fmt->format == sam);
+  }
+  samFile *f{};
+};
+
+struct bam_header {
+  bam_header() = default;
+  bam_header(const bam_header &rhs) : h{bam_hdr_dup(rhs.h)} {}
+  bam_header(bam_infile &in) : h{sam_hdr_read(in.f)} {}
+  ~bam_header() { if (h != nullptr) bam_hdr_destroy(h); }
+  operator bool() const { return h != nullptr; }
+  bool add_pg_line(const std::string cmd, const std::string id,
+                   const std::string vn) {
+    return sam_hdr_add_line(h, "PG", "ID", id.c_str(),  "VN",
+                            vn.c_str(), "CL", cmd.c_str(), nullptr) == 0;
+  }
+  std::string tostring() const { return sam_hdr_str(h); }
+  sam_hdr_t *h{};
+};
+
+struct bam_outfile {
+  bam_outfile(const std::string &fn, const bool fmt = false) :
+    f{hts_open(fn.c_str(), fmt ? "bw" : "w")} {}
+  ~bam_outfile() { if (f != nullptr) hts_close(f); }
+  operator bool() const { return f != nullptr; }
+  bool write(const bam_header &h, const bam_rec &b) {
+    return sam_write1(f, h.h, b.b) >= 0;
+  }
+  bool write(const bam_header &h) { return sam_hdr_write(f, h.h) == 0; }
+  htsFile *f{};
+};
+
+struct bam_tpool{
+  bam_tpool(const size_t n_threads) : tpool{hts_tpool_init(n_threads), 0} {}
+  ~bam_tpool() { hts_tpool_destroy(tpool.pool); }
+  template<class T> void set_io(const T &bam_file) {
+    const int ret = hts_set_thread_pool(bam_file.f, &tpool);
+    if (ret < 0) throw std::runtime_error("failed to set thread pool");
+  }
+  htsThreadPool tpool{};
+};
+
+#endif
diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp
new file mode 100644
index 00000000..adbd089d
--- /dev/null
+++ b/src/common/bam_record_utils.cpp
@@ -0,0 +1,885 @@
+/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith
+ *
+ * Authors: Masaru Nakajima and Andrew D. Smith
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include "bam_record_utils.hpp"
+#include "dnmt_error.hpp"
+
+#include <sstream>
+#include <algorithm>
+#include <string>
+
+#include <htslib/sam.h>
+
+#include "smithlab_utils.hpp"
+
+using std::string;
+using std::vector;
+
+/// functions in place of undefd macro
+static inline bool
+bam_is_rev(const bam1_t *b) { return (b->core.flag & BAM_FREVERSE) != 0; }
+
+static inline char *
+bam_get_qname(const bam1_t *b) { return reinterpret_cast<char*>(b->data); }
+
+static inline uint32_t *
+bam_get_cigar(const bam1_t *b) {
+  return reinterpret_cast<uint32_t*>(b->data + b->core.l_qname);
+}
+
+static inline uint8_t *
+bam_get_seq(const bam1_t *b) {
+  // start of data +  bytes for query/read name+ bytes for cigar
+  return b->data + b->core.l_qname + (b->core.n_cigar << 2);
+}
+
+static inline uint8_t *
+bam_get_qual(const bam1_t *b) {
+  return b->data +  // start of data
+    b->core.l_qname +  // bytes for query name
+    (b->core.n_cigar << 2) +  // bytes for cigar
+    ((b->core.l_qseq + 1) >> 1);  // bytes for packed query/read
+}
+
+static inline uint8_t *
+bam_get_aux(const bam1_t *b) {
+  return b->data +
+    b->core.l_qname +
+    (b->core.n_cigar << 2) +
+    ((b->core.l_qseq + 1) >> 1) +
+    b->core.l_qseq;
+}
+
+static inline int
+bam_get_l_aux(const bam1_t *b) {
+  return b->l_data -
+    (b->core.l_qname +
+     (b->core.n_cigar << 2) +
+     ((b->core.l_qseq + 1) >> 1) +
+     b->core.l_qseq);
+}
+
+static inline bool
+bam_same_orientation(const bam1_t *a, const bam1_t *b) {
+  return ((a->core.flag ^ b->core.flag) & BAM_FREVERSE) != 0;
+}
+
+static void
+roundup_to_power_of_2(uint32_t &x) {
+  bool k_high_bit_set = (x >> (sizeof(uint32_t) * 8 - 1)) & 1;
+  if (x > 0) {
+    uint8_t size = sizeof(uint32_t);
+    --x;
+    x |= x >> (size / 4);
+    x |= x >> (size / 2);
+    x |= x >> (size);
+    x |= x >> (size * 2);
+    x |= x >> (size * 4);
+    x += !k_high_bit_set;
+  }
+  else x = 0;
+}
+
+static int
+sam_realloc_bam_data(bam1_t *b, size_t desired) {
+  uint32_t new_m_data = desired;
+  roundup_to_power_of_2(new_m_data);
+  if (new_m_data < desired) {
+    errno = ENOMEM; // (from sam.c) Not strictly true but we can't
+                    // store the size
+    return -1;
+  }
+  uint8_t *new_data = nullptr;
+  if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
+    new_data = static_cast<uint8_t *>(realloc(b->data, new_m_data));
+  }
+  else {
+    new_data = static_cast<uint8_t *>(malloc(new_m_data));
+    if (new_data != nullptr) {
+      if (b->l_data > 0)
+        std::copy_n(new_data, (static_cast<uint32_t>(b->l_data) < b->m_data) ?
+                    b->l_data : b->m_data, b->data);
+      bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
+    }
+  }
+  if (!new_data) return -1;
+  b->data = new_data;
+  b->m_data = new_m_data;
+  return 0;
+}
+
+// static int
+// sam_realloc_bam_data(bam1_t *b, size_t desired) {
+//   /* returns flag: either 0 for success or -1 for error (unable to
+//      allocate desired memory) */
+//   uint32_t new_m_data = desired;
+//   roundup_to_power_of_2(new_m_data);
+//   if (new_m_data < desired)  return -1;
+//   uint8_t *new_data = (uint8_t *)realloc(b->data, new_m_data);
+//   if (!new_data) return -1;
+//   // ADS: what would be the state of members below if -1 was returned?
+//   b->data = new_data;
+//   b->m_data = new_m_data;
+//   return 0;
+// }
+
+// static inline void
+// bam_copy_core(const bam1_t *a, bam1_t *b) {
+//   // ADS: prepared for a possibly more efficient block copy to assign
+//   // all variables at once, like this: b->core = a->core;
+//   b->core.pos = a->core.pos;
+//   b->core.tid = a->core.tid;
+//   b->core.bin = a->core.bin;
+//   b->core.qual = a->core.qual;
+//   b->core.l_extranul = a->core.l_extranul;
+//   b->core.flag = a->core.flag;
+//   b->core.l_qname = a->core.l_qname;
+//   b->core.n_cigar = a->core.n_cigar;
+//   b->core.l_qseq = a->core.l_qseq;
+//   b->core.mtid = a->core.mtid;
+//   b->core.mpos = a->core.mpos;
+//   b->core.isize = a->core.isize;
+// }
+
+static inline void
+bam_set1_core(bam1_core_t &core,
+              const size_t l_qname, const uint16_t flag, const int32_t tid,
+              const hts_pos_t pos, const uint8_t mapq, const size_t n_cigar,
+              const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize,
+              const size_t l_seq, const size_t qname_nuls) {
+  /* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and
+     likely mean "region to bin" for indexing */
+  /* MN: hts_reg2bin categorizes the size of the reference region.
+     Here, we use the numbers used in htslib/cram/cram_samtools.h */
+  static const int min_shift = 14;
+  static const int n_lvls = 5;
+
+  core.pos = pos;
+  core.tid = tid;
+  core.bin = hts_reg2bin(pos, pos + isize, min_shift, n_lvls);
+  core.qual = mapq;
+  core.l_extranul = qname_nuls - 1;
+  core.flag = flag;
+  core.l_qname = l_qname + qname_nuls;
+  core.n_cigar = n_cigar;
+  core.l_qseq = l_seq;
+  core.mtid = mtid;
+  core.mpos = mpos;
+  core.isize = isize;
+}
+
+static inline int
+bam_set1_wrapper(bam1_t *bam,
+                 const size_t l_qname, const char *qname,
+                 const uint16_t flag, const int32_t tid,
+                 const hts_pos_t pos, const uint8_t mapq,
+                 const size_t n_cigar, const uint32_t *cigar,
+                 const int32_t mtid, const hts_pos_t mpos,
+                 const hts_pos_t isize, const size_t l_seq,
+                 const size_t l_aux) {
+  /* This is based on how assignment is done in the `bam_set1`
+     function defined in `sam.c` from htslib */
+
+  /* This modification assigns variables of bam1_t struct but not the
+   * query/read sequence.
+   *
+   * many checks in bam_set1 have been removed because they are
+   * checked in code that calls this function, mostly because the
+   * values come from valid `bam1_t` instances, so have already been
+   * validated.
+   *
+   * Assumptions:
+   * cigar: correct format and matching length
+   * rlen = isize
+   * qlen = l_seq
+   * l_qname <= 254
+   * HTS_POS_MAX - rlen > pos
+   * Where HTS_POS_MAX = ((((int64_t)INT_MAX)<<32)|INT_MAX) is the highest
+   * supported position.
+   *
+   * Number of bytes needed for the data is smaller than INT32_MAX
+   *
+   * qual = NULL, because we do not keep the quality scores through
+   * formatting the reads.
+   */
+
+  // `qname_nuls` below is the number of '\0' to use to pad the qname
+  // so that the cigar has 4-byte alignment.
+  const size_t qname_nuls = 4 - l_qname % 4;
+  bam_set1_core(bam->core, l_qname, flag, tid, pos, mapq, n_cigar,
+                mtid, mpos, isize, l_seq, qname_nuls);
+
+  const size_t data_len =
+    (l_qname + qname_nuls + n_cigar*sizeof(uint32_t) + (l_seq + 1) / 2 + l_seq);
+
+  bam->l_data = data_len;
+  if (data_len + l_aux > bam->m_data) {
+    const int ret = sam_realloc_bam_data(bam, data_len + l_aux);
+    if (ret < 0)
+      throw dnmt_error(ret, "Failed to allocate memory for BAM record");
+  }
+  auto data_iter = bam->data;
+
+  std::copy_n(qname, l_qname, data_iter);
+  std::fill_n(data_iter + l_qname, qname_nuls, '\0');
+  data_iter += l_qname + qname_nuls;
+
+  // ADS: reinterpret here because we know the cigar is originally an
+  // array of uint32_t and has been aligned for efficiency
+  std::copy_n(cigar, n_cigar, reinterpret_cast<uint32_t*>(data_iter));
+  data_iter += n_cigar*sizeof(uint32_t);
+
+  // skipping sequece assignment
+  data_iter += (l_seq + 1) / 2;
+
+  std::fill(data_iter, data_iter + l_seq, '\xff');
+
+  return static_cast<int>(data_len);
+}
+
+static inline bool
+eats_ref(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; }
+
+static inline bool
+eats_query(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; }
+
+static inline size_t
+bam_get_n_cigar(const bam1_t *b) { return b->core.n_cigar; }
+
+static inline uint32_t
+to_insertion(const uint32_t x) {
+  return (x & ~BAM_CIGAR_MASK) | BAM_CINS;
+}
+
+static inline void
+fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) {
+  if (n_cigar < 3) return;
+  // find first non-softclip
+  auto c_beg = cigar;
+  auto c_end = cigar + n_cigar;
+
+  while (!eats_ref(*c_beg) && ++c_beg != c_end);
+  if (c_beg == c_end) throw dnmt_error("cigar eats no ref");
+
+  while (!eats_ref(*(c_end-1)) && --c_end != c_beg);
+  if (c_beg == c_end) throw dnmt_error("cigar eats no ref");
+
+  for (auto c_itr = c_beg; c_itr != c_end; ++c_itr)
+    if (bam_cigar_op(*c_itr) == BAM_CSOFT_CLIP)
+      *c_itr = to_insertion(*c_itr);
+}
+
+static inline uint32_t
+to_softclip(const uint32_t x) {
+  return (x & ~BAM_CIGAR_MASK) | BAM_CSOFT_CLIP;
+}
+
+static inline void
+fix_external_insertion(const size_t n_cigar, uint32_t *cigar) {
+  if (n_cigar < 2) return;
+
+  auto c_itr = cigar;
+  const auto c_end = c_itr + n_cigar;
+
+  for (; !eats_ref(*c_itr) && c_itr != c_end; ++c_itr)
+    *c_itr = to_softclip(*c_itr);
+
+  if (c_itr == c_end) throw dnmt_error("cigar eats no ref");
+
+  c_itr = cigar + n_cigar - 1;
+  for (; !eats_ref(*c_itr) && c_itr != cigar; --c_itr)
+    *c_itr = to_softclip(*c_itr);
+}
+
+static inline size_t
+merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) {
+  if (n_cigar < 2) return n_cigar;
+  auto c_itr1 = cigar;
+  auto c_end = c_itr1 + n_cigar;
+  auto c_itr2 = c_itr1 + 1;
+  auto op1 = bam_cigar_op(*c_itr1);
+  while (c_itr2 != c_end) {
+    auto op2 = bam_cigar_op(*c_itr2);
+    if (op1 == op2) {
+      *c_itr1 = bam_cigar_gen(bam_cigar_oplen(*c_itr1) +
+                                  bam_cigar_oplen(*c_itr2), op1);
+    }
+    else {
+      *(++c_itr1) = *c_itr2;
+      op1 = op2;
+    }
+    ++c_itr2;
+  }
+  // another increment to move past final "active" element for c_itr1
+  ++c_itr1;
+  return std::distance(cigar, c_itr1);
+}
+
+static inline size_t
+correct_cigar(bam1_t *b) {
+  /* This function will change external insertions into soft clip
+     operations. Not sure why those would be present. It will also
+     change internal soft-clip operations into insertions. This could
+     be needed if soft-clipped ends of reads were moved to the middle
+     of a merged fragment. Finally, it will collapse adjacent
+     identical operations. None of this impacts the seq/qual/aux which
+     get moved as a block */
+
+  uint32_t *cigar = bam_get_cigar(b);
+  size_t n_cigar = b->core.n_cigar;
+  fix_external_insertion(n_cigar, cigar);
+  fix_internal_softclip(n_cigar, cigar);
+
+  // merge identical adjacent cigar ops and get new number of ops
+  n_cigar = merge_cigar_ops(n_cigar, cigar);
+  // difference in bytes to shift the internal data
+  const size_t delta = (b->core.n_cigar - n_cigar) * sizeof(uint32_t);
+  if (delta > 0) { // if there is a difference; do the shift
+    const auto data_end = b->data + b->l_data; // bam_get_aux(b) + bam_get_l_aux(b);
+    std::copy(bam_get_seq(b), data_end, bam_get_seq(b) - delta);
+    b->core.n_cigar = n_cigar; // and update number of cigar ops
+  }
+  return delta;
+}
+
+size_t correct_cigar(bam_rec &b) {
+  return correct_cigar(b.b);
+}
+
+static inline hts_pos_t
+get_rlen(const bam1_t *b) { // less tedious
+  return bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
+}
+
+static inline size_t
+get_l_qseq(const bam1_t *b) { return b->core.l_qseq; }
+
+static inline void
+complement_seq(char *first, char *last) {
+  for (; first != last; ++first) {
+    assert(valid_base(*first));
+    *first = complement(*first);
+  }
+}
+
+static inline void
+reverse(char *a, char *b) {
+  char *p1, *p2;
+  for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) {
+    *p1 ^= *p2;
+    *p2 ^= *p1;
+    *p1 ^= *p2;
+    assert(valid_base(*p1) && valid_base(*p2));
+  }
+}
+
+// return value is the number of cigar ops that are fully consumed in
+// order to read n_ref, while "partial_oplen" is the number of bases
+// that could be taken from the next operation, which might be merged
+// with the other read.
+static inline uint32_t
+get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops,
+                         const uint32_t n_ref_full, uint32_t *partial_oplen) {
+  // assume: n_ops <= size(cig_in) <= size(cig_out)
+  size_t rlen = 0;
+  uint32_t i = 0;
+  for (i = 0; i < in_ops; ++i) {
+    if (eats_ref(cig_in[i])) {
+      if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full)
+        break;
+      rlen += bam_cigar_oplen(cig_in[i]);
+    }
+  }
+  *partial_oplen = n_ref_full - rlen;
+  return i;
+}
+
+/* This table converts 2 bases packed in a byte to their reverse
+ * complement. The input is therefore a unit8_t representing 2 bases.
+ * It is assumed that the input uint8_t value is of form "xx" or "x-",
+ * where 'x' a 4-bit number representing either A, C, G, T, or N and
+ * '-' is 0000.  For example, the ouptut for "AG" is "CT". The format
+ * "x-" is often used at the end of an odd-length sequence.  The
+ * output of "A-" is "-T", and the output of "C-" is "-G", and so
+ * forth. The user must handle this case separately.
+ */
+const uint8_t byte_revcom_table[] = {
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248,
+  4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  2, 130, 66, 0, 34, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 242,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  1, 129, 65, 0, 33, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 241,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
+ 15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 255
+};
+
+static inline void
+revcom_byte_then_reverse(unsigned char *a, unsigned char *b) {
+  unsigned char *p1, *p2;
+  for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) {
+    *p1 = byte_revcom_table[*p1];
+    *p2 = byte_revcom_table[*p2];
+    *p1 ^= *p2;
+    *p2 ^= *p1;
+    *p1 ^= *p2;
+  }
+  if (p1 == p2) *p1 = byte_revcom_table[*p1];
+}
+
+static inline void
+revcomp_seq_by_byte(bam1_t *aln) {
+  const size_t l_qseq = get_l_qseq(aln);
+  auto seq = bam_get_seq(aln);
+  const size_t num_bytes = ceil(l_qseq / 2.0);
+  auto seq_end = seq + num_bytes;
+  revcom_byte_then_reverse(seq, seq_end);
+  if (l_qseq % 2 == 1) { // for odd-length sequences
+    for (size_t i = 0; i < num_bytes - 1; i++) {
+      // swap 4-bit chunks within consecutive bytes like this:
+      // (----aaaa bbbbcccc dddd....) => (aaaabbbb ccccdddd ....)
+      seq[i] = (seq[i] << 4) | (seq[i + 1] >> 4);
+    }
+    seq[num_bytes - 1] <<= 4;
+  }
+}
+
+// places seq of b at the end of seq of c
+// assumes 0 < c_seq_len - b_seq_len <= a_seq_len
+// also assumes that c_seq_len has been figured out
+// Also assumes the number of bytes allocated to sequence potion of c->data
+// has been set to ceil((a_used_len + b_seq_len) / 2.0) where
+// a_used_len = c_seq_len - b_seq_len
+static inline void
+merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) {
+  // ADS: (todo) need some functions for int_ceil and is_odd
+  const size_t b_seq_len = get_l_qseq(b);
+  const size_t c_seq_len = get_l_qseq(c);
+  const size_t a_used_len = c_seq_len - b_seq_len;
+
+  const bool is_a_odd = a_used_len % 2 == 1;
+  const bool is_b_odd = b_seq_len % 2 == 1;
+  const bool is_c_odd = c_seq_len % 2 == 1;
+
+  const size_t a_num_bytes = ceil(a_used_len / 2.0);
+  const size_t b_num_bytes = ceil(b_seq_len / 2.0);
+
+  const size_t b_offset = is_a_odd && is_b_odd;
+
+  const auto a_seq = bam_get_seq(a);
+  const auto b_seq = bam_get_seq(b);
+  auto c_seq = bam_get_seq(c);
+
+  std::copy_n(a_seq, a_num_bytes, c_seq);
+  if (is_a_odd) {
+    // c_seq looks like either [ aa aa aa aa ]
+    //                      or [ aa aa aa a- ]
+    c_seq[a_num_bytes - 1] &= 0xf0;
+    c_seq[a_num_bytes - 1] |= is_b_odd ?
+        byte_revcom_table[b_seq[b_num_bytes - 1]] :
+        byte_revcom_table[b_seq[b_num_bytes - 1]] >> 4;
+  }
+  if (is_c_odd) {
+    // c_seq looks like either [ aa aa aa aa ]
+    //                      or [ aa aa aa ab ]
+    for (size_t i = 0; i < b_num_bytes - 1; i++) {
+      c_seq[a_num_bytes + i] =
+          (byte_revcom_table[b_seq[b_num_bytes - i - 1]] << 4) |
+          (byte_revcom_table[b_seq[b_num_bytes - i - 2]] >> 4);
+    }
+    c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcom_table[b_seq[0]] << 4;
+    // Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd)
+    //                    or [ aa aa aa ab bb bb bb b- ] (a odd; b odd)
+  }
+  else {
+    for (size_t i = 0; i < b_num_bytes - b_offset; i++) {
+      c_seq[a_num_bytes + i] =
+          byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]];
+    }
+    // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even)
+    //                    or [ aa aa aa ab bb bb bb    ] (a odd and b odd)
+  }
+}
+
+static inline void
+flip_conversion(bam1_t *aln) {
+  if (aln->core.flag & BAM_FREVERSE)
+    aln->core.flag = aln->core.flag & (~BAM_FREVERSE);
+  else
+    aln->core.flag = aln->core.flag | BAM_FREVERSE;
+
+  revcomp_seq_by_byte(aln);
+
+  // ADS: don't like *(cv + 1) below, but no HTSlib function for it?
+  uint8_t *cv = bam_aux_get(aln, "CV");
+  if (!cv) throw dnmt_error("bam_aux_get failed for CV");
+  *(cv + 1) = 'T';
+}
+
+void
+flip_conversion(bam_rec &aln) { flip_conversion(aln.b); }
+
+static inline bool
+are_mates(const bam1_t *one, const bam1_t *two) {
+  return one->core.mtid == two->core.tid &&
+    one->core.mpos == two->core.pos &&
+    (one->core.flag & BAM_FREVERSE) != (one->core.flag & BAM_FREVERSE);
+  // below is a consistency check and should not be necessary
+  /* &&
+     two->core.mtid == one->core.tid &&
+     two->core.mpos == one->core.pos; */
+}
+
+static inline int
+truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) {
+
+  const uint32_t *a_cig = bam_get_cigar(a);
+  const uint32_t a_ops = a->core.n_cigar;
+
+  uint32_t part_op = 0;
+  const uint32_t c_cur =
+    get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op);
+
+  // ADS: hack here because the get_full_and_partial_ops doesn't do
+  // exactly what is needed for this.
+  const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0);
+
+  const uint32_t c_ops = c_cur + use_partial;
+  vector<uint32_t> c_cig(c_ops, 0);
+
+  // ADS: replace this with a std::copy
+  auto c_cig_itr = std::copy(a_cig, a_cig + c_cur, begin(c_cig));
+  // ADS: warning, if (use_partial==false), then the amount of part_op
+  // used below would make no sense.
+  if (use_partial)
+    *c_cig_itr = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur]));
+  /* after this point the cigar is set and should decide everything */
+
+  const uint32_t c_seq_len = bam_cigar2qlen(c_ops, c_cig.data());
+  const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data());
+
+  // flag only needs to worry about strand and single-end stuff
+  const uint16_t flag =
+      a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE);
+
+  const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1);
+  int ret = bam_set1_wrapper(c,
+                             a_qname_len,
+                             bam_get_qname(a),
+                             flag, // flags (SR and revcomp info)
+                             a->core.tid,
+                             a->core.pos,
+                             a->core.qual,
+                             c_ops,         // merged cigar ops
+                             c_cig.data(),  // merged cigar
+                             -1,            // (no mate)
+                             -1,            // (no mate)
+                             isize,         // rlen from new cigar
+                             c_seq_len,     // truncated seq length
+                             8);            // enough for the 2 tags?
+  if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper");
+
+  const size_t n_bytes_to_copy = (c_seq_len + 1)/2;  // compression
+  std::copy_n(bam_get_seq(a), n_bytes_to_copy, bam_get_seq(c));
+
+  /* add the tags */
+  const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here!
+  // "_udpate" for "int" because it determines the right size
+  ret = bam_aux_update_int(c, "NM", nm);
+  if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int");
+
+  const uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV"));
+  // "_append" for "char" because there is no corresponding update
+  ret = bam_aux_append(c, "CV", 'A', 1, &conversion);
+  if (ret < 0) throw dnmt_error(ret, "bam_aux_append");
+
+  return ret;
+}
+
+int
+truncate_overlap(const bam_rec &a, const uint32_t overlap, bam_rec &c) {
+  return truncate_overlap(a.b, overlap, c.b);
+}
+
+int
+merge_overlap(const bam1_t *a, const bam1_t *b,
+              const uint32_t head, bam1_t *c) {
+  assert(head > 0);
+
+  const uint32_t *a_cig = bam_get_cigar(a);
+  const uint32_t a_ops = a->core.n_cigar;
+
+  const uint32_t *b_cig = bam_get_cigar(b);
+  const uint32_t b_ops = b->core.n_cigar;
+
+  uint32_t part_op = 0;
+  uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, head, &part_op);
+  // ADS: hack here because the get_full_and_partial_ops doesn't do
+  // exactly what is needed for this.
+  const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0);
+
+  // check if the middle op would be the same
+  const bool merge_mid =
+      (use_partial > 0 ?
+      bam_cigar_op(a_cig[c_cur]) == bam_cigar_op(b_cig[0]) :
+      bam_cigar_op(a_cig[c_cur - 1]) == bam_cigar_op(b_cig[0]));
+
+  // c_ops: include the prefix of a_cig we need; then add for the
+  // partial op; subtract for the identical op in the middle; finally
+  // add the rest of b_cig.
+  const uint32_t c_ops = c_cur + use_partial - merge_mid + b_ops;
+  vector<uint32_t> c_cig(c_ops, 0);
+  std::copy(a_cig, a_cig + c_cur, begin(c_cig));
+
+  if (use_partial) {
+    c_cig[c_cur] = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur]));
+    c_cur++; // index of dest for copying b_cig; faciltates corner case
+  }
+  // Here we get the length of a's sequence part contribution to c's
+  // sequence before the possibility of merging the last entry with
+  // the first entry in b's cigar. This is done with the cigar, so
+  // everything depends on the "use_partial"
+  const size_t a_seq_len = bam_cigar2qlen(c_cur, c_cig.data());
+  /* ADS: above the return type of bam_cigar2qlen is uint64_t, but
+     according to the source as of 05/2023 it cannot become
+     negative; no possible error code returned */
+
+  if (merge_mid) // update the middle op if it's the same
+    c_cig[c_cur - 1] = bam_cigar_gen(bam_cigar_oplen(c_cig[c_cur - 1]) +
+                                     bam_cigar_oplen(b_cig[0]),
+                                     bam_cigar_op(b_cig[0]));
+  // copy the cigar from b into c
+  std::copy(b_cig + merge_mid, b_cig + b_ops, begin(c_cig) + c_cur);
+  /* done with cigar here */
+
+  const uint32_t c_seq_len = a_seq_len + b->core.l_qseq;
+
+  // get the template length
+  const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data());
+
+  // flag only needs to worry about strand and single-end stuff
+  const uint16_t flag = a->core.flag & (BAM_FREAD1 |
+                                        BAM_FREAD2 |
+                                        BAM_FREVERSE);
+
+  const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1);
+  int ret = bam_set1_wrapper(c,
+                             a_qname_len,
+                             bam_get_qname(a),
+                             flag, // (no PE; revcomp info)
+                             a->core.tid,
+                             a->core.pos,
+                             a->core.qual, // mapq from "a" (consider update)
+                             c_ops,        // merged cigar ops
+                             c_cig.data(), // merged cigar
+                             -1,           // (no mate)
+                             -1,           // (no mate)
+                             isize,        // updated
+                             c_seq_len,    // merged sequence length
+                             8);           // enough for 2 tags?
+  if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper in merge_overlap");
+  // Merge the sequences by bytes
+  merge_by_byte(a, b, c);
+
+  // add the tag for mismatches
+  const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) +
+                      bam_aux2i(bam_aux_get(b, "NM")));
+  ret = bam_aux_update_int(c, "NM", nm);
+  if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int in merge_overlap");
+
+  // add the tag for conversion
+  const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV"));
+  ret = bam_aux_append(c, "CV", 'A', 1, &cv);
+  if (ret < 0) throw dnmt_error(ret, "bam_aux_append in merge_overlap");
+
+  return ret;
+}
+
+int
+merge_overlap(const bam_rec &a, const bam_rec &b,
+              const uint32_t head, bam_rec &c) {
+  return merge_overlap(a.b, b.b, head, c.b);
+}
+
+static inline int
+merge_non_overlap(const bam1_t *a, const bam1_t *b,
+                  const uint32_t spacer, bam1_t *c) {
+  /* make the cigar string */
+  // collect info about the cigar strings
+  const uint32_t *a_cig = bam_get_cigar(a);
+  const uint32_t a_ops = a->core.n_cigar;
+  const uint32_t *b_cig = bam_get_cigar(b);
+  const uint32_t b_ops = b->core.n_cigar;
+
+  // allocate the new cigar string
+  const uint32_t c_ops = a_ops + b_ops + 1;
+  vector<uint32_t> c_cig(c_ops, 0);
+
+  // concatenate the new cigar strings with a "skip" in the middle
+  auto c_cig_itr = std::copy(a_cig, a_cig + a_ops, begin(c_cig));
+  *c_cig_itr++ = bam_cigar_gen(spacer, BAM_CREF_SKIP);
+  std::copy(b_cig, b_cig + b_ops, c_cig_itr);
+  /* done with cigars */
+
+  const size_t a_seq_len = get_l_qseq(a);
+  const size_t b_seq_len = get_l_qseq(b);
+  const size_t c_seq_len = a_seq_len + b_seq_len;
+
+  // get the template length from the cigar
+  const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data());
+
+  // flag: only need to keep strand and single-end info
+  const uint16_t flag = a->core.flag & (BAM_FREAD1 |
+                                        BAM_FREAD2 |
+                                        BAM_FREVERSE);
+
+  const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1);
+  int ret = bam_set1_wrapper(c,
+                             a_qname_len,
+                             bam_get_qname(a),
+                             flag, // flags (no PE; revcomp info)
+                             a->core.tid,
+                             a->core.pos,
+                             a->core.qual, // mapq from a (consider update)
+                             c_ops,        // merged cigar ops
+                             c_cig.data(), // merged cigar
+                             -1,           // (no mate)
+                             -1,           // (no mate)
+                             isize,        // TLEN (relative to reference; SAM docs)
+                             c_seq_len,    // merged sequence length
+                             8);           // enough for 2 tags of 1 byte value?
+  if (ret < 0) throw dnmt_error(ret, "bam_set1 in merge_non_overlap");
+
+  merge_by_byte(a, b, c);
+
+  /* add the tags */
+  const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) +
+                      bam_aux2i(bam_aux_get(b, "NM")));
+  // "udpate" for "int" because it determines the right size
+  ret = bam_aux_update_int(c, "NM", nm);
+  if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int");
+
+  const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV"));
+  // "append" for "char" because there is no corresponding update
+  ret = bam_aux_append(c, "CV", 'A', 1, &cv);
+  if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_append");
+
+  return ret;
+}
+
+int
+merge_non_overlap(const bam_rec &a, const bam_rec &b,
+                  const uint32_t spacer, bam_rec &c) {
+  return merge_non_overlap(a.b, b.b, spacer, c.b);
+}
+
+static inline int
+keep_better_end(const bam1_t *a, const bam1_t *b, bam1_t *c) {
+  const auto a_rl = get_rlen(a);
+  const auto b_rl = get_rlen(b);
+  const auto c_rl = std::max(a_rl, b_rl);
+  c = bam_copy1(c, a_rl >= b_rl ? a : b);
+  c->core.mtid = -1;
+  c->core.mpos = -1;
+  c->core.isize = c_rl;
+  c->core.flag &= (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE);
+  return 0;
+}
+
+int
+keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c) {
+  return keep_better_end(a.b, b.b, c.b);
+}
+
+// ADS: will move to using this function once it is written
+static inline void
+standardize_format(const string &input_format, bam1_t *aln) {
+  int err_code = 0;
+
+  if (input_format == "abismal" || input_format == "walt") return;
+
+  if (input_format == "bsmap") {
+    // A/T rich; get the ZS tag value
+    auto zs_tag = bam_aux_get(aln, "ZS");
+    if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)");
+    // ADS: test for errors on the line below
+    const uint8_t cv = string(bam_aux2Z(zs_tag))[1] == '-' ? 'A' : 'T';
+    // get the "mismatches" tag
+    auto nm_tag = bam_aux_get(aln, "NM");
+    if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bsmap)");
+    const int64_t nm = bam_aux2i(nm_tag);
+
+    aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize)
+
+    /* add the tags we want */
+    // "udpate" for "int" because it determines the right size; even
+    // though we just deleted all tags, it will add it back here.
+    err_code = bam_aux_update_int(aln, "NM", nm);
+    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int");
+    // "append" for "char" because there is no corresponding update
+    err_code = bam_aux_append(aln, "CV", 'A', 1, &cv);
+    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append");
+
+    if (bam_is_rev(aln))
+      revcomp_seq_by_byte(aln); // reverse complement if needed
+  }
+  if (input_format == "bismark") {
+    // ADS: Previously we modified the read names at the first
+    // underscore. Even if the names are still that way, it should no
+    // longer be needed since we compare names up to a learned suffix.
+
+    // A/T rich; get the XR tag value
+    auto xr_tag = bam_aux_get(aln, "XR");
+    if (!xr_tag) throw dnmt_error("bam_aux_get for XR (invalid bismark)");
+    const uint8_t cv = string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T';
+    // get the "mismatches" tag
+    auto nm_tag = bam_aux_get(aln, "NM");
+    if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bismark)");
+    const int64_t nm = bam_aux2i(nm_tag);
+
+    aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize)
+
+    /* add the tags we want */
+    // "udpate" for "int" because it determines the right size; even
+    // though we just deleted all tags, it will add it back here.
+    err_code = bam_aux_update_int(aln, "NM", nm);
+    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int");
+    // "append" for "char" because there is no corresponding update
+    err_code = bam_aux_append(aln, "CV", 'A', 1, &cv);
+    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append");
+
+    if (bam_is_rev(aln))
+      revcomp_seq_by_byte(aln); // reverse complement if needed
+  }
+
+  // Be sure this doesn't depend on mapper! Removes the "qual" part of
+  // the data in a bam1_t struct but does not change its uncompressed
+  // size.
+  const auto qs = bam_get_qual(aln);
+  std::fill(qs, qs + aln->core.l_qseq, '\xff'); // overwrites qseq
+}
+
+void
+standardize_format(const string &input_format, bam_rec &aln) {
+  standardize_format(input_format, aln.b);
+}
diff --git a/src/common/bam_record_utils.hpp b/src/common/bam_record_utils.hpp
new file mode 100644
index 00000000..90291b99
--- /dev/null
+++ b/src/common/bam_record_utils.hpp
@@ -0,0 +1,184 @@
+/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith
+ *
+ * Authors: Masaru Nakajima and Andrew D. Smith
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef BAM_RECORD_UTILS_HPP
+#define BAM_RECORD_UTILS_HPP
+
+#include "bam_record.hpp"
+
+#include <string>
+
+#ifdef bam_is_rev
+#undef bam_is_rev
+#endif
+
+inline bool
+bam_is_rev(const bam_rec &b) { return (b.b->core.flag & BAM_FREVERSE) != 0; }
+
+#ifdef bam_is_mrev
+#undef bam_is_mrev
+#endif
+
+inline bool
+bam_is_mrev(const bam_rec &b) { return (b.b->core.flag & BAM_FMREVERSE) != 0; }
+
+#ifdef bam_get_qname
+#undef bam_get_qname
+#endif
+
+inline char *
+bam_get_qname(const bam_rec &b) { return reinterpret_cast<char*>(b.b->data); }
+
+#ifdef bam_get_cigar
+#undef bam_get_cigar
+#endif
+
+inline uint32_t *
+bam_get_cigar(const bam_rec &b) {
+  // start of data + bytes for query/read name
+  return reinterpret_cast<uint32_t*>(b.b->data + b.b->core.l_qname);
+}
+
+#ifdef bam_get_seq
+#undef bam_get_seq
+#endif
+
+inline uint8_t *
+bam_get_seq(const bam_rec &b) {
+  // start of data + bytes for cigar + bytes for query/read name
+  return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2);
+}
+
+#ifdef bam_get_qual
+#undef bam_get_qual
+#endif
+
+inline uint8_t *
+bam_get_qual(const bam_rec &b) {
+  return b.b->data +  // start of data
+    b.b->core.l_qname +  // bytes for query name
+    (b.b->core.n_cigar << 2) +  // bytes for cigar
+    ((b.b->core.l_qseq + 1) >> 1);  // bytes for packed query/read
+}
+
+#ifdef bam_get_aux
+#undef bam_get_aux
+#endif
+
+inline uint8_t *
+bam_get_aux(const bam_rec &b) {
+  return b.b->data +
+    b.b->core.l_qname +
+    (b.b->core.n_cigar << 2) +
+    ((b.b->core.l_qseq + 1) >> 1) +
+    b.b->core.l_qseq;
+}
+
+#ifdef bam_get_l_aux
+#undef bam_get_l_aux
+#endif
+
+inline int
+bam_get_l_aux(const bam_rec &b) {
+  return b.b->l_data -
+    (b.b->core.l_qname +
+     (b.b->core.n_cigar << 2) +
+     ((b.b->core.l_qseq + 1) >> 1) +
+     b.b->core.l_qseq);
+}
+
+inline bool
+bam_same_orientation(const bam_rec &a, const bam_rec &b) {
+  return ((a.b->core.flag ^ b.b->core.flag) & BAM_FREVERSE) != 0;
+}
+
+int truncate_overlap(const bam_rec &a, const uint32_t overlap, bam_rec &c);
+
+int merge_overlap(const bam_rec &a, const bam_rec &b,
+                  const uint32_t head, bam_rec &c);
+
+int merge_non_overlap(const bam_rec &a, const bam_rec &b,
+                      const uint32_t spacer, bam_rec &c);
+
+int keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c);
+
+size_t correct_cigar(bam_rec &b);
+
+void flip_conversion(bam_rec &aln);
+
+inline bool
+is_a_rich(const bam_rec &b) { return bam_aux2A(bam_aux_get(b.b, "CV")) == 'A'; }
+
+void
+standardize_format(const std::string &input_format, bam_rec &aln);
+
+inline bool
+are_mates(const bam_rec &one, const bam_rec &two) {
+  return one.b->core.mtid == two.b->core.tid &&
+    one.b->core.mpos == two.b->core.pos &&
+    bam_same_orientation(one, two);
+  // below is a consistency check and should not be necessary
+  /* &&
+     two->core.mtid == one->core.tid &&
+     two->core.mpos == one->core.pos; */
+}
+
+inline size_t
+get_l_qseq(const bam_rec &b) { return b.b->core.l_qseq; }
+
+inline size_t
+get_n_targets(const bam_header &bh) { return bh.h->n_targets; }
+
+inline std::string
+get_qname(const bam_rec &b) { return bam_get_qname(b); }
+
+inline int32_t
+get_tid(const bam_rec &b) { return b.b->core.tid; }
+
+inline hts_pos_t
+get_pos(const bam_rec &b) { return b.b->core.pos; }
+
+inline hts_pos_t
+get_endpos(const bam_rec &b) { return bam_endpos(b.b); }
+
+inline bool
+precedes_by_start(const bam_rec &a, const bam_rec &b) {
+  // assumes a.get_tid() <= b.get_tid()
+  return get_tid(a) == get_tid(b) && get_pos(a) < get_pos(b);
+}
+
+inline bool
+precedes_by_end_and_strand(const bam_rec &a, const bam_rec &b) {
+  const auto end_a = bam_endpos(a.b);
+  const auto end_b = bam_endpos(b.b);
+  return end_a < end_b || (end_a == end_b && bam_is_rev(a) < bam_is_rev(b));
+}
+
+inline bool
+equivalent_chrom_and_start(const bam_rec &a, const bam_rec &b) {
+  return a.b->core.pos == b.b->core.pos && a.b->core.tid == b.b->core.tid;
+}
+
+inline bool
+equivalent_end_and_strand(const bam_rec &a, const bam_rec &b) {
+  return bam_endpos(a.b) == bam_endpos(b.b) && bam_is_rev(a) == bam_is_rev(b);
+}
+
+template <typename T> int
+bam_aux_update_int(bam_rec &b, const char tag[2], T val) {
+  return bam_aux_update_int(b.b, tag, val);
+}
+
+#endif
diff --git a/src/common/dnmt_error.hpp b/src/common/dnmt_error.hpp
index 8090ebb6..38a714c1 100644
--- a/src/common/dnmt_error.hpp
+++ b/src/common/dnmt_error.hpp
@@ -20,6 +20,7 @@
 #include <string>
 #include <cstring>
 #include <cstdint> // for int64_t
+#include <sstream>
 
 struct dnmt_error: public std::exception {
   int64_t err;        // error possibly from HTSlib
diff --git a/src/utils/format-reads.cpp b/src/utils/format-reads.cpp
index d82b6d1b..73d82871 100644
--- a/src/utils/format-reads.cpp
+++ b/src/utils/format-reads.cpp
@@ -40,10 +40,6 @@
 
 #include <config.h>
 
-// from HTSlib
-#include <htslib/sam.h>
-#include <htslib/thread_pool.h>
-
 // from smithlab
 #include "OptionParser.hpp"
 #include "smithlab_utils.hpp"
@@ -51,6 +47,7 @@
 
 // from dnmtools
 #include "dnmt_error.hpp"
+#include "bam_record_utils.hpp"
 
 using std::string;
 using std::vector;
@@ -58,716 +55,19 @@ using std::cerr;
 using std::endl;
 using std::string;
 using std::vector;
-
-
-
-
-static inline void
-roundup_to_power_of_2(uint32_t &x) {
-  bool k_high_bit_set = (x >> (sizeof(uint32_t) * 8 - 1)) & 1;
-  if (x > 0) {
-    uint8_t size = sizeof(uint32_t);
-    --x;
-    x |= x >> (size / 4);
-    x |= x >> (size / 2);
-    x |= x >> (size);
-    x |= x >> (size * 2);
-    x |= x >> (size * 4);
-    x += !k_high_bit_set;
-  }
-  else x = 0;
-}
-
-static int
-sam_realloc_bam_data(bam1_t *b, size_t desired) {
-  /* returns flag: either 0 for success or -1 for error (unable to
-     allocate desired memory) */
-  uint32_t new_m_data = desired;
-  roundup_to_power_of_2(new_m_data);
-  if (new_m_data < desired)  return -1;
-  uint8_t *new_data = (uint8_t *)realloc(b->data, new_m_data);
-  if (!new_data) return -1;
-  // ADS: what would be the state of members below if -1 was returned?
-  b->data = new_data;
-  b->m_data = new_m_data;
-  return 0;
-}
-
-static inline void
-bam_copy_core(const bam1_t *a, bam1_t *b) {
-  /* ADS: prepared for a possibly more efficient block copy to assign
-     all variables at once */
-  b->core.pos = a->core.pos;
-  b->core.tid = a->core.tid;
-  b->core.bin = a->core.bin;
-  b->core.qual = a->core.qual;
-  b->core.l_extranul = a->core.l_extranul;
-  b->core.flag = a->core.flag;
-  b->core.l_qname = a->core.l_qname;
-  b->core.n_cigar = a->core.n_cigar;
-  b->core.l_qseq = a->core.l_qseq;
-  b->core.mtid = a->core.mtid;
-  b->core.mpos = a->core.mpos;
-  b->core.isize = a->core.isize;
-}
-
-static inline void
-bam_set1_core(bam1_core_t &core,
-              const size_t l_qname, const uint16_t flag, const int32_t tid,
-              const hts_pos_t pos, const uint8_t mapq, const size_t n_cigar,
-              const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize,
-              const size_t l_seq, const size_t qname_nuls) {
-  /* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and
-     likely mean "region to bin" for indexing */
-  /* MN: hts_reg2bin categorizes the size of the reference region.
-     Here, we use the numbers used in htslib/cram/cram_samtools.h */
-  static const int min_shift = 14;
-  static const int n_lvls = 5;
-
-  core.pos = pos;
-  core.tid = tid;
-  core.bin = hts_reg2bin(pos, pos + isize, min_shift, n_lvls);
-  core.qual = mapq;
-  core.l_extranul = qname_nuls - 1;
-  core.flag = flag;
-  core.l_qname = l_qname + qname_nuls;
-  core.n_cigar = n_cigar;
-  core.l_qseq = l_seq;
-  core.mtid = mtid;
-  core.mpos = mpos;
-  core.isize = isize;
-}
-
-static int
-bam_set1_wrapper(bam1_t *bam,
-                 const size_t l_qname, const char *qname,
-                 const uint16_t flag, const int32_t tid,
-                 const hts_pos_t pos, const uint8_t mapq,
-                 const size_t n_cigar, const uint32_t *cigar,
-                 const int32_t mtid, const hts_pos_t mpos,
-                 const hts_pos_t isize, const size_t l_seq,
-                 const size_t l_aux) {
-  /* This is based on how assignment is done in the `bam_set1`
-     function defined in `sam.c` from htslib */
-
-  /*
-   * This modification assigns variables of bam1_t struct but not the sequence.
-   *
-   * Many checks have been removed because they are checked in code
-   * that calls this function, mostly because they already come from a
-   * valid `bam1_t` struct and so the values have been individually
-   * validated.
-   *
-   * Assumptions:
-   * cigar has been computed and is in the right format
-   * rlen = isize
-   * qlen = l_seq
-   * l_qname <= 254
-   * HTS_POS_MAX - rlen > pos
-   * Where HTS_POS_MAX = ((((int64_t)INT_MAX)<<32)|INT_MAX) is the highest
-   * supported position.
-   *
-   * Number of bytes needed for the data is smaller than INT32_MAX
-   *
-   * qual = NULL, because we do not keep the quality scores through
-   * formatting the reads.
-   */
-
-  // `qname_nuls` below is the number of '\0' to use to pad the qname
-  // so that the cigar has 4-byte alignment.
-  const size_t qname_nuls = 4 - l_qname % 4;
-  bam_set1_core(bam->core, l_qname, flag, tid, pos, mapq, n_cigar,
-                mtid, mpos, isize, l_seq, qname_nuls);
-
-  const size_t data_len =
-    (l_qname + qname_nuls + n_cigar*sizeof(uint32_t) + (l_seq + 1) / 2 + l_seq);
-
-  bam->l_data = data_len;
-  if (data_len + l_aux > bam->m_data) {
-    const int ret = sam_realloc_bam_data(bam, data_len + l_aux);
-    if (ret < 0) {
-      throw dnmt_error(ret, "Failed to allocate memory for BAM record");
-    }
-  }
-  auto data_iter = bam->data;
-
-  std::copy_n(qname, l_qname, data_iter);
-  std::fill_n(data_iter + l_qname, qname_nuls, '\0');
-  data_iter += l_qname + qname_nuls;
-
-  // ADS: reinterpret here because we know the cigar is originally an
-  // array of uint32_t and has been aligned for efficiency
-  std::copy_n(cigar, n_cigar, reinterpret_cast<uint32_t *>(data_iter));
-  data_iter += n_cigar * sizeof(uint32_t);
-
-  // skipping sequece assignment
-  data_iter += (l_seq + 1) / 2;
-
-  std::fill(data_iter, data_iter + l_seq, '\xff');
-
-  return static_cast<int>(data_len);
-}
-
-static inline bool
-eats_ref(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; }
-
-static inline bool
-eats_query(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; }
-
-static inline size_t
-bam_get_n_cigar(const bam1_t *b) { return b->core.n_cigar; }
-
-static inline uint32_t
-to_insertion(const uint32_t x) {
-  return (x & ~BAM_CIGAR_MASK) | BAM_CINS;
-}
-
-static void
-fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) {
-  if (n_cigar < 3) return;
-  // find first non-softclip
-  auto c_beg = cigar;
-  auto c_end = cigar + n_cigar;
-
-  while (!eats_ref(*c_beg) && ++c_beg != c_end);
-  if (c_beg == c_end) throw dnmt_error("cigar eats no ref");
-
-  while (!eats_ref(*(c_end-1)) && --c_end != c_beg);
-  if (c_beg == c_end) throw dnmt_error("cigar eats no ref");
-
-  for (auto c_itr = c_beg; c_itr != c_end; ++c_itr)
-    if (bam_cigar_op(*c_itr) == BAM_CSOFT_CLIP)
-      *c_itr = to_insertion(*c_itr);
-}
-
-static inline uint32_t
-to_softclip(const uint32_t x) {
-  return (x & ~BAM_CIGAR_MASK) | BAM_CSOFT_CLIP;
-}
-
-static void
-fix_external_insertion(const size_t n_cigar, uint32_t *cigar) {
-  if (n_cigar < 2) return;
-
-  auto c_itr = cigar;
-  const auto c_end = c_itr + n_cigar;
-
-  for (; !eats_ref(*c_itr) && c_itr != c_end; ++c_itr)
-    *c_itr = to_softclip(*c_itr);
-
-  if (c_itr == c_end) throw dnmt_error("cigar eats no ref");
-
-  c_itr = cigar + n_cigar - 1;
-  for (; !eats_ref(*c_itr) && c_itr != cigar; --c_itr)
-    *c_itr = to_softclip(*c_itr);
-}
-
-static size_t
-merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) {
-  if (n_cigar < 2) return n_cigar;
-  auto c_itr1 = cigar;
-  auto c_end = c_itr1 + n_cigar;
-  auto c_itr2 = c_itr1 + 1;
-  auto op1 = bam_cigar_op(*c_itr1);
-  while (c_itr2 != c_end) {
-    auto op2 = bam_cigar_op(*c_itr2);
-    if (op1 == op2) {
-      *c_itr1 = bam_cigar_gen(bam_cigar_oplen(*c_itr1) +
-                                  bam_cigar_oplen(*c_itr2), op1);
-    }
-    else {
-      *(++c_itr1) = *c_itr2;
-      op1 = op2;
-    }
-    ++c_itr2;
-  }
-  // another increment to move past final "active" element for c_itr1
-  ++c_itr1;
-  return std::distance(cigar, c_itr1);
-}
-
-static size_t
-correct_cigar(bam1_t *b) {
-  /* This function will change external insertions into soft clip
-     operations. Not sure why those would be present. It will also
-     change internal soft-clip operations into insertions. This could
-     be needed if soft-clipped ends of reads were moved to the middle
-     of a merged fragment. Finally, it will collapse adjacent
-     identical operations. None of this impacts the seq/qual/aux which
-     get moved as a block */
-
-  uint32_t *cigar = bam_get_cigar(b);
-  size_t n_cigar = b->core.n_cigar;
-  fix_external_insertion(n_cigar, cigar);
-  fix_internal_softclip(n_cigar, cigar);
-
-  // merge identical adjacent cigar ops and get new number of ops
-  n_cigar = merge_cigar_ops(n_cigar, cigar);
-  // difference in bytes to shift the internal data
-  const size_t delta = (b->core.n_cigar - n_cigar) * sizeof(uint32_t);
-  if (delta > 0) { // if there is a difference; do the shift
-    auto data_end = bam_get_aux(b) + bam_get_l_aux(b);
-    std::copy(bam_get_seq(b), data_end, bam_get_seq(b) - delta);
-    b->core.n_cigar = n_cigar; // and update number of cigar ops
-  }
-  return delta;
-}
-
-static inline size_t
-get_rlen(const bam1_t *b) { // less tedious
-  return bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
-}
-
-static inline size_t
-get_qlen(const bam1_t *b) { // less tedious
-  return b->core.l_qseq;
-}
-
-static inline void
-complement_seq(char *first, char *last) {
-  for (; first != last; ++first) {
-    assert(valid_base(*first));
-    *first = complement(*first);
-  }
-}
-
-static inline void
-reverse(char *a, char *b) {
-  char *p1, *p2;
-  for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) {
-    *p1 ^= *p2;
-    *p2 ^= *p1;
-    *p1 ^= *p2;
-    assert(valid_base(*p1) && valid_base(*p2));
-  }
-}
-
-
-// return value is the number of cigar ops that are fully consumed in
-// order to read n_ref, while "partial_oplen" is the number of bases
-// that could be taken from the next operation, which might be merged
-// with the other read.
-static uint32_t
-get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops,
-                         const uint32_t n_ref_full, uint32_t *partial_oplen) {
-  // assume: n_ops <= size(cig_in) <= size(cig_out)
-  size_t rlen = 0;
-  uint32_t i = 0;
-  for (i = 0; i < in_ops; ++i) {
-    if (eats_ref(cig_in[i])) {
-      if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full)
-        break;
-      rlen += bam_cigar_oplen(cig_in[i]);
-    }
-  }
-  *partial_oplen = n_ref_full - rlen;
-  return i;
-}
-
-
-/* This table converts 2 bases packed in a byte to their reverse
- * complement. The input is therefore a unit8_t representing 2 bases.
- * It is assumed that the input uint8_t value is of form "xx" or "x-",
- * where 'x' a 4-bit number representing either A, C, G, T, or N and
- * '-' is 0000.  For example, the ouptut for "AG" is "CT". The format
- * "x-" is often used at the end of an odd-length sequence.  The
- * output of "A-" is "-T", and the output of "C-" is "-G", and so
- * forth. The user must handle this case separately.
- */
-const uint8_t byte_revcom_table[] = {
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248,
-  4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  2, 130, 66, 0, 34, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 242,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  1, 129, 65, 0, 33, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 241,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
-  0,   0,  0, 0,  0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,   0,
- 15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 255
-};
-
-
-static inline void
-revcom_byte_then_reverse(unsigned char *a, unsigned char *b) {
-  unsigned char *p1, *p2;
-  for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) {
-    *p1 = byte_revcom_table[*p1];
-    *p2 = byte_revcom_table[*p2];
-    *p1 ^= *p2;
-    *p2 ^= *p1;
-    *p1 ^= *p2;
-  }
-  if (p1 == p2) *p1 = byte_revcom_table[*p1];
-}
-
-static void
-revcomp_seq_by_byte(bam1_t *aln) {
-  const size_t l_qseq = get_qlen(aln);
-  auto seq = bam_get_seq(aln);
-  const size_t num_bytes = ceil(l_qseq / 2.0);
-  auto seq_end = seq + num_bytes;
-  revcom_byte_then_reverse(seq, seq_end);
-  if (l_qseq % 2 == 1) { // for odd-length sequences
-    for (size_t i = 0; i < num_bytes - 1; i++) {
-      // swap 4-bit chunks within consecutive bytes like this:
-      // (----aaaa bbbbcccc dddd....) => (aaaabbbb ccccdddd ....)
-      seq[i] = (seq[i] << 4) | (seq[i + 1] >> 4);
-    }
-    seq[num_bytes - 1] <<= 4;
-  }
-}
-
-// places seq of b at the end of seq of c
-// assumes 0 < c_seq_len - b_seq_len <= a_seq_len
-// also assumes that c_seq_len has been figured out
-// Also assumes the number of bytes allocated to sequence potion of c->data
-// has been set to ceil((a_used_len + b_seq_len) / 2.0) where
-// a_used_len = c_seq_len - b_seq_len
-static void
-merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) {
-  // ADS: (todo) need some functions for int_ceil and is_odd
-  const size_t b_seq_len = get_qlen(b);
-  const size_t c_seq_len = get_qlen(c);
-  const size_t a_used_len = c_seq_len - b_seq_len;
-
-  const bool is_a_odd = a_used_len % 2 == 1;
-  const bool is_b_odd = b_seq_len % 2 == 1;
-  const bool is_c_odd = c_seq_len % 2 == 1;
-
-  const size_t a_num_bytes = ceil(a_used_len / 2.0);
-  const size_t b_num_bytes = ceil(b_seq_len / 2.0);
-
-  const size_t b_offset = is_a_odd && is_b_odd;
-
-  const auto a_seq = bam_get_seq(a);
-  const auto b_seq = bam_get_seq(b);
-  auto c_seq = bam_get_seq(c);
-
-  std::copy_n(a_seq, a_num_bytes, c_seq);
-  if (is_a_odd) {
-    // c_seq looks like either [ aa aa aa aa ]
-    //                      or [ aa aa aa a- ]
-    c_seq[a_num_bytes - 1] &= 0xf0;
-    c_seq[a_num_bytes - 1] |= is_b_odd ?
-        byte_revcom_table[b_seq[b_num_bytes - 1]] :
-        byte_revcom_table[b_seq[b_num_bytes - 1]] >> 4;
-  }
-  if (is_c_odd) {
-    // c_seq looks like either [ aa aa aa aa ]
-    //                      or [ aa aa aa ab ]
-    for (size_t i = 0; i < b_num_bytes - 1; i++) {
-      c_seq[a_num_bytes + i] =
-          (byte_revcom_table[b_seq[b_num_bytes - i - 1]] << 4) |
-          (byte_revcom_table[b_seq[b_num_bytes - i - 2]] >> 4);
-    }
-    c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcom_table[b_seq[0]] << 4;
-    // Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd)
-    //                    or [ aa aa aa ab bb bb bb b- ] (a odd; b odd)
-  }
-  else {
-    for (size_t i = 0; i < b_num_bytes - b_offset; i++) {
-      c_seq[a_num_bytes + i] =
-          byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]];
-    }
-    // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even)
-    //                    or [ aa aa aa ab bb bb bb    ] (a odd and b odd)
-  }
-}
-
-static inline bool
-is_a_rich(const bam1_t *b) { return bam_aux2A(bam_aux_get(b, "CV")) == 'A'; }
-
-static inline bool
-format_is_bam_or_sam(htsFile *hts) {
-  const htsFormat *fmt = hts_get_format(hts);
-  return fmt->category == sequence_data &&
-         (fmt->format == bam || fmt->format == sam);
-}
-
-static void
-flip_conversion(bam1_t *aln) {
-  if (aln->core.flag & BAM_FREVERSE)
-    aln->core.flag = aln->core.flag & (~BAM_FREVERSE);
-  else
-    aln->core.flag = aln->core.flag | BAM_FREVERSE;
-
-  revcomp_seq_by_byte(aln);
-
-  // ADS: don't like *(cv + 1) below, but no HTSlib function for it?
-  uint8_t *cv = bam_aux_get(aln, "CV");
-  if (!cv) throw dnmt_error("bam_aux_get failed for CV");
-  *(cv + 1) = 'T';
-}
-
-static bool
-are_mates(const bam1_t *one, const bam1_t *two) {
-  return one->core.mtid == two->core.tid &&
-         one->core.mpos == two->core.pos &&
-         bam_is_rev(one) != bam_is_rev(two);
-  // below is a consistency check and should not be necessary
-  /* &&
-     two->core.mtid == one->core.tid &&
-     two->core.mpos == one->core.pos; */
-}
-
-static int
-truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) {
-
-  const uint32_t *a_cig = bam_get_cigar(a);
-  const uint32_t a_ops = a->core.n_cigar;
-
-  uint32_t part_op = 0;
-  const uint32_t c_cur =
-    get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op);
-
-  // ADS: hack here because the get_full_and_partial_ops doesn't do
-  // exactly what is needed for this.
-  const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0);
-
-  const uint32_t c_ops = c_cur + use_partial;
-  uint32_t *c_cig = (uint32_t *)calloc(c_ops, sizeof(uint32_t));
-
-  // ADS: replace this with a std::copy
-  memcpy(c_cig, a_cig, c_cur * sizeof(uint32_t));
-  // ADS: warning, if !use_partial, the amount of part_op used below
-  // would make no sense.
-  if (use_partial)
-    c_cig[c_cur] = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur]));
-  /* after this point the cigar is set and should decide everything */
-
-  const uint32_t c_seq_len = bam_cigar2qlen(c_ops, c_cig);
-  const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig);
-
-  // flag only needs to worry about strand and single-end stuff
-  const uint16_t flag =
-      a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE);
-
-  int ret = bam_set1_wrapper(c,
-                             a->core.l_qname - (a->core.l_extranul + 1),
-                             bam_get_qname(a),
-                             flag, // flags (SR and revcomp info)
-                             a->core.tid,
-                             a->core.pos,
-                             a->core.qual,
-                             c_ops,     // merged cigar ops
-                             c_cig,     // merged cigar
-                             -1,        // (no mate)
-                             -1,        // (no mate)
-                             isize,     // rlen from new cigar
-                             c_seq_len, // truncated seq length
-                             8);        // enough for the 2 tags?
-  if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper");
-  // ADS: might it be better to fill `c->data` directly?
-  free(c_cig);
-
-  auto c_seq = bam_get_seq(c);
-  size_t num_bytes_to_copy = (c_seq_len + 1)/2;
-  std::copy_n(bam_get_seq(a), num_bytes_to_copy, c_seq);
-
-  /* add the tags */
-  const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here!
-  // "udpate" for "int" because it determines the right size
-  ret = bam_aux_update_int(c, "NM", nm);
-  if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int");
-
-  const uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV"));
-  // "append" for "char" because there is no corresponding update
-  ret = bam_aux_append(c, "CV", 'A', 1, &conversion);
-  if (ret < 0) throw dnmt_error(ret, "bam_aux_append");
-
-  return ret;
-}
-
-static int
-merge_overlap(const bam1_t *a, const bam1_t *b,
-              const uint32_t head, bam1_t *c) {
-  assert(head > 0);
-
-  const uint32_t *a_cig = bam_get_cigar(a);
-  const uint32_t a_ops = a->core.n_cigar;
-
-  const uint32_t *b_cig = bam_get_cigar(b);
-  const uint32_t b_ops = b->core.n_cigar;
-
-  uint32_t part_op = 0;
-  uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, head, &part_op);
-  // ADS: hack here because the get_full_and_partial_ops doesn't do
-  // exactly what is needed for this.
-  const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0);
-
-  // check if the middle op would be the same
-  const bool merge_mid =
-      (use_partial > 0 ?
-      bam_cigar_op(a_cig[c_cur]) == bam_cigar_op(b_cig[0]) :
-      bam_cigar_op(a_cig[c_cur - 1]) == bam_cigar_op(b_cig[0]));
-
-  // c_ops: include the prefix of a_cig we need; then add for the
-  // partial op; subtract for the identical op in the middle; finally
-  // add the rest of b_cig.
-  const uint32_t c_ops = c_cur + use_partial - merge_mid + b_ops;
-
-  uint32_t *c_cig = (uint32_t *)calloc(c_ops, sizeof(uint32_t));
-  // std::fill(c_cig, c_cig + c_ops, std::numeric_limits<uint32_t>::max());
-  memcpy(c_cig, a_cig, c_cur * sizeof(uint32_t));
-
-  if (use_partial) {
-    c_cig[c_cur] = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur]));
-    c_cur++; // index of dest for copying b_cig; faciltates corner case
-  }
-  // Here we get the length of a's sequence part contribution to c's
-  // sequence before the possibility of merging the last entry with
-  // the first entry in b's cigar. This is done with the cigar, so
-  // everything depends on the "use_partial"
-  const size_t a_seq_len = bam_cigar2qlen(c_cur, c_cig);
-  /* ADS: above the return type of bam_cigar2qlen is uint64_t, but
-     according to the source as of 05/2023 it cannot become
-     negative; no possible error code returned */
-
-  if (merge_mid) // update the middle op if it's the same
-    c_cig[c_cur - 1] = bam_cigar_gen(bam_cigar_oplen(c_cig[c_cur - 1]) +
-                                     bam_cigar_oplen(b_cig[0]),
-                                     bam_cigar_op(b_cig[0]));
-  // copy the cigar from b into c
-  memcpy(c_cig + c_cur, b_cig + merge_mid,
-         (b_ops - merge_mid)*sizeof(uint32_t));
-  /* done with cigar string here */
-
-  const uint32_t c_seq_len = a_seq_len + b->core.l_qseq;
-
-  // get the template length
-  const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig);
-
-  // flag only needs to worry about strand and single-end stuff
-  uint16_t flag = (a->core.flag & (BAM_FREAD1 |
-                                   BAM_FREAD2 |
-                                   BAM_FREVERSE));
-
-  int ret = bam_set1_wrapper(c,
-                             a->core.l_qname - (a->core.l_extranul + 1),
-                             bam_get_qname(a),
-                             flag, // (no PE; revcomp info)
-                             a->core.tid,
-                             a->core.pos,
-                             a->core.qual, // mapq from "a" (consider update)
-                             c_ops,        // merged cigar ops
-                             c_cig,        // merged cigar
-                             -1,           // (no mate)
-                             -1,           // (no mate)
-                             isize,        // updated
-                             c_seq_len,    // merged sequence length
-                             8);           // enough for 2 tags?
-  free(c_cig);
-  if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper in merge_overlap");
-  // Merge the sequences by bytes
-  merge_by_byte(a, b, c);
-
-  // add the tag for mismatches
-  const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) +
-                      bam_aux2i(bam_aux_get(b, "NM")));
-  ret = bam_aux_update_int(c, "NM", nm);
-  if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int in merge_overlap");
-
-  // add the tag for conversion
-  const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV"));
-  ret = bam_aux_append(c, "CV", 'A', 1, &cv);
-  if (ret < 0) throw dnmt_error(ret, "bam_aux_append in merge_overlap");
-
-  return ret;
-}
-
-static int
-merge_non_overlap(const bam1_t *a, const bam1_t *b,
-                  const uint32_t spacer, bam1_t *c) {
-  /* make the cigar string */
-  // collect info about the cigar strings
-  const uint32_t *a_cig = bam_get_cigar(a);
-  const uint32_t a_ops = a->core.n_cigar;
-  const uint32_t *b_cig = bam_get_cigar(b);
-  const uint32_t b_ops = b->core.n_cigar;
-  // allocate the new cigar string
-  const uint32_t c_ops = a_ops + b_ops + 1;
-  uint32_t *c_cig = (uint32_t *)calloc(c_ops, sizeof(uint32_t));
-  // concatenate the new cigar strings with a "skip" in the middle
-  memcpy(c_cig, a_cig, a_ops * sizeof(uint32_t));
-  c_cig[a_ops] = bam_cigar_gen(spacer, BAM_CREF_SKIP);
-  memcpy(c_cig + a_ops + 1, b_cig, b_ops * sizeof(uint32_t));
-  /* done with cigars */
-
-  const size_t a_seq_len = get_qlen(a);
-  const size_t b_seq_len = get_qlen(b);
-  const size_t c_seq_len = a_seq_len + b_seq_len;
-
-  // get the template length from the cigar
-  const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig);
-
-  // flag: only need to keep strand and single-end info
-  const uint16_t flag = a->core.flag & (BAM_FREAD1 |
-                                        BAM_FREAD2 |
-                                        BAM_FREVERSE);
-
-  int ret =
-      bam_set1_wrapper(c,
-                       a->core.l_qname - (a->core.l_extranul + 1),
-                       bam_get_qname(a),
-                       flag, // flags (no PE; revcomp info)
-                       a->core.tid,
-                       a->core.pos,
-                       a->core.qual, // mapq from a (consider update)
-                       c_ops,        // merged cigar ops
-                       c_cig,        // merged cigar
-                       -1,           // (no mate)
-                       -1,           // (no mate)
-                       isize,        // TLEN (relative to reference; SAM docs)
-                       c_seq_len,    // merged sequence length
-                       8);           // enough for 2 tags of 1 byte value?
-  free(c_cig);
-  if (ret < 0) throw dnmt_error(ret, "bam_set1 in merge_non_overlap");
-
-  merge_by_byte(a, b, c);
-
-  /* add the tags */
-  const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) +
-                      bam_aux2i(bam_aux_get(b, "NM")));
-  // "udpate" for "int" because it determines the right size
-  ret = bam_aux_update_int(c, "NM", nm);
-  if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int");
-
-  const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV"));
-  // "append" for "char" because there is no corresponding update
-  ret = bam_aux_append(c, "CV", 'A', 1, &cv);
-  if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_append");
-
-  return ret;
-}
-
-static int
-keep_better_end(const bam1_t *a, const bam1_t *b, bam1_t *c) {
-  c = bam_copy1(c, get_rlen(a) >= get_rlen(b) ? a : b);
-  c->core.mtid = -1;
-  c->core.mpos = -1;
-  c->core.isize = get_rlen(c);
-  c->core.flag &= (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE);
-  return 0;
-}
+using std::equal;
 
 static size_t
 merge_mates(const size_t range,
-            bam1_t *one, bam1_t *two, bam1_t *merged) {
+            bam_rec &one, bam_rec &two, bam_rec &merged) {
 
   if (!are_mates(one, two)) return -std::numeric_limits<int>::max();
 
   // arithmetic easier using base 0 so subtracting 1 from pos
-  const int one_s = one->core.pos;
-  const int one_e = bam_endpos(one);
-  const int two_s = two->core.pos;
-  const int two_e = bam_endpos(two);
+  const int one_s = get_pos(one);
+  const int one_e = get_endpos(one);
+  const int two_s = get_pos(two);
+  const int two_e = get_endpos(two);
   assert(one_s >= 0 && two_s >= 0);
 
   const int spacer = two_s - one_e;
@@ -853,98 +153,22 @@ merge_mates(const size_t range,
 
   return two_e - one_s;
 }
-/********Above are functions for merging pair-end reads********/
-
-// ADS: will move to using this function once it is written
-static void
-standardize_format(const string &input_format, bam1_t *aln) {
-  int err_code = 0;
-
-  if (input_format == "abismal" || input_format == "walt") return;
-
-  if (input_format == "bsmap") {
-    // A/T rich; get the ZS tag value
-    auto zs_tag = bam_aux_get(aln, "ZS");
-    if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)");
-    // ADS: test for errors on the line below
-    const uint8_t cv = string(bam_aux2Z(zs_tag))[1] == '-' ? 'A' : 'T';
-    // get the "mismatches" tag
-    auto nm_tag = bam_aux_get(aln, "NM");
-    if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bsmap)");
-    const int64_t nm = bam_aux2i(nm_tag);
-
-    aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize)
-
-    /* add the tags we want */
-    // "udpate" for "int" because it determines the right size; even
-    // though we just deleted all tags, it will add it back here.
-    err_code = bam_aux_update_int(aln, "NM", nm);
-    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int");
-    // "append" for "char" because there is no corresponding update
-    err_code = bam_aux_append(aln, "CV", 'A', 1, &cv);
-    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append");
-
-    if (bam_is_rev(aln))
-      revcomp_seq_by_byte(aln); // reverse complement if needed
-  }
-  if (input_format == "bismark") {
-    // ADS: Previously we modified the read names at the first
-    // underscore. Even if the names are still that way, it should no
-    // longer be needed since we compare names up to a learned suffix.
-
-    // A/T rich; get the XR tag value
-    auto xr_tag = bam_aux_get(aln, "XR");
-    if (!xr_tag) throw dnmt_error("bam_aux_get for XR (invalid bismark)");
-    const uint8_t cv = string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T';
-    // get the "mismatches" tag
-    auto nm_tag = bam_aux_get(aln, "NM");
-    if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bismark)");
-    const int64_t nm = bam_aux2i(nm_tag);
-
-    aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize)
-
-    /* add the tags we want */
-    // "udpate" for "int" because it determines the right size; even
-    // though we just deleted all tags, it will add it back here.
-    err_code = bam_aux_update_int(aln, "NM", nm);
-    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int");
-    // "append" for "char" because there is no corresponding update
-    err_code = bam_aux_append(aln, "CV", 'A', 1, &cv);
-    if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append");
-
-    if (bam_is_rev(aln))
-      revcomp_seq_by_byte(aln); // reverse complement if needed
-  }
 
-  // Be sure this doesn't depend on mapper! Removes the "qual" part of
-  // the data in a bam1_t struct but does not change its uncompressed
-  // size.
-  const auto qs = bam_get_qual(aln);
-  std::fill(qs, qs + aln->core.l_qseq, '\xff'); // deletes qseq
-}
+/********Above are functions for merging pair-end reads********/
 
 static vector<string>
 load_read_names(const string &inputfile, const size_t n_reads) {
-  samFile *hts = hts_open(inputfile.c_str(), "r");
-  if (!hts) throw dnmt_error("failed to open file: " + inputfile);
+  bam_infile hts(inputfile);
+  if (!hts) throw dnmt_error("failed to open BAM/SAM file: " + inputfile);
 
-  sam_hdr_t *hdr = sam_hdr_read(hts);
-  if (!hdr) throw dnmt_error("failed to read header: " + inputfile);
+  bam_header hdr(hts);
+  if (!hdr) throw dnmt_error("failed to get header: " + inputfile);
 
-  bam1_t *aln = bam_init1();
+  bam_rec aln;
   vector<string> names;
   size_t count = 0;
-  int err_code = 0;
-
-  while ((err_code = sam_read1(hts, hdr, aln)) >= 0 && count++ < n_reads)
-    names.push_back(string(bam_get_qname(aln)));
-  // err_core == -1 means EOF
-  if (err_code < -1) dnmt_error(err_code, "load_read_names:sam_read1");
-
-  bam_destroy1(aln);
-  bam_hdr_destroy(hdr);
-  err_code = hts_close(hts);
-  if (err_code < 0) throw dnmt_error(err_code, "check_input_file:hts_close");
+  while (hts.read(hdr, aln) && count++ < n_reads)
+    names.push_back(bam_get_qname(aln));
 
   return names;
 }
@@ -1055,65 +279,42 @@ check_sorted(const string &inputfile, const size_t suff_len, size_t n_reads) {
   return true;
 }
 
-static bool
+static inline void
 check_input_file(const string &infile) {
-  // ADS: (below) At some point the errno here is set to 3=ESRCH ("No
-  // such process"?) when using HTSlib 1.17 on macos ventura
-  const int prev_errno = errno;
-  samFile* hts = hts_open(infile.c_str(), "r");
-  if (!hts || errno != prev_errno)
-    throw dnmt_error("error opening: " + infile);
-
-  const htsFormat *fmt = hts_get_format(hts);
-  if (fmt->category != sequence_data)
-    throw dnmt_error("not sequence data: " + infile);
-  if (fmt->format != bam && fmt->format != sam)
-    throw dnmt_error("not SAM/BAM format: " + infile);
-
-  const int err_code = hts_close(hts);
-  if (err_code < 0) throw dnmt_error(err_code, "check_input_file:hts_close");
-
-  return true;
+  const bam_infile hts(infile);
+  if (!hts) throw dnmt_error("failed to open file: " + infile);
+  if (!hts.is_mapped_reads_file())
+    throw dnmt_error("not valid SAM/BAM format: " + infile);
 }
 
 static bool
 check_format_in_header(const string &input_format, const string &inputfile) {
-  samFile* hts = hts_open(inputfile.c_str(), "r");
+  bam_infile hts(inputfile);
   if (!hts) throw dnmt_error("error opening file: " + inputfile);
-
-  sam_hdr_t *hdr = sam_hdr_read(hts);
-  if (!hdr) throw dnmt_error("failed to read header: " + inputfile);
-
-  auto begin_hdr = sam_hdr_str(hdr);
-  auto end_hdr = begin_hdr + std::strlen(begin_hdr);
-  auto it = std::search(begin_hdr, end_hdr,
+  const bam_header bh(hts);
+  if (!bh) throw dnmt_error("failed to read header: " + inputfile);
+  const string entire_header(bh.tostring());
+  auto it = std::search(begin(entire_header), end(entire_header),
                         begin(input_format), end(input_format),
                         [](const unsigned char a, const unsigned char b) {
                           return std::toupper(a) == std::toupper(b);
                         });
-  bam_hdr_destroy(hdr);
-  const int err_code = hts_close(hts);
-  if (err_code < 0) throw dnmt_error(err_code, "check_format_in_header:hts_close");
-
-  return it != end_hdr;
+  return it != end(entire_header);
 }
 
-static bool
-same_name(const bam1_t *a, const bam1_t *b, const size_t suff_len) {
+static inline bool
+same_name(const bam_rec &a, const bam_rec &b, const size_t suff_len) {
   // "+ 1" below: extranul counts *extras*; we don't want *any* nulls
-  const uint16_t a_l = a->core.l_qname - (a->core.l_extranul + 1);
-  const uint16_t b_l = b->core.l_qname - (b->core.l_extranul + 1);
+  const uint16_t a_l = a.b->core.l_qname - (a.b->core.l_extranul + 1);
+  const uint16_t b_l = b.b->core.l_qname - (b.b->core.l_extranul + 1);
   if (a_l != b_l) return false;
   assert(a_l > suff_len);
   return !std::strncmp(bam_get_qname(a), bam_get_qname(b), a_l - suff_len);
 }
 
-static void
-add_pg_line(const string &cmd, sam_hdr_t *hdr) {
-  int err_code =
-    sam_hdr_add_line(hdr, "PG", "ID", "DNMTOOLS", "VN",
-                     VERSION, "CL", cmd.c_str(), NULL);
-  if (err_code) throw dnmt_error(err_code, "failed to add pg header line");
+static inline void
+swap(bam_rec &a, bam_rec &b) {
+  std::swap(a.b, b.b);
 }
 
 static void
@@ -1122,99 +323,73 @@ format(const string &cmd, const size_t n_threads,
        const bool bam_format, const string &input_format,
        const size_t suff_len, const size_t max_frag_len) {
 
-  int err_code = 0;
+  static const dnmt_error bam_write_err{"error writing bam"};
 
-  // open the hts files; assume already checked
-  samFile *hts = hts_open(inputfile.c_str(), "r");
-  samFile *out = hts_open(outfile.c_str(), bam_format ? "wb" : "w");
+  bam_tpool tpool(n_threads); // outer scope: must be destroyed last
+  {
+    bam_infile hts(inputfile);  // assume already checked
+    bam_header hdr(hts);
+    if (!hdr) throw dnmt_error("failed to read header");
 
-  // set the threads
-  htsThreadPool the_thread_pool{hts_tpool_init(n_threads), 0};
-  err_code = hts_set_thread_pool(hts, &the_thread_pool);
-  if (err_code < 0) throw dnmt_error("error setting threads");
-  err_code = hts_set_thread_pool(out, &the_thread_pool);
-  if (err_code < 0) throw dnmt_error("error setting threads");
+    bam_outfile out(outfile, bam_format);
+    {
+      bam_header hdr_out(hdr);
+      if (!hdr_out) throw dnmt_error("failed create header");
+      hdr_out.add_pg_line(cmd, "DNMTOOLS", VERSION);
+      if (!out.write(hdr_out)) throw dnmt_error("failed to write header");
+    }
 
-  // headers: load the input file's header, and then update to the
-  // output file's header, then write it and destroy; we will only use
-  // the input file header.
-  sam_hdr_t *hdr = sam_hdr_read(hts);
-  if (!hdr) throw dnmt_error("failed to read header");
-  sam_hdr_t *hdr_out = bam_hdr_dup(hdr);
-  if (!hdr_out) throw dnmt_error("failed create header");
-  add_pg_line(cmd, hdr_out);
-  err_code = sam_hdr_write(out, hdr_out);
-  if (err_code) throw dnmt_error(err_code, "failed to output header");
+    if (n_threads > 1) {
+      tpool.set_io(hts);
+      tpool.set_io(out);
+    }
 
-  // now process the reads
-  bam1_t *aln = bam_init1();
-  bam1_t *prev_aln = bam_init1();
-  bam1_t *merged = bam_init1();
-  bool previous_was_merged = false;
+    bam_rec aln, prev_aln, merged;
+    bool previous_was_merged = false;
 
-  err_code = sam_read1(hts, hdr, aln); // for EOF, err_code == -1
-  if (err_code < -1) throw dnmt_error(err_code, "format:sam_read1");
-  const bool empty_reads_file = (err_code == -1);
-  if (!empty_reads_file) {
-  standardize_format(input_format, aln);
+    const bool empty_reads_file = !hts.read(hdr, aln);
 
-    std::swap(aln, prev_aln); // start with prev_aln being first read
+    if (!empty_reads_file) {
 
-    while ((err_code = sam_read1(hts, hdr, aln)) >= 0) {
       standardize_format(input_format, aln);
-      if (same_name(prev_aln, aln, suff_len)) {
-        // below: essentially check for dovetail
-        if (!bam_is_rev(aln)) std::swap(prev_aln, aln);
-        const size_t frag_len = merge_mates(max_frag_len, prev_aln, aln, merged);
-        if (frag_len > 0 && frag_len < max_frag_len) {
-          if (is_a_rich(merged)) flip_conversion(merged);
-          err_code = sam_write1(out, hdr, merged);
-          if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1");
+
+      swap(aln, prev_aln);  // start with prev_aln being first read
+
+      while (hts.read(hdr, aln)) {
+        standardize_format(input_format, aln);
+        if (same_name(prev_aln, aln, suff_len)) {
+          // below: essentially check for dovetail
+          if (!bam_is_rev(aln)) swap(prev_aln, aln);
+          const size_t frag_len = merge_mates(max_frag_len, prev_aln, aln, merged);
+          if (frag_len > 0 && frag_len < max_frag_len) {
+            if (is_a_rich(merged)) flip_conversion(merged);
+            if (!out.write(hdr, merged)) throw bam_write_err;
+          }
+          else {
+            if (is_a_rich(prev_aln)) flip_conversion(prev_aln);
+            if (!out.write(hdr, prev_aln)) throw bam_write_err;
+            if (is_a_rich(aln)) flip_conversion(aln);
+            if (!out.write(hdr, aln)) throw bam_write_err;
+          }
+          previous_was_merged = true;
         }
         else {
-          if (is_a_rich(prev_aln)) flip_conversion(prev_aln);
-          err_code = sam_write1(out, hdr, prev_aln);
-          if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1");
-          if (is_a_rich(aln)) flip_conversion(aln);
-          err_code = sam_write1(out, hdr, aln);
-          if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1");
+          if (!previous_was_merged) {
+            if (is_a_rich(prev_aln)) flip_conversion(prev_aln);
+            if (!out.write(hdr, prev_aln)) throw bam_write_err;
+          }
+          previous_was_merged = false;
         }
-        previous_was_merged = true;
+        swap(prev_aln, aln);
       }
-      else {
-        if (!previous_was_merged) {
-          if (is_a_rich(prev_aln)) flip_conversion(prev_aln);
-          err_code = sam_write1(out, hdr, prev_aln);
-          if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1");
-        }
-        previous_was_merged = false;
+      if (!previous_was_merged) {
+        if (is_a_rich(prev_aln)) flip_conversion(prev_aln);
+        if (!out.write(hdr, prev_aln)) throw bam_write_err;
       }
-      std::swap(prev_aln, aln);
-    }
-    if (err_code < -1) throw dnmt_error(err_code, "format:sam_read1");
-
-    if (!previous_was_merged) {
-      if (is_a_rich(prev_aln)) flip_conversion(prev_aln);
-      err_code = sam_write1(out, hdr, prev_aln);
-      if (err_code < 0) throw dnmt_error(err_code, "format:sam_write1");
     }
   }
-
-  // turn off the lights
-  bam_destroy1(prev_aln);
-  bam_destroy1(aln);
-  bam_destroy1(merged);
-  bam_hdr_destroy(hdr);
-  bam_hdr_destroy(hdr_out);
-  err_code = hts_close(hts);
-  if (err_code < 0) throw dnmt_error(err_code, "format:hts_close");
-  err_code = hts_close(out);
-  if (err_code < 0) throw dnmt_error(err_code, "format:hts_close");
-  // do this after the files have been closed
-  hts_tpool_destroy(the_thread_pool.pool);
 }
 
-
 int main_format(int argc, const char **argv) {
 
   try {
@@ -1324,7 +499,7 @@ int main_format(int argc, const char **argv) {
       }
       else if (!check_suff_len(infile, suff_len, n_reads_to_check))
         throw dnmt_error("wrong read name suffix length [" +
-                      std::to_string(suff_len) + "] in: " + infile);
+                         std::to_string(suff_len) + "] in: " + infile);
       if (!check_sorted(infile, suff_len, n_reads_to_check))
         throw dnmt_error("mates not consecutive in: " + infile);
     }
diff --git a/src/utils/uniq.cpp b/src/utils/uniq.cpp
index 37b1ca08..2ea97e11 100644
--- a/src/utils/uniq.cpp
+++ b/src/utils/uniq.cpp
@@ -18,7 +18,7 @@
  * General Public License for more details.
  */
 
-#include <cstdint> // for [u]int[0-9]+_t
+#include <cstdint>  // for [u]int[0-9]+_t
 #include <iostream>
 #include <random>
 #include <stdexcept>
@@ -28,18 +28,13 @@
 // generated by autotools
 #include <config.h>
 
-// from HTSlib
-#include <htslib/sam.h>
-#include <htslib/thread_pool.h>
-
-// from smithlab
 #include "GenomicRegion.hpp"
 #include "OptionParser.hpp"
 #include "bsutils.hpp"
+#include "dnmt_error.hpp"
 #include "smithlab_os.hpp"
 #include "smithlab_utils.hpp"
-
-#include "dnmt_error.hpp"
+#include "bam_record_utils.hpp"
 
 using std::cerr;
 using std::endl;
@@ -59,69 +54,23 @@ namespace uniq_random {
   bool initialized = false;
   std::default_random_engine e;
   std::uniform_int_distribution<int> di;
-
   void initialize(const size_t the_seed) {
     e = std::default_random_engine(the_seed);
     initialized = true;
   }
-
   int rand() {
     // ADS: should have same range as ordinary rand() by properties of
     // std::uniform_int_distribution default constructor.
     // assert(initialized);
     return di(e);
   }
-} // namespace uniq_random
-
-inline bool
-format_is_bam_or_sam(htsFile *hts) {
-  const htsFormat *fmt = hts_get_format(hts);
-  return fmt->category == sequence_data &&
-         (fmt->format == bam || fmt->format == sam);
-}
-
-inline string
-qname(const bam1_t *b) {
-  return string(bam_get_qname(b));
-}
-
-inline int32_t
-get_tid(bam1_t *b) {
-  return b->core.tid;
-}
-
-inline size_t
-qlen(const bam1_t *r) {
-  return bam_cigar2qlen(r->core.n_cigar, bam_get_cigar(r));
-}
-
-inline bool
-precedes_by_start(const bam1_t *a, const bam1_t *b) {
-  return a->core.tid == b->core.tid && a->core.pos < b->core.pos;
-}
-
-inline bool
-precedes_by_end_and_strand(const bam1_t *a, const bam1_t *b) {
-  const size_t end_a = bam_endpos(a), end_b = bam_endpos(b);
-  return (end_a < end_b || (end_a == end_b && bam_is_rev(a) < bam_is_rev(b)));
-}
-
-inline bool
-equivalent_chrom_and_start(const bam1_t *a, const bam1_t *b) {
-  return a->core.pos == b->core.pos && a->core.tid == b->core.tid;
-}
-
-inline bool
-equivalent_end_and_strand(const bam1_t *a, const bam1_t *b) {
-  return bam_endpos(a) == bam_endpos(b) && bam_is_rev(a) == bam_is_rev(b);
-}
+}  // namespace uniq_random
 
-struct rd_stats { // keep track of good bases/reads in and out
+struct rd_stats {  // keep track of good bases/reads in and out
   size_t bases{};
   size_t reads{};
-
-  void update(bam1_t *b) {
-    bases += qlen(b);
+  void update(const bam_rec &b) {
+    bases += get_l_qseq(b);
     ++reads;
   }
 };
@@ -159,24 +108,24 @@ write_hist_output(const vector<size_t> &hist, const string &histfile) {
 }
 
 /* The "inner" buffer corresponds to all reads sharing chrom, start,
- * end and strand, and is a contiguous subset of the "outer" buffer
- * that shares the same end and strand.
- */
+   end and strand, and is a contiguous subset of the "outer" buffer
+   that shares the same end and strand. */
 static void
 process_inner_buffer(const bool add_dup_count,
-                     const vector<bam1_t *>::const_iterator it,
-                     const vector<bam1_t *>::const_iterator jt, sam_hdr_t *hdr,
-                     samFile *out, rd_stats &rs_out, size_t &reads_duped,
+                     const vector<bam_rec>::iterator it,
+                     const vector<bam_rec>::iterator jt, bam_header &hdr,
+                     bam_outfile &out, rd_stats &rs_out, size_t &reads_duped,
                      vector<size_t> &hist) {
+  constexpr char du_tag[2] = {'D', 'U'};
   const size_t n_reads = std::distance(it, jt);
   const size_t selected = uniq_random::rand() % n_reads;
 
   if (add_dup_count) {
-    const int ret = bam_aux_update_int(*(it + selected), "DU", n_reads);
+    const int ret = bam_aux_update_int(*(it + selected), du_tag, n_reads);
     if (ret < 0) throw dnmt_error("error adding duplicate count aux field");
   }
 
-  if (sam_write1(out, hdr, *(it + selected)) < 0)
+  if (!out.write(hdr, *(it + selected)))
     throw runtime_error("failed writing bam record");
   if (hist.size() <= n_reads) hist.resize(n_reads + 1);
   hist[n_reads]++;
@@ -188,8 +137,8 @@ process_inner_buffer(const bool add_dup_count,
    and start position. These are gathered and then processed together. */
 static void
 process_buffer(const bool add_dup_count, rd_stats &rs_out, size_t &reads_duped,
-               vector<size_t> &hist, vector<bam1_t *> &buffer, sam_hdr_t *hdr,
-               samFile *out) {
+               vector<size_t> &hist, vector<bam_rec> &buffer, bam_header &hdr,
+               bam_outfile &out) {
   sort(begin(buffer), end(buffer), precedes_by_end_and_strand);
   auto it(begin(buffer));
   auto jt = it + 1;
@@ -201,114 +150,72 @@ process_buffer(const bool add_dup_count, rd_stats &rs_out, size_t &reads_duped,
     }
   process_inner_buffer(add_dup_count, it, jt, hdr, out, rs_out, reads_duped,
                        hist);
-
-  // free the bam1_t pointers before clearing the buffer
-  for (size_t i = 0; i < buffer.size(); ++i)
-    if (buffer[i] != 0) {
-      bam_destroy1(buffer[i]);
-      buffer[i] = 0;
-    }
   buffer.clear();
 }
 
-static bam1_t *
-get_read(samFile *hts, sam_hdr_t *hdr) {
-  bam1_t *b = bam_init1();
-  const int result = sam_read1(hts, hdr, b);
-  if (result >= 0) return b;
-
-  if (result < -1)
-    throw runtime_error("error reading file: " + string(hts->fn));
-  else // -1 should mean EOF, so we free this read
-    bam_destroy1(b);
-  return 0;
-}
-
 static void
 uniq(const bool VERBOSE, const bool add_dup_count, const size_t n_threads,
      const string &cmd, const string &infile, const string &statfile,
      const string &histfile, const bool bam_format, const string &outfile) {
-  // ADS: (below) At some point the errno here is set to 3=ESRCH ("No
-  // such process"?) when using HTSlib 1.17 on macos ventura
-  int prev_errno = errno;
-  samFile *hts = hts_open(infile.c_str(), "r");
-  if (!hts || errno != prev_errno) throw dnmt_error("error opening: " + infile);
-
-  htsThreadPool the_thread_pool{hts_tpool_init(n_threads), 0};
-  if (hts_set_thread_pool(hts, &the_thread_pool) < 0)
-    throw runtime_error("error setting threads");
-
-  if (!format_is_bam_or_sam(hts))
-    throw runtime_error("bad file format: " + infile);
-
-  sam_hdr_t *hdr = sam_hdr_read(hts);
-  if (!hdr) throw runtime_error("failed to read header: " + infile);
-
-  // open the output file
-  samFile *out = hts_open(outfile.c_str(), bam_format ? "wb" : "w");
-
-  if (hts_set_thread_pool(out, &the_thread_pool) < 0)
-    throw runtime_error("error setting threads");
-
-  // take care of the output file's header
-  sam_hdr_t *hdr_out = bam_hdr_dup(hdr);
-  if (sam_hdr_add_line(hdr_out, "PG", "ID", "DNMTOOLS", "VN", VERSION, "CL",
-                       cmd.c_str(), NULL))
-    throw runtime_error("failed to format header");
-  if (sam_hdr_write(out, hdr_out))
-    throw runtime_error("failed to output header");
-  bam_hdr_destroy(hdr_out);
-
   // values to tabulate stats; no real cost
   rd_stats rs_in, rs_out;
   size_t reads_duped = 0;
   vector<size_t> hist;
 
-  // try to load the first read
-  prev_errno = errno;
-  bam1_t *aln = get_read(hts, hdr);
-  if (errno != prev_errno)
-    throw runtime_error("failed parsing read from input file");
+  bam_tpool tpool(n_threads);  // outer scope: must be destroyed last
+  {
+    bam_infile hts(infile);
+    bam_header hdr(hts);
+    if (!hdr) throw dnmt_error("failed to read header");
+
+    bam_outfile out(outfile, bam_format);
+    {
+      bam_header hdr_out(hdr);
+      if (!hdr_out) throw dnmt_error("failed create header");
+      hdr_out.add_pg_line(cmd, "DNMTOOLS", VERSION);
+      if (!out.write(hdr_out)) throw dnmt_error("failed to write header");
+    }
+
+    if (n_threads > 1) {
+      tpool.set_io(hts);
+      tpool.set_io(out);
+    }
 
-  if (aln) { // data file is not empty
+    bam_rec aln;
+    if (hts.read(hdr, aln)) {  // valid SAM/BAM can have 0 reads
 
-    rs_in.update(aln); // update for the input we just got
+      rs_in.update(aln);  // update stats for input we just got
 
-    vector<bam1_t *> buffer(1, aln); // select output from this buffer
+      vector<bam_rec> buffer(1, aln);  // select output from this buffer
 
-    // to check that reads are sorted properly
-    vector<bool> chroms_seen(hdr->n_targets, false);
-    int32_t cur_chrom = get_tid(aln);
+      // to check that reads are sorted properly
+      vector<bool> chroms_seen(get_n_targets(hdr), false);
+      int32_t cur_chrom = get_tid(aln);
 
-    while ((aln = get_read(hts, hdr))) {
-      rs_in.update(aln);
+      while (hts.read(hdr, aln)) {
+        rs_in.update(aln);
 
-      // below works because buffer reset at every new chrom
-      if (precedes_by_start(aln, buffer[0]))
-        throw runtime_error("not sorted: " + qname(buffer[0]) + " " +
-                            qname(aln));
+        // below works because buffer reset at every new chrom
+        if (precedes_by_start(aln, buffer[0]))
+          throw runtime_error("not sorted: " + get_qname(buffer[0]) + " " +
+                              get_qname(aln));
 
-      const int32_t chrom = get_tid(aln);
-      if (chrom != cur_chrom) {
-        if (chroms_seen[chrom]) throw runtime_error("input not sorted");
-        chroms_seen[chrom] = true;
-        cur_chrom = chrom;
-      }
+        const int32_t chrom = get_tid(aln);
+        if (chrom != cur_chrom) {
+          if (chroms_seen[chrom]) throw runtime_error("input not sorted");
+          chroms_seen[chrom] = true;
+          cur_chrom = chrom;
+        }
 
-      if (!equivalent_chrom_and_start(buffer[0], aln))
-        process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr,
-                       out);
-      buffer.push_back(aln);
+        if (!equivalent_chrom_and_start(buffer[0], aln))
+          process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr,
+                         out);
+        buffer.push_back(aln);
+      }
+      process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr,
+                     out);
     }
-    process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, out);
   }
-
-  // remember to turn off the lights
-  bam_hdr_destroy(hdr);
-  hts_close(out);
-  hts_close(hts);
-  hts_tpool_destroy(the_thread_pool.pool);
-
   // write any additional output requested
   write_stats_output(rs_in, rs_out, reads_duped, statfile);
   write_hist_output(hist, histfile);
@@ -372,7 +279,7 @@ main_uniq(int argc, const char **argv) {
     if (leftover_args.size() == 2 && !use_stdout)
       outfile = leftover_args.back();
     else
-      outfile = string("-"); // so htslib can write to stdout
+      outfile = string("-");  // so htslib can write to stdout
     /****************** END COMMAND LINE OPTIONS *****************/
 
     // ADS: Random here is because we choose randomly when keeping one