diff --git a/docs/content/states.md b/docs/content/states.md index 7ada4e6b..65009066 100644 --- a/docs/content/states.md +++ b/docs/content/states.md @@ -16,9 +16,9 @@ the format used by `amrfinder` and `amrtester`. The epiread format consists of three columns. The first column is the chromosome name for the mapped read, the second is the "index" of the -first CpG in the read. This index is a number (starting with 0) to -indicate which CpG sites in the chromosome corresponds to the first -CpG site in the read. These are not nucleotide positions in the +first CpG in the read. The index `x` indicates that the first CpG site +in the read corresponds to the `x`'th (starting from 0) CpG site in +the chromosome. Therefore, these are not nucleotide positions in the genome. The final column in the epiread format is the sequence of methylation states within the read. This sequence of states is composed of 3 possible letters: C if the corresponding letter at that diff --git a/src/analysis/methstates.cpp b/src/analysis/methstates.cpp index 649ff934..2f060cd8 100644 --- a/src/analysis/methstates.cpp +++ b/src/analysis/methstates.cpp @@ -32,6 +32,8 @@ #include "sam_record.hpp" #include "cigar_utils.hpp" +#include "bam_record_utils.hpp" + using std::string; using std::vector; using std::cout; @@ -40,6 +42,7 @@ using std::endl; using std::unordered_map; using std::unordered_set; using std::runtime_error; +using std::lower_bound; static const char b2c[] = "TNGNNNCNNNNNNNNNNNNA"; @@ -80,18 +83,17 @@ is_cpg(const string &s, const size_t idx) { return s[idx] == 'C' && s[idx + 1] == 'G'; } - static void -collect_cpgs(const string &s, unordered_map &cpgs) { +collect_cpgs(const string &s, + vector &cpgs) { cpgs.clear(); const size_t lim = s.length() - 1; - size_t cpg_count = 0; - for (size_t i = 0; i < lim; ++i) - if (is_cpg(s, i)) - cpgs[i] = cpg_count++; + for (size_t i = 0; i < lim; ++i) { + if (is_cpg(s, i)) + cpgs.push_back(i); + } } - // void // local_apply_cigar(const string &cigar, string &to_inflate, // const char inflation_symbol) { @@ -127,56 +129,54 @@ collect_cpgs(const string &s, unordered_map &cpgs) { // to_inflate.swap(inflated_seq); // } - static bool -convert_meth_states_pos(const string &chrom, - const unordered_map &cpgs, - const sam_rec &aln, - size_t &start_pos, string &states) { +convert_meth_states_pos(const vector &cpgs, + const bam_header &hdr, + const bam_rec &aln, + size_t &first_cpg_index, string &states) { + states.clear(); - const size_t width = cigar_rseq_ops(aln.cigar); - const size_t offset = aln.pos - 1; // SAM format convention + const size_t seq_start = get_pos(aln); + const size_t width = rlen_from_cigar(aln); + const size_t seq_end = seq_start + width; - string the_seq(aln.seq); - apply_cigar(aln.cigar, the_seq); - if (the_seq.size() != width) - throw runtime_error("bad sam record format: " + aln.tostring()); + string seq_str; + get_seq_str(aln, seq_str); + apply_cigar(aln, seq_str, 'N'); - const auto beg_chrom = begin(chrom); - const auto end_chrom = end(chrom); - auto chrom_itr = beg_chrom + offset; - auto first_cpg = end_chrom; + if (seq_str.size() != width) { + throw runtime_error("bad sam record format: " + to_string(hdr, aln)); + } - // ADS: below the "-1" in the std::min is to ensure dinuc exists - const auto beg_seq = begin(the_seq); - const auto seq_lim = beg_seq + std::min(width, chrom.length() - 1 - offset); + // get the first cpg site equal to or large than seq_start + auto cpg_itr = lower_bound(begin(cpgs), end(cpgs), seq_start); + auto first_cpg_itr = end(cpgs); - for (auto seq_itr = beg_seq; seq_itr != seq_lim; ++chrom_itr, ++seq_itr) { - if (*chrom_itr == 'C' && *(chrom_itr + 1) == 'G') { - const char x = *seq_itr; + if (cpg_itr == end(cpgs)) { + return false; + } else { + for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end; cpg_itr++) { + const char x = seq_str[*cpg_itr - seq_start]; states += (x == 'C') ? 'C' : ((x == 'T') ? 'T' : 'N'); - if (first_cpg == end_chrom) - first_cpg = chrom_itr; + if (first_cpg_itr == end(cpgs)) + first_cpg_itr = cpg_itr; } } - if (first_cpg != end_chrom) { - const auto the_cpg = cpgs.find(distance(beg_chrom, first_cpg)); - if (the_cpg == end(cpgs)) - throw runtime_error("cannot locate site on positive strand: " + - aln.tostring()); - start_pos = the_cpg->second; + if (first_cpg_itr != end(cpgs)) { + first_cpg_index = distance(begin(cpgs), first_cpg_itr); } + return states.find_first_of("CT") != string::npos; } static bool -convert_meth_states_neg(const string &chrom, - const unordered_map &cpgs, - const sam_rec &aln, - size_t &start_pos, string &states) { +convert_meth_states_neg(const vector &cpgs, + const bam_header &hdr, + const bam_rec &aln, + size_t &first_cpg_index, string &states) { /* ADS: the "revcomp" on the read sequence is needed for the cigar to be applied, since the cigar is relative to the genome coordinates and not the read's sequence. But the read sequence @@ -187,46 +187,47 @@ convert_meth_states_neg(const string &chrom, states.clear(); - const size_t width = cigar_rseq_ops(aln.cigar); - const size_t offset = aln.pos - 1; // SAM format convention - - string the_seq; - the_seq.resize(aln.seq.length()); - revcomp_copy(begin(aln.seq), end(aln.seq), begin(the_seq)); - apply_cigar(aln.cigar, the_seq); - if (the_seq.size() != width) - throw runtime_error("bad sam record format: " + aln.tostring()); - - const auto beg_chrom = begin(chrom); - const auto end_chrom = end(chrom); - auto chrom_itr = beg_chrom + ((offset > 0) ? offset - 1 : 0); - auto first_cpg = end_chrom; - - // ADS: should apply_cigar make the std::min below irrelevant? - const auto beg_seq = begin(the_seq); - const auto seq_lim = beg_seq + std::min(width, chrom.length() - offset); - auto seq_itr = beg_seq + ((offset > 0) ? 0 : 1); - - for (; seq_itr != seq_lim; ++chrom_itr, ++seq_itr) { - if (*chrom_itr == 'C' && *(chrom_itr + 1) == 'G') { - const char x = *seq_itr; + const size_t seq_start = get_pos(aln); + const size_t width = rlen_from_cigar(aln); + const size_t seq_end = seq_start + width; + + string orig_seq; + get_seq_str(aln, orig_seq); + + string seq_str; + seq_str.resize(orig_seq.size()); + revcomp_copy(begin(orig_seq), end(orig_seq), begin(seq_str)); + apply_cigar(aln, seq_str, 'N'); + + if (seq_str.size() != width) { + throw runtime_error("bad sam record format: " + to_string(hdr, aln)); + } + + // get the first cpg site equal to or large than seq_start - 1 + // the -1 is because we look for G in the read corresponding to a + // CpG in chromosome, which are indexed in cpgs based on the position of C + auto cpg_itr = lower_bound(begin(cpgs), end(cpgs), + seq_start > 0 ? seq_start - 1 : 0); + auto first_cpg_itr = end(cpgs); + + if (cpg_itr == end(cpgs)) { + return false; + } else { + for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end - 1; cpg_itr++) { + const char x = seq_str[*cpg_itr - seq_start + 1]; states += (x == 'G') ? 'C' : ((x == 'A') ? 'T' : 'N'); - if (first_cpg == end_chrom) - first_cpg = chrom_itr; + if (first_cpg_itr == end(cpgs)) + first_cpg_itr = cpg_itr; } } - if (first_cpg != end_chrom) { - const auto the_cpg = cpgs.find(distance(beg_chrom, first_cpg)); - if (the_cpg == end(cpgs)) - throw runtime_error("cannot locate site on negative strand: " + - aln.tostring()); - start_pos = the_cpg->second; + if (first_cpg_itr != end(cpgs)) { + first_cpg_index = distance(begin(cpgs), first_cpg_itr); } + return states.find_first_of("CT") != string::npos; } - static void get_chrom(const string &chrom_name, const vector &all_chroms, @@ -307,9 +308,12 @@ main_methstates(int argc, const char **argv) { if (VERBOSE) cerr << "n_chroms: " << all_chroms.size() << endl; - SAMReader in(mapped_reads_file); + bam_infile in(mapped_reads_file); if (!in) throw runtime_error("cannot open input file " + mapped_reads_file); + bam_header hdr(in); + if (!hdr) + throw runtime_error("cannot read heade" + mapped_reads_file); std::ofstream of; if (!outfile.empty()) of.open(outfile.c_str()); @@ -317,7 +321,8 @@ main_methstates(int argc, const char **argv) { // for the current chrom, this maps cpg index to cpg position in // the chrom - unordered_map cpgs; + //unordered_map cpgs; + vector cpgs; unordered_set chroms_seen; string chrom_name; @@ -325,16 +330,17 @@ main_methstates(int argc, const char **argv) { // iterate over records/reads in the SAM file, sequentially // processing each before considering the next - sam_rec aln; - while (in >> aln) { + bam_rec aln; + while (in.read(hdr, aln)) { // get the correct chrom if it has changed - if (aln.rname != chrom_name) { + if (string(sam_hdr_tid2name(hdr, aln)) != chrom_name) { + chrom_name = sam_hdr_tid2name(hdr, aln); + // make sure all reads from same chrom are contiguous in the file - if (chroms_seen.find(aln.rname) != end(chroms_seen)) + if (chroms_seen.find(chrom_name) != end(chroms_seen)) throw runtime_error("chroms out of order (check SAM file sorted)"); - chrom_name = aln.rname; if (VERBOSE) cerr << "processing " << chrom_name << endl; @@ -342,16 +348,16 @@ main_methstates(int argc, const char **argv) { collect_cpgs(chrom, cpgs); } - size_t start_pos = std::numeric_limits::max(); + size_t first_cpg_index = std::numeric_limits::max(); string seq; - const bool has_cpgs = check_flag(aln, samflags::read_rc) ? - convert_meth_states_neg(chrom, cpgs, aln, start_pos, seq) : - convert_meth_states_pos(chrom, cpgs, aln, start_pos, seq); + const bool has_cpgs = bam_is_rev(aln) ? + convert_meth_states_neg(cpgs, hdr, aln, first_cpg_index, seq) : + convert_meth_states_pos(cpgs, hdr, aln, first_cpg_index, seq); if (has_cpgs) - out << aln.rname << '\t' - << start_pos << '\t' + out << sam_hdr_tid2name(hdr, aln) << '\t' + << first_cpg_index << '\t' << seq << '\n'; } } diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp index adbd089d..37412c7a 100644 --- a/src/common/bam_record_utils.cpp +++ b/src/common/bam_record_utils.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -26,6 +27,10 @@ using std::string; using std::vector; +using std::stringstream; +using std::runtime_error; +using std::to_string; + /// functions in place of undefd macro static inline bool @@ -249,11 +254,6 @@ bam_set1_wrapper(bam1_t *bam, return static_cast(data_len); } -static inline bool -eats_ref(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; } - -static inline bool -eats_query(const uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } static inline size_t bam_get_n_cigar(const bam1_t *b) { return b->core.n_cigar; } @@ -270,10 +270,10 @@ fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) { auto c_beg = cigar; auto c_end = cigar + n_cigar; - while (!eats_ref(*c_beg) && ++c_beg != c_end); + while (!cigar_eats_ref(*c_beg) && ++c_beg != c_end); if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); - while (!eats_ref(*(c_end-1)) && --c_end != c_beg); + while (!cigar_eats_ref(*(c_end-1)) && --c_end != c_beg); if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); for (auto c_itr = c_beg; c_itr != c_end; ++c_itr) @@ -293,13 +293,13 @@ fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { auto c_itr = cigar; const auto c_end = c_itr + n_cigar; - for (; !eats_ref(*c_itr) && c_itr != c_end; ++c_itr) + for (; !cigar_eats_ref(*c_itr) && c_itr != c_end; ++c_itr) *c_itr = to_softclip(*c_itr); if (c_itr == c_end) throw dnmt_error("cigar eats no ref"); c_itr = cigar + n_cigar - 1; - for (; !eats_ref(*c_itr) && c_itr != cigar; --c_itr) + for (; !cigar_eats_ref(*c_itr) && c_itr != cigar; --c_itr) *c_itr = to_softclip(*c_itr); } @@ -396,7 +396,7 @@ get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops, size_t rlen = 0; uint32_t i = 0; for (i = 0; i < in_ops; ++i) { - if (eats_ref(cig_in[i])) { + if (cigar_eats_ref(cig_in[i])) { if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full) break; rlen += bam_cigar_oplen(cig_in[i]); @@ -883,3 +883,67 @@ void standardize_format(const string &input_format, bam_rec &aln) { standardize_format(input_format, aln.b); } + +// used in methstates +void +apply_cigar(const bam_rec &aln, string &to_inflate, + const char inflation_symbol) { + + string inflated_seq; + stringstream ss_cigar; + size_t i = 0; + auto to_inflate_beg = std::begin(to_inflate); + + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = beg_cig + get_n_cigar(aln); + + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const auto op = bam_cigar_op(*c_itr); + const auto n = bam_cigar_oplen(*c_itr); + ss_cigar << n << op; + + if (cigar_eats_ref(op) && cigar_eats_query(op)) { + inflated_seq.append(to_inflate_beg + i, to_inflate_beg + i + n); + i += n; + } + else if (cigar_eats_query(op)) { + // no addition of symbols to query + i += n; + } + else if (cigar_eats_ref(op)) { + inflated_seq.append(n, inflation_symbol); + // no increment of index within query + } + } + + // sum of total M/I/S/=/X/N operations must equal length of seq + const size_t orig_len = to_inflate.length(); + if (i != orig_len) + throw runtime_error("inconsistent number of qseq ops in cigar: " + + to_inflate + " " + ss_cigar.str() + " " + + to_string(i) + " " + + to_string(orig_len)); + to_inflate.swap(inflated_seq); +} + +void +get_seq_str(const bam_rec & aln, string &seq_str) { + size_t qlen = static_cast(get_l_qseq(aln)); + seq_str.resize(qlen); + auto seq = bam_get_seq(aln); + for (size_t i = 0; i < qlen ; i++) { + seq_str[i] = seq_nt16_str[bam_seqi(seq, i)]; + } +} + +string +to_string(const bam_header &hdr, const bam_rec &aln) { + kstring_t ks = { 0, 0, NULL }; + int ret = sam_format1(hdr.h, aln.b, &ks); + if (ret < 0) { + runtime_error("Can't format record: " + to_string(hdr, aln)); + } + + return string(ks.s); +} + diff --git a/src/common/bam_record_utils.hpp b/src/common/bam_record_utils.hpp index 97510cee..c273fbe6 100644 --- a/src/common/bam_record_utils.hpp +++ b/src/common/bam_record_utils.hpp @@ -142,6 +142,16 @@ is_a_rich(const bam_rec &b) { return bam_aux2A(bam_aux_get(b.b, "CV")) == 'A'; } void standardize_format(const std::string &input_format, bam_rec &aln); + +void +apply_cigar(const bam_rec &aln, std::string &to_inflate, + const char inflation_symbol); + + +void +get_seq_str(const bam_rec & aln, std::string &seq_str); + + inline bool are_mates(const bam_rec &one, const bam_rec &two) { return one.b->core.mtid == two.b->core.tid && @@ -222,7 +232,17 @@ sam_hdr_tid2name(const bam_header &hdr, const int32_t tid) { return std::string(sam_hdr_tid2name(hdr.h, tid)); } +inline std::string +sam_hdr_tid2name(const bam_header &hdr, const bam_rec &aln) { + return std::string(sam_hdr_tid2name(hdr.h, aln.b->core.tid)); +} +std::string +to_string(const bam_header &hdr, const bam_rec &aln); + +inline size_t rlen_from_cigar(const bam_rec &aln) { + return bam_cigar2rlen(get_n_cigar(aln), bam_get_cigar(aln)); +} #endif