From 8705df087411d1891cfae666ade4e2eb75b2963e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 3 Oct 2023 16:19:53 -0700 Subject: [PATCH 1/7] guessprotocol: improvements for speed and gathering all summary info into one struct --- src/utils/guessprotocol.cpp | 162 +++++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 60 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index 584475d2..4a91faf8 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -1,7 +1,7 @@ /* guessprotocol: a program for guessing whether a wgbs protocol is * original, pbat or random pbat * - * Copyright (C) 2019-2022 + * Copyright (C) 2019-2023 * * Authors: Andrew D. Smith * @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include "OptionParser.hpp" #include "smithlab_utils.hpp" @@ -35,6 +37,43 @@ using std::cerr; using std::endl; using std::min; using std::runtime_error; +using std::array; + +static string +guess_protocol(const double fraction_t_rich) { + if (fraction_t_rich >= 0.8) { + return "original"; + } + if (fraction_t_rich <= 0.2) { + return "pbat"; + } + if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) { + return "random"; + } + return "inconclusive"; +} + +struct guessprotocol_summary { + string protocol; + string layout; + uint64_t n_t_rich_reads{}; + uint64_t n_reads{}; + double fraction_t_rich_reads{}; + + void evaluate() { + fraction_t_rich_reads = static_cast(n_t_rich_reads)/n_reads; + protocol = guess_protocol(fraction_t_rich_reads); + } + + string tostring() const { + std::ostringstream oss; + oss << "protocol: " << protocol << '\n' + << "fraction_t_rich_reads: " << fraction_t_rich_reads << '\n' + << "t_rich_reads: " << n_t_rich_reads << '\n' + << "reads_examined: " << n_reads; + return oss.str(); + } +}; // store each read from one end struct FASTQRecord { @@ -47,46 +86,49 @@ struct FASTQRecord { static bool mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends const FASTQRecord &a, const FASTQRecord &b) { - assert(to_ignore_at_end < a.name.length()); - return equal(begin(a.name), end(a.name) - to_ignore_at_end, begin(b.name)); + assert(to_ignore_at_end < std::size(a.name)); + return equal(cbegin(a.name), cend(a.name) - to_ignore_at_end, + cbegin(b.name)); } // Read 4 lines one time from fastq and fill in the FASTQRecord structure static std::istream& operator>>(std::istream& s, FASTQRecord &r) { - if (getline(s, r.name)) { + constexpr auto n_error_codes = 5u; + enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual }; + static const array error_msg = { + runtime_error(""), + runtime_error("failed to parse fastq name line"), + runtime_error("failed to parse fastq sequence line"), + runtime_error("failed to parse fastq plus line"), + runtime_error("failed to parse fastq qual line") + }; - if (r.name.empty() || r.name[0] != '@') - throw std::runtime_error("bad name line: " + r.name); + err_code ec = err_code::none; - r.name = r.name.substr(1, r.name.find_first_of(' ')); + getline(s, r.name); - if (!getline(s, r.seq)) - throw runtime_error("failed to read expected seq line"); + if (r.name.empty() || r.name[0] != '@') + ec = err_code::bad_name; - string tmp; - if (!getline(s, tmp)) - throw runtime_error("failed to read expected + line"); + r.name.resize(r.name.find_first_of(' ')); + const auto nm_sz = r.name.find_first_of(' '); + copy(cbegin(r.name) + 1, cbegin(r.name) + nm_sz, begin(r.name)); - if (!getline(s, tmp)) - throw runtime_error("failed to read expected score line"); - } - return s; -} + if (!getline(s, r.seq)) + ec = err_code::bad_seq; + string tmp; + if (!getline(s, tmp)) + ec = err_code::bad_plus; -static string -guess_protocol(const double fraction_t_rich) { - if (fraction_t_rich >= 0.8) { - return "original"; - } - if (fraction_t_rich <= 0.2) { - return "pbat"; - } - if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) { - return "random"; - } - return "inconclusive"; + if (!getline(s, tmp)) + ec = err_code::bad_qual; + + if (ec != err_code::none) + throw error_msg[ec]; + + return s; } int @@ -94,17 +136,23 @@ main_guessprotocol(int argc, const char **argv) { try { + constexpr auto description = "guess bisulfite protocol for a library"; + + string outfile; size_t reads_to_check = 1000000; size_t name_suffix_len = 0; + namespace fs = std::filesystem; + const string cmd_name = std::filesystem::path(argv[0]).filename(); + /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), - "guess whether protocol is ordinary, pbat or random", + OptionParser opt_parse(cmd_name, description, " []"); opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false, reads_to_check); opt_parse.add_opt("ignore", 'i', "length of read name suffix " "to ignore when matching", false, name_suffix_len); + opt_parse.add_opt("output", 'o', "output file name", false, outfile); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { @@ -123,8 +171,11 @@ main_guessprotocol(int argc, const char **argv) { const vector reads_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ + guessprotocol_summary summary; if (reads_files.size() == 2) { - // Input: paired-end reads with end1 and end2 + summary.layout = "paired"; + + // input: paired-end reads with end1 and end2 std::ifstream in1(reads_files.front()); if (!in1) throw runtime_error("cannot open input file: " + reads_files.front()); @@ -133,12 +184,10 @@ main_guessprotocol(int argc, const char **argv) { if (!in2) throw runtime_error("cannot open input file: " + reads_files.back()); - size_t n_pairs = 0; - size_t t_rich_pairs = 0; - FASTQRecord end_one, end_two; - while (in1 >> end_one && in2 >> end_two && n_pairs < reads_to_check) { - ++n_pairs; + while (in1 >> end_one && in2 >> end_two && + summary.n_reads < reads_to_check) { + summary.n_reads++; // two reads should be in paired-ends if (!mates(name_suffix_len, end_one, end_two)) @@ -162,46 +211,39 @@ main_guessprotocol(int argc, const char **argv) { const double t_rich_count = (end_one_t + end_two_a); const double pbat_count = (end_one_a + end_two_t); - t_rich_pairs += (t_rich_count > pbat_count); + summary.n_t_rich_reads += (t_rich_count > pbat_count); } - const double fraction_t_rich = static_cast(t_rich_pairs)/n_pairs; - cout << guess_protocol(fraction_t_rich) << '\t' - << "fraction_t_rich=" << fraction_t_rich << '\t' - << "t_rich_pairs=" << t_rich_pairs << '\t' - << "pairs_examined=" << n_pairs << endl; } - else { // if (reads_files.size() == 1) - // Input: single-end reads + else { + summary.layout = "single"; + + // input: single-end reads std::ifstream in(reads_files.front()); if (!in) throw runtime_error("cannot open input file: " + reads_files.front()); - size_t n_reads = 0; - size_t t_rich_reads = 0; - FASTQRecord r; - while (in >> r && n_reads < reads_to_check) { - ++n_reads; + while (in >> r && summary.n_reads < reads_to_check) { + summary.n_reads++; const double a = (count(begin(r.seq), end(r.seq), 'A') + count(begin(r.seq), end(r.seq), 'C')); const double t = (count(begin(r.seq), end(r.seq), 'T') + count(begin(r.seq), end(r.seq), 'G')); - t_rich_reads += (t > a); + summary.n_t_rich_reads += (t > a); } - const double fraction_t_rich = static_cast(t_rich_reads)/n_reads; - cout << guess_protocol(fraction_t_rich) << '\t' - << "fraction_t_rich=" << fraction_t_rich << '\t' - << "t_rich_reads=" << t_rich_reads << '\t' - << "reads_examined=" << n_reads << endl; } + + summary.evaluate(); + if (!outfile.empty()) { + std::ofstream out(outfile); + if (!out) throw runtime_error("failed to open output file: " + outfile); + out << summary.tostring() << endl; + } + else cout << summary.tostring() << endl; } catch (const runtime_error &e) { cerr << e.what() << endl; return EXIT_FAILURE; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; - return EXIT_FAILURE; - } return EXIT_SUCCESS; } From dfe7d65f5b5ec5ffb05cbbb4c1db9c8338acba01 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 3 Oct 2023 18:56:00 -0700 Subject: [PATCH 2/7] guessprotocol: major updates adding a probability model to the criteria --- src/utils/guessprotocol.cpp | 235 +++++++++++++++++++++++------------- 1 file changed, 148 insertions(+), 87 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index 4a91faf8..f1d42c97 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -16,61 +16,111 @@ * General Public License for more details. */ -#include -#include -#include -#include -#include +#include + #include -#include #include +#include +#include +#include #include +#include +#include +#include #include "OptionParser.hpp" -#include "smithlab_utils.hpp" +#include "numerical_utils.hpp" #include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -using std::string; -using std::vector; -using std::cout; +using std::array; using std::cerr; +using std::cout; using std::endl; using std::min; using std::runtime_error; -using std::array; +using std::string; +using std::vector; -static string -guess_protocol(const double fraction_t_rich) { - if (fraction_t_rich >= 0.8) { - return "original"; - } - if (fraction_t_rich <= 0.2) { - return "pbat"; +using bamxx::bgzf_file; + +constexpr int nuc_to_idx[] = { + /* 0*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 16*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 32*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 48*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 64*/ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + /* 80*/ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 96*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /*112*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /*128*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +struct nucleotide_model { + vector pr{}; + vector lpr{}; + double bisulfite_conversion_rate{}; + bool is_t_rich{}; + + nucleotide_model(const vector &bc, const double conv_rate, + const bool itr) + : pr{bc}, bisulfite_conversion_rate{conv_rate}, is_t_rich{itr} { + auto nuc_from = is_t_rich ? 1 : 2; + auto nuc_to = is_t_rich ? 3 : 0; + pr[nuc_to] += bisulfite_conversion_rate * pr[nuc_from]; + pr[nuc_from] *= (1.0 - bisulfite_conversion_rate); + assert(reduce(cbegin(pr), cend(pr), 0.0) == 1.0); + lpr.resize(std::size(pr)); + transform(cbegin(pr), cend(pr), begin(lpr), + [](const double x) { return log(x); }); } - if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) { - return "random"; + + double operator()(const string &s) const { + return accumulate( + cbegin(s), cend(s), 0.0, + [&](const double x, const char c) { return x + lpr[nuc_to_idx[c]]; }); + }; + + string tostring() const { + std::ostringstream oss; + oss << "pr:\n"; + for (auto i : pr) oss << i << '\n'; + oss << "log pr:\n"; + for (auto i : lpr) oss << i << '\n'; + oss << bisulfite_conversion_rate << '\n' << is_t_rich; + return oss.str(); } - return "inconclusive"; -} +}; struct guessprotocol_summary { string protocol; string layout; - uint64_t n_t_rich_reads{}; + double n_reads_wgbs{}; uint64_t n_reads{}; - double fraction_t_rich_reads{}; + double wgbs_fraction{}; void evaluate() { - fraction_t_rich_reads = static_cast(n_t_rich_reads)/n_reads; - protocol = guess_protocol(fraction_t_rich_reads); + const auto frac = n_reads_wgbs / n_reads; + protocol = "inconclusive"; + if (frac > 0.8) protocol = "wgbs"; + if (frac < 0.2) protocol = "pbat"; + if (frac > 0.4 && frac < 0.6) protocol = "rpbat"; + wgbs_fraction = frac; } string tostring() const { std::ostringstream oss; oss << "protocol: " << protocol << '\n' - << "fraction_t_rich_reads: " << fraction_t_rich_reads << '\n' - << "t_rich_reads: " << n_t_rich_reads << '\n' - << "reads_examined: " << n_reads; + << "wgbs_fraction: " << wgbs_fraction << '\n' + << "n_reads_wgbs: " << n_reads_wgbs << '\n' + << "n_reads: " << n_reads; return oss.str(); } }; @@ -92,13 +142,14 @@ mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends } // Read 4 lines one time from fastq and fill in the FASTQRecord structure -static std::istream& -operator>>(std::istream& s, FASTQRecord &r) { +static bgzf_file & +operator>>(bgzf_file &s, FASTQRecord &r) { constexpr auto n_error_codes = 5u; + enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual }; + static const array error_msg = { - runtime_error(""), - runtime_error("failed to parse fastq name line"), + runtime_error(""), runtime_error("failed to parse fastq name line"), runtime_error("failed to parse fastq sequence line"), runtime_error("failed to parse fastq plus line"), runtime_error("failed to parse fastq qual line") @@ -106,27 +157,22 @@ operator>>(std::istream& s, FASTQRecord &r) { err_code ec = err_code::none; - getline(s, r.name); + if (!getline(s, r.name)) return s; - if (r.name.empty() || r.name[0] != '@') - ec = err_code::bad_name; + if (r.name.empty() || r.name[0] != '@') ec = err_code::bad_name; - r.name.resize(r.name.find_first_of(' ')); - const auto nm_sz = r.name.find_first_of(' '); - copy(cbegin(r.name) + 1, cbegin(r.name) + nm_sz, begin(r.name)); + const auto nm_end = r.name.find_first_of(" \t"); + const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1; + r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name)); - if (!getline(s, r.seq)) - ec = err_code::bad_seq; + if (!getline(s, r.seq)) ec = err_code::bad_seq; string tmp; - if (!getline(s, tmp)) - ec = err_code::bad_plus; + if (!getline(s, tmp)) ec = err_code::bad_plus; - if (!getline(s, tmp)) - ec = err_code::bad_qual; + if (!getline(s, tmp)) ec = err_code::bad_qual; - if (ec != err_code::none) - throw error_msg[ec]; + if (ec != err_code::none) throw error_msg[ec]; return s; } @@ -136,11 +182,16 @@ main_guessprotocol(int argc, const char **argv) { try { + static const vector human_base_comp = {0.2, 0.3, 0.3, 0.2}; + static const vector flat_base_comp = {0.25, 0.25, 0.25, 0.25}; + constexpr auto description = "guess bisulfite protocol for a library"; + bool verbose; string outfile; size_t reads_to_check = 1000000; size_t name_suffix_len = 0; + double bisulfite_conversion_rate = 0.98; namespace fs = std::filesystem; const string cmd_name = std::filesystem::path(argv[0]).filename(); @@ -152,7 +203,12 @@ main_guessprotocol(int argc, const char **argv) { false, reads_to_check); opt_parse.add_opt("ignore", 'i', "length of read name suffix " "to ignore when matching", false, name_suffix_len); + opt_parse.add_opt("bisulfite", 'b', "bisulfite conversion rate", + false, bisulfite_conversion_rate); opt_parse.add_opt("output", 'o', "output file name", false, outfile); + opt_parse.add_opt("verbose", 'v', + "report available information during the run", + false, verbose); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { @@ -171,72 +227,77 @@ main_guessprotocol(int argc, const char **argv) { const vector reads_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ + auto base_comp = human_base_comp; + nucleotide_model t_rich_model(base_comp, bisulfite_conversion_rate, true); + nucleotide_model a_rich_model(base_comp, bisulfite_conversion_rate, false); + guessprotocol_summary summary; + summary.layout = reads_files.size() == 2 ? "paired" : "single"; + + if (verbose) { + if (reads_files.size() == 2) + cerr << "data layout: " + << "paired" << '\n' + << "read1 file: " << reads_files.front() << '\n' + << "read2 file: " << reads_files.back() << '\n'; + else + cerr << "data layout: " + << "single" << '\n' + << "read file: " << reads_files.front() << '\n'; + cerr << "reads to check: " << reads_to_check << '\n' + << "read name suffix length: " << name_suffix_len << '\n' + << "bisulfite conversion: " << bisulfite_conversion_rate << '\n'; + } + if (reads_files.size() == 2) { - summary.layout = "paired"; // input: paired-end reads with end1 and end2 - std::ifstream in1(reads_files.front()); + bgzf_file in1(reads_files.front(), "r"); if (!in1) - throw runtime_error("cannot open input file: " + reads_files.front()); + throw runtime_error("cannot open file: " + reads_files.front()); - std::ifstream in2(reads_files.back()); + bgzf_file in2(reads_files.back(), "r"); if (!in2) - throw runtime_error("cannot open input file: " + reads_files.back()); + throw runtime_error("cannot open file: " + reads_files.back()); - FASTQRecord end_one, end_two; - while (in1 >> end_one && in2 >> end_two && - summary.n_reads < reads_to_check) { + FASTQRecord r1, r2; + while (in1 >> r1 && in2 >> r2 && summary.n_reads < reads_to_check) { summary.n_reads++; - // two reads should be in paired-ends - if (!mates(name_suffix_len, end_one, end_two)) - throw runtime_error("expected mates, got: " + - end_one.name + " and " + end_two.name); - - const double end_one_a = - count(begin(end_one.seq), end(end_one.seq), 'A') + - count(begin(end_one.seq), end(end_one.seq), 'C'); - const double end_one_t = - count(begin(end_one.seq), end(end_one.seq), 'T') + - count(begin(end_one.seq), end(end_one.seq), 'G'); - - const double end_two_a = - count(begin(end_two.seq), end(end_two.seq), 'A') + - count(begin(end_two.seq), end(end_two.seq), 'C'); - const double end_two_t = - count(begin(end_two.seq), end(end_two.seq), 'T') + - count(begin(end_two.seq), end(end_two.seq), 'G'); - - const double t_rich_count = (end_one_t + end_two_a); - const double pbat_count = (end_one_a + end_two_t); - - summary.n_t_rich_reads += (t_rich_count > pbat_count); + if (!mates(name_suffix_len, r1, r2)) + throw runtime_error("expected mates: " + r1.name + ", " + r2.name); + + const double ta = t_rich_model(r1.seq) + a_rich_model(r2.seq); + const double at = a_rich_model(r1.seq) + t_rich_model(r2.seq); + + const auto prob_read1_t_rich = exp(ta - log_sum_log(ta, at)); + summary.n_reads_wgbs += prob_read1_t_rich; } } else { - summary.layout = "single"; // input: single-end reads - std::ifstream in(reads_files.front()); + bgzf_file in(reads_files.front(), "r"); if (!in) - throw runtime_error("cannot open input file: " + reads_files.front()); + throw runtime_error("cannot open file: " + reads_files.front()); FASTQRecord r; while (in >> r && summary.n_reads < reads_to_check) { summary.n_reads++; - const double a = (count(begin(r.seq), end(r.seq), 'A') + - count(begin(r.seq), end(r.seq), 'C')); - const double t = (count(begin(r.seq), end(r.seq), 'T') + - count(begin(r.seq), end(r.seq), 'G')); - summary.n_t_rich_reads += (t > a); + + const double t = t_rich_model(r.seq); + const double a = a_rich_model(r.seq); + + const auto prob_t_rich = exp(t - log_sum_log(t, a)); + summary.n_reads_wgbs += prob_t_rich; } } summary.evaluate(); + if (!outfile.empty()) { std::ofstream out(outfile); - if (!out) throw runtime_error("failed to open output file: " + outfile); + if (!out) throw runtime_error("failed to open: " + outfile); out << summary.tostring() << endl; } else cout << summary.tostring() << endl; From 26319e2d98404cf602f5fdc800075b68288aedcb Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 3 Oct 2023 19:26:40 -0700 Subject: [PATCH 3/7] guessprotocol: adding the option to use human genome base composition in model --- src/utils/guessprotocol.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index f1d42c97..2112fbf0 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -182,12 +182,13 @@ main_guessprotocol(int argc, const char **argv) { try { - static const vector human_base_comp = {0.2, 0.3, 0.3, 0.2}; + static const vector human_base_comp = {0.295, 0.205, 0.205, 0.295}; static const vector flat_base_comp = {0.25, 0.25, 0.25, 0.25}; constexpr auto description = "guess bisulfite protocol for a library"; bool verbose; + bool use_human; string outfile; size_t reads_to_check = 1000000; size_t name_suffix_len = 0; @@ -205,6 +206,8 @@ main_guessprotocol(int argc, const char **argv) { "to ignore when matching", false, name_suffix_len); opt_parse.add_opt("bisulfite", 'b', "bisulfite conversion rate", false, bisulfite_conversion_rate); + opt_parse.add_opt("human", 'H', "assume human genome", + false, use_human); opt_parse.add_opt("output", 'o', "output file name", false, outfile); opt_parse.add_opt("verbose", 'v', "report available information during the run", @@ -227,7 +230,9 @@ main_guessprotocol(int argc, const char **argv) { const vector reads_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ - auto base_comp = human_base_comp; + auto base_comp = flat_base_comp; + if (use_human) base_comp = human_base_comp; + nucleotide_model t_rich_model(base_comp, bisulfite_conversion_rate, true); nucleotide_model a_rich_model(base_comp, bisulfite_conversion_rate, false); From ec041aea64bd52e99c9580856a6b2b41ee2f43f7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 3 Oct 2023 19:43:40 -0700 Subject: [PATCH 4/7] guessprotocol: adding an include for std::array; left out as some compilers seem to have it through another header --- src/utils/guessprotocol.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index 2112fbf0..d20e714b 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -1,7 +1,7 @@ /* guessprotocol: a program for guessing whether a wgbs protocol is - * original, pbat or random pbat + * wgbs, pbat or random pbat * - * Copyright (C) 2019-2023 + * Copyright (C) 2019-2023 Andrew D. Smith * * Authors: Andrew D. Smith * @@ -27,6 +27,7 @@ #include #include #include +#include #include "OptionParser.hpp" #include "numerical_utils.hpp" @@ -83,9 +84,11 @@ struct nucleotide_model { } double operator()(const string &s) const { - return accumulate( - cbegin(s), cend(s), 0.0, - [&](const double x, const char c) { return x + lpr[nuc_to_idx[c]]; }); + return accumulate(cbegin(s), cend(s), 0.0, + [&](const double x, const char c) { + const auto i = nuc_to_idx[c]; + return i == 4 ? x : x + lpr[i]; + }); }; string tostring() const { From 974329677b1df785b283e2f172536474bde794c3 Mon Sep 17 00:00:00 2001 From: masarunakajima Date: Wed, 4 Oct 2023 12:21:00 -0700 Subject: [PATCH 5/7] added documentation for the summary parameters --- src/utils/guessprotocol.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index d20e714b..e3d9fe48 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -103,10 +103,18 @@ struct nucleotide_model { }; struct guessprotocol_summary { + // protocol is the guessed protocol (wgbs, pbat, rpbat, or inconclusive) + // based on the content of the reads. string protocol; + // layout indicates whether the reads are paired or single-ended. string layout; + // n_reads_wgbs is the average number of reads (for single-ended reads) or + // read pairs (for paired reads) where read1 is T-rich. double n_reads_wgbs{}; + // n_reads is the number of evaluated reads or read pairs. uint64_t n_reads{}; + // wgbs_fraction is the probability that a read (for single-ended reads) or + // the read1 of a read pair (for paired reads) is T-rich. double wgbs_fraction{}; void evaluate() { From d542ccbb21965b7cb781e4ca6d059bd61662e543 Mon Sep 17 00:00:00 2001 From: masarunakajima Date: Wed, 4 Oct 2023 14:01:27 -0700 Subject: [PATCH 6/7] updated protocol guessing criteria and added confidence field --- src/utils/guessprotocol.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index e3d9fe48..df87a70c 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -86,7 +86,7 @@ struct nucleotide_model { double operator()(const string &s) const { return accumulate(cbegin(s), cend(s), 0.0, [&](const double x, const char c) { - const auto i = nuc_to_idx[c]; + const auto i = nuc_to_idx[static_cast(c)]; return i == 4 ? x : x + lpr[i]; }); }; @@ -106,6 +106,9 @@ struct guessprotocol_summary { // protocol is the guessed protocol (wgbs, pbat, rpbat, or inconclusive) // based on the content of the reads. string protocol; + // confidence indicates the level of confidence in the guess for the + // protocol. + string confidence; // layout indicates whether the reads are paired or single-ended. string layout; // n_reads_wgbs is the average number of reads (for single-ended reads) or @@ -118,17 +121,21 @@ struct guessprotocol_summary { double wgbs_fraction{}; void evaluate() { - const auto frac = n_reads_wgbs / n_reads; - protocol = "inconclusive"; - if (frac > 0.8) protocol = "wgbs"; - if (frac < 0.2) protocol = "pbat"; - if (frac > 0.4 && frac < 0.6) protocol = "rpbat"; + const auto frac = n_reads_wgbs / std::max(1ul, n_reads); + if (frac <= 0.01) {protocol = "pbat"; confidence = "high";} + else if (frac <= 0.1) {protocol = "pbat"; confidence = "low";} + else if (frac <= 0.2) {protocol = "rpbat"; confidence = "low";} + else if (frac <= 0.8) {protocol = "rpbat"; confidence = "high";} + else if (frac <= 0.9) {protocol = "rpbat"; confidence = "low";} + else if (frac <= 0.99) {protocol = "wgbs"; confidence = "low";} + else {protocol = "wgbs"; confidence = "high";} wgbs_fraction = frac; } string tostring() const { std::ostringstream oss; oss << "protocol: " << protocol << '\n' + << "confidence: " << confidence << '\n' << "wgbs_fraction: " << wgbs_fraction << '\n' << "n_reads_wgbs: " << n_reads_wgbs << '\n' << "n_reads: " << n_reads; From 11483a976dc2432d62f9b18e80bd116d0d05e1be Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Wed, 4 Oct 2023 14:25:43 -0700 Subject: [PATCH 7/7] guessprotocol: updating the decision critiera and adding confidence output --- src/utils/guessprotocol.cpp | 42 ++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index d20e714b..d7f68584 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -103,7 +103,15 @@ struct nucleotide_model { }; struct guessprotocol_summary { + static constexpr auto wgbs_cutoff_confident = 0.99; + static constexpr auto wgbs_cutoff_unconfident = 0.9; + static constexpr auto rpbat_cutoff_confident_high = 0.8; + static constexpr auto rpbat_cutoff_confident_low = 0.2; + static constexpr auto pbat_cutoff_unconfident = 0.1; + static constexpr auto pbat_cutoff_confident = 0.01; + string protocol; + string confidence; string layout; double n_reads_wgbs{}; uint64_t n_reads{}; @@ -112,15 +120,43 @@ struct guessprotocol_summary { void evaluate() { const auto frac = n_reads_wgbs / n_reads; protocol = "inconclusive"; - if (frac > 0.8) protocol = "wgbs"; - if (frac < 0.2) protocol = "pbat"; - if (frac > 0.4 && frac < 0.6) protocol = "rpbat"; + + // assigning wgbs (near one) + if (frac > wgbs_cutoff_confident) { + protocol = "wgbs"; + confidence = "high"; + } + else if (frac > wgbs_cutoff_unconfident) { + protocol = "wgbs"; + confidence = "low"; + } + // assigning pbat (near zero) + else if (frac < pbat_cutoff_confident) { + protocol = "pbat"; + confidence = "high"; + } + else if (frac < pbat_cutoff_unconfident) { + protocol = "pbat"; + confidence = "low"; + } + // assigning rpbat (towards middle) + else if (frac > rpbat_cutoff_confident_low && + frac < rpbat_cutoff_confident_high) { + protocol = "rpbat"; + confidence = "high"; + } + else { + protocol = "rpbat"; + confidence = "low"; + } + wgbs_fraction = frac; } string tostring() const { std::ostringstream oss; oss << "protocol: " << protocol << '\n' + << "confidence: " << confidence << '\n' << "wgbs_fraction: " << wgbs_fraction << '\n' << "n_reads_wgbs: " << n_reads_wgbs << '\n' << "n_reads: " << n_reads;