Skip to content

Commit

Permalink
add option --report-fdr to print FDR per match
Browse files Browse the repository at this point in the history
  • Loading branch information
RuoshiZhang committed Dec 9, 2021
1 parent 125affa commit 1cd3047
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 8 deletions.
6 changes: 6 additions & 0 deletions src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class LocalParameters : public Parameters {
PARAMETER(PARAM_REVERSE_SETDB)
PARAMETER(PARAM_EXTRACTORF_SPACER)
PARAMETER(PARAM_FDR_CUTOFF)
PARAMETER(PARAM_REPORT_FDR)
PARAMETER(PARAM_TAX_FDR_CUTOFF)
PARAMETER(PARAM_FORMAT_TYPE)
PARAMETER(PARAM_REPORT_PAM)
Expand All @@ -51,6 +52,7 @@ class LocalParameters : public Parameters {
int extractorfsSpacer;
int reverseSetDb;
float fdrCutoff;
int reportFdr;
float taxFdrCutoff;
int formatType;
int reportPam;
Expand All @@ -66,6 +68,7 @@ class LocalParameters : public Parameters {
PARAM_REVERSE_SETDB(PARAM_REVERSE_SETDB_ID,"--reverse-setdb", "Create reversed setdb", "Create additional setDB with reversed fragments to compute under null [0,1]", typeid(int), (void *) &reverseSetDb, "^[0-1]{1}$"),
PARAM_EXTRACTORF_SPACER(PARAM_EXTRACTORF_SPACER_ID,"--extractorf-spacer", "Extract orfs from spacers", "change parameter settings for extractorfs when createsetdb for spacer db [0,1]", typeid(int), (void *) &extractorfsSpacer, "^[0-1]{1}$"),
PARAM_FDR_CUTOFF(PARAM_FDR_CUTOFF_ID,"--fdr", "FDR cutoff", "FDR cutoff for filtering matches [0.0, 1.0]", typeid(float), (void *) &fdrCutoff, "^0(\\.[0-9]+)?|1(\\.0+)?$"),
PARAM_REPORT_FDR(PARAM_REPORT_FDR_ID,"--report-fdr", "Report FDR", "Report FDR of matches", typeid(int), (void *) &reportFdr, "^[0-1]{1}$"),
PARAM_TAX_FDR_CUTOFF(PARAM_TAX_FDR_CUTOFF_ID,"--tax-fdr", "Taxonomy FDR cutoff", "FDR cutoff for taxonomy report [0.0, 1.0]", typeid(float), (void *) &taxFdrCutoff, "^0(\\.[0-9]+)?|1(\\.0+)?$"),
PARAM_FORMAT_TYPE(PARAM_FORMAT_TYPE_ID,"--fmt", "Output format", "0: short (only matches)\n1: long (matches and hits)\n2: long with nucleotide alignment", typeid(int), (void *) &formatType, "^[0-2]{1}$"),
PARAM_REPORT_PAM(PARAM_REPORT_PAM_ID,"--report-pam", "Report PAM", "Report protospacer adjacent motifs up and downstream of hits", typeid(int), (void *) &reportPam, "^[0-1]{1}$"),
Expand All @@ -83,11 +86,13 @@ class LocalParameters : public Parameters {
downloaddb.push_back(&PARAM_V);

filtermatchbyfdr.push_back(&PARAM_FDR_CUTOFF);
filtermatchbyfdr.push_back(&PARAM_REPORT_FDR);
filtermatchbyfdr.push_back(&PARAM_COMPRESSED);
filtermatchbyfdr.push_back(&PARAM_THREADS);
filtermatchbyfdr.push_back(&PARAM_V);

summarizeresults.push_back(&PARAM_FORMAT_TYPE);
summarizeresults.push_back(&PARAM_REPORT_FDR);
summarizeresults.push_back(&PARAM_LCA_RANKS);
summarizeresults.push_back(&PARAM_TAXON_ADD_LINEAGE);
summarizeresults.push_back(&PARAM_COMPRESSED);
Expand Down Expand Up @@ -150,6 +155,7 @@ class LocalParameters : public Parameters {
extractorfsSpacer = 0;
reverseSetDb = 1;
fdrCutoff = 0.05;
reportFdr = 0;
taxFdrCutoff = 0.02;
formatType = 1;
reportPam = 1;
Expand Down
2 changes: 1 addition & 1 deletion src/spacepharer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
const char* binary_name = "spacepharer";
const char* tool_name = "SpacePHARER";
const char* tool_introduction =
"SpacePHARER (CRISPR Spacer Phage-Host Pair Finder) is a sensitive and fast tool for de novo prediction of phage-host relationships via identifying phage genomes that match CRISPR spacers in genomic or metagenomic data.\n\nPlease cite: R. Zhang et al. SpacePHARER: Sensitive identification of phages from CRISPR spacers in prokaryotic hosts. biorxiv, doi:10.1101/2020.05.15.090266 (2020).";
"SpacePHARER (CRISPR Spacer Phage-Host Pair Finder) is a sensitive and fast tool for de novo prediction of phage-host relationships via identifying phage genomes that match CRISPR spacers in genomic or metagenomic data.\n\nPlease cite: R. Zhang et al. SpacePHARER: Sensitive identification of phages from CRISPR spacers in prokaryotic hosts. Bioinformatics, doi:10.1093/bioinformatics/btab222 (2021).";
const char* main_author = "Ruoshi Zhang <ruoshi.zhang@mpibpc.mpg.de>";
const char* show_extended_help = "1";
const char* show_bash_info = NULL;
Expand Down
56 changes: 50 additions & 6 deletions src/util/FilterMatchbyFdr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ int filtermatchbyfdr(int argc, const char **argv, const Command& command) {
negScoreDb.open(DBReader<unsigned int>::LINEAR_ACCCESS);

double threshold = 0;
std::vector<double> fdrList;
std::vector<double> uniqueScoreList;
if (negScoreDb.getSize() > 0) {
progress.reset(negScoreDb.getSize());
std::vector<double> negToSort;
Expand Down Expand Up @@ -93,7 +95,7 @@ int filtermatchbyfdr(int argc, const char **argv, const Command& command) {
SORT_PARALLEL(negToSort.rbegin(), negToSort.rend());

//cumsum for TP+FP(pos_counter) and FP(neg_counter) and x = FP/nNeg, y = (TP + FP)/nPos
std::vector<double> uniqueScoreList;

std::vector<double> x;
std::vector<double> y;
size_t cnt = 0;
Expand Down Expand Up @@ -152,9 +154,14 @@ int filtermatchbyfdr(int argc, const char **argv, const Command& command) {
}
}

if (i < 2) {

if (i < 2){
if(par.fdrCutoff < 1){
Debug(Debug::WARNING) << "Combined score list too short. Using threshold " << threshold << "\n";
} else {
Debug(Debug::WARNING) << "FDR cutoff is set to 1. Printing all matches. \n";
}
threshold = posToSort.back();
Debug(Debug::WARNING) << "Combined score list too short. Using threshold " << threshold << "\n";
} else {
size_t j = idxList[i-2];
double TPFP = y[j];
Expand All @@ -172,8 +179,38 @@ int filtermatchbyfdr(int argc, const char **argv, const Command& command) {
Debug(Debug::INFO) << "Combined score threshold is " << threshold << " with FDR of " << par.fdrCutoff << ".\n";
Debug(Debug::INFO) << y[j]* posToSort.size() << " matches passed combined score threshold.\n";
}


if(par.reportFdr){
for (size_t j = 0; j < idxList[0]; j++){
fdrList.push_back(0.0);
}
for(size_t i = 0; i < idxList.size()-1; i++){
double TPFP = y[idxList[i]];
double FP = x[idxList[i]]* pi0;
for(size_t j = idxList[i]; j < idxList[i+1]; j++){
if (std::isinf(slopeList[i])){
fdrList.push_back(x[idxList[i]]* pi0 /y[idxList[i]]);
} else {
TPFP += (x[j] - x[j-1]) * slopeList[i];
FP += (x[j] - x[j-1]) * pi0;
fdrList.push_back(FP / TPFP);
}
}
}
}
} else {
Debug(Debug::WARNING) << "Combined score list of control set is empty\n";
Debug(Debug::WARNING) << "Combined score list of control set is empty. Printing all matches\n";
if(par.reportFdr){
for(size_t i = 0; i < posToSort.size();i++){
double s = DBL_MIN;
if(s < posToSort[i]){
uniqueScoreList.push_back(posToSort[i]);
fdrList.push_back(0.0);
s = posToSort[i];
}
}
}
threshold = posToSort.back();
}
negScoreDb.close();
Expand Down Expand Up @@ -202,10 +239,17 @@ int filtermatchbyfdr(int argc, const char **argv, const Command& command) {
EXIT(EXIT_FAILURE);
}
double score = strtod(entry[1], NULL);
const char* start = data;
//const char* start = data;
data = Util::skipLine(data);
if(score >= threshold) {
buffer.append(start, data - start);
buffer.append(entry[0], entry[3] - entry[0]);
if(par.reportFdr){
buffer.append("\t");
std::vector<double>::iterator it = std::find(uniqueScoreList.begin(),uniqueScoreList.end(), score);
int idx = std::distance(uniqueScoreList.begin(), it);
buffer.append(SSTR(fdrList[idx]));
}
buffer.append("\n");
}
}
writer.writeData(buffer.c_str(), buffer.length(), posScoreDb.getDbKey(id), thread_idx);
Expand Down
12 changes: 11 additions & 1 deletion src/util/SummarizeResults.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ int summarizeresults(int argc, const char **argv, const Command& command) {
std::string cScore;
cScore.reserve(255);

std::string fdr;
fdr.reserve(255);

std::string buffer;
buffer.reserve(1024 * 1024);

Expand Down Expand Up @@ -87,6 +90,9 @@ int summarizeresults(int argc, const char **argv, const Command& command) {
// call before getWordsOfLine or entry is overwritten
if (lineCount == 0) {
cScore.assign(entry[1], entry[2] - entry[1]);
if(par.reportFdr){
fdr.assign(entry[3], entry[4] - entry[3]);
}
}

columns = Util::getWordsOfLine(alnCurrent, entry, 255);
Expand All @@ -103,7 +109,11 @@ int summarizeresults(int argc, const char **argv, const Command& command) {
buffer.append(entry[3], entry[4] - entry[3] - 1);
buffer.append("\t");
buffer.append(cScore);
//buffer.append("\t");
if(par.reportFdr){
buffer.append(fdr);
buffer.append("\t");
}

if (t != NULL) {
taxId = Util::fast_atoi<TaxID>(entry[12]);
}
Expand Down

0 comments on commit 1cd3047

Please sign in to comment.