Permalink
Browse files

Merge pull request #148 from RuoshiZhang/master

Fix multihit besthitperset p value and result merging
  • Loading branch information...
milot-mirdita committed Jan 10, 2019
2 parents 3c54ceb + 5a34f46 commit d5b86506a4c2520c67fb16ed71b9c4ad05a10f78
Showing with 37 additions and 31 deletions.
  1. +2 −2 src/commons/Parameters.cpp
  2. +33 −27 src/multihit/besthitperset.cpp
  3. +2 −2 src/util/mergeresultsbyset.cpp
@@ -186,7 +186,7 @@ Parameters::Parameters():
PARAM_COMPUTE_POSITIONS(PARAM_COMPUTE_POSITIONS_ID, "--compute-positions", "Compute Positions", "Add the positions of he hit on the target genome", typeid(std::string), (void*) &compPos, ""),
PARAM_TRANSITIVE_REPLACE(PARAM_TRANSITIVE_REPLACE_ID, "--transitive-replace", "Replace transitively", "Replace cluster name in a search file by all genes in this cluster", typeid(std::string), (void*) &clusterFile, ""),
// besthitperset
PARAM_SIMPLE_BEST_HIT(PARAM_SIMPLE_BEST_HIT_ID, "--simple-best-hit", "Use Simple Best Hit", "Update the e-value by the best p-value", typeid(bool), (void*) &simpleBestHit, ""),
PARAM_SIMPLE_BEST_HIT(PARAM_SIMPLE_BEST_HIT_ID, "--simple-best-hit", "Use Simple Best Hit", "Update the p-value by a single best hit, or by best and second best hits", typeid(bool), (void*) &simpleBestHit, ""),
PARAM_ALPHA(PARAM_ALPHA_ID, "--alpha", "Alpha", "Set alpha for combining p-values during aggregation", typeid(float), (void*) &alpha, ""),
PARAM_SHORT_OUTPUT(PARAM_SHORT_OUTPUT_ID, "--short-output", "Short output", "The output database will contain only the spread p-value", typeid(bool), (void*) &shortOutput, ""),
// concatdb
@@ -1533,7 +1533,7 @@ void Parameters::setDefaults() {


//besthitperset
simpleBestHit = false;
simpleBestHit = true;
alpha = 1;
shortOutput = false;

@@ -30,49 +30,55 @@ public :
std::string buffer;
buffer.reserve(1024);

double bestScore = 0;
double secondBestScore = 0;
double bestScore = -DBL_MAX;
double secondBestScore = -DBL_MAX;
double bestEval = DBL_MAX;

double correctedPval = 0;

// Look for the lowest p-value and retain only this line
// dataToAggregate = [nbrTargetGene][Field of result]
size_t targetId = targetSizeReader->getId(targetSetKey);
if (targetId == UINT_MAX) {
Debug(Debug::ERROR) << "Invalid target size database key " << targetSetKey << ".\n";
EXIT(EXIT_FAILURE);
}
char *data = targetSizeReader->getData(targetId, thread_idx);
unsigned int nbrGenes = Util::fast_atoi<unsigned int>(data);

std::vector<std::string> *bestEntry;
for (size_t i = 0; i < dataToAggregate.size(); i++) {
double score = strtod(dataToAggregate[i][1].c_str(), NULL);
double eval = strtod(dataToAggregate[i][3].c_str(), NULL);

if (score > bestScore) {
secondBestScore = bestScore;
bestScore = score;
if (simpleBestHitMode == false) {
bestEntry = &dataToAggregate[i];
}
double pval = eval/nbrGenes;
//prevent log(0)
if (pval == 0) {
pval = DBL_MIN;
}

if (simpleBestHitMode == true && bestEval > eval) {
double score = -log(pval);

//if only one hit use simple best hit
if(simpleBestHitMode ||dataToAggregate.size() < 2) {
bestEval = eval;
bestEntry = &dataToAggregate[i];
}
else {
if (score >= bestScore) {
secondBestScore = bestScore;
bestScore = score;
bestEntry = &dataToAggregate[i];
}
else if (score > secondBestScore) {
secondBestScore = score;
}
}
}

size_t targetId = targetSizeReader->getId(targetSetKey);
if (targetId == UINT_MAX) {
Debug(Debug::ERROR) << "Invalid target size database key " << targetSetKey << ".\n";
EXIT(EXIT_FAILURE);
}
char *data = targetSizeReader->getData(targetId, thread_idx);
unsigned int nbrGenes = Util::fast_atoi<unsigned int>(data);

if (simpleBestHitMode) {
correctedPval = bestEval / nbrGenes;
} else {
// if no second hit is available, update pvalue with fake hit
if (dataToAggregate.size() < 2) {
secondBestScore = 2.0 * log((nbrGenes + 1) / 2) / log(2.0);
}
correctedPval = pow(2.0, secondBestScore / 2 - bestScore / 2);
if (simpleBestHitMode ||dataToAggregate.size() < 2) {
correctedPval = 1 - exp(-bestEval);
}
else {
correctedPval = exp(secondBestScore - bestScore);
}

// Aggregate the full line into string
@@ -44,9 +44,9 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) {
data = Util::skipLine(data);
}
dbw.writeData(buffer.c_str(), buffer.length(), setReader.getDbKey(i), thread_idx);
buffer.clear();
}
buffer.clear();
};
}
dbw.close();
resultReader.close();
setReader.close();

0 comments on commit d5b8650

Please sign in to comment.