Skip to content

Commit

Permalink
Rework truncation logic. Realloc memory if needed to avoid truncation…
Browse files Browse the repository at this point in the history
… of results.
  • Loading branch information
martin-steinegger committed Dec 20, 2022
1 parent 7b95387 commit ed4c55f
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 19 deletions.
7 changes: 7 additions & 0 deletions src/prefiltering/CacheFriendlyOperations.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ struct __attribute__((__packed__)) CounterResult {
return false;
}

static bool sortScore(const CounterResult &first, const CounterResult &second) {
if (first.count > second.count)
return true;
if (second.count > first.count)
return false;
return false;
}
};

template<unsigned int BINSIZE>
Expand Down
10 changes: 4 additions & 6 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,6 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
size_t resSize = 0;
size_t realResSize = 0;
size_t diagonalOverflow = 0;
size_t trancatedCounter = 0;
size_t totalQueryDBSize = querySize;

size_t localThreads = 1;
Expand Down Expand Up @@ -809,7 +808,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
std::string result;
result.reserve(1000000);

#pragma omp for schedule(dynamic, 1) reduction (+: kmersPerPos, resSize, dbMatches, doubleMatches, querySeqLenSum, diagonalOverflow, trancatedCounter)
#pragma omp for schedule(dynamic, 1) reduction (+: kmersPerPos, resSize, dbMatches, doubleMatches, querySeqLenSum, diagonalOverflow)
for (size_t id = queryFrom; id < queryFrom + querySize; id++) {
progress.updateProgress();
// get query sequence
Expand Down Expand Up @@ -875,7 +874,6 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
doubleMatches += matcher.getStatistics()->doubleMatches;
querySeqLenSum += seq.L;
diagonalOverflow += matcher.getStatistics()->diagonalOverflow;
trancatedCounter += matcher.getStatistics()->truncated;
resSize += resultSize;
realResSize += std::min(resultSize, maxResListLen);
reslens[thread_idx]->emplace_back(resultSize);
Expand All @@ -888,7 +886,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
dbMatches / totalQueryDBSize,
doubleMatches / totalQueryDBSize,
querySeqLenSum, diagonalOverflow,
resSize / totalQueryDBSize, trancatedCounter);
resSize / totalQueryDBSize);

size_t empty = 0;
for (size_t id = 0; id < querySize; id++) {
Expand Down Expand Up @@ -958,7 +956,6 @@ void Prefiltering::printStatistics(const statistics_t &stats, std::list<int> **r
Debug(Debug::INFO) << "\n" << stats.kmersPerPos << " k-mers per position\n";
Debug(Debug::INFO) << stats.dbMatches << " DB matches per sequence\n";
Debug(Debug::INFO) << stats.diagonalOverflow << " overflows\n";
Debug(Debug::INFO) << stats.truncated << " queries produce too many hits (truncated result)\n";
Debug(Debug::INFO) << stats.resultsPassedPrefPerSeq << " sequences passed prefiltering per query sequence";
if (stats.resultsPassedPrefPerSeq > maxResults)
Debug(Debug::WARNING) << " (ATTENTION: max. " << maxResults
Expand Down Expand Up @@ -1075,7 +1072,8 @@ size_t Prefiltering::estimateMemoryConsumption(int split, size_t dbSize, size_t
// This memory is an approx. for Countint32Array and QueryTemplateLocalFast
size_t threadSize = threads * (
(dbSizeSplit * 2 * sizeof(IndexEntryLocal)) // databaseHits in QueryMatcher
+ (dbSizeSplit * sizeof(CounterResult)) // databaseHits in QueryMatcher
+ (dbSizeSplit * 1.5 * sizeof(CounterResult)) // databaseHits in QueryMatcher
// 1.5 is a security factor
+ (maxResListLen * sizeof(hit_t))
+ (dbSizeSplit * 2 * sizeof(CounterResult) * 2) // BINS * binSize, (binSize = dbSize * 2 / BINS)
// 2 is a security factor the size can increase during run
Expand Down
35 changes: 27 additions & 8 deletions src/prefiltering/QueryMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,23 +169,33 @@ std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned
}else{
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false);
}
stats->truncated = 0;
}else{
//Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, resultSize, identityId, diagonalThr, ungappedAlignment, false);
stats->truncated = 1;
size_t resultPos = 0;
for (size_t i = 0; i < resultSize; i++) {
resultReadPos[resultPos].id = resultReadPos[i].id;
resultReadPos[resultPos].diagonal = resultReadPos[i].diagonal;
resultReadPos[resultPos].count = resultReadPos[i].count;
resultPos += (resultReadPos[i].count >= diagonalThr);
}
SORT_SERIAL(resultReadPos, resultReadPos + resultPos, CounterResult::sortScore);
queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(resultReadPos, resultPos, identityId, diagonalThr, ungappedAlignment, false);
}
}else{
unsigned int thr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery);
thr = std::max(minDiagScoreThr, thr);
if(resultSize < foundDiagonalsSize / 2) {
int elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, thr, foundDiagonals, resultSize);
queryResult = getResult<KMER_SCORE>(foundDiagonals + resultSize, elementsCntAboveDiagonalThr, identityId, thr, ungappedAlignment, false);
stats->truncated = 0;
}else{
// Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
queryResult = getResult<KMER_SCORE>(foundDiagonals, resultSize, identityId, thr, ungappedAlignment, false);
stats->truncated = 1;
size_t resultPos = 0;
for (size_t i = 0; i < resultSize; i++) {
foundDiagonals[resultPos].id = foundDiagonals[i].id;
foundDiagonals[resultPos].diagonal = foundDiagonals[i].diagonal;
foundDiagonals[resultPos].count = foundDiagonals[i].count;
resultPos += (foundDiagonals[i].count >= thr);
}
SORT_SERIAL(foundDiagonals, foundDiagonals + resultPos, CounterResult::sortScore);
queryResult = getResult<KMER_SCORE>(foundDiagonals, resultPos, identityId, thr, ungappedAlignment, false);
}
}
if(queryResult.second > 1){
Expand Down Expand Up @@ -266,6 +276,15 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
// detected overflow while matching
if ((sequenceHits + seqListSize) >= lastSequenceHit) {
stats->diagonalOverflow = true;
// realloc foundDiagonals if only 10% of memory left
if((foundDiagonalsSize - overflowHitCount) < 0.1 * foundDiagonalsSize){
foundDiagonalsSize *= 1.5;
foundDiagonals = (CounterResult*) realloc(foundDiagonals, foundDiagonalsSize * sizeof(CounterResult));
if(foundDiagonals == NULL){
Debug(Debug::ERROR) << "Out of memory in QueryMatcher::match\n";
EXIT(EXIT_FAILURE);
}
}
// last pointer
indexPointer[current_i + 1] = sequenceHits;
//std::cout << "Overflow in i=" << indexStart << std::endl;
Expand Down
8 changes: 3 additions & 5 deletions src/prefiltering/QueryMatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@ struct statistics_t{
size_t querySeqLen;
size_t diagonalOverflow;
size_t resultsPassedPrefPerSeq;
size_t truncated;
statistics_t() : kmersPerPos(0.0) , dbMatches(0) , doubleMatches(0), querySeqLen(0), diagonalOverflow(0), resultsPassedPrefPerSeq(0), truncated(0) {};
statistics_t() : kmersPerPos(0.0) , dbMatches(0) , doubleMatches(0), querySeqLen(0), diagonalOverflow(0), resultsPassedPrefPerSeq(0) {};
statistics_t(double kmersPerPos, size_t dbMatches,
size_t doubleMatches, size_t querySeqLen, size_t diagonalOverflow, size_t resultsPassedPrefPerSeq, size_t truncated) : kmersPerPos(kmersPerPos),
size_t doubleMatches, size_t querySeqLen, size_t diagonalOverflow, size_t resultsPassedPrefPerSeq) : kmersPerPos(kmersPerPos),
dbMatches(dbMatches),
doubleMatches(doubleMatches),
querySeqLen(querySeqLen),
diagonalOverflow(diagonalOverflow),
resultsPassedPrefPerSeq(resultsPassedPrefPerSeq),
truncated(truncated){};
resultsPassedPrefPerSeq(resultsPassedPrefPerSeq){};
};

struct hit_t {
Expand Down

0 comments on commit ed4c55f

Please sign in to comment.