Skip to content
Permalink
Browse files

Fix max seq issues in prefilter

  • Loading branch information
martin-steinegger committed Feb 7, 2020
1 parent 8bcd7c7 commit 1ac1e68669284c203cc5aa26cc823f71daf39a79
@@ -243,6 +243,10 @@ void Sequence::mapSequence(size_t id, unsigned int dbKey, std::pair<const unsign
|| Parameters::isEqualDbtype( this->seqType,Parameters::DBTYPE_NUCLEOTIDES)
|| Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ)){
this->L = data.second;
if(this->L >= static_cast<int>(maxLen)){
numSequence = static_cast<unsigned char *>(realloc(numSequence, this->L+1));
maxLen = this->L;
}
memcpy(this->numSequence, data.first, this->L);
} else {
Debug(Debug::ERROR) << "Invalid sequence type!\n";
@@ -109,7 +109,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
generator->setDivideStrategy(s.profile_matrix);
}

unsigned int *buffer = new unsigned int[seq->getMaxLen()];
unsigned int *buffer = static_cast<unsigned int*>(malloc(seq->getMaxLen() * sizeof(unsigned int)));
unsigned int bufferSize = seq->getMaxLen();
#pragma omp for schedule(dynamic, 100) reduction(+:totalKmerCount, maskedResidues)
for (size_t id = dbFrom; id < dbTo; id++) {
progress.updateProgress();
@@ -119,7 +120,10 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
unsigned int qKey = dbr->getDbKey(id);

s.mapSequence(id - dbFrom, qKey, seqData, dbr->getSeqLen(id));

if(s.getMaxLen() >= bufferSize ){
buffer = static_cast<unsigned int*>(realloc(buffer, s.getMaxLen() * sizeof(unsigned int)));
bufferSize = seq->getMaxLen();
}
// count similar or exact k-mers based on sequence type
if (isProfile) {
// Find out if we should also mask profiles
@@ -162,7 +166,7 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
}
}

delete[] buffer;
free(buffer);

if (generator != NULL) {
delete generator;
@@ -215,8 +219,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
#endif
Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getSpacedKmerPattern());
Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
IndexEntryLocalTmp *buffer = new IndexEntryLocalTmp[seq->getMaxLen()];

IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
size_t bufferSize = seq->getMaxLen();
KmerGenerator *generator = NULL;
if (isProfile) {
generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
@@ -231,18 +235,18 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
unsigned int qKey = dbr->getDbKey(id);
if (isProfile) {
s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
indexTable->addSimilarSequence(&s, generator, &idxer);
indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
} else {
s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom));
indexTable->addSequence(&s, &idxer, buffer, kmerThr, idScoreLookup);
indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
}
}

if (generator != NULL) {
delete generator;
}

delete [] buffer;
free(buffer);
}
if(idScoreLookup!=NULL){
delete[] idScoreLookup;
@@ -16,7 +16,7 @@
#include "MathUtil.h"
#include "KmerGenerator.h"
#include "Parameters.h"

#include <stdlib.h>
#include <algorithm>

// IndexEntryLocal is an entry with position and seqId for a kmer
@@ -132,17 +132,10 @@ class IndexTable {
size_t countKmer = 0;
bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) ||
Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS));
const unsigned char xIndex = s->subMat->aa2num[static_cast<int>('X')];
while(s->hasNextKmer()){
const unsigned char * kmer = s->nextKmer();
if(removeX){
int xCount = 0;
for(int pos = 0; pos < kmerSize; pos++){
xCount += (kmer[pos] == xIndex);
}
if(xCount > 0){
continue;
}
if(removeX && s->kmerContainsX()){
continue;
}
if(threshold > 0){
int score = 0;
@@ -301,8 +294,7 @@ class IndexTable {

// FUNCTIONS TO OVERWRITE
// add k-mers of the sequence to the index table
void addSimilarSequence(Sequence* s, KmerGenerator* kmerGenerator, Indexer * idxer) {
std::vector<IndexEntryLocalTmp> buffer;
void addSimilarSequence(Sequence* s, KmerGenerator* kmerGenerator, IndexEntryLocalTmp ** buffer, size_t &bufferSize, Indexer * idxer) {
// iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy)
s->resetCurrPos();
idxer->reset();
@@ -316,48 +308,47 @@ class IndexTable {
// if region got masked do not add kmer
if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0)
continue;
buffer.push_back(IndexEntryLocalTmp(kmerIdx,s->getId(), s->getCurrentPosition()));
(*buffer)[kmerPos].kmer = kmerIdx;
(*buffer)[kmerPos].seqId = s->getId();
(*buffer)[kmerPos].position_j = s->getCurrentPosition();
kmerPos++;
}
if(kmerPos >= bufferSize){
*buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
bufferSize = bufferSize*2;
}
}

if(kmerPos>1){
std::sort(buffer.begin(), buffer.end(), IndexEntryLocalTmp::comapreByIdAndPos);
std::sort(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
}
unsigned int prevKmer = UINT_MAX;
for(size_t pos = 0; pos < buffer.size(); pos++){
unsigned int kmerIdx = buffer[pos].kmer;
for(size_t pos = 0; pos < kmerPos; pos++){
unsigned int kmerIdx = (*buffer)[pos].kmer;
if(kmerIdx != prevKmer){
size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
IndexEntryLocal *entry = &entries[offset];
entry->seqId = buffer[pos].seqId;
entry->position_j = buffer[pos].position_j;
entry->seqId = (*buffer)[pos].seqId;
entry->position_j = (*buffer)[pos].position_j;
}
prevKmer = kmerIdx;
}
}

// add k-mers of the sequence to the index table
void addSequence (Sequence* s, Indexer * idxer,
IndexEntryLocalTmp * buffer,
IndexEntryLocalTmp ** buffer, size_t bufferSize,
int threshold, char * diagonalScore){
// iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy)
s->resetCurrPos();
idxer->reset();
size_t kmerPos = 0;
bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) ||
Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS));
const unsigned char xIndex = s->subMat->aa2num[static_cast<int>('X')];
while (s->hasNextKmer()){
const unsigned char * kmer = s->nextKmer();
if(removeX){
int xCount = 0;
for(int pos = 0; pos < kmerSize; pos++){
xCount += (kmer[pos] == xIndex);
}
if(xCount > 0){
continue;
}
if(removeX && s->kmerContainsX()){
continue;
}
if(threshold > 0) {
int score = 0;
@@ -373,24 +364,28 @@ class IndexTable {
if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0)
continue;

buffer[kmerPos].kmer = kmerIdx;
buffer[kmerPos].seqId = s->getId();
buffer[kmerPos].position_j = s->getCurrentPosition();
(*buffer)[kmerPos].kmer = kmerIdx;
(*buffer)[kmerPos].seqId = s->getId();
(*buffer)[kmerPos].position_j = s->getCurrentPosition();
kmerPos++;
if(kmerPos >= bufferSize){
*buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
bufferSize = bufferSize*2;
}
}

if(kmerPos>1){
std::sort(buffer, buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
std::sort(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
}

unsigned int prevKmer = UINT_MAX;
for(size_t pos = 0; pos < kmerPos; pos++){
unsigned int kmerIdx = buffer[pos].kmer;
unsigned int kmerIdx = (*buffer)[pos].kmer;
if(kmerIdx != prevKmer){
size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
IndexEntryLocal *entry = &entries[offset];
entry->seqId = buffer[pos].seqId;
entry->position_j = buffer[pos].position_j;
entry->seqId = (*buffer)[pos].seqId;
entry->position_j = (*buffer)[pos].position_j;
}
prevKmer = kmerIdx;
}
@@ -814,10 +814,9 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
Sequence seq(maxSeqLen, querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);

Sequence seq(qdbr->getMaxSeqLen(), querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);
QueryMatcher matcher(indexTable, sequenceLookup, kmerSubMat, ungappedSubMat,
kmerThr, kmerSize, dbSize, maxSeqLen, maxResListLen, aaBiasCorrection,
kmerThr, kmerSize, dbSize, qdbr->getMaxSeqLen(), maxResListLen, aaBiasCorrection,
diagonalScoring, minDiagScoreThr, takeOnlyBestKmer);

if (seq.profile_matrix != NULL) {

0 comments on commit 1ac1e68

Please sign in to comment.
You can’t perform that action at this time.