Skip to content
Permalink
Browse files

DBReader::getAminoAcidDBSize returns the actual number of sequence re…

…sidues or profile columns.

Previously it would add 2 bytes per entry for sequences (\n\0) or for 22 additional bytes per column for profiles (and another \0 at the end).

This only has an effect on the E-value computation of profile searches that get swapped (slice search and target profile search), iterative profile search are not affected by the 22 additonal bytes.
  • Loading branch information...
milot-mirdita committed Mar 13, 2019
1 parent 6cbc8a7 commit 0e1e001de3c87ff6430b22bb4bfa28314c8f8aa1
@@ -206,7 +206,7 @@ void Alignment::run(const unsigned int mpiRank, const unsigned int mpiNumProc,

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(prefdbr->getAminoAcidDBSize(), prefdbr->getSeqLens(),
Util::decomposeDomainByAminoAcid(prefdbr->getDataSize(), prefdbr->getSeqLens(),
prefdbr->getSize(), mpiRank, mpiNumProc, &dbFrom, &dbSize);

Debug(Debug::INFO) << "Compute split from " << dbFrom << " to " << (dbFrom + dbSize) << "\n";
@@ -22,16 +22,16 @@ template <typename T>
DBReader<T>::DBReader(const char* dataFileName_, const char* indexFileName_, int threads, int dataMode) :
threads(threads), dataMode(dataMode), dataFileName(strdup(dataFileName_)),
indexFileName(strdup(indexFileName_)), size(0), dataFiles(NULL), dataSizeOffset(NULL), dataFileCnt(0),
totalDataSize(0), aaDbSize(0), lastKey(T()), closed(1), dbtype(Parameters::DBTYPE_GENERIC_DB),
totalDataSize(0), dataSize(0), lastKey(T()), closed(1), dbtype(Parameters::DBTYPE_GENERIC_DB),
compressedBuffers(NULL), compressedBufferSizes(NULL), index(NULL), seqLens(NULL), id2local(NULL), local2id(NULL),
dataMapped(false), accessType(0), externalData(false), didMlock(false)
{}

template <typename T>
DBReader<T>::DBReader(DBReader<T>::Index *index, unsigned int *seqLens, size_t size, size_t aaDbSize, T lastKey,
DBReader<T>::DBReader(DBReader<T>::Index *index, unsigned int *seqLens, size_t size, size_t dataSize, T lastKey,
int dbType, unsigned int maxSeqLen, int threads) :
threads(threads), dataMode(USE_INDEX), dataFileName(NULL), indexFileName(NULL),
size(size), dataFiles(NULL), dataSizeOffset(NULL), dataFileCnt(0), totalDataSize(0), aaDbSize(aaDbSize), lastKey(lastKey),
size(size), dataFiles(NULL), dataSizeOffset(NULL), dataFileCnt(0), totalDataSize(0), dataSize(dataSize), lastKey(lastKey),
maxSeqLen(maxSeqLen), closed(1), dbtype(dbType), compressedBuffers(NULL), compressedBufferSizes(NULL), index(index), sortedByOffset(true),
seqLens(seqLens), id2local(NULL), local2id(NULL), dataMapped(false), accessType(NOSORT), externalData(true), didMlock(false)
{}
@@ -73,7 +73,6 @@ void DBReader<T>::mlock(){
}
}


template <typename T>
void DBReader<T>::printMagicNumber(){
Debug(Debug::INFO) << magicBytes << "\n";
@@ -97,9 +96,6 @@ template <typename T> DBReader<T>::~DBReader(){
}
}




template <typename T> bool DBReader<T>::open(int accessType){
// count the number of entries
this->accessType = accessType;
@@ -168,10 +164,10 @@ template <typename T> bool DBReader<T>::open(int accessType){
}

// init seq lens array and dbKey mapping
aaDbSize = 0;
dataSize = 0;
for (size_t i = 0; i < size; i++){
unsigned int size = seqLens[i];
aaDbSize += size;
dataSize += size;
}
}

@@ -489,6 +485,17 @@ template <typename T> char* DBReader<T>::getDataCompressed(size_t id, int thrIdx
return compressedBuffers[thrIdx];
}

template <typename T> size_t DBReader<T>::getAminoAcidDBSize() {
checkClosed();
if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
// Get the actual profile column without the null byte per entry
return (dataSize / Sequence::PROFILE_READIN_SIZE) - size;
} else {
// Get the actual number of residues witout \n\0 per entry
return dataSize - (2 * size);
}
}

template <typename T> char* DBReader<T>::getData(size_t id, int thrIdx){
if(compression == COMPRESSED){
return getDataCompressed(id, thrIdx);
@@ -722,7 +729,7 @@ template <typename T> size_t DBReader<T>::getDataOffset(T i) {

template <>
size_t DBReader<unsigned int>::indexMemorySize(const DBReader<unsigned int> &idx) {
size_t memSize = // size + aaDbSize
size_t memSize = // size + dataSize
2 * sizeof(size_t)
// maxSeqLen + lastKey + dbtype
+ 3 * sizeof(unsigned int)
@@ -740,7 +747,7 @@ char* DBReader<unsigned int>::serialize(const DBReader<unsigned int> &idx) {
char* p = data;
memcpy(p, &idx.size, sizeof(size_t));
p += sizeof(size_t);
memcpy(p, &idx.aaDbSize, sizeof(size_t));
memcpy(p, &idx.dataSize, sizeof(size_t));
p += sizeof(size_t);
memcpy(p, &idx.lastKey, sizeof(unsigned int));
p += sizeof(unsigned int);
@@ -760,7 +767,7 @@ DBReader<unsigned int> *DBReader<unsigned int>::unserialize(const char* data, in
const char* p = data;
size_t size = *((size_t*)p);
p += sizeof(size_t);
size_t aaDbSize = *((size_t*)p);
size_t dataSize = *((size_t*)p);
p += sizeof(size_t);
unsigned int lastKey = *((unsigned int*)p);
p += sizeof(unsigned int);
@@ -772,7 +779,7 @@ DBReader<unsigned int> *DBReader<unsigned int>::unserialize(const char* data, in
p += size * sizeof(DBReader<unsigned int>::Index);
unsigned int *seqLens = (unsigned int *)p;

return new DBReader<unsigned int>(idx, seqLens, size, aaDbSize, lastKey, dbType, maxSeqLen, threads);
return new DBReader<unsigned int>(idx, seqLens, size, dataSize, lastKey, dbType, maxSeqLen, threads);
}

template <typename T>
@@ -44,7 +44,9 @@ class DBReader {

const char* getIndexFileName() { return indexFileName; }

size_t getAminoAcidDBSize(){ return aaDbSize; }
size_t getAminoAcidDBSize();

size_t getDataSize() { return dataSize; }

char* getData(size_t id, int thrIdx);

@@ -267,8 +269,8 @@ class DBReader {
std::vector<std::string> dataFileNames;


// amino acid size
size_t aaDbSize;
// summed up size of all entries
size_t dataSize;
// Last Key in Index
T lastKey;
// max seqLen
@@ -712,7 +712,7 @@ bool Prefiltering::runSplit(DBReader<unsigned int>* qdbr, const std::string &res

// create index table based on split parameter
if (splitMode == Parameters::TARGET_DB_SPLIT) {
Util::decomposeDomainByAminoAcid(tdbr->getAminoAcidDBSize(), tdbr->getSeqLens(), tdbr->getSize(),
Util::decomposeDomainByAminoAcid(tdbr->getDataSize(), tdbr->getSeqLens(), tdbr->getSize(),
split, splitCount, &dbFrom, &dbSize);
if (dbSize == 0) {
return false;
@@ -737,7 +737,7 @@ bool Prefiltering::runSplit(DBReader<unsigned int>* qdbr, const std::string &res

getIndexTable(split, dbFrom, dbSize);
} else if (splitMode == Parameters::QUERY_DB_SPLIT) {
Util::decomposeDomainByAminoAcid(qdbr->getAminoAcidDBSize(), qdbr->getSeqLens(), qdbr->getSize(),
Util::decomposeDomainByAminoAcid(qdbr->getDataSize(), qdbr->getSeqLens(), qdbr->getSize(),
split, splitCount, &queryFrom, &querySize);
if (querySize == 0) {
return false;
@@ -322,7 +322,7 @@ int doExtract(Parameters &par, const unsigned int mpiRank, const unsigned int mp

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(reader.getAminoAcidDBSize(), reader.getSeqLens(), reader.getSize(),
Util::decomposeDomainByAminoAcid(reader.getDataSize(), reader.getSeqLens(), reader.getSize(),
mpiRank, mpiNumProc, &dbFrom, &dbSize);
std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(par.db3, par.db3Index, mpiRank);

@@ -45,7 +45,7 @@ int extractframes(int argc, const char **argv, const Command& command) {
#endif
size_t querySize = 0;
size_t queryFrom = 0;
Util::decomposeDomainByAminoAcid(reader.getAminoAcidDBSize(), reader.getSeqLens(), reader.getSize(),
Util::decomposeDomainByAminoAcid(reader.getDataSize(), reader.getSeqLens(), reader.getSize(),
thread_idx, par.threads, &queryFrom, &querySize);
if (querySize == 0) {
queryFrom = 0;
@@ -51,7 +51,7 @@ int extractorfs(int argc, const char **argv, const Command& command) {
#endif
size_t querySize = 0;
size_t queryFrom = 0;
Util::decomposeDomainByAminoAcid(reader.getAminoAcidDBSize(), reader.getSeqLens(), reader.getSize(),
Util::decomposeDomainByAminoAcid(reader.getDataSize(), reader.getSeqLens(), reader.getSize(),
thread_idx, par.threads, &queryFrom, &querySize);
if (querySize == 0) {
queryFrom = 0;
@@ -374,7 +374,7 @@ int result2msa(Parameters &par, const unsigned int mpiRank, const unsigned int m

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(qDbr->getAminoAcidDBSize(), qDbr->getSeqLens(), qDbr->getSize(),
Util::decomposeDomainByAminoAcid(qDbr->getDataSize(), qDbr->getSeqLens(), qDbr->getSize(),
mpiRank, mpiNumProc, &dbFrom, &dbSize);
qDbr->close();
delete qDbr;
@@ -293,7 +293,7 @@ int computeProfileProfile(Parameters &par,const unsigned int mpiRank, const unsi

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(qDbr->getAminoAcidDBSize(), qDbr->getSeqLens(), qDbr->getSize(),
Util::decomposeDomainByAminoAcid(qDbr->getDataSize(), qDbr->getSeqLens(), qDbr->getSize(),
mpiRank, mpiNumProc, &dbFrom, &dbSize);
qDbr->close();
delete qDbr;
@@ -34,10 +34,8 @@ int splitdb(int argc, const char **argv, const Command& command) {

size_t startIndex = 0;
size_t domainSize = 0;

if(par.splitAA) {
Util::decomposeDomainByAminoAcid(dbr.getAminoAcidDBSize(), sizes, size, split, par.split, &startIndex,
&domainSize);
if (par.splitAA) {
Util::decomposeDomainByAminoAcid(dbr.getDataSize(), sizes, size, split, par.split, &startIndex, &domainSize);
} else {
Util::decomposeDomain(size, split, par.split, &startIndex, &domainSize);
}
@@ -44,7 +44,7 @@ int splitsequence(int argc, const char **argv, const Command& command) {
#endif
size_t querySize = 0;
size_t queryFrom = 0;
Util::decomposeDomainByAminoAcid(reader.getAminoAcidDBSize(), reader.getSeqLens(), reader.getSize(),
Util::decomposeDomainByAminoAcid(reader.getDataSize(), reader.getSeqLens(), reader.getSize(),
thread_idx, par.threads, &queryFrom, &querySize);
if (querySize == 0) {
queryFrom = 0;
@@ -108,7 +108,7 @@ int doSummarize(Parameters &par, const unsigned int mpiRank, const unsigned int

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(reader.getAminoAcidDBSize(), reader.getSeqLens(), reader.getSize(),
Util::decomposeDomainByAminoAcid(reader.getDataSize(), reader.getSeqLens(), reader.getSize(),
mpiRank, mpiNumProc, &dbFrom, &dbSize);
std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(par.db2, par.db2Index, mpiRank);

@@ -184,7 +184,7 @@ int doAnnotate(Parameters &par, const unsigned int mpiRank, const unsigned int m

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(reader.getAminoAcidDBSize(), reader.getSeqLens(), reader.getSize(),
Util::decomposeDomainByAminoAcid(reader.getDataSize(), reader.getSeqLens(), reader.getSize(),
mpiRank, mpiNumProc, &dbFrom, &dbSize);
std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(par.db3, par.db3Index, mpiRank);

0 comments on commit 0e1e001

Please sign in to comment.
You can’t perform that action at this time.