Skip to content
Permalink
Browse files

Merge branch master

  • Loading branch information...
RuoshiZhang committed Sep 3, 2019
2 parents d7a7edb + 2f8093d commit b984c952144635c899f596ae42f4995d6bb321ed
Showing with 651 additions and 469 deletions.
  1. +1 −1 azure-pipelines.yml
  2. +24 −9 cmake/FindASan.cmake
  3. +15 −4 data/searchslicedtargetprofile.sh
  4. +15 −10 src/alignment/Alignment.cpp
  5. +2 −2 src/alignment/CompressedA3M.cpp
  6. +1 −1 src/alignment/MultipleAlignment.cpp
  7. +3 −4 src/alignment/rescorediagonal.cpp
  8. +2 −2 src/clustering/ClusteringAlgorithms.cpp
  9. +4 −4 src/commons/DBConcat.cpp
  10. +58 −76 src/commons/DBReader.cpp
  11. +41 −12 src/commons/DBReader.h
  12. +13 −13 src/commons/DBWriter.cpp
  13. +2 −2 src/commons/DBWriter.h
  14. +2 −4 src/commons/Orf.cpp
  15. +3 −0 src/commons/Parameters.cpp
  16. +5 −1 src/commons/Parameters.h
  17. +22 −21 src/commons/Sequence.cpp
  18. +5 −5 src/commons/Sequence.h
  19. +0 −56 src/commons/Util.cpp
  20. +1 −3 src/commons/Util.h
  21. +2 −2 src/linclust/kmermatcher.cpp
  22. +220 −0 src/multihit/resultsbyset.cpp
  23. +7 −15 src/prefiltering/IndexBuilder.cpp
  24. +12 −11 src/prefiltering/Prefiltering.cpp
  25. +3 −3 src/prefiltering/PrefilteringIndexReader.cpp
  26. +6 −4 src/prefiltering/ungappedprefilter.cpp
  27. +1 −1 src/taxonomy/addtaxonomy.cpp
  28. +1 −1 src/taxonomy/filtertaxdb.cpp
  29. +1 −1 src/taxonomy/lca.cpp
  30. +3 −4 src/test/TestAlignment.cpp
  31. +2 −2 src/test/TestAlignmentPerformance.cpp
  32. +3 −4 src/test/TestAlignmentTraceback.cpp
  33. +1 −1 src/test/TestBestAlphabet.cpp
  34. +1 −1 src/test/TestCompositionBias.cpp
  35. +5 −5 src/test/TestDBReader.cpp
  36. +2 −2 src/test/TestDBReaderIndexSerialization.cpp
  37. +1 −1 src/test/TestDBReaderZstd.cpp
  38. +7 −7 src/test/TestDiagonalScoring.cpp
  39. +3 −3 src/test/TestDiagonalScoringPerformance.cpp
  40. +1 −1 src/test/TestKmerGenerator.cpp
  41. +1 −1 src/test/TestKmerNucl.cpp
  42. +2 −2 src/test/TestKmerScore.cpp
  43. +2 −2 src/test/TestKsw2.cpp
  44. +5 −5 src/test/TestMultipleAlignment.cpp
  45. +2 −2 src/test/TestProfileAlignment.cpp
  46. +4 −4 src/test/TestSequenceIndex.cpp
  47. +1 −1 src/test/TestTanTan.cpp
  48. +9 −9 src/test/TestUtil.cpp
  49. +2 −2 src/util/alignall.cpp
  50. +2 −2 src/util/alignbykmer.cpp
  51. +2 −2 src/util/apply.cpp
  52. +3 −3 src/util/clusthash.cpp
  53. +1 −2 src/util/compress.cpp
  54. +7 −7 src/util/convertalignments.cpp
  55. +1 −2 src/util/convertca3m.cpp
  56. +2 −3 src/util/convertprofiledb.cpp
  57. +2 −2 src/util/countkmer.cpp
  58. +1 −2 src/util/createseqfiledb.cpp
  59. +2 −2 src/util/createsubdb.cpp
  60. +6 −6 src/util/createtsv.cpp
  61. +5 −4 src/util/expandaln.cpp
  62. +5 −6 src/util/extractdomains.cpp
  63. +2 −3 src/util/extractframes.cpp
  64. +3 −6 src/util/extractorfs.cpp
  65. +3 −3 src/util/filterdb.cpp
  66. +1 −1 src/util/gff2db.cpp
  67. +2 −6 src/util/maskbygff.cpp
  68. +2 −1 src/util/masksequence.cpp
  69. +3 −4 src/util/msa2profile.cpp
  70. +3 −3 src/util/offsetalignment.cpp
  71. +1 −1 src/util/profile2cs.cpp
  72. +2 −1 src/util/profile2pssm.cpp
  73. +1 −1 src/util/profile2seq.cpp
  74. +2 −2 src/util/proteinaln2nucl.cpp
  75. +8 −12 src/util/result2msa.cpp
  76. +3 −7 src/util/result2pp.cpp
  77. +8 −15 src/util/result2profile.cpp
  78. +1 −1 src/util/result2repseq.cpp
  79. +3 −4 src/util/reverseseq.cpp
  80. +4 −6 src/util/splitdb.cpp
  81. +26 −16 src/util/splitsequence.cpp
  82. +1 −1 src/util/summarizeresult.cpp
  83. +2 −3 src/util/summarizetabs.cpp
  84. +1 −1 src/util/translateaa.cpp
  85. +1 −1 src/util/translatenucs.cpp
@@ -80,7 +80,7 @@ jobs:
SIMD: 'AVX2'
STATIC: 0
MPI: 0
BUILD_TYPE: ASan
BUILD_TYPE: ASanOpt

steps:
- checkout: self
@@ -47,19 +47,12 @@ else(NOT ADDRESS_SANITIZER_FLAG)
set(HAVE_ADDRESS_SANITIZER FALSE)
endif()

check_cxx_compiler_flag("-Og" HAVE_OPTIMIZE_DEBUG)
if(HAVE_OPTIMIZE_DEBUG)
set(OPTIMIZE_DEBUG_FLAG "-Og")
else()
set(OPTIMIZE_DEBUG_FLAG "-O0")
endif()

set(HAVE_ADDRESS_SANITIZER TRUE)

set(CMAKE_C_FLAGS_ASAN "${OPTIMIZE_DEBUG_FLAG} -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
set(CMAKE_C_FLAGS_ASAN "-O0 -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
CACHE STRING "Flags used by the C compiler during ASan builds."
FORCE)
set(CMAKE_CXX_FLAGS_ASAN "${OPTIMIZE_DEBUG_FLAG} -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
set(CMAKE_CXX_FLAGS_ASAN "-O0 -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
CACHE STRING "Flags used by the C++ compiler during ASan builds."
FORCE)
set(CMAKE_EXE_LINKER_FLAGS_ASAN "${ADDRESS_SANITIZER_FLAG}"
@@ -73,3 +66,25 @@ mark_as_advanced(CMAKE_C_FLAGS_ASAN
CMAKE_CXX_FLAGS_ASAN
CMAKE_EXE_LINKER_FLAGS_ASAN
CMAKE_SHARED_LINKER_FLAGS_ASAN)


check_cxx_compiler_flag("-Og" HAVE_OPTIMIZE_DEBUG)
if(HAVE_OPTIMIZE_DEBUG)
set(CMAKE_C_FLAGS_ASANOPT "-Og -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
CACHE STRING "Flags used by the C compiler during ASan Optimized builds."
FORCE)
set(CMAKE_CXX_FLAGS_ASANOPT "-Og -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
CACHE STRING "Flags used by the C++ compiler during ASan Optimized builds."
FORCE)
set(CMAKE_EXE_LINKER_FLAGS_ASANOPT "${ADDRESS_SANITIZER_FLAG}"
CACHE STRING "Flags used for linking binaries during ASan Optimized builds."
FORCE)
set(CMAKE_SHARED_LINKER_FLAGS_ASANOPT "${ADDRESS_SANITIZER_FLAG}"
CACHE STRING "Flags used by the shared libraries linker during ASan Optimized builds."
FORCE)

mark_as_advanced(CMAKE_C_FLAGS_ASANOPT
CMAKE_CXX_FLAGS_ASANOPT
CMAKE_EXE_LINKER_FLAGS_ASANOPT
CMAKE_SHARED_LINKER_FLAGS_ASANOPT)
endif()
@@ -60,10 +60,12 @@ FIRST_INDEX_LINE=1
NUM_PROFS_IN_STEP=1
NUM_PREF_RESULTS_IN_ALL_PREV_STEPS=0

STEP=0

while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
if [ -f "${TMP_PATH}/aln.checkpoint" ]; then
if [ -f "${TMP_PATH}/aln_${STEP}.checkpoint" ]; then
# restore values from previous run, in case it was aborted
read -r FIRST_INDEX_LINE NUM_PREF_RESULTS_IN_ALL_PREV_STEPS < "${TMP_PATH}/aln.checkpoint"
read -r FIRST_INDEX_LINE NUM_PREF_RESULTS_IN_ALL_PREV_STEPS < "${TMP_PATH}/aln_${STEP}.checkpoint"
fi

# predict NUM_SEQS_THAT_SATURATE as the average number of prefilter results per profile in previous steps
@@ -159,9 +161,11 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
|| fail "mvdb died"
fi

STEP="$((STEP+1))"
# update for the next step
rm -f "${TMP_PATH}/pref.done" "${TMP_PATH}/aln.done" "${TMP_PATH}/aln_swap.done"
printf "%d\\t%s\\n" "${FIRST_INDEX_LINE}" "${NUM_PREF_RESULTS_IN_ALL_PREV_STEPS}" > "${TMP_PATH}/aln.checkpoint"
printf "%d\\t%s\\n" "${FIRST_INDEX_LINE}" "${NUM_PREF_RESULTS_IN_ALL_PREV_STEPS}" > "${TMP_PATH}/aln_${STEP}.checkpoint"

done

# keep only the top max-seqs hits according to the default alignment sorting criteria
@@ -176,6 +180,13 @@ if [ -n "$REMOVE_TMP" ]; then
"$MMSEQS" rmdb "${TMP_PATH}/aln_merged" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${PROFILEDB}" ${VERBOSITY_PAR}
rm -f "${TMP_PATH}/aln.checkpoint" "${PROFILEDB}.meta"
CURR_STEP=0
while [ "${CURR_STEP}" -le "${STEP}" ]; do
if [ -f "${TMP_PATH}/aln_${CURR_STEP}.checkpoint" ]; then
rm -f "${TMP_PATH}/aln_${CURR_STEP}.checkpoint"
fi
CURR_STEP="$((CURR_STEP+1))"
done
rm -f "${PROFILEDB}.meta"
rm -f "$TMP_PATH/searchslicedtargetprofile.sh"
fi
@@ -216,8 +216,7 @@ void Alignment::run(const unsigned int mpiRank, const unsigned int mpiNumProc,

size_t dbFrom = 0;
size_t dbSize = 0;
Util::decomposeDomainByAminoAcid(prefdbr->getDataSize(), prefdbr->getSeqLens(),
prefdbr->getSize(), mpiRank, mpiNumProc, &dbFrom, &dbSize);
prefdbr->decomposeDomainByAminoAcid( mpiRank, mpiNumProc, &dbFrom, &dbSize);

Debug(Debug::INFO) << "Compute split from " << dbFrom << " to " << (dbFrom + dbSize) << "\n";
std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(outDB, outDBIndex, mpiRank);
@@ -300,13 +299,14 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
unsigned int queryDbKey = prefdbr->getDbKey(id);
// only load query data if data != \0
if(*data != '\0'){
char *querySeqData = qdbr->getDataByDBKey(queryDbKey, thread_idx);
size_t qId = qdbr->getId(queryDbKey);
char *querySeqData = qdbr->getData(qId, thread_idx);
if (querySeqData == NULL) {
Debug(Debug::ERROR) << "Query sequence " << queryDbKey
<< " is required in the prefiltering, but is not contained in the query sequence database.\nPlease check your database.\n";
EXIT(EXIT_FAILURE);
}
qSeq.mapSequence(id, queryDbKey, querySeqData);
qSeq.mapSequence(qId, queryDbKey, querySeqData, qdbr->getSeqLen(qId));
matcher.initQuery(&qSeq);
}

@@ -329,13 +329,14 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
isReverse = (reversePrefilterResult) ? (hit.prefScore < 0) ? true : false : false;
diagonal = static_cast<short>(hit.diagonal);
}
size_t dbId = tdbr->getId(dbKey);
char *dbSeqData = tdbr->getData(dbId, thread_idx);

char *dbSeqData = tdbr->getDataByDBKey(dbKey, thread_idx);
if (dbSeqData == NULL) {
Debug(Debug::ERROR) << "Sequence " << dbKey <<" is required in the prefiltering, but is not contained in the target sequence database!\nPlease check your database.\n";
EXIT(EXIT_FAILURE);
}
dbSeq.mapSequence(static_cast<size_t>(-1), dbKey, dbSeqData);
dbSeq.mapSequence(dbId, dbKey, dbSeqData, tdbr->getSeqLen(dbId));
// check if the sequences could pass the coverage threshold
if(Util::canBeCovered(canCovThr, covMode, static_cast<float>(qSeq.L), static_cast<float>(dbSeq.L)) == false) {
rejected++;
@@ -374,12 +375,14 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
if (realign == true) {
realigner->initQuery(&qSeq);
for (size_t result = 0; result < swResults.size(); result++) {
char *dbSeqData = tdbr->getDataByDBKey(swResults[result].dbKey, thread_idx);
size_t dbId = tdbr->getId(swResults[result].dbKey);
char *dbSeqData = tdbr->getData(dbId, thread_idx);
if (dbSeqData == NULL) {
Debug(Debug::ERROR) << "Sequence " << swResults[result].dbKey <<" is required in the prefiltering, but is not contained in the target sequence database!\nPlease check your database.\n";
EXIT(EXIT_FAILURE);
}
dbSeq.mapSequence(static_cast<size_t>(-1), swResults[result].dbKey, dbSeqData);
dbSeq.mapSequence(static_cast<size_t>(-1), swResults[result].dbKey, dbSeqData,
tdbr->getSeqLen(dbId));
const bool isIdentity = (queryDbKey == swResults[result].dbKey && (includeIdentity || sameQTDB)) ? true : false;
Matcher::result_t res = realigner->getSWResult(&dbSeq, INT_MAX, false, covMode, covThr, FLT_MAX,
Matcher::SCORE_COV_SEQID, seqIdMode, isIdentity);
@@ -476,12 +479,14 @@ void Alignment::computeAlternativeAlignment(unsigned int queryDbKey, Sequence &d
if (isIdentity == true) {
continue;
}
char *dbSeqData = tdbr->getDataByDBKey(swResults[i].dbKey, thread_idx);
size_t dbId = tdbr->getId(swResults[i].dbKey);
char *dbSeqData = tdbr->getData(dbId, thread_idx);
if (dbSeqData == NULL) {
Debug(Debug::ERROR) << "Sequence " << swResults[i].dbKey <<" is required in the prefiltering, but is not contained in the target sequence database!\nPlease check your database.\n";
EXIT(EXIT_FAILURE);
}
dbSeq.mapSequence(static_cast<size_t>(-1), swResults[i].dbKey, dbSeqData);

dbSeq.mapSequence(dbId, swResults[i].dbKey, dbSeqData, tdbr->getSeqLen(dbId));
for (int pos = swResults[i].dbStartPos; pos < swResults[i].dbEndPos; ++pos) {
dbSeq.int_sequence[pos] = xIndex;
}
@@ -201,11 +201,11 @@ void CompressedA3M::extractMatcherResults(unsigned int &key, std::vector<Matcher
match.dbKey = sequenceReader.getDbKey(entryIndex);
if (isFirst) {
key = match.dbKey;
qLen = sequenceReader.getSeqLens(entryIndex) - 2;
qLen = sequenceReader.getSeqLen(entryIndex);
match.qLen = match.dbLen = qLen;
} else {
match.qLen = qLen;
match.dbLen = sequenceReader.getSeqLens(entryIndex) - 2;
match.dbLen = sequenceReader.getSeqLen(entryIndex);
}

unsigned short int startPos;
@@ -225,7 +225,7 @@ MultipleAlignment::MSAResult MultipleAlignment::computeMSA(Sequence *centerSeq,
char ** msaSequence = new char *[edgeSeqs.size() + 1];
for(size_t i = 0; i <= edgeSeqs.size(); i++){
// FIXME: in deletion case, the msa could become even larger than maxSeqLen
msaSequence[i] = initX(noDeletionMSA ? centerSeq->L : maxSeqLen);
msaSequence[i] = initX(noDeletionMSA ? centerSeq->L + 1: maxSeqLen + 1);
}

if(edgeSeqs.size() != alignmentResults.size()){
@@ -142,7 +142,7 @@ int doRescorediagonal(Parameters &par,
if(*data != '\0'){
queryId = qdbr->getId(queryKey);
querySeq = qdbr->getData(queryId, thread_idx);
queryLen = std::max(0, static_cast<int>(qdbr->getSeqLens(queryId)) - 2);
queryLen = static_cast<int>(qdbr->getSeqLen(queryId));
if(queryLen > queryRevSeqLen){
delete [] queryRevSeq;
queryRevSeq = new char[queryLen];
@@ -183,7 +183,7 @@ int doRescorediagonal(Parameters &par,
unsigned int targetId = tdbr->getId(results[entryIdx].seqId);
const bool isIdentity = (queryId == targetId && (par.includeIdentity || sameQTDB)) ? true : false;
char *targetSeq = tdbr->getData(targetId, thread_idx);
int dbLen = std::max(0, static_cast<int>(tdbr->getSeqLens(targetId)) - 2);
int dbLen = static_cast<int>(tdbr->getSeqLen(targetId));

float queryLength = static_cast<float>(queryLen);
float targetLength = static_cast<float>(dbLen);
@@ -360,8 +360,7 @@ int rescorediagonal(int argc, const char **argv, const Command &command) {
size_t dbFrom = 0;
size_t dbSize = 0;

Util::decomposeDomainByAminoAcid(resultReader.getDataSize(), resultReader.getSeqLens(), resultReader.getSize(),
MMseqsMPI::rank, MMseqsMPI::numProc, &dbFrom, &dbSize);
resultReader.decomposeDomainByAminoAcid(MMseqsMPI::rank, MMseqsMPI::numProc, &dbFrom, &dbSize);
std::string outfile = par.db4;
std::string outfileIndex = par.db4Index;
std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(outfile, outfileIndex, MMseqsMPI::rank);
@@ -55,7 +55,7 @@ std::unordered_map<unsigned int, std::vector<unsigned int>> ClusteringAlgorithm
#pragma omp for schedule(dynamic, 10)
for (size_t i = 0; i < alnDbr->getSize(); i++) {
const char *data = alnDbr->getData(i, thread_idx);
const size_t dataSize = alnDbr->getSeqLens(i);
const size_t dataSize = alnDbr->getEntryLen(i);
elementCount += Util::countLines(data, dataSize);
}
}
@@ -410,7 +410,7 @@ void ClusteringAlgorithms::readInClusterData(unsigned int **elementLookupTable,
const unsigned int clusterId = seqDbr->getDbKey(i);
const size_t alnId = alnDbr->getId(clusterId);
const char *data = alnDbr->getData(alnId, thread_idx);
const size_t dataSize = alnDbr->getSeqLens(alnId);
const size_t dataSize = alnDbr->getEntryLen(alnId);
elementOffsets[i] = Util::countLines(data, dataSize);
}
}
@@ -74,10 +74,10 @@ void DBConcat::concat(bool write) {

if (write) {
char *data = dbA.getData(id, thread_idx);
size_t dataSizeA = dbA.getSeqLens(id) - 1;
size_t dataSizeA = dbA.getEntryLen(id) - 1;
if(takeLargerEntry == true) {
size_t idB = dbB.getId(newKey);
size_t dataSizeB = dbB.getSeqLens(idB)-1;
size_t dataSizeB = dbB.getEntryLen(idB)-1;
if(dataSizeA >= dataSizeB){
concatWriter->writeData(data, dataSizeA, newKey, thread_idx);
}
@@ -112,10 +112,10 @@ void DBConcat::concat(bool write) {

if (write) {
char *data = dbB.getData(id, thread_idx);
size_t dataSizeB = dbB.getSeqLens(id) - 1;
size_t dataSizeB = dbB.getEntryLen(id) - 1;
if(takeLargerEntry){
size_t idB = dbA.getId(newKey);
size_t dataSizeA = dbA.getSeqLens(idB)-1;
size_t dataSizeA = dbA.getEntryLen(idB)-1;
if(dataSizeB > dataSizeA) {
concatWriter->writeData(data, dataSizeB, newKey, thread_idx);
}

0 comments on commit b984c95

Please sign in to comment.
You can’t perform that action at this time.