diff --git a/data/hybridassembledb.sh b/data/hybridassembledb.sh index 5ce6ae0a..4aaaa7df 100644 --- a/data/hybridassembledb.sh +++ b/data/hybridassembledb.sh @@ -141,10 +141,9 @@ if notExists "${RESULT_NUCL}_only_assembled_h.dbtype"; then ln -s "${TMP_PATH}/nucl_6f_start_long_h.dbtype" "${RESULT_NUCL}_only_assembled_h.dbtype" fi - if notExists "${RESULT_NUCL}.merged.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" concatdbs "${INPUT}" "${RESULT_NUCL}_only_assembled" "${RESULT_NUCL}.merged" \ + "$MMSEQS" concatdbs "${RESULT_NUCL}_only_assembled" "${INPUT}" "${RESULT_NUCL}.merged" \ || fail "Concat hybridassemblies and reads died" fi diff --git a/data/nuclassembledb.sh b/data/nuclassembledb.sh index 56390482..fec4bc7a 100644 --- a/data/nuclassembledb.sh +++ b/data/nuclassembledb.sh @@ -104,7 +104,7 @@ while [ $STEP -lt $NUM_IT ]; do # 3. Assemble if notExists "${TMP_PATH}/assembly_${STEP}.done"; then # shellcheck disable=SC2086 - "$MMSEQS" assembleresults "$INPUT" "${TMP_PATH}/aln_${STEP}" "${TMP_PATH}/assembly_${STEP}" ${ASSEMBLE_RESULT_PAR} \ + "$MMSEQS" nuclassembleresults "$INPUT" "${TMP_PATH}/aln_${STEP}" "${TMP_PATH}/assembly_${STEP}" ${ASSEMBLE_RESULT_PAR} \ || fail "Assembly step died" touch "${TMP_PATH}/assembly_${STEP}.done" deleteIncremental "$PREV_ASSEMBLY" diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index ce04c7ee..4ac5b682 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -12,6 +12,7 @@ extern int nuclassembledb(int argc, const char** argv, const Command &command); extern int hybridassembledb(int argc, const char** argv, const Command &command); extern int assembleresult(int argc, const char** argv, const Command &command); extern int hybridassembleresults(int argc, const char** argv, const Command &command); +extern int nuclassembleresult(int argc, const char** argv, const Command &command); extern int filternoncoding(int argc, const char** argv, const Command &command); extern int mergereads(int argc, const char** argv, const Command &command); extern int findassemblystart(int argc, const char** argv, const Command &command); diff --git a/src/assembler/CMakeLists.txt b/src/assembler/CMakeLists.txt index a1e07d08..79a39596 100644 --- a/src/assembler/CMakeLists.txt +++ b/src/assembler/CMakeLists.txt @@ -1,5 +1,6 @@ set(assembler_source_files assembler/assembleresult.cpp + assembler/nuclassembleresult.cpp assembler/hybridassembleresult.cpp assembler/findassemblystart.cpp assembler/filternoncoding.cpp diff --git a/src/assembler/cyclecheck.cpp b/src/assembler/cyclecheck.cpp index 9e096ec2..21488df2 100644 --- a/src/assembler/cyclecheck.cpp +++ b/src/assembler/cyclecheck.cpp @@ -1,6 +1,8 @@ /* * Written by Annika Seidel * detect circular fragments + * + * constraint: fragments contain redundant parts at most 3 times */ #include "DBReader.h" @@ -72,13 +74,21 @@ int cyclecheck(int argc, const char **argv, const Command& command) { #ifdef OPENMP thread_idx = (unsigned int) omp_get_thread_num(); #endif + /* 1. split sequence in 3 parts and extract kmers from each part (frontKmers, middleKmers and backKmers) + * 2. find kmermatches between 1 third and 2 third, 1 third and 3 third, 2 third and 3 third + * 3. sum up kmermatches for diagonalbands + * 4. find longest diagonal (= smallest diag index) which fullfill threeshold + */ + Indexer indexer(subMat->alphabetSize - 1, kmerSize); Sequence seq(par.maxSeqLen, seqType, subMat, kmerSize, false, false); - kmerSeqPos *frontKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 2 + 1]; + kmerSeqPos *frontKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 3 + 1]; Util::checkAllocation(frontKmers, "Can not allocate memory"); - kmerSeqPos *backKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 2 + 1]; + kmerSeqPos *middleKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 3 + 1]; + Util::checkAllocation(middleKmers, "Can not allocate memory"); + kmerSeqPos *backKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 3 + 1]; Util::checkAllocation(backKmers, "Can not allocate memory"); - unsigned int *diagHits = new unsigned int[par.maxSeqLen / 2 + 1]; + unsigned int *diagHits = new unsigned int[(par.maxSeqLen / 3) * 2 + 1]; #pragma omp for schedule(dynamic, 100) for (size_t id = 0; id < seqDbr->getSize(); id++) { @@ -98,78 +108,120 @@ int cyclecheck(int argc, const char **argv, const Command& command) { //TODO: try spaced kmers? //TODO: limit the number of kmers in the first half of the sequence? only first 15%? - /* extract front and back kmers */ - unsigned int frontKmersCount = 0; - while (seq.hasNextKmer() && frontKmersCount < seqLen / 2 + 1) { - - const unsigned char *kmer = seq.nextKmer(); - - uint64_t kmerIdx = indexer.int2index(kmer, 0, kmerSize); - (frontKmers + frontKmersCount)->kmer = kmerIdx; - (frontKmers + frontKmersCount)->pos = seq.getCurrentPosition(); - - frontKmersCount++; - } - - unsigned int backKmersCount = 0; + /* extract kmers */ + unsigned int frontKmersCount = 0, middleKmersCount = 0, backKmersCount = 0; + unsigned int thirdSeqLen = seqLen / 3; while (seq.hasNextKmer()) { + unsigned int pos = seq.getCurrentPosition(); const unsigned char *kmer = seq.nextKmer(); - uint64_t kmerIdx = indexer.int2index(kmer, 0, kmerSize); - (backKmers + backKmersCount)->kmer = kmerIdx; - (backKmers + backKmersCount)->pos = seq.getCurrentPosition(); - backKmersCount++; + if (pos < thirdSeqLen + 1) { + (frontKmers + frontKmersCount)->kmer = kmerIdx; + (frontKmers + frontKmersCount)->pos = seq.getCurrentPosition(); + frontKmersCount++; + } + else if (pos < 2 * thirdSeqLen + 1) { + (middleKmers + middleKmersCount)->kmer = kmerIdx; + (middleKmers + middleKmersCount)->pos = seq.getCurrentPosition(); + middleKmersCount++; + } + else // pos > 2* thirdSeqLen + { + (backKmers + backKmersCount)->kmer = kmerIdx; + (backKmers + backKmersCount)->pos = seq.getCurrentPosition(); + backKmersCount++; + } } std::sort(frontKmers, frontKmers + frontKmersCount, kmerSeqPos::compareByKmer); + std::sort(middleKmers, middleKmers + middleKmersCount, kmerSeqPos::compareByKmer); std::sort(backKmers, backKmers + backKmersCount, kmerSeqPos::compareByKmer); - /* calculate front-back-kmermatches */ + /* calculate front-back-kmermatches and front-middle-kmermatches */ unsigned int kmermatches = 0; - std::fill(diagHits, diagHits + seqLen / 2 + 1, 0); + std::fill(diagHits, diagHits + 2*thirdSeqLen + 1, 0); unsigned int idx = 0; unsigned int jdx = 0; - while (idx < frontKmersCount && jdx < backKmersCount) { + unsigned int kdx = 0; - if (frontKmers[idx].kmer < backKmers[jdx].kmer) + while(idx < frontKmersCount && (jdx < backKmersCount || kdx < middleKmersCount) ) { + + size_t kmerIdx = frontKmers[idx].kmer; + unsigned int pos = frontKmers[idx].pos; + + while (jdx < backKmersCount && backKmers[jdx].kmer < kmerIdx) { + jdx++; + } + while (kdx < middleKmersCount && middleKmers[kdx].kmer < kmerIdx) { + kdx++; + } + + while (jdx < backKmersCount && kmerIdx == backKmers[jdx].kmer) { + int diag = backKmers[jdx].pos - pos; + if (diag >= static_cast(seqLen / 3)) { + diagHits[diag - seqLen / 3]++; + kmermatches++; + } + jdx++; + } + + while (kdx < middleKmersCount && kmerIdx == middleKmers[kdx].kmer) { + int diag = middleKmers[kdx].pos - pos; + if (diag >= static_cast(seqLen / 3)) { + diagHits[diag - seqLen / 3]++; + kmermatches++; + } + kdx++; + } + + idx++; + while (idx < frontKmersCount && kmerIdx == frontKmers[idx].kmer) { idx++; - else if (frontKmers[idx].kmer > backKmers[jdx].kmer) + } + } + + /* calculate middle-back-kmermatches */ + jdx = 0, kdx = 0; + while (kdx < middleKmersCount && jdx < backKmersCount) { + + if (middleKmers[kdx].kmer < backKmers[jdx].kmer) + kdx++; + else if (middleKmers[kdx].kmer > backKmers[jdx].kmer) jdx++; else { - size_t kmerIdx = frontKmers[idx].kmer; - unsigned int pos = frontKmers[idx].pos; + size_t kmerIdx = middleKmers[kdx].kmer; + unsigned int pos = middleKmers[kdx].pos; while (jdx < backKmersCount && kmerIdx == backKmers[jdx].kmer) { int diag = backKmers[jdx].pos - pos; - if (diag >= static_cast(seqLen / 2)) { + if (diag >= static_cast(seqLen / 3)) { //diag = abs(diag); - diagHits[diag - seqLen / 2]++; + diagHits[diag - seqLen / 3]++; kmermatches++; } jdx++; } - while (idx < frontKmersCount && kmerIdx == frontKmers[idx].kmer) { - idx++; + while (kdx < middleKmersCount && kmerIdx == middleKmers[kdx].kmer) { + kdx++; } } } - /* calculate maximal hit rate on diagonal bands */ - int splitDiagonal = -1; - float maxDiagbandHitRate = 0.0; + /* calculate hit rate on diagonal bands */ + unsigned int splitDiagonal = 0; if (kmermatches > 0) { - for (unsigned int d = 0; d < seqLen / 2; d++) { + for (unsigned int d = 0; d < 2 * thirdSeqLen; d++) { if (diagHits[d] != 0) { - unsigned int diag = d + seqLen / 2; + unsigned int diag = d + thirdSeqLen; unsigned int diaglen = seqLen - diag; unsigned int gapwindow = diaglen * 0.01; unsigned int lower = std::max(0, static_cast(d - gapwindow)); - unsigned int upper = std::min(d + gapwindow, seqLen / 2); + unsigned int upper = std::min(d + gapwindow, 2 * thirdSeqLen); unsigned int diagbandHits = 0; for (size_t i = lower; i <= upper; i++) { @@ -178,16 +230,16 @@ int cyclecheck(int argc, const char **argv, const Command& command) { } float diagbandHitRate = static_cast(diagbandHits) / (diaglen - kmerSize + 1); - if (diagbandHitRate > maxDiagbandHitRate) { - maxDiagbandHitRate = diagbandHitRate; + if (diagbandHitRate > HIT_RATE_THRESHOLD) { splitDiagonal = diag; + break; } } } } - if (maxDiagbandHitRate >= HIT_RATE_THRESHOLD) { + if (splitDiagonal != 0) { unsigned int len = seqDbr->getEntryLen(id)-1; std::string seq; @@ -203,6 +255,7 @@ int cyclecheck(int argc, const char **argv, const Command& command) { } delete[] diagHits; delete[] frontKmers; + delete[] middleKmers; delete[] backKmers; } diff --git a/src/assembler/hybridassembleresult.cpp b/src/assembler/hybridassembleresult.cpp index 45f1aa85..86c323b1 100644 --- a/src/assembler/hybridassembleresult.cpp +++ b/src/assembler/hybridassembleresult.cpp @@ -22,7 +22,7 @@ class CompareResultBySeqId { public: - bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) { + /*bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) { if(r1.seqId < r2.seqId ) return true; if(r2.seqId < r1.seqId ) @@ -37,6 +37,40 @@ class CompareResultBySeqId { return false; return false; + }*/ + bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) { + unsigned int mm_count1 = (1 - r1.seqId) * r1.alnLength + 0.5; + unsigned int mm_count2 = (1 - r2.seqId) * r2.alnLength + 0.5; + + unsigned int alpha1 = mm_count1 + 1; + unsigned int alpha2 = mm_count2 + 1; + unsigned int beta1 = r1.alnLength - mm_count1 + 1; + unsigned int beta2 = r2.alnLength - mm_count2 + 1; + + //double c=(std::tgamma(beta1+beta2)*std::tgamma(alpha1+beta2))/(std::tgamma(alpha1+beta1+beta2)*std::tgamma(beta1)); + double log_c = (std::lgamma(beta1+beta2)+std::lgamma(alpha1+beta1))-(std::lgamma(alpha1+beta1+beta2)+std::lgamma(beta1)); + + //double r = 1.0; // r_0 =1 + double log_r = 0.0; + double p = 0.0; + for (size_t idx = 0; idx < alpha2; idx++) { + + p += exp(log_r + log_c); + //r *= ((alpha1+idx)*(beta2+idx))/((idx+1)*(idx+alpha1+beta1+beta2)); + log_r = log(alpha1+idx)+log(beta2+idx)-(log(idx+1) + log(idx+alpha1+beta1+beta2)) + log_r; + } + //p *= c; + + if (p < 0.45) + return true; + if (p > 0.55) + return false; + if (r1.dbLen - r1.alnLength < r2.dbLen - r2.alnLength) + return true; + if (r1.dbLen - r1.alnLength > r2.dbLen - r2.alnLength) + return false; + + return true; } }; @@ -122,6 +156,8 @@ int dohybridassembleresult(LocalParameters &par) { bool queryCouldBeExtendedLeft = false; bool queryCouldBeExtendedRight = false; for (size_t alnIdx = 0; alnIdx < nuclAlignments.size(); alnIdx++) { + if(nuclAlignments[alnIdx].seqId < par.seqIdThr) + continue; alnQueue.push(nuclAlignments[alnIdx]); if (nuclAlignments.size() > 1) { size_t id = nuclSequenceDbr->getId(nuclAlignments[alnIdx].dbKey); diff --git a/src/assembler/nuclassembleresult.cpp b/src/assembler/nuclassembleresult.cpp new file mode 100644 index 00000000..87004d7b --- /dev/null +++ b/src/assembler/nuclassembleresult.cpp @@ -0,0 +1,403 @@ +#include "LocalParameters.h" +#include "DistanceCalculator.h" +#include "Matcher.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" +#include "MathUtil.h" + +#include +#include +#include +#include + +#ifdef OPENMP +#include +#endif + +class CompareNuclResultByScore { +public: + /*bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) { + if(r1.score < r2.score ) + return true; + if(r2.score < r1.score ) + return false; + if(r1.alnLength < r2.alnLength ) + return true; + if(r2.alnLength < r1.alnLength ) + return false; + if(r1.dbKey > r2.dbKey ) + return true; + if(r2.dbKey > r1.dbKey ) + return false; + return false; + }*/ + bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) { + unsigned int mm_count1 = (1 - r1.seqId) * r1.alnLength + 0.5; + unsigned int mm_count2 = (1 - r2.seqId) * r2.alnLength + 0.5; + + unsigned int alpha1 = mm_count1 + 1; + unsigned int alpha2 = mm_count2 + 1; + unsigned int beta1 = r1.alnLength - mm_count1 + 1; + unsigned int beta2 = r2.alnLength - mm_count2 + 1; + + //double c=(std::tgamma(beta1+beta2)*std::tgamma(alpha1+beta2))/(std::tgamma(alpha1+beta1+beta2)*std::tgamma(beta1)); + double log_c = (std::lgamma(beta1+beta2)+std::lgamma(alpha1+beta1))-(std::lgamma(alpha1+beta1+beta2)+std::lgamma(beta1)); + + //double r = 1.0; // r_0 =1 + double log_r = 0.0; + double p = 0.0; + for (size_t idx = 0; idx < alpha2; idx++) { + + p += exp(log_r + log_c); + //r *= ((alpha1+idx)*(beta2+idx))/((idx+1)*(idx+alpha1+beta1+beta2)); + log_r = log(alpha1+idx)+log(beta2+idx)-(log(idx+1) + log(idx+alpha1+beta1+beta2)) + log_r; + } + //p *= c; + + if (p < 0.45) + return true; + if (p > 0.55) + return false; + if (r1.dbLen - r1.alnLength < r2.dbLen - r2.alnLength) + return true; + if (r1.dbLen - r1.alnLength > r2.dbLen - r2.alnLength) + return false; + + return true; + + } +}; + +typedef std::priority_queue , CompareNuclResultByScore> QueueByScoreNucl; +Matcher::result_t selectNuclFragmentToExtend(QueueByScoreNucl &alignments, + unsigned int queryKey) { + // results are ordered by score + while (alignments.empty() == false){ + Matcher::result_t res = alignments.top(); + alignments.pop(); + size_t dbKey = res.dbKey; + const bool notRightStartAndLeftStart = !(res.dbStartPos == 0 && res.qStartPos == 0 ); + const bool rightStart = res.dbStartPos == 0 && (res.dbEndPos != static_cast(res.dbLen)-1); + const bool leftStart = res.qStartPos == 0 && (res.qEndPos != static_cast(res.qLen)-1); + const bool isNotIdentity = (dbKey != queryKey); + + if ((rightStart || leftStart) && notRightStartAndLeftStart && isNotIdentity){ + return res; + } + } + return Matcher::result_t(UINT_MAX,0,0,0,0,0,0,0,0,0,0,0,0,""); +} + +inline char* getNuclRevFragment(const char* fragment, size_t fragLen, NucleotideMatrix *nuclMatrix) +{ + char *fragmentRev = new char[fragLen]; + for (int pos = fragLen - 1; pos > -1; pos--) { + int res = nuclMatrix->aa2num[static_cast(fragment[pos])]; + char revRes = nuclMatrix->num2aa[nuclMatrix->reverseResidue(res)]; + fragmentRev[(fragLen - 1) - pos] = (revRes == 'X')? 'N' : revRes; + } + return fragmentRev; +} + +inline void updateNuclAlignment(Matcher::result_t &tmpAlignment, DistanceCalculator::LocalAlignment &alignment, + const char *querySeq, size_t querySeqLen, const char *tSeq, size_t tSeqLen) { + + int qStartPos, qEndPos, dbStartPos, dbEndPos; + int diag = alignment.diagonal; + int dist = std::max(abs(diag), 0); + + if (diag >= 0) { + qStartPos = alignment.startPos + dist; + qEndPos = alignment.endPos + dist; + dbStartPos = alignment.startPos; + dbEndPos = alignment.endPos; + } else { + qStartPos = alignment.startPos; + qEndPos = alignment.endPos; + dbStartPos = alignment.startPos + dist; + dbEndPos = alignment.endPos + dist; + } + + int idCnt = 0; + for(int i = qStartPos; i < qEndPos; i++){ + idCnt += (querySeq[i] == tSeq[dbStartPos+(i-qStartPos)]) ? 1 : 0; + } + float seqId = static_cast(idCnt) / (static_cast(qEndPos) - static_cast(qStartPos)); + + tmpAlignment.seqId = seqId; + tmpAlignment.qLen = querySeqLen; + tmpAlignment.dbLen = tSeqLen; + + tmpAlignment.alnLength = alignment.diagonalLen; + float scorePerCol = static_cast(alignment.score ) / static_cast(tmpAlignment.alnLength + 0.5); + tmpAlignment.score = static_cast(scorePerCol*100); + + tmpAlignment.qStartPos = qStartPos; + tmpAlignment.qEndPos = qEndPos; + tmpAlignment.dbStartPos = dbStartPos; + tmpAlignment.dbEndPos = dbEndPos; + +} + +int doNuclAssembly(LocalParameters &par) { + DBReader *sequenceDbr = new DBReader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + sequenceDbr->open(DBReader::NOSORT); + + DBReader * alnReader = new DBReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + alnReader->open(DBReader::NOSORT); + + DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, sequenceDbr->getDbtype()); + resultWriter.open(); + + int seqType = sequenceDbr->getDbtype(); + BaseMatrix *subMat; + if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) { + subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0); + } else { + subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0); + } + + SubstitutionMatrix::FastMatrix fastMatrix = SubstitutionMatrix::createAsciiSubMat(*subMat); + EvalueComputation evaluer(sequenceDbr->getAminoAcidDBSize(), subMat); + + unsigned char * wasExtended = new unsigned char[sequenceDbr->getSize()]; + std::fill(wasExtended, wasExtended+sequenceDbr->getSize(), 0); + Debug::Progress progress(sequenceDbr->getSize()); +#pragma omp parallel + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); +#endif + + std::vector alignments; + alignments.reserve(300); + bool *useReverse = new bool[sequenceDbr->getSize()]; + std::fill(useReverse, useReverse+sequenceDbr->getSize(), false); +#pragma omp for schedule(dynamic, 100) + for (size_t id = 0; id < sequenceDbr->getSize(); id++) { + progress.updateProgress(); + + unsigned int queryKey = sequenceDbr->getDbKey(id); + char *querySeq = sequenceDbr->getData(id, thread_idx); + unsigned int querySeqLen = sequenceDbr->getSeqLen(id); + std::string query(querySeq, querySeqLen); // no /n/0 + + char *alnData = alnReader->getDataByDBKey(queryKey, thread_idx); + alignments.clear(); + Matcher::readAlignmentResults(alignments, alnData); + + bool queryCouldBeExtended = false; + QueueByScoreNucl alnQueue; + + for (size_t alnIdx = 0; alnIdx < alignments.size(); alnIdx++) { + + int rawScore = static_cast(evaluer.computeRawScoreFromBitScore(alignments[alnIdx].score) + 0.5); + float scorePerCol = static_cast(rawScore) / static_cast(alignments[alnIdx].alnLength + 0.5); + + //float alnLen = static_cast(alignments[alnIdx].alnLength); + //float ids = static_cast(alignments[alnIdx].seqId) * alnLen; + //alignments[alnIdx].seqId = ids / (alnLen + 0.5); + alignments[alnIdx].score = static_cast(scorePerCol*100); + + if (seqType == Parameters::DBTYPE_NUCLEOTIDES) { + if (alignments[alnIdx].qStartPos > alignments[alnIdx].qEndPos) { + useReverse[sequenceDbr->getId(alignments[alnIdx].dbKey)] = true; + + std::swap(alignments[alnIdx].qStartPos, alignments[alnIdx].qEndPos); + unsigned int dbStartPos = alignments[alnIdx].dbStartPos; + alignments[alnIdx].dbStartPos = alignments[alnIdx].dbLen - alignments[alnIdx].dbEndPos - 1; + alignments[alnIdx].dbEndPos= alignments[alnIdx].dbLen - dbStartPos - 1; + + } else { + useReverse[sequenceDbr->getId(alignments[alnIdx].dbKey)] = false; + } + } + + alnQueue.push(alignments[alnIdx]); + if (alignments.size() > 1) + __sync_or_and_fetch(&wasExtended[sequenceDbr->getId(alignments[alnIdx].dbKey)], + static_cast(0x40)); + } + + std::vector tmpAlignments; + tmpAlignments.reserve(alignments.size()); + while (!alnQueue.empty()) { + + unsigned int leftQueryOffset = 0; + unsigned int rightQueryOffset = 0; + tmpAlignments.clear(); + Matcher::result_t besttHitToExtend; + while ((besttHitToExtend = selectNuclFragmentToExtend(alnQueue, queryKey)).dbKey != UINT_MAX) { + + unsigned int targetId = sequenceDbr->getId(besttHitToExtend.dbKey); + if (targetId == UINT_MAX) { + Debug(Debug::ERROR) << "Could not find targetId " << besttHitToExtend.dbKey + << " in database " << sequenceDbr->getDataFileName() << "\n"; + EXIT(EXIT_FAILURE); + } + char *targetSeq = sequenceDbr->getData(targetId, thread_idx); + unsigned int targetSeqLen = sequenceDbr->getSeqLen(targetId) ; + + // check if alignment still make sense (can extend the query) + if (besttHitToExtend.dbStartPos == 0) { + if ((targetSeqLen - (besttHitToExtend.dbEndPos + 1)) <= rightQueryOffset) { + continue; + } + } else if (besttHitToExtend.qStartPos == 0) { + if (besttHitToExtend.dbStartPos <= static_cast(leftQueryOffset)) { + continue; + } + } + __sync_or_and_fetch(&wasExtended[targetId], static_cast(0x10)); + + unsigned int dbStartPos = besttHitToExtend.dbStartPos; + unsigned int dbEndPos = besttHitToExtend.dbEndPos; + unsigned int qStartPos = besttHitToExtend.qStartPos; + unsigned int qEndPos = besttHitToExtend.qEndPos; + if (dbStartPos == 0 && qEndPos == (querySeqLen - 1)) { + //right extension + + if(rightQueryOffset > 0) { + tmpAlignments.push_back(besttHitToExtend); + continue; + } + + unsigned int fragLen = targetSeqLen - (dbEndPos + 1); + std::string fragment; + if (useReverse[targetId]) { + char *cfragment = getNuclRevFragment(targetSeq, fragLen, (NucleotideMatrix *) subMat); + fragment = std::string(cfragment, fragLen); + delete[] cfragment; + } + else + fragment = std::string(targetSeq + dbEndPos + 1, fragLen); + + query += fragment; + rightQueryOffset += fragLen; + //update that dbKey was used in assembly + __sync_or_and_fetch(&wasExtended[targetId], static_cast(0x80)); + + } + else if (qStartPos == 0 && dbEndPos == (targetSeqLen - 1)) { + //left extension + + if(leftQueryOffset > 0) { + tmpAlignments.push_back(besttHitToExtend); + continue; + } + + unsigned int fragLen = dbStartPos; + if (query.size() + fragLen >= par.maxSeqLen) { + Debug(Debug::WARNING) << "Ignore extension because of length limitation for sequence: " \ + << queryKey << ". Max length allowed would be " << par.maxSeqLen << "\n"; + break; + } + + std::string fragment; + if (useReverse[targetId]) { + char *cfragment = getNuclRevFragment(targetSeq + (targetSeqLen - dbStartPos), fragLen, (NucleotideMatrix *) subMat); + fragment = std::string(cfragment, fragLen); + delete[] cfragment; + } + else + fragment = std::string(targetSeq, fragLen); + + query = fragment + query; + leftQueryOffset += fragLen; + //update that dbKey was used in assembly + __sync_or_and_fetch(&wasExtended[targetId], static_cast(0x80)); + } + + } + + if (leftQueryOffset > 0 || rightQueryOffset > 0) + queryCouldBeExtended = true; + + if (!alnQueue.empty()) + break; + + querySeqLen = query.length(); + querySeq = (char *) query.c_str(); + + // update alignments + for(size_t alnIdx = 0; alnIdx < tmpAlignments.size(); alnIdx++) { + + unsigned int tId = sequenceDbr->getId(tmpAlignments[alnIdx].dbKey); + unsigned int tSeqLen = sequenceDbr->getSeqLen(tId); + char *tSeq = sequenceDbr->getData(tId, thread_idx); + if (useReverse[tId]) + tSeq = getNuclRevFragment(tSeq, tSeqLen, (NucleotideMatrix *) subMat); + + int qStartPos = tmpAlignments[alnIdx].qStartPos; + int dbStartPos = tmpAlignments[alnIdx].dbStartPos; + int diag = (qStartPos + leftQueryOffset) - dbStartPos; + + DistanceCalculator::LocalAlignment alignment = DistanceCalculator::ungappedAlignmentByDiagonal( + querySeq, querySeqLen, tSeq, tSeqLen, + diag, fastMatrix.matrix, par.rescoreMode); + + updateNuclAlignment(tmpAlignments[alnIdx], alignment, querySeq, querySeqLen, tSeq, tSeqLen); + + // refill queue + if(tmpAlignments[alnIdx].seqId >= par.seqIdThr) + alnQueue.push(tmpAlignments[alnIdx]); + } + } + + if (queryCouldBeExtended) { + query.push_back('\n'); + __sync_or_and_fetch(&wasExtended[id], static_cast(0x20)); + resultWriter.writeData(query.c_str(), query.size(), queryKey, thread_idx); + } + + } + } // end parallel + +// add sequences that are not yet assembled +#pragma omp parallel for schedule(dynamic, 10000) + for (size_t id = 0; id < sequenceDbr->getSize(); id++) { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); +#endif + //bool couldExtend = (wasExtended[id] & 0x10); + bool isNotContig = !(wasExtended[id] & 0x20); + //bool wasNotUsed = !(wasExtended[id] & 0x40); + //bool wasNotExtended = !(wasExtended[id] & 0x80); + //bool wasUsed = (wasExtended[id] & 0x40); + //if(isNotContig && wasNotExtended ){ + if (isNotContig){ + char *querySeqData = sequenceDbr->getData(id, thread_idx); + resultWriter.writeData(querySeqData, sequenceDbr->getEntryLen(id)-1, sequenceDbr->getDbKey(id), thread_idx); + } + } + + // cleanup + resultWriter.close(true); + alnReader->close(); + delete [] wasExtended; + delete alnReader; + delete [] fastMatrix.matrix; + delete [] fastMatrix.matrixData; + sequenceDbr->close(); + delete sequenceDbr; + Debug(Debug::INFO) << "\nDone.\n"; + + return EXIT_SUCCESS; +} + +int nuclassembleresult(int argc, const char **argv, const Command& command) { + LocalParameters& par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + MMseqsMPI::init(argc, argv); + + // never allow deletions + par.allowDeletion = false; + Debug(Debug::INFO) << "Compute assembly.\n"; + return doNuclAssembly(par); +} + diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 527a0c7d..411dc118 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -193,7 +193,7 @@ class LocalParameters : public Parameters { clustSeqIdThr = 0.97; clustCovThr = 0.99; minContigLen = 1000; - chopCycle = false; + chopCycle = true; cycleCheck = true; multiNumIterations = MultiParam(12,20); diff --git a/src/plass.cpp b/src/plass.cpp index ec022362..7eda0c1c 100644 --- a/src/plass.cpp +++ b/src/plass.cpp @@ -75,6 +75,14 @@ std::vector commands = { {"nuclAlnResult", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, {"nuclAssembly", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::nuclDb }, {"aaAssembly", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::aaDb }}}, + {"nuclassembleresults", nuclassembleresult, &localPar.assembleresults, COMMAND_HIDDEN, + "Extending representative sequence to the left and right side using ungapped alignments.", + NULL, + "Annika Seidel >", + " ", + CITATION_PLASS, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::nuclDb }, + {"alnResult", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, + {"reprSeqDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}}, {"findassemblystart", findassemblystart, &localPar.onlythreads, COMMAND_HIDDEN, "Compute consensus based new * stop before M amino acid", NULL, diff --git a/src/workflow/HybridassembleDB.cpp b/src/workflow/HybridassembleDB.cpp index 51cb8413..54fed9d5 100644 --- a/src/workflow/HybridassembleDB.cpp +++ b/src/workflow/HybridassembleDB.cpp @@ -9,9 +9,9 @@ void setHybridAssemblerWorkflowDefaults(LocalParameters *p) { - p->multiNumIterations = MultiParam(12,20); + p->multiNumIterations = MultiParam(5,5); p->multiKmerSize = MultiParam(14,22); - p->multiSeqIdThr = MultiParam(0.97,0.97); + p->multiSeqIdThr = MultiParam(0.97,0.99); p->alphabetSize = MultiParam(13,5); p->orfMinLength = 45; @@ -99,6 +99,7 @@ int hybridassembledb(int argc, const char **argv, const Command &command) { par.filenames.pop_back(); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + cmd.addVariable("REMOVE_INCREMENTAL_TMP", par.deleteFilesInc ? "TRUE" : NULL); cmd.addVariable("RUNNER", par.runner.c_str()); // set values for protein level assembly @@ -138,6 +139,7 @@ int hybridassembledb(int argc, const char **argv, const Command &command) { cmd.addVariable("UNGAPPED_ALN_PAR", par.createParameterString(par.rescorediagonal).c_str()); // # 3. Assembly: Extend by left and right extension + par.seqIdThr = par.multiSeqIdThr.nucleotides; cmd.addVariable("ASSEMBLE_RESULT_PAR", par.createParameterString(par.assembleresults).c_str()); // set mandatory values for nucleotide level assembly step when calling nucleassemble from hybridassemble diff --git a/src/workflow/Hybridassembler.cpp b/src/workflow/Hybridassembler.cpp index e2eb1f81..14b01661 100644 --- a/src/workflow/Hybridassembler.cpp +++ b/src/workflow/Hybridassembler.cpp @@ -16,9 +16,9 @@ namespace hybridassembler { void setEasyHybridAssemblerWorkflowDefaults(LocalParameters *p) { //p->createdbMode = Parameters::SEQUENCE_SPLIT_MODE_SOFT; - p->multiNumIterations = MultiParam(12,20); + p->multiNumIterations = MultiParam(5,5); p->multiKmerSize = MultiParam(14,22); - p->multiSeqIdThr = MultiParam(0.97,0.97); + p->multiSeqIdThr = MultiParam(0.97,0.99); p->alphabetSize = MultiParam(13,5); p->orfMinLength = 45; diff --git a/src/workflow/NuclassembleDB.cpp b/src/workflow/NuclassembleDB.cpp index 352d3aff..b84d56a7 100644 --- a/src/workflow/NuclassembleDB.cpp +++ b/src/workflow/NuclassembleDB.cpp @@ -23,7 +23,7 @@ void setNuclAssemblerWorkflowDefaults(LocalParameters *p) { p->maskMode = 0; p->numIterations = 12; p->rescoreMode = Parameters::RESCORE_MODE_GLOBAL_ALIGNMENT; - p->seqIdThr = 0.97; + p->seqIdThr = 0.99; p->spacedKmer = false; } diff --git a/src/workflow/Nuclassembler.cpp b/src/workflow/Nuclassembler.cpp index 635bb32d..1736422f 100644 --- a/src/workflow/Nuclassembler.cpp +++ b/src/workflow/Nuclassembler.cpp @@ -27,7 +27,7 @@ void setEasyNuclAssemblerWorkflowDefaults(LocalParameters *p) { p->maskMode = 0; p->numIterations = 12; p->rescoreMode = Parameters::RESCORE_MODE_GLOBAL_ALIGNMENT; - p->seqIdThr = 0.97; + p->seqIdThr = 0.99; p->spacedKmer = false; }