Skip to content

Commit

Permalink
Merge branch 'hybridassembler' of https://github.com/soedinglab/plass
Browse files Browse the repository at this point in the history
  • Loading branch information
AnnSeidel committed Aug 13, 2020
2 parents 41d03ca + 39c9b0a commit c4aaa98
Show file tree
Hide file tree
Showing 13 changed files with 555 additions and 52 deletions.
3 changes: 1 addition & 2 deletions data/hybridassembledb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,9 @@ if notExists "${RESULT_NUCL}_only_assembled_h.dbtype"; then
ln -s "${TMP_PATH}/nucl_6f_start_long_h.dbtype" "${RESULT_NUCL}_only_assembled_h.dbtype"
fi


if notExists "${RESULT_NUCL}.merged.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "${INPUT}" "${RESULT_NUCL}_only_assembled" "${RESULT_NUCL}.merged" \
"$MMSEQS" concatdbs "${RESULT_NUCL}_only_assembled" "${INPUT}" "${RESULT_NUCL}.merged" \
|| fail "Concat hybridassemblies and reads died"
fi

Expand Down
2 changes: 1 addition & 1 deletion data/nuclassembledb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ while [ $STEP -lt $NUM_IT ]; do
# 3. Assemble
if notExists "${TMP_PATH}/assembly_${STEP}.done"; then
# shellcheck disable=SC2086
"$MMSEQS" assembleresults "$INPUT" "${TMP_PATH}/aln_${STEP}" "${TMP_PATH}/assembly_${STEP}" ${ASSEMBLE_RESULT_PAR} \
"$MMSEQS" nuclassembleresults "$INPUT" "${TMP_PATH}/aln_${STEP}" "${TMP_PATH}/assembly_${STEP}" ${ASSEMBLE_RESULT_PAR} \
|| fail "Assembly step died"
touch "${TMP_PATH}/assembly_${STEP}.done"
deleteIncremental "$PREV_ASSEMBLY"
Expand Down
1 change: 1 addition & 0 deletions src/LocalCommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ extern int nuclassembledb(int argc, const char** argv, const Command &command);
extern int hybridassembledb(int argc, const char** argv, const Command &command);
extern int assembleresult(int argc, const char** argv, const Command &command);
extern int hybridassembleresults(int argc, const char** argv, const Command &command);
extern int nuclassembleresult(int argc, const char** argv, const Command &command);
extern int filternoncoding(int argc, const char** argv, const Command &command);
extern int mergereads(int argc, const char** argv, const Command &command);
extern int findassemblystart(int argc, const char** argv, const Command &command);
Expand Down
1 change: 1 addition & 0 deletions src/assembler/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(assembler_source_files
assembler/assembleresult.cpp
assembler/nuclassembleresult.cpp
assembler/hybridassembleresult.cpp
assembler/findassemblystart.cpp
assembler/filternoncoding.cpp
Expand Down
135 changes: 94 additions & 41 deletions src/assembler/cyclecheck.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
/*
* Written by Annika Seidel <annika.seidel@mpibpc.mpg.de>
* detect circular fragments
*
* constraint: fragments contain redundant parts at most 3 times
*/

#include "DBReader.h"
Expand Down Expand Up @@ -72,13 +74,21 @@ int cyclecheck(int argc, const char **argv, const Command& command) {
#ifdef OPENMP
thread_idx = (unsigned int) omp_get_thread_num();
#endif
/* 1. split sequence in 3 parts and extract kmers from each part (frontKmers, middleKmers and backKmers)
* 2. find kmermatches between 1 third and 2 third, 1 third and 3 third, 2 third and 3 third
* 3. sum up kmermatches for diagonalbands
* 4. find longest diagonal (= smallest diag index) which fullfill threeshold
*/

Indexer indexer(subMat->alphabetSize - 1, kmerSize);
Sequence seq(par.maxSeqLen, seqType, subMat, kmerSize, false, false);
kmerSeqPos *frontKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 2 + 1];
kmerSeqPos *frontKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 3 + 1];
Util::checkAllocation(frontKmers, "Can not allocate memory");
kmerSeqPos *backKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 2 + 1];
kmerSeqPos *middleKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 3 + 1];
Util::checkAllocation(middleKmers, "Can not allocate memory");
kmerSeqPos *backKmers = new(std::nothrow) kmerSeqPos[par.maxSeqLen / 3 + 1];
Util::checkAllocation(backKmers, "Can not allocate memory");
unsigned int *diagHits = new unsigned int[par.maxSeqLen / 2 + 1];
unsigned int *diagHits = new unsigned int[(par.maxSeqLen / 3) * 2 + 1];

#pragma omp for schedule(dynamic, 100)
for (size_t id = 0; id < seqDbr->getSize(); id++) {
Expand All @@ -98,78 +108,120 @@ int cyclecheck(int argc, const char **argv, const Command& command) {
//TODO: try spaced kmers?
//TODO: limit the number of kmers in the first half of the sequence? only first 15%?

/* extract front and back kmers */
unsigned int frontKmersCount = 0;
while (seq.hasNextKmer() && frontKmersCount < seqLen / 2 + 1) {

const unsigned char *kmer = seq.nextKmer();

uint64_t kmerIdx = indexer.int2index(kmer, 0, kmerSize);
(frontKmers + frontKmersCount)->kmer = kmerIdx;
(frontKmers + frontKmersCount)->pos = seq.getCurrentPosition();

frontKmersCount++;
}

unsigned int backKmersCount = 0;
/* extract kmers */
unsigned int frontKmersCount = 0, middleKmersCount = 0, backKmersCount = 0;
unsigned int thirdSeqLen = seqLen / 3;
while (seq.hasNextKmer()) {

unsigned int pos = seq.getCurrentPosition();
const unsigned char *kmer = seq.nextKmer();

uint64_t kmerIdx = indexer.int2index(kmer, 0, kmerSize);
(backKmers + backKmersCount)->kmer = kmerIdx;
(backKmers + backKmersCount)->pos = seq.getCurrentPosition();

backKmersCount++;
if (pos < thirdSeqLen + 1) {
(frontKmers + frontKmersCount)->kmer = kmerIdx;
(frontKmers + frontKmersCount)->pos = seq.getCurrentPosition();
frontKmersCount++;
}
else if (pos < 2 * thirdSeqLen + 1) {
(middleKmers + middleKmersCount)->kmer = kmerIdx;
(middleKmers + middleKmersCount)->pos = seq.getCurrentPosition();
middleKmersCount++;
}
else // pos > 2* thirdSeqLen
{
(backKmers + backKmersCount)->kmer = kmerIdx;
(backKmers + backKmersCount)->pos = seq.getCurrentPosition();
backKmersCount++;
}
}

std::sort(frontKmers, frontKmers + frontKmersCount, kmerSeqPos::compareByKmer);
std::sort(middleKmers, middleKmers + middleKmersCount, kmerSeqPos::compareByKmer);
std::sort(backKmers, backKmers + backKmersCount, kmerSeqPos::compareByKmer);

/* calculate front-back-kmermatches */
/* calculate front-back-kmermatches and front-middle-kmermatches */
unsigned int kmermatches = 0;
std::fill(diagHits, diagHits + seqLen / 2 + 1, 0);
std::fill(diagHits, diagHits + 2*thirdSeqLen + 1, 0);

unsigned int idx = 0;
unsigned int jdx = 0;
while (idx < frontKmersCount && jdx < backKmersCount) {
unsigned int kdx = 0;

if (frontKmers[idx].kmer < backKmers[jdx].kmer)
while(idx < frontKmersCount && (jdx < backKmersCount || kdx < middleKmersCount) ) {

size_t kmerIdx = frontKmers[idx].kmer;
unsigned int pos = frontKmers[idx].pos;

while (jdx < backKmersCount && backKmers[jdx].kmer < kmerIdx) {
jdx++;
}
while (kdx < middleKmersCount && middleKmers[kdx].kmer < kmerIdx) {
kdx++;
}

while (jdx < backKmersCount && kmerIdx == backKmers[jdx].kmer) {
int diag = backKmers[jdx].pos - pos;
if (diag >= static_cast<int>(seqLen / 3)) {
diagHits[diag - seqLen / 3]++;
kmermatches++;
}
jdx++;
}

while (kdx < middleKmersCount && kmerIdx == middleKmers[kdx].kmer) {
int diag = middleKmers[kdx].pos - pos;
if (diag >= static_cast<int>(seqLen / 3)) {
diagHits[diag - seqLen / 3]++;
kmermatches++;
}
kdx++;
}

idx++;
while (idx < frontKmersCount && kmerIdx == frontKmers[idx].kmer) {
idx++;
else if (frontKmers[idx].kmer > backKmers[jdx].kmer)
}
}

/* calculate middle-back-kmermatches */
jdx = 0, kdx = 0;
while (kdx < middleKmersCount && jdx < backKmersCount) {

if (middleKmers[kdx].kmer < backKmers[jdx].kmer)
kdx++;
else if (middleKmers[kdx].kmer > backKmers[jdx].kmer)
jdx++;
else {
size_t kmerIdx = frontKmers[idx].kmer;
unsigned int pos = frontKmers[idx].pos;
size_t kmerIdx = middleKmers[kdx].kmer;
unsigned int pos = middleKmers[kdx].pos;
while (jdx < backKmersCount && kmerIdx == backKmers[jdx].kmer) {

int diag = backKmers[jdx].pos - pos;
if (diag >= static_cast<int>(seqLen / 2)) {
if (diag >= static_cast<int>(seqLen / 3)) {
//diag = abs(diag);
diagHits[diag - seqLen / 2]++;
diagHits[diag - seqLen / 3]++;
kmermatches++;
}
jdx++;
}
while (idx < frontKmersCount && kmerIdx == frontKmers[idx].kmer) {
idx++;
while (kdx < middleKmersCount && kmerIdx == middleKmers[kdx].kmer) {
kdx++;
}
}

}

/* calculate maximal hit rate on diagonal bands */
int splitDiagonal = -1;
float maxDiagbandHitRate = 0.0;
/* calculate hit rate on diagonal bands */
unsigned int splitDiagonal = 0;

if (kmermatches > 0) {
for (unsigned int d = 0; d < seqLen / 2; d++) {
for (unsigned int d = 0; d < 2 * thirdSeqLen; d++) {
if (diagHits[d] != 0) {
unsigned int diag = d + seqLen / 2;
unsigned int diag = d + thirdSeqLen;
unsigned int diaglen = seqLen - diag;
unsigned int gapwindow = diaglen * 0.01;
unsigned int lower = std::max(0, static_cast<int>(d - gapwindow));
unsigned int upper = std::min(d + gapwindow, seqLen / 2);
unsigned int upper = std::min(d + gapwindow, 2 * thirdSeqLen);
unsigned int diagbandHits = 0;

for (size_t i = lower; i <= upper; i++) {
Expand All @@ -178,16 +230,16 @@ int cyclecheck(int argc, const char **argv, const Command& command) {
}

float diagbandHitRate = static_cast<float>(diagbandHits) / (diaglen - kmerSize + 1);
if (diagbandHitRate > maxDiagbandHitRate) {
maxDiagbandHitRate = diagbandHitRate;
if (diagbandHitRate > HIT_RATE_THRESHOLD) {
splitDiagonal = diag;
break;
}
}

}
}

if (maxDiagbandHitRate >= HIT_RATE_THRESHOLD) {
if (splitDiagonal != 0) {

unsigned int len = seqDbr->getEntryLen(id)-1;
std::string seq;
Expand All @@ -203,6 +255,7 @@ int cyclecheck(int argc, const char **argv, const Command& command) {
}
delete[] diagHits;
delete[] frontKmers;
delete[] middleKmers;
delete[] backKmers;
}

Expand Down
38 changes: 37 additions & 1 deletion src/assembler/hybridassembleresult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

class CompareResultBySeqId {
public:
bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) {
/*bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) {
if(r1.seqId < r2.seqId )
return true;
if(r2.seqId < r1.seqId )
Expand All @@ -37,6 +37,40 @@ class CompareResultBySeqId {
return false;
return false;
}*/
bool operator() (const Matcher::result_t & r1,const Matcher::result_t & r2) {
unsigned int mm_count1 = (1 - r1.seqId) * r1.alnLength + 0.5;
unsigned int mm_count2 = (1 - r2.seqId) * r2.alnLength + 0.5;

unsigned int alpha1 = mm_count1 + 1;
unsigned int alpha2 = mm_count2 + 1;
unsigned int beta1 = r1.alnLength - mm_count1 + 1;
unsigned int beta2 = r2.alnLength - mm_count2 + 1;

//double c=(std::tgamma(beta1+beta2)*std::tgamma(alpha1+beta2))/(std::tgamma(alpha1+beta1+beta2)*std::tgamma(beta1));
double log_c = (std::lgamma(beta1+beta2)+std::lgamma(alpha1+beta1))-(std::lgamma(alpha1+beta1+beta2)+std::lgamma(beta1));

//double r = 1.0; // r_0 =1
double log_r = 0.0;
double p = 0.0;
for (size_t idx = 0; idx < alpha2; idx++) {

p += exp(log_r + log_c);
//r *= ((alpha1+idx)*(beta2+idx))/((idx+1)*(idx+alpha1+beta1+beta2));
log_r = log(alpha1+idx)+log(beta2+idx)-(log(idx+1) + log(idx+alpha1+beta1+beta2)) + log_r;
}
//p *= c;

if (p < 0.45)
return true;
if (p > 0.55)
return false;
if (r1.dbLen - r1.alnLength < r2.dbLen - r2.alnLength)
return true;
if (r1.dbLen - r1.alnLength > r2.dbLen - r2.alnLength)
return false;

return true;
}
};

Expand Down Expand Up @@ -122,6 +156,8 @@ int dohybridassembleresult(LocalParameters &par) {
bool queryCouldBeExtendedLeft = false;
bool queryCouldBeExtendedRight = false;
for (size_t alnIdx = 0; alnIdx < nuclAlignments.size(); alnIdx++) {
if(nuclAlignments[alnIdx].seqId < par.seqIdThr)
continue;
alnQueue.push(nuclAlignments[alnIdx]);
if (nuclAlignments.size() > 1) {
size_t id = nuclSequenceDbr->getId(nuclAlignments[alnIdx].dbKey);
Expand Down

0 comments on commit c4aaa98

Please sign in to comment.