Skip to content
Permalink
Browse files

changed skipping logic for sequences with repeated kmers: skip repeat…

…ed kmer when looking for matches instead of skipping whole sequence, replaced skipNRepeatKmer parameter by ignore-multi-kmer parameter with new logic
  • Loading branch information...
AnnSeidel committed Nov 6, 2019
1 parent 84e5309 commit 5ae5503a923ec4338464d09cc1ff4bc3680727da
Showing with 32 additions and 30 deletions.
  1. +4 −4 src/commons/Parameters.cpp
  2. +2 −2 src/commons/Parameters.h
  3. +26 −24 src/linclust/kmermatcher.cpp
@@ -134,7 +134,7 @@ Parameters::Parameters():
PARAM_KMER_PER_SEQ(PARAM_KMER_PER_SEQ_ID, "--kmer-per-seq", "K-mers per sequence", "kmer per sequence", typeid(int), (void*) &kmersPerSequence, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR),
PARAM_KMER_PER_SEQ_SCALE(PARAM_KMER_PER_SEQ_SCALE_ID, "--kmer-per-seq-scale", "scale k-mers per sequence", "scale kmer per sequence based on sequence length as kmer-per-seq val + scale x seqlen", typeid(float), (void*) &kmersPerSequenceScale, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_EXPERT),
PARAM_INCLUDE_ONLY_EXTENDABLE(PARAM_INCLUDE_ONLY_EXTENDABLE_ID, "--include-only-extendable", "Include only extendable", "Include only extendable", typeid(bool), (void*) &includeOnlyExtendable, "", MMseqsParameter::COMMAND_CLUSTLINEAR),
PARAM_SKIP_N_REPEAT_KMER(PARAM_SKIP_N_REPEAT_KMER_ID, "--skip-n-repeat-kmer", "Skip sequence with n repeating k-mers", "Skip sequence with >= n exact repeating k-mers", typeid(int), (void*) &skipNRepeatKmer, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip kmers occuring multiple times (>=2)", typeid(bool), (void*) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash", typeid(int), (void*) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
PARAM_PICK_N_SIMILAR(PARAM_HASH_SHIFT_ID, "--pick-n-sim-kmer", "Add N similar to search", "adds N similar to search", typeid(int), (void*) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void*) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
@@ -644,7 +644,7 @@ Parameters::Parameters():
kmerindexdb.push_back(&PARAM_MIN_SEQ_ID);
kmerindexdb.push_back(&PARAM_ADJUST_KMER_LEN);
kmerindexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
kmerindexdb.push_back(&PARAM_SKIP_N_REPEAT_KMER);
kmerindexdb.push_back(&PARAM_IGNORE_MULTI_KMER);
kmerindexdb.push_back(&PARAM_ALPH_SIZE);
kmerindexdb.push_back(&PARAM_MAX_SEQ_LEN);
kmerindexdb.push_back(&PARAM_MASK_RESIDUES);
@@ -796,7 +796,7 @@ Parameters::Parameters():
kmermatcher.push_back(&PARAM_HASH_SHIFT);
kmermatcher.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
kmermatcher.push_back(&PARAM_INCLUDE_ONLY_EXTENDABLE);
kmermatcher.push_back(&PARAM_SKIP_N_REPEAT_KMER);
kmermatcher.push_back(&PARAM_IGNORE_MULTI_KMER);
kmermatcher.push_back(&PARAM_THREADS);
kmermatcher.push_back(&PARAM_COMPRESSED);
kmermatcher.push_back(&PARAM_V);
@@ -1973,7 +1973,7 @@ void Parameters::setDefaults() {
kmersPerSequence = 21;
kmersPerSequenceScale = 0.0;
includeOnlyExtendable = false;
skipNRepeatKmer = 0;
ignoreMultiKmer = false;
hashShift = 5;
pickNbest = 1;
adjustKmerLength = false;
@@ -447,7 +447,7 @@ class Parameters {
int kmersPerSequence;
float kmersPerSequenceScale;
bool includeOnlyExtendable;
int skipNRepeatKmer;
bool ignoreMultiKmer;
int hashShift;
int pickNbest;
int adjustKmerLength;
@@ -720,7 +720,7 @@ class Parameters {
PARAMETER(PARAM_KMER_PER_SEQ)
PARAMETER(PARAM_KMER_PER_SEQ_SCALE)
PARAMETER(PARAM_INCLUDE_ONLY_EXTENDABLE)
PARAMETER(PARAM_SKIP_N_REPEAT_KMER)
PARAMETER(PARAM_IGNORE_MULTI_KMER)
PARAMETER(PARAM_HASH_SHIFT)
PARAMETER(PARAM_PICK_N_SIMILAR)
PARAMETER(PARAM_ADJUST_KMER_LEN)
@@ -307,25 +307,6 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
std::sort(kmers, kmers + seqKmerCount, SequencePosition::compareByScore);
}
}
size_t kmerConsidered = std::min(static_cast<int>(chooseTopKmer - 1 + (chooseTopKmerScale * seq.L)), seqKmerCount);
if(par.skipNRepeatKmer > 0 ){
size_t prevKmer = SIZE_T_MAX;
kmers[seqKmerCount].kmer=SIZE_T_MAX;
int repeatKmerCnt = 0;
for (int topKmer = 0; topKmer < seqKmerCount; topKmer++) {
size_t kmerCurr = (kmers + topKmer)->kmer;
size_t kmerNext = (kmers + topKmer + 1)->kmer;
if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
kmerCurr = BIT_SET(kmerCurr, 63);
kmerNext = BIT_SET(kmerNext, 63);
}
repeatKmerCnt += (kmerCurr == kmerNext || kmerCurr == prevKmer);
prevKmer = kmerCurr;
}
if(repeatKmerCnt >= par.skipNRepeatKmer){
kmerConsidered = 0;
}
}

// add k-mer to represent the identity
//TODO, how to handle this in reverse?
@@ -341,12 +322,33 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
bufferPos = 0;
}
}
for (size_t topKmer = 0; topKmer < kmerConsidered; topKmer++) {
size_t kmer = (kmers + topKmer)->kmer;
if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
kmer = BIT_SET(kmer, 63);

size_t kmersToConsider = std::min(static_cast<int>(chooseTopKmer - 1 + (chooseTopKmerScale * seq.L)), seqKmerCount);
size_t kmersConsidered = 0;

size_t prevKmer = SIZE_T_MAX;
kmers[seqKmerCount].kmer = SIZE_T_MAX;
for (size_t topKmer = 0; topKmer < static_cast<size_t>(seqKmerCount) &&
kmersConsidered < kmersToConsider; topKmer++) {

size_t kmerCurr = (kmers + topKmer)->kmer;
size_t kmerNext = (kmers + topKmer + 1)->kmer;

if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
kmerCurr = BIT_SET(kmerCurr, 63);
kmerNext = BIT_SET(kmerNext, 63);
}
size_t splitIdx = kmer % splits;

bool repeatedKmer = (kmerCurr == kmerNext || kmerCurr == prevKmer);
prevKmer = kmerCurr;

/* skip repeated kmer */
if (par.ignoreMultiKmer > 0 && repeatedKmer) {
continue;
}

kmersConsidered++;
size_t splitIdx = kmerCurr % splits;
if (splitIdx != split) {
continue;
}

0 comments on commit 5ae5503

Please sign in to comment.
You can’t perform that action at this time.