Skip to content
Permalink
Browse files

revised and updated hybridassembler parameter

  • Loading branch information
AnnSeidel committed Feb 19, 2020
1 parent 89692a8 commit 8e0b20de5eba39d62c5f035729dac6cd2269bdd2
@@ -39,7 +39,8 @@ class LocalParameters : public Parameters {
int filterProteins;
int deleteFilesInc;
int minContigLen;
float clustThr;
float clustSeqIdThr;
float clustCovThr;
float proteinFilterThreshold;
bool cycleCheck;
bool chopCycle;
@@ -53,7 +54,8 @@ class LocalParameters : public Parameters {
PARAMETER(PARAM_PROTEIN_FILTER_THRESHOLD)
PARAMETER(PARAM_DELETE_TMP_INC)
PARAMETER(PARAM_MIN_CONTIG_LEN)
PARAMETER(PARAM_CLUST_THR)
PARAMETER(PARAM_CLUST_MIN_SEQ_ID_THR)
PARAMETER(PARAM_CLUST_C)
PARAMETER(PARAM_CYCLE_CHECK)
PARAMETER(PARAM_CHOP_CYCLE)
PARAMETER(PARAM_MULTI_NUM_ITERATIONS)
@@ -71,13 +73,14 @@ class LocalParameters : public Parameters {
multiSeqIdThr(FLT_MAX,FLT_MAX),
PARAM_FILTER_PROTEINS(PARAM_FILTER_PROTEINS_ID,"--filter-proteins", "Filter Proteins", "filter proteins by a neural network [0,1]",typeid(int), (void *) &filterProteins, "^[0-1]{1}$"),
PARAM_PROTEIN_FILTER_THRESHOLD(PARAM_PROTEIN_FILTER_THRESHOLD_ID,"--protein-filter-threshold", "Protein Filter Threshold", "filter proteins lower than threshold [0.0,1.0]",typeid(float), (void *) &proteinFilterThreshold, "^0(\\.[0-9]+)?|1(\\.0+)?$"),
PARAM_DELETE_TMP_INC(PARAM_DELETE_TMP_INC_ID,"--delete-tmp-inc", "Delete temporary files incremental", "delete temporary files incremental [0,1]",typeid(int), (void *) &deleteFilesInc, "^[0-1]{1}$"),
PARAM_DELETE_TMP_INC(PARAM_DELETE_TMP_INC_ID,"--delete-tmp-inc", "Delete temporary files incremental", "Delete temporary files incremental [0,1]",typeid(int), (void *) &deleteFilesInc, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
PARAM_MIN_CONTIG_LEN(PARAM_MIN_CONTIG_LEN_ID, "--min-contig-len", "Minimum contig length", "Minimum length of assembled contig to output", typeid(int), (void *) &minContigLen, "^[1-9]{1}[0-9]*$"),
PARAM_CLUST_THR(PARAM_CLUST_THR_ID,"--clust-thr", "Clustering threshold","Threshold to reduce redundancy in assembly by applying the linclust algorithm (clustering threshold) (range 0.0-1.0)",typeid(float), (void *) &clustThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"),
PARAM_CLUST_MIN_SEQ_ID_THR(PARAM_CLUST_MIN_SEQ_ID_THR_ID,"--clust-min-seq-id", "Clustering seq. id. threshold","Seq. id. threshold passed to linclust algorithm to reduce redundancy in assembly (range 0.0-1.0)",typeid(float), (void *) &clustSeqIdThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_CLUST),
PARAM_CLUST_C(PARAM_CLUST_C_ID,"--clust-min-cov", "Clustering coverage threshold","Coverage threshold passed to linclust algorithm to reduce redundancy in assembly (range 0.0-1.0)",typeid(float), (void *) &clustCovThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_CLUST),
PARAM_CYCLE_CHECK(PARAM_CYCLE_CHECK_ID,"--cycle-check", "Check for circular sequences", "Check for circular sequences (avoid infinite extension of circular or long repeated regions) ",typeid(bool), (void *) &cycleCheck, "", MMseqsParameter::COMMAND_MISC | MMseqsParameter::COMMAND_EXPERT),
PARAM_CHOP_CYCLE(PARAM_CHOP_CYCLE_ID,"--chop-cycle", "Chop Cycle", "Remove superfluous part of circular fragments",typeid(bool), (void *) &chopCycle, "", MMseqsParameter::COMMAND_MISC | MMseqsParameter::COMMAND_EXPERT),
PARAM_MULTI_NUM_ITERATIONS(PARAM_MULTI_NUM_ITERATIONS_ID, "--num-iterations", "Number of assembly iteration","Number of assembly iterations performed on protein level,nucleotide level [1, inf]",typeid(MultiParam<int>),(void *) &multiNumIterations, ""),
PARAM_MULTI_K(PARAM_MULTI_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(MultiParam<int>), (void *) &multiKmerSize, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
PARAM_CHOP_CYCLE(PARAM_CHOP_CYCLE_ID,"--chop-cycle", "Chop Cycle", "Remove superfluous part of circular fragments (see --cycle-check)",typeid(bool), (void *) &chopCycle, "", MMseqsParameter::COMMAND_MISC | MMseqsParameter::COMMAND_EXPERT),
PARAM_MULTI_NUM_ITERATIONS(PARAM_MULTI_NUM_ITERATIONS_ID, "--num-iterations", "Number of assembly iterations","Number of assembly iterations performed on nucleotide level,protein level (range 1-inf)",typeid(MultiParam<int>),(void *) &multiNumIterations, ""),
PARAM_MULTI_K(PARAM_MULTI_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(MultiParam<int>), (void *) &multiKmerSize, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
PARAM_MULTI_MIN_SEQ_ID(PARAM_MULTI_MIN_SEQ_ID_ID, "--min-seq-id", "Seq. id. threshold", "Overlap sequence identity threshold [0.0, 1.0]", typeid(MultiParam<float>), (void *) &multiSeqIdThr, "", MMseqsParameter::COMMAND_ALIGN),
PARAM_MULTI_MIN_ALN_LEN(PARAM_MULTI_MIN_ALN_LEN_ID, "--min-aln-len", "Min alignment length", "Minimum alignment length (range 0-INT_MAX)", typeid(MultiParam<int>), (void *) &multiAlnLenThr, "", MMseqsParameter::COMMAND_ALIGN)

@@ -111,6 +114,7 @@ class LocalParameters : public Parameters {

//reduceredundancy (subset of clustering parameters which have to be adjusted)
reduceredundancy.push_back(&PARAM_ALPH_SIZE);
reduceredundancy.push_back(&PARAM_CLUSTER_MODE);
reduceredundancy.push_back(&PARAM_K);
reduceredundancy.push_back(&PARAM_KMER_PER_SEQ);
reduceredundancy.push_back(&PARAM_KMER_PER_SEQ_SCALE);
@@ -123,7 +127,7 @@ class LocalParameters : public Parameters {
reduceredundancy.push_back(&PARAM_GAP_OPEN);
reduceredundancy.push_back(&PARAM_GAP_EXTEND);
reduceredundancy.push_back(&PARAM_ZDROP);
reduceredundancy.push_back(&PARAM_CLUSTER_MODE);


// assembledb workflow
assembleDBworkflow = combineList(rescorediagonal, kmermatcher);
@@ -141,44 +145,35 @@ class LocalParameters : public Parameters {
assemblerworkflow = combineList(assembleDBworkflow, createdb);

// nucl assembledb workflow
nuclassembleDBworkflow = combineList(rescorediagonal, kmermatcher);
nuclassembleDBworkflow = combineList(nuclassembleDBworkflow, assembleresults);
nuclassembleDBworkflow = combineList(nuclassembleDBworkflow, cyclecheck);

nuclassembleDBworkflow.push_back(&PARAM_CYCLE_CHECK);
nuclassembleDBworkflow.push_back(&PARAM_MIN_CONTIG_LEN);
nuclassembleDBworkflow.push_back(&PARAM_NUM_ITERATIONS);
nuclassembleDBworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
nuclassembleDBworkflow.push_back(&PARAM_DELETE_TMP_INC);
nuclassembleDBworkflow.push_back(&PARAM_RUNNER);

nuclassembleDBworkflow = combineList(nuclassembleDBworkflow, kmermatcher);
nuclassembleDBworkflow = combineList(nuclassembleDBworkflow, rescorediagonal);
nuclassembleDBworkflow = combineList(nuclassembleDBworkflow, assembleresults);
nuclassembleDBworkflow = combineList(nuclassembleDBworkflow, cyclecheck);

// easynuclassembleworkflow
nuclassemblerworkflow = combineList(nuclassembleDBworkflow, createdb);

// hybridassembleresults
hybridassembleresults.push_back(&PARAM_MIN_SEQ_ID);
hybridassembleresults.push_back(&PARAM_MAX_SEQ_LEN);
hybridassembleresults.push_back(&PARAM_RESCORE_MODE);
hybridassembleresults.push_back(&PARAM_THREADS);
hybridassembleresults.push_back(&PARAM_V);
hybridassembleresults.push_back(&PARAM_RESCORE_MODE);


// hybridassembledbworkflow
hybridassembleDBworkflow.push_back(&PARAM_CLUST_THR);
hybridassembleDBworkflow.push_back(&PARAM_CYCLE_CHECK);
hybridassembleDBworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
hybridassembleDBworkflow.push_back(&PARAM_DELETE_TMP_INC);
hybridassembleDBworkflow.push_back(&PARAM_RUNNER);
hybridassembleDBworkflow.push_back(&PARAM_THREADS);
hybridassembleDBworkflow.push_back(&PARAM_V);

hybridassembleDBworkflow = combineList(hybridassembleDBworkflow, extractorfs);
hybridassembleDBworkflow = combineList(hybridassembleDBworkflow, hybridassembleresults);
hybridassembleDBworkflow = combineList(extractorfs, hybridassembleresults);
hybridassembleDBworkflow = combineList(hybridassembleDBworkflow, nuclassembleDBworkflow);
hybridassembleDBworkflow = combineList(hybridassembleDBworkflow, reduceredundancy);
hybridassembleDBworkflow.push_back(&PARAM_CLUST_MIN_SEQ_ID_THR);
hybridassembleDBworkflow.push_back(&PARAM_CLUST_C);

// hybridassembledbworkflow special: replace with MultiParam to make aa and nucl values independent
// hybridassembledbworkflow special parameter: replace with MultiParam to make aa and nucl values independent
hybridassembleDBworkflow = removeParameter(hybridassembleDBworkflow, PARAM_K);
hybridassembleDBworkflow.push_back(&PARAM_MULTI_K);
hybridassembleDBworkflow = removeParameter(hybridassembleDBworkflow, PARAM_MIN_SEQ_ID);
@@ -194,7 +189,8 @@ class LocalParameters : public Parameters {
filterProteins = 1;
deleteFilesInc = 1;
proteinFilterThreshold = 0.2;
clustThr = 0.97;
clustSeqIdThr = 0.97;
clustCovThr = 0.99;
minContigLen = 1000;
chopCycle = false;
cycleCheck = true;
@@ -51,7 +51,7 @@ int easyassembler(int argc, const char **argv, const Command &command) {
par.overrideParameterDescription(par.PARAM_NUM_ITERATIONS, "Number of assembly iterations [1, inf]", NULL, 0);
par.overrideParameterDescription(par.PARAM_E, "Extend sequences if the E-value is below [0.0, inf]", NULL, 0);

par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_HIDDEN);
par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_C.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ID_OFFSET.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_CONTIG_END_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
@@ -15,7 +15,7 @@ void setHybridAssemblerWorkflowDefaults(LocalParameters *p) {
p->alphabetSize = MultiParam<int>(13,5);

p->orfMinLength = 45;
p->covThr = 0.0;
p->covThr = 0.00;
p->evalThr = 0.00001;
p->maskMode = 0;
p->kmersPerSequence = 60;
@@ -31,6 +31,7 @@ void setHybridAssemblerWorkflowDefaults(LocalParameters *p) {
p->chopCycle = true;

//cluster defaults
p->covThr = 0.99;
p->clusteringMode = 2;
p->gapOpen = 5;
p->gapExtend = 2;
@@ -41,31 +42,43 @@ int hybridassembledb(int argc, const char **argv, const Command &command) {
LocalParameters &par = LocalParameters::getLocalInstance();
setHybridAssemblerWorkflowDefaults(&par);

par.overrideParameterDescription(par.PARAM_MIN_SEQ_ID, "Overlap sequence identity threshold [0.0, 1.0]", NULL, 0);
// par.overrideParameterDescription(par.PARAM_ORF_MIN_LENGTH, "Min codons in orf", "minimum codon number in open reading frames", 0);
// par.overrideParameterDescription(par.PARAM_NUM_ITERATIONS, "Number of assembly iterations [1, inf]", NULL, 0);
par.overrideParameterDescription(par.PARAM_E, "Extend sequences if the E-value is below [0.0, inf]", NULL, 0);

par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_C.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.overrideParameterDescription(par.PARAM_MULTI_MIN_SEQ_ID, "Overlap sequence identity threshold (range 0.0-1.0)", NULL, 0);
par.overrideParameterDescription(par.PARAM_E, "Extend sequences if the E-value is below (range 0.0-inf)", NULL, 0);
par.overrideParameterDescription(par.PARAM_GAP_OPEN, "Gap open cost (only for clustering", NULL, 0);
par.overrideParameterDescription(par.PARAM_GAP_EXTEND, "Gap extend cost (only for clustering)", NULL, 0);
par.overrideParameterDescription(par.PARAM_ZDROP, "Maximal allowed difference between score values before alignment is truncated (only for clustering)", NULL, 0);

// hide this parameters (changing them would lead to unexpected behavior)
par.PARAM_C.replaceCategory(MMseqsParameter::COMMAND_HIDDEN);
par.PARAM_CONTIG_END_MODE.replaceCategory(MMseqsParameter::COMMAND_HIDDEN);
par.PARAM_CONTIG_START_MODE.replaceCategory(MMseqsParameter::COMMAND_HIDDEN);
par.PARAM_ORF_MAX_GAP.replaceCategory(MMseqsParameter::COMMAND_HIDDEN);
par.PARAM_ORF_START_MODE.replaceCategory(MMseqsParameter::COMMAND_HIDDEN);
par.PARAM_WRAPPED_SCORING.replaceCategory(MMseqsParameter::COMMAND_HIDDEN);

// expert
par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_CLUSTER_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_DB_TYPE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ID_OFFSET.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_CONTIG_END_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_CONTIG_START_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ORF_MAX_GAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ORF_START_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_KMER_PER_SEQ_SCALE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ORF_MAX_LENGTH.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ORF_MIN_LENGTH.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ORF_FORWARD_FRAMES.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_ORF_REVERSE_FRAMES.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_SEQ_ID_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_SEQ_ID_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_SORT_RESULTS.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_TRANSLATION_TABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_TRANSLATE.addCategory(MMseqsParameter::COMMAND_EXPERT);
par.PARAM_USE_ALL_TABLE_STARTS.addCategory(MMseqsParameter::COMMAND_EXPERT);

par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
CommandCaller cmd;


std::string tmpDir = par.filenames.back();
std::string hash = SSTR(par.hashParameter(par.filenames, *command.params));
if (par.reuseLatest) {
@@ -88,7 +101,6 @@ int hybridassembledb(int argc, const char **argv, const Command &command) {
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("RUNNER", par.runner.c_str());


// set values for protein level assembly
par.numIterations = par.multiNumIterations.aminoacids;
par.kmerSize = par.multiKmerSize.aminoacids;
@@ -111,6 +123,11 @@ int hybridassembledb(int argc, const char **argv, const Command &command) {
par.orfMaxGaps = 0;
cmd.addVariable("EXTRACTORFS_START_PAR", par.createParameterString(par.extractorfs).c_str());

// force parameters for assembly steps
par.covThr = 0.0;
par.seqIdMode = 0;
par.includeOnlyExtendable = true;

// # 1. Finding exact $k$-mer matches.
cmd.addVariable("KMERMATCHER_PAR", par.createParameterString(par.kmermatcher).c_str());

@@ -132,9 +149,9 @@ int hybridassembledb(int argc, const char **argv, const Command &command) {
cmd.addVariable("NUCL_ASM_PAR", par.createParameterString(par.nuclassembleDBworkflow).c_str());

// set mandatory values for redundancy reduction
par.seqIdThr = par.clustThr;
par.seqIdThr = par.clustSeqIdThr;
par.covMode = 1;
par.covThr = 0.99;
par.covThr = par.clustCovThr;
par.wrappedScoring = true;
par.ignoreMultiKmer = true;
cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.reduceredundancy).c_str());

0 comments on commit 8e0b20d

Please sign in to comment.
You can’t perform that action at this time.