Skip to content
Permalink
Browse files

added deleteIncremental logic for nucliotide assembler, some further …

…cleanups
  • Loading branch information
AnnSeidel committed Nov 27, 2019
1 parent 2e860f5 commit 987a932f72e440145030dad886f6d6c7e008d27d
Showing with 71 additions and 40 deletions.
  1. +61 −27 data/nuclassembler.sh
  2. +0 −6 src/assembler/cyclecheck.cpp
  3. +1 −0 src/commons/LocalParameters.h
  4. +7 −7 src/plass.cpp
  5. +2 −0 src/workflow/Nuclassembler.cpp
@@ -6,39 +6,57 @@ fail() {
exit 1
}

deleteIncremental() {
if [ -n "$REMOVE_INCREMENTAL_TMP" ] && [ -n "$1" ]; then
"$MMSEQS" rmdb "$1"
fi
}

notExists() {
[ ! -f "$1" ]
}

cyclecheck() {
if [ -n "$CALL_CYCLE_CHECK" ]; then
if notExists "${1}_cycle"; then
if notExists "${1}_cycle.done"; then
# shellcheck disable=SC2086
"$MMSEQS" cyclecheck "$1" "${1}_cycle" ${CYCLE_CHECK_PAR} \
|| fail "Cycle check step died"
fi

if [ -s "${1}_cycle" ]; then

if notExists "${1}_noneCycle"; then
awk 'NR==FNR { a[$1]=$0; next } !($1 in a) {print $0}' "${1}_cycle.index" \
"${1}.index" > "${1}_noneCycle.index"
ln -s "$1" "${1}_noneCycle"
ln -s "${1}.dbtype" "${1}_noneCycle.dbtype"
fi
if [ -s "${1}_cycle" ]; then

if [ -z "$RESULT_CYC" ]; then
RESULT_CYC="${1}_cycle"
else
if notExists "${1}_cycle__all"; then
if notExists "${1}_noneCycle"; then
awk 'NR==FNR { a[$1]=$0; next } !($1 in a) {print $0}' "${1}_cycle.index" \
"${1}.index" > "${1}_noneCycle.index"
ln -s "$1" "${1}_noneCycle"
ln -s "${1}.dbtype" "${1}_noneCycle.dbtype"
fi

if [ -z "$PREV_CYCLE_ALL" ]; then
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "${RESULT_CYC}" "${1}_cycle" "${1}_cycle_all" --preserve-keys
"$MMSEQS" mvdb "${1}_cycle" "${1}_cycle_all"
else
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "${PREV_CYCLE_ALL}" "${1}_cycle" "${1}_cycle_all" --preserve-keys
fi
RESULT_CYC="${1}_cycle_all"

else
ln -s "$1" "${1}_noneCycle"
ln -s "${1}.index" "${1}_noneCycle.index"
ln -s "${1}.dbtype" "${1}_noneCycle.dbtype"
fi
touch "${1}_cycle.done"
deleteIncremental "$PREV_CYCLE"
PREV_CYCLE="${1}_cycle"
fi

PREV_ASSEMBLY="${1}_noneCycle"
if [ -s "${1}_cycle_all" ]; then
deleteIncremental "${PREV_CYCLE_ALL}"
PREV_CYCLE_ALL="${1}_cycle_all"
fi

PREV_ASSEMBLY="${1}_noneCycle"
fi
}

@@ -73,40 +91,51 @@ while [ $STEP -lt $NUM_IT ]; do
echo "STEP: $STEP"

# 1. Finding exact $k$-mer matches.
if notExists "${TMP_PATH}/pref_$STEP"; then
if notExists "${TMP_PATH}/pref_${STEP}.done"; then
# shellcheck disable=SC2086
"$MMSEQS" kmermatcher "$INPUT" "${TMP_PATH}/pref_$STEP" ${KMERMATCHER_PAR} \
"$MMSEQS" kmermatcher "$INPUT" "${TMP_PATH}/pref_${STEP}" ${KMERMATCHER_PAR} \
|| fail "Kmer matching step died"
deleteIncremental "$PREV_KMER_PREF"
touch "${TMP_PATH}/pref_${STEP}.done"
PREV_KMER_PREF="${TMP_PATH}/pref_${STEP}"
fi

# 2. Ungapped alignment
if notExists "${TMP_PATH}/aln_$STEP"; then
if notExists "${TMP_PATH}/aln_${STEP}.done"; then
# shellcheck disable=SC2086
"$MMSEQS" rescorediagonal "$INPUT" "$INPUT" "${TMP_PATH}/pref_$STEP" "${TMP_PATH}/aln_$STEP" ${UNGAPPED_ALN_PAR} \
"$MMSEQS" rescorediagonal "$INPUT" "$INPUT" "${TMP_PATH}/pref_${STEP}" "${TMP_PATH}/aln_${STEP}" ${UNGAPPED_ALN_PAR} \
|| fail "Ungapped alignment step died"
touch "${TMP_PATH}/aln_${STEP}.done"
deleteIncremental "$PREV_ALN"
PREV_ALN="${TMP_PATH}/aln_${STEP}"
fi

# 3. Assemble
if notExists "${TMP_PATH}/assembly_$STEP"; then
if notExists "${TMP_PATH}/assembly_${STEP}.done"; then
# shellcheck disable=SC2086
"$MMSEQS" assembleresults "$INPUT" "${TMP_PATH}/aln_$STEP" "${TMP_PATH}/assembly_$STEP" ${ASSEMBLE_RESULT_PAR} \
"$MMSEQS" assembleresults "$INPUT" "${TMP_PATH}/aln_${STEP}" "${TMP_PATH}/assembly_${STEP}" ${ASSEMBLE_RESULT_PAR} \
|| fail "Assembly step died"
touch "${TMP_PATH}/assembly_${STEP}.done"
deleteIncremental "$PREV_ASSEMBLY"
deleteIncremental "$PREV_ASSEMBLY_STEP"
fi
PREV_ASSEMBLY="${TMP_PATH}/assembly_$STEP"

PREV_ASSEMBLY="${TMP_PATH}/assembly_${STEP}"
PREV_ASSEMBLY_STEP="${TMP_PATH}/assembly_${STEP}"
cyclecheck "${PREV_ASSEMBLY}"

INPUT="${PREV_ASSEMBLY}"
STEP="$((STEP+1))"
done
STEP="$((STEP-1))"
RESULT="${TMP_PATH}/assembly_${STEP}"

if [ -n "$RESULT_CYC" ]; then
if [ -n "$PREV_CYCLE_ALL" ]; then

RESULT="${TMP_PATH}/assembly_merged"
if notExists "${TMP_PATH}/assembly_merged"; then
# shellcheck disable=SC2086
"$MMSEQS" concatdbs "${PREV_ASSEMBLY}" "${RESULT_CYC}" "${TMP_PATH}/assembly_merged" --preserve-keys
"$MMSEQS" concatdbs "${PREV_ASSEMBLY}" "${PREV_CYCLE_ALL}" "${TMP_PATH}/assembly_merged" --preserve-keys
fi
fi

@@ -149,8 +178,13 @@ fi

if notExists "${TMP_PATH}/assembly_final_rep_h"; then
# shellcheck disable=SC2086
"$MMSEQS" createhdb "${TMP_PATH}/assembly_final_rep" "${RESULT_CYC}" "${TMP_PATH}/assembly_final_rep" ${VERBOSITY_PAR} \
|| fail "createhdb failed"
if notExists "${PREV_CYCLE_ALL}";then
"$MMSEQS" createhdb "${TMP_PATH}/assembly_final_rep" "${TMP_PATH}/assembly_final_rep" ${VERBOSITY_PAR} \
|| fail "createhdb failed"
else
"$MMSEQS" createhdb "${TMP_PATH}/assembly_final_rep" "${PREV_CYCLE_ALL}" "${TMP_PATH}/assembly_final_rep" ${VERBOSITY_PAR} \
|| fail "createhdb failed"
fi
fi

if notExists "${TMP_PATH}/assembly_final_rep.fasta"; then
@@ -155,11 +155,6 @@ int cyclecheck(int argc, const char **argv, const Command& command) {
}

}
//std:: cout << "number of kmermatches " << kmermatches << std::endl;
/*for (size_t i=0; i < seqLen/2; i++) {
if (diagHits[i] != 0)
std:: cout << id << "\t" << seqLen << "\t" << i+seqLen/2 << "\t" << diagHits[i] << std::endl;
}*/

/* calculate maximal hit rate on diagonal bands */
int splitDiagonal = -1;
@@ -214,6 +209,5 @@ int cyclecheck(int argc, const char **argv, const Command& command) {
delete seqDbr;

return EXIT_SUCCESS;
//TODO: split main in functionsq

}
@@ -118,6 +118,7 @@ class LocalParameters : public Parameters {
nuclassemblerworkflow.push_back(&PARAM_CLUST_THR);
nuclassemblerworkflow.push_back(&PARAM_NUM_ITERATIONS);
nuclassemblerworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
nuclassemblerworkflow.push_back(&PARAM_DELETE_TMP_INC);
nuclassemblerworkflow.push_back(&PARAM_RUNNER);
nuclassemblerworkflow = combineList(nuclassemblerworkflow, kmermatcher);
nuclassemblerworkflow = combineList(nuclassemblerworkflow, rescorediagonal);
@@ -20,22 +20,22 @@ std::vector<struct Command> commands = {
"<i:fast(a|q)File[.gz]> | <i:fastqFile1_1[.gz] ... <i:fastqFileN_1[.gz] <i:fastqFile1_2[.gz] ... <i:fastqFileN_2[.gz]> <o:fastaFile> <tmpDir>",
CITATION_PLASS, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
{"nuclassemble", nuclassembler, &localPar.nuclassemblerworkflow, COMMAND_MAIN,
"Assemble nucleotide sequences by iterative greedy overlap assembly. (under development)",
"Assemble nucleotide sequences by iterative greedy overlap assembly (under development). Extends sequence to the left and right using ungapped alignments.",
"Annika Seidel & Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
"Assemble nucleotide sequences by iterative greedy overlap assembly. (experimental",
"Assemble nucleotide sequences by iterative greedy overlap assembly (experimental). Extends sequence to the left and right using ungapped alignments.",
"Annika Seidel <annika.seidel@mpibpc.mpg.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
"<i:fast(a|q)File[.gz]> | <i:fastqFile1_1[.gz] ... <i:fastqFileN_1[.gz] <i:fastqFile1_2[.gz] ... <i:fastqFileN_2[.gz]> <o:fastaFile> <tmpDir>",
CITATION_PLASS, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
{"hybridassemble", hybridassembler, &localPar.hybridassemblerworkflow, COMMAND_HIDDEN,
"Assemble nucleotide sequences by iterative greedy overlap assembly using protein and nucleotide information. (under development)",
"Assemble nucleotide sequences by iterative greedy overlap assembly using protein and nucleotide information (under development)."
"Assemble nucleotide sequences by iterative greedy overlap assembly using protein and nucleotide information. (experimental)",
"Assemble nucleotide sequences by iterative greedy overlap assembly using protein and nucleotide information (experimental)."
"Extends sequence to the left and right using ungapped alignments on proteins followed by further processing using ungapped alignments on nucleotide level.",
"Annika Seidel <annika.seidel@mpibpc.mpg.de> & Martin Steinegger ",
"Annika Seidel <annika.seidel@mpibpc.mpg.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
"<i:fast(a|q)File[.gz]> | <i:fastqFile1_1[.gz] ... <i:fastqFileN_1[.gz] <i:fastqFile1_2[.gz] ... <i:fastqFileN_2[.gz]> <o:fastaFile> <tmpDir>",
CITATION_PLASS, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
{"assembleresults", assembleresult, &localPar.assembleresults, COMMAND_HIDDEN,
"Extending representative sequence to the left and right side using ungapped alignments.",
NULL,
"Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
"Annika Seidel <annika.seidel@mpibpc.mpg.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
"<i:sequenceDB> <i:alnResult> <o:reprSeqDB>",
CITATION_PLASS, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"alnResult", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
@@ -87,6 +87,8 @@ int nuclassembler(int argc, const char **argv, const Command &command) {
par.filenames.pop_back();

cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("REMOVE_INCREMENTAL_TMP", par.deleteFilesInc ? "TRUE" : NULL);

cmd.addVariable("RUNNER", par.runner.c_str());
cmd.addVariable("NUM_IT", SSTR(par.numIterations).c_str());
// # 1. Finding exact $k$-mer matches.

0 comments on commit 987a932

Please sign in to comment.
You can’t perform that action at this time.