Permalink
Browse files

Merge branch 'master' of https://github.com/soedinglab/MMseqs2

  • Loading branch information...
martin-steinegger committed Oct 11, 2018
2 parents 18b2e68 + d95c0d6 commit 8306271e1e0d40f05963d6a7d10533767914cdca
View
@@ -42,7 +42,7 @@ set(COMPILED_RESOURCES
libPure_blosum62_255.lib
libPure_blosum62_32.lib
libPolished_8.lib
searchslicemodetargetprofile.sh
searchslicedtargetprofile.sh
cs219.lib
)
@@ -0,0 +1,176 @@
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}
notExists() {
[ ! -f "$1" ]
}
hasCommand () {
command -v "$1" >/dev/null 2>&1 || { echo "Please make sure that $1 is in \$PATH."; exit 1; }
}
abspath() {
if [ -d "$1" ]; then
(cd "$1"; pwd)
elif [ -f "$1" ]; then
if [ -z "${1##*/*}" ]; then
echo "$(cd "${1%/*}"; pwd)/${1##*/}"
else
echo "$(pwd)/$1"
fi
elif [ -d "$(dirname "$1")" ]; then
echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
fi
}
hasCommand awk
hasCommand wc
hasCommand join
hasCommand sort
# check amount of input variables
[ "$#" -ne 4 ] && echo "Please provide <queryDB> <targetDB> <resultDB> <tmpDir>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -f "$2" ] && echo "$2 not found!" && exit 1;
[ -f "$3" ] && echo "$3 exists already!" && exit 1;
[ ! -d "$4" ] && echo "TMP directory $4 not found!" && mkdir -p "$4";
INPUT="$1"
TARGET="$(abspath "$2")"
RESULT="$3"
TMP_PATH="$4"
PROFILEDB="${TMP_PATH}/profileDB"
if notExists "${TMP_PATH}/profileDB"; then
# symlink the profile DB that can be reduced at every iteration the search
ln -s "${TARGET}" "${PROFILEDB}"
ln -s "${TARGET}.dbtype" "${PROFILEDB}.dbtype"
sort -k1,1 "${TARGET}.index" > "${PROFILEDB}.index"
echo "${AVAIL_DISK}" > "${PROFILEDB}.params"
else
read -r AVAIL_DISK < "${PROFILEDB}.params"
fi
# echo call to trim whitespace wc produces
# shellcheck disable=SC2046,SC2005
NUM_PROFILES="$(echo $(wc -l < "${PROFILEDB}.index"))"
OFFSET=0
STEP=0
# MAX_STEPS is set by the workflow
# shellcheck disable=SC2153
while [ "${STEP}" -lt "${MAX_STEPS}" ] && [ "${NUM_PROFILES}" -gt 0 ]; do
if [ -f "${TMP_PATH}/aln_${STEP}.checkpoint" ]; then
# restore values from previous run, in case it was aborted
read -r NUM_PROFILES OFFSET < "${TMP_PATH}/aln_${STEP}.checkpoint"
continue
fi
# Compute the max number of sequence according to the number of profiles
# 90 bytes/query-result line max.
MAX_SEQS="$((1024*(AVAIL_DISK/NUM_PROFILES)/90))"
# No more disk space allowed to use
if [ "${MAX_SEQS}" -eq 0 ]; then
break
fi
# Max result to go into the prefilter list
SEARCH_LIM="$((OFFSET+MAX_SEQS))"
if notExists "${TMP_PATH}/pref.done"; then
# shellcheck disable=SC2086
${RUNNER} "$MMSEQS" prefilter "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" \
--max-seqs "${SEARCH_LIM}" --offset-result "${OFFSET}" ${PREFILTER_PAR} \
|| fail "prefilter died"
touch "${TMP_PATH}/pref.done"
fi
if notExists "${TMP_PATH}/pref_count" || notExists "${TMP_PATH}/pref_keep.list"; then
# shellcheck disable=SC2086
"$MMSEQS" result2stats "$PROFILEDB" "$INPUT" "${TMP_PATH}/pref" "${TMP_PATH}/pref_count" \
--stat linecount ${THREADS_PAR} \
|| fail "result2stats died"
fi
if notExists "${TMP_PATH}/pref_keep.list"; then
# remove profiles that do not provide more hits
# shellcheck disable=SC2086
"$MMSEQS" filterdb "${TMP_PATH}/pref_count" "${TMP_PATH}/pref_keep" \
--filter-column 1 --comparison-operator ge --comparison-value "${MAX_SEQS}" ${THREADS_PAR} \
|| fail "filterdb died"
rm -f "${TMP_PATH}/pref_count" "${TMP_PATH}/pref_count.index"
awk '$3 > 1 { print $1 }' "${TMP_PATH}/pref_keep.index" | sort -k1,1 > "${TMP_PATH}/pref_keep.list"
rm -f "${TMP_PATH}/pref_keep" "${TMP_PATH}/pref_keep.index"
fi
if notExists "${TMP_PATH}/aln.done"; then
# shellcheck disable=SC2086
${RUNNER} "$MMSEQS" align "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" "${TMP_PATH}/aln" ${ALIGNMENT_PAR} \
|| fail "align died"
rm -f "${TMP_PATH}/pref" "${TMP_PATH}/pref.index"
touch "${TMP_PATH}/aln.done"
fi
if notExists "${TMP_PATH}/aln_swap.done"; then
# note: we recover the right evalue, since it is computed according to the original target db
# shellcheck disable=SC2086
"$MMSEQS" swapresults "${TARGET}" "${INPUT}" "${TMP_PATH}/aln" "${TMP_PATH}/aln_swap" ${SWAP_PAR} \
|| fail "swapresults died"
rm -f "${TMP_PATH}/aln" "${TMP_PATH}/aln.index"
touch "${TMP_PATH}/aln_swap.done"
fi
MERGED="${TMP_PATH}/aln_swap"
if [ -f "${TMP_PATH}/aln_merged" ]; then
# shellcheck disable=SC2086
"$MMSEQS" mergedbs "${INPUT}" "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged" "${TMP_PATH}/aln_swap" ${VERBOSITY_PAR} \
|| fail "mergedbs died"
mv -f "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged.index"
mv -f "${TMP_PATH}/aln_merged_new.index" "${TMP_PATH}/aln_merged.index"
rm -f "${TMP_PATH}/aln_swap" "${TMP_PATH}/aln_swap.index"
MERGED="${TMP_PATH}/aln_merged"
fi
# keep only the top max-seqs hits according to the default alignment sorting criteria
# shellcheck disable=SC2086
"$MMSEQS" sortresult "${MERGED}" "${TMP_PATH}/aln_merged_trunc" ${SORTRESULT_PAR} \
|| fail "sortresult died"
mv -f "${TMP_PATH}/aln_merged_trunc" "${TMP_PATH}/aln_merged"
mv -f "${TMP_PATH}/aln_merged_trunc.index" "${TMP_PATH}/aln_merged.index"
join "${TMP_PATH}/pref_keep.list" "${PROFILEDB}.index" > "${PROFILEDB}.index.tmp"
mv -f "${PROFILEDB}.index.tmp" "${PROFILEDB}.index"
# keep for the prefilter only the next hits
OFFSET="$SEARCH_LIM"
# shellcheck disable=SC2046,SC2005
NUM_PROFILES="$(echo $(wc -l < "${PROFILEDB}.index"))"
rm -f "${TMP_PATH}/pref.done" "${TMP_PATH}/aln.done" "${TMP_PATH}/aln_swap.done" "${TMP_PATH}/pref_keep.list"
printf "%d\\t%d\\n" "${NUM_PROFILES}" "${OFFSET}" > "${TMP_PATH}/aln_${STEP}.checkpoint"
STEP="$((STEP+1))"
done
mv -f "${TMP_PATH}/aln_merged" "${RESULT}"
mv -f "${TMP_PATH}/aln_merged.index" "${RESULT}.index"
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
while [ "$STEP" -lt "$MAX_STEPS" ]; do
if [ -f "${TMP_PATH}/aln_${STEP}.checkpoint" ]; then
rm -f "${TMP_PATH}/aln_${STEP}.checkpoint"
fi
done
rm -f "${PROFILEDB}" "${PROFILEDB}.index" "${PROFILEDB}.dbtype" "${PROFILEDB}.params"
rm -f "$TMP_PATH/searchslicedtargetprofile.sh"
fi

This file was deleted.

Oops, something went wrong.
@@ -15,6 +15,7 @@ extern int combinepvalperset(int argc, const char **argv, const Command &command
extern int concatdbs(int argc, const char **argv, const Command& command);
extern int convert2fasta(int argc, const char **argv, const Command& command);
extern int convertalignments(int argc, const char **argv, const Command& command);
extern int convertca3m(int argc, const char **argv, const Command& command);
extern int convertkb(int argc, const char **argv, const Command& command);
extern int convertmsa(int argc, const char **argv, const Command& command);
extern int convertprofiledb(int argc, const char **argv, const Command& command);
@@ -28,6 +29,7 @@ extern int diffseqdbs(int argc, const char **argv, const Command& command);
extern int easycluster(int argc, const char **argv, const Command& command);
extern int easylinclust(int argc, const char **argv, const Command& command);
extern int easysearch(int argc, const char **argv, const Command& command);
extern int expandaln(int argc, const char **argv, const Command& command);
extern int extractalignedregion(int argc, const char **argv, const Command& command);
extern int extractdomains(int argc, const char **argv, const Command& command);
extern int extractorfs(int argc, const char **argv, const Command& command);
@@ -65,6 +67,7 @@ extern int search(int argc, const char **argv, const Command& command);
extern int sequence2profile(int argc, const char **argv, const Command& command);
extern int shellcompletion(int argc, const char **argv, const Command& command);
extern int shellcompletion(int argc, const char **argv, const Command& command);
extern int sortresult(int argc, const char **argv, const Command& command);
extern int splitdb(int argc, const char **argv, const Command& command);
extern int subtractdbs(int argc, const char **argv, const Command& command);
extern int suffixid(int argc, const char **argv, const Command& command);
@@ -364,7 +364,8 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
res.dbcov = 1.0f;
res.seqId = 1.0f;
}
if(checkCriteriaAndAddHitToList(res, isIdentity, swResults)){
if(checkCriteria(res, isIdentity, evalThr, seqIdThr, covMode, covThr)){
swResults.emplace_back(res);
passedNum++;
totalPassedNum++;
rejected = 0;
@@ -407,7 +408,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
}
}
// put the contents of the swResults list into ffindex DB
// put the contents of the swResults list into a result DB
for (size_t result = 0; result < swResults.size(); result++) {
size_t len = Matcher::resultToBuffer(buffer, swResults[result], addBacktrace);
alnResultsOutString.append(buffer, len);
@@ -490,7 +491,7 @@ size_t Alignment::estimateHDDMemoryConsumption(int dbSize, int maxSeqs) {
}
bool Alignment::checkCriteriaAndAddHitToList(Matcher::result_t &res, bool isIdentity, std::vector<Matcher::result_t> &swHits){
bool Alignment::checkCriteria(Matcher::result_t &res, bool isIdentity, double evalThr, double seqIdThr, int covMode, float covThr) {
const bool evalOk = (res.eval <= evalThr); // -e
const bool seqIdOK = (res.seqId >= seqIdThr); // --min-seq-id
const bool covOK = Util::hasCoverage(covThr, covMode, res.qcov, res.dbcov);
@@ -503,7 +504,6 @@ bool Alignment::checkCriteriaAndAddHitToList(Matcher::result_t &res, bool isIden
covOK
))
{
swHits.push_back(res);
return true;
} else {
return false;
@@ -529,8 +529,9 @@ void Alignment::computeAlternativeAlignment(unsigned int queryDbKey, Sequence &d
for (int altAli = 0; altAli < altAlignment && nextAlignment; altAli++) {
Matcher::result_t res = matcher.getSWResult(&dbSeq, INT_MAX, covMode, covThr, evalThr, swMode,
seqIdMode, isIdentity);
nextAlignment = checkCriteriaAndAddHitToList(res, isIdentity, swResults);
nextAlignment = checkCriteria(res, isIdentity, evalThr, seqIdThr, covMode, covThr);
if (nextAlignment == true) {
swResults.emplace_back(res);
for (int pos = res.dbStartPos; pos < res.dbEndPos; pos++) {
dbSeq.int_sequence[pos] = xIndex;
}
Oops, something went wrong.

0 comments on commit 8306271

Please sign in to comment.