Permalink
Browse files

Workflow for sliced rev profile search

  • Loading branch information...
ClovisG committed Sep 14, 2018
1 parent 8c45f81 commit 587b47d67463d282ee85b5a8cfe5926e9a2f4c3c
View
@@ -40,3 +40,4 @@ BacteriaBench/
EliTestFiles/
NCBI_Bacteria_Orfs/
build.old/
.vscode/c_cpp_properties.json
View
@@ -0,0 +1,21 @@
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
{
"label": "build",
"type": "shell",
"command": "cd build && make -j 4",
"args": [],
"group": {
"kind": "build",
"isDefault": true
},
"presentation": {
"reveal": "silent"
},
"problemMatcher": "$msCompile"
}
]
}
View
@@ -42,6 +42,7 @@ set(COMPILED_RESOURCES
libPure_blosum62_255.lib
libPure_blosum62_32.lib
libPolished_8.lib
searchslicemodetargetprofile.sh
)
set(GENERATED_OUTPUT_HEADERS "")
@@ -0,0 +1,125 @@
#!/bin/sh -ex
fail() {
echo "Error: $1"
exit 1
}
notExists() {
[ ! -f "$1" ]
}
hasCommand () {
command -v "$1" >/dev/null 2>&1 || { echo "Please make sure that $1 is in \$PATH."; exit 1; }
}
hasCommand awk
hasCommand join
hasCommand sort
# check amount of input variables
[ "$#" -ne 4 ] && echo "Please provide <queryDB> <targetDB> <resultDB> <tmpDir>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -f "$2" ] && echo "$2 not found!" && exit 1;
[ -f "$3" ] && echo "$3 exists already!" && exit 1;
[ ! -d "$4" ] && echo "TMP directory $4 not found!" && mkdir -p "$4";
INPUT="$1"
RESULTS="$3"
TMP_PATH="$4"
# Copy of the profile DB that can be reduced as the search processes
ln -sf $(realpath $2) $TMP_PATH/profileDB
sort -k1,1 $2.index > $TMP_PATH/profileDB.index
ln -sf $(realpath $2).dbtype $TMP_PATH/profileDB.dbtype
PROFILEDB=$TMP_PATH/profileDB
FULLPROFILEDB=$TMP_PATH/fullProfileDB
ln -sf $(realpath $2) $FULLPROFILEDB
ln -sf $(realpath $2.index) $FULLPROFILEDB.index
ln -sf $(realpath $2.dbtype) $FULLPROFILEDB.dbtype
offset=0 #start with the first result
nProfiles=$(awk '{n=n+1}END{print n}' $PROFILEDB.index)
nSequences=$(awk '{n=n+1}END{print n}' $INPUT.index)
STEP=0
while [ $STEP -lt $MAX_STEPS ] && [ $nProfiles -gt 0 ];
do
#MEMORY_FOR_SWAPPING=$(free|grep Mem|awk '{print $4}') #1000000 #10000000
# compute the max number of sequence that are reasonable to swap
# according to the number of profiles
# 90 bytes/query-result line max.
MAX_SEQS=$(awk -v mem=$AVAIL_MEM -v nProfiles=$nProfiles 'BEGIN{printf("%.0f\n", 1024*(mem/nProfiles)/90);}')
# Max result to go in the prefilter list
SEARCH_LIM=$(awk -v offset=$offset -v maxSeqs=$MAX_SEQS 'BEGIN{print offset+maxSeqs}')
#echo "Memory for swapping: $MEMORY_FOR_SWAPPING" >> $TMP_PATH/log.txt
#echo "Current iteration: searching for $MAX_SEQS sequences using $nProfiles profiles with an offset of $offset in the results" >> $TMP_PATH/log.txt
#rm -f $TMP_PATH/aln_.* $TMP_PATH/pref_* $TMP_PATH/searchOut.notSwapped.$nProfiles* #$TMP_PATH/searchOut.current*
if notExists "${TMP_PATH}/pref_notSwapped.$offset"; then
"$MMSEQS" prefilter $PROFILEDB $INPUT $TMP_PATH/pref_notSwapped.$offset --max-seqs $SEARCH_LIM \
--offset-result $offset ${PREFILTER_PAR} \
|| fail "Pref died"
fi
if notExists "${TMP_PATH}/pref.$offset"; then
"$MMSEQS" swapresults $PROFILEDB $INPUT $TMP_PATH/pref_notSwapped.$offset $TMP_PATH/pref.$offset \
|| fail "Swapping died"
fi
# note here : we recover the right evalue, since it is computed according to the target db
# which is the full profiledb
if notExists "${TMP_PATH}/aln.$offset"; then
"$MMSEQS" align $INPUT $FULLPROFILEDB $TMP_PATH/pref.$offset $TMP_PATH/aln.$offset ${ALIGNMENT_PAR} \
|| fail "Alignment died"
fi
if [ -f $TMP_PATH/searchOut ]; then
mmseqs mergedbs $INPUT $TMP_PATH/searchOut.new "${TMP_PATH}/aln.$offset" $TMP_PATH/searchOut
else
cp "${TMP_PATH}/aln.$offset" $TMP_PATH/searchOut.new
cp "${TMP_PATH}/aln.$offset.index" $TMP_PATH/searchOut.new.index
fi
"$MMSEQS" filterdb $TMP_PATH/searchOut.new $TMP_PATH/searchOut.new.sorted --sort-entries 1 --filter-column 4 $COMMONS \
|| fail "Filter (sorting) died"
rm -f $TMP_PATH/searchOut.new{,.index}
"$MMSEQS" filterdb $TMP_PATH/searchOut.new.sorted $TMP_PATH/searchOut.new.sorted.trunc --extract-lines $MAX_RESULTS_PER_QUERY $COMMONS \
|| fail "Filter (extract lines) died"
rm -f $TMP_PATH/searchOut.new.sorted{,.index}
mv -f $TMP_PATH/searchOut.new.sorted.trunc $TMP_PATH/searchOut
mv -f $TMP_PATH/searchOut.new.sorted.trunc.index $TMP_PATH/searchOut.index
# now remove the profiles that reached their eval threshold
notExists $TMP_PATH/searchOut.$offset.count && "$MMSEQS" result2stats $PROFILEDB $INPUT $TMP_PATH/pref_notSwapped.$offset $TMP_PATH/searchOut.$offset.count --stat linecount $COMMONS
notExists $TMP_PATH/searchOut.$offset.toKeep && "$MMSEQS" filterdb $TMP_PATH/searchOut.$offset.count $TMP_PATH/searchOut.$offset.toKeep --filter-column 1 --comparison-operator ge --comparison-value $MAX_SEQS $COMMONS
sort -k1,1 $TMP_PATH/searchOut.$offset.toKeep.index >$TMP_PATH/searchOut.$offset.toKeep.keys
join $TMP_PATH/searchOut.$offset.toKeep.keys $PROFILEDB.index >$PROFILEDB.index.tmp
mv -f $PROFILEDB.index.tmp $PROFILEDB.index # reduce the profile DB
offset=$SEARCH_LIM # keep for the prefilter only the next hits
nProfiles=$(awk '{n=n+1}END{print n}' $PROFILEDB.index)
STEP=$((STEP+1))
done
# Save the results
mv -f $TMP_PATH/searchOut $RESULTS
mv -f $TMP_PATH/searchOut.index $RESULTS.index
# Clean up
#rm -f $TMP_PATH/searchOut.toKeep $TMP_PATH/searchOut.toKeep.index
#rm -f $TMP_PATH/searchOut.count $TMP_PATH/searchOut.count.index
#rm -f $TMP_PATH/aln_4.* $TMP_PATH/pref_4* $TMP_PATH/searchOut.notSwapped* $TMP_PATH/searchOut.current*
#rm $TMP_PATH/profileDB*
@@ -127,6 +127,7 @@ Parameters::Parameters():
PARAM_NUM_ITERATIONS(PARAM_NUM_ITERATIONS_ID, "--num-iterations", "Number search iterations","Search iterations",typeid(int),(void *) &numIterations, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE),
PARAM_START_SENS(PARAM_START_SENS_ID, "--start-sens", "Start sensitivity","start sensitivity",typeid(float),(void *) &startSens, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_SENS_STEPS(PARAM_SENS_STEPS_ID, "--sens-steps", "Search steps","Search steps performed from --start-sense and -s.",typeid(int),(void *) &sensSteps, "^[1-9]{1}$"),
PARAM_SLICE_SEARCH(PARAM_SLICE_SEARCH_ID, "--slice-search", "Run a seq-profile search in slice mode","For bigger profile DB, run iteratively the search by greedily swapping the search results.",typeid(bool),(void *) &sliceSearch, ""),
// easysearch
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query.", typeid(bool), (void*)&greedyBestHits, ""),
// Orfs
@@ -670,6 +671,7 @@ Parameters::Parameters():
searchworkflow.push_back(PARAM_NUM_ITERATIONS);
searchworkflow.push_back(PARAM_START_SENS);
searchworkflow.push_back(PARAM_SENS_STEPS);
searchworkflow.push_back(PARAM_SLICE_SEARCH);
searchworkflow.push_back(PARAM_RUNNER);
searchworkflow.push_back(PARAM_REMOVE_TMP_FILES);
@@ -1174,6 +1176,7 @@ void Parameters::setDefaults() {
numIterations = 1;
startSens = 4;
sensSteps = 1;
sliceSearch = false;
greedyBestHits = false;
View
@@ -9,6 +9,9 @@
#include <typeinfo>
#include "Command.h"
#define USE_ONLY_SET_PARAMETERS true // when createParameterString, generates
// flags only for the param set by the user
#define PARAMETER(x) const static int x##_ID = __COUNTER__; \
MMseqsParameter x;
@@ -229,6 +232,7 @@ class Parameters {
int numIterations;
float startSens;
int sensSteps;
bool sliceSearch;
// easysearch
bool greedyBestHits;
@@ -540,6 +544,7 @@ class Parameters {
PARAMETER(PARAM_NUM_ITERATIONS)
PARAMETER(PARAM_START_SENS)
PARAMETER(PARAM_SENS_STEPS)
PARAMETER(PARAM_SLICE_SEARCH)
// easysearch
PARAMETER(PARAM_GREEDY_BEST_HITS)
View
@@ -355,7 +355,7 @@ void Util::checkAllocation(void *pointer, std::string message) {
}
size_t Util::getPageSize() {
return sysconf(_SC_PAGE_SIZE);
return sysconf(_SC_PAGE_SIZE); // in bytes
}
size_t Util::getTotalMemoryPages() {
@@ -370,7 +370,7 @@ size_t Util::getTotalMemoryPages() {
return phys_pages;
}
size_t Util::getTotalSystemMemory()
size_t Util::getTotalSystemMemory() // in bytes
{
// check for real physical memory
long pages = getTotalMemoryPages();
View
@@ -6,6 +6,7 @@
#include "Parameters.h"
#include "searchtargetprofile.sh.h"
#include "searchslicemodetargetprofile.sh.h"
#include "blastpgp.sh.h"
#include "translated_search.sh.h"
#include "blastp.sh.h"
@@ -127,7 +128,26 @@ int search(int argc, const char **argv, const Command& command) {
cmd.addVariable("RUNNER", par.runner.c_str());
cmd.addVariable("ALIGNMENT_DB_EXT", targetDbType == Sequence::PROFILE_STATE_SEQ ? ".255" : "");
if (targetDbType == Sequence::HMM_PROFILE) {
if (targetDbType == Sequence::HMM_PROFILE && par.sliceSearch) {
cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter,USE_ONLY_SET_PARAMETERS).c_str());
cmd.addVariable("MAX_STEPS", std::to_string(30).c_str());
cmd.addVariable("MAX_RESULTS_PER_QUERY", std::to_string(par.maxResListLen).c_str());
size_t memoryLimit = static_cast<size_t>(Util::getTotalSystemMemory() * 0.9);
cmd.addVariable("AVAIL_MEM", std::to_string(memoryLimit/1024).c_str());
cmd.addVariable("COMMONS", (std::string("--threads ") + std::to_string(par.threads)).c_str());
if (isUngappedMode) {
par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
par.rescoreMode = originalRescoreMode;
} else {
cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
}
cmd.addVariable("SWAP_PAR", par.createParameterString(par.swapresult).c_str());
FileUtil::writeFile(tmpDir + "/searchslicemodetargetprofile.sh", searchslicemodetargetprofile_sh, searchslicemodetargetprofile_sh_len);
program=std::string(tmpDir + "/searchslicemodetargetprofile.sh");
} else if (targetDbType == Sequence::HMM_PROFILE) {
cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
// we need to align all hits in case of target Profile hits
size_t maxResListLen = par.maxResListLen;

0 comments on commit 587b47d

Please sign in to comment.