-
Notifications
You must be signed in to change notification settings - Fork 190
/
searchslicedtargetprofile.sh
executable file
·176 lines (147 loc) · 6.2 KB
/
searchslicedtargetprofile.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}
notExists() {
[ ! -f "$1" ]
}
hasCommand () {
command -v "$1" >/dev/null 2>&1 || { echo "Please make sure that $1 is in \$PATH."; exit 1; }
}
abspath() {
if [ -d "$1" ]; then
(cd "$1"; pwd)
elif [ -f "$1" ]; then
if [ -z "${1##*/*}" ]; then
echo "$(cd "${1%/*}"; pwd)/${1##*/}"
else
echo "$(pwd)/$1"
fi
elif [ -d "$(dirname "$1")" ]; then
echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
fi
}
hasCommand awk
hasCommand wc
hasCommand join
hasCommand sort
# check amount of input variables
[ "$#" -ne 4 ] && echo "Please provide <queryDB> <targetDB> <resultDB> <tmpDir>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -f "$2" ] && echo "$2 not found!" && exit 1;
[ -f "$3" ] && echo "$3 exists already!" && exit 1;
[ ! -d "$4" ] && echo "TMP directory $4 not found!" && mkdir -p "$4";
INPUT="$1"
TARGET="$(abspath "$2")"
RESULT="$3"
TMP_PATH="$4"
PROFILEDB="${TMP_PATH}/profileDB"
if notExists "${TMP_PATH}/profileDB"; then
# symlink the profile DB that can be reduced at every iteration the search
ln -s "${TARGET}" "${PROFILEDB}"
ln -s "${TARGET}.dbtype" "${PROFILEDB}.dbtype"
sort -k1,1 "${TARGET}.index" > "${PROFILEDB}.index"
echo "${AVAIL_DISK}" > "${PROFILEDB}.params"
else
read -r AVAIL_DISK < "${PROFILEDB}.params"
fi
# echo call to trim whitespace wc produces
# shellcheck disable=SC2046,SC2005
NUM_PROFILES="$(echo $(wc -l < "${PROFILEDB}.index"))"
OFFSET=0
STEP=0
# MAX_STEPS is set by the workflow
# shellcheck disable=SC2153
while [ "${STEP}" -lt "${MAX_STEPS}" ] && [ "${NUM_PROFILES}" -gt 0 ]; do
if [ -f "${TMP_PATH}/aln_${STEP}.checkpoint" ]; then
# restore values from previous run, in case it was aborted
read -r NUM_PROFILES OFFSET < "${TMP_PATH}/aln_${STEP}.checkpoint"
continue
fi
# Compute the max number of sequence according to the number of profiles
# 90 bytes/query-result line max.
MAX_SEQS="$((1024*(AVAIL_DISK/NUM_PROFILES)/90))"
# No more disk space allowed to use
if [ "${MAX_SEQS}" -eq 0 ]; then
break
fi
# Max result to go into the prefilter list
SEARCH_LIM="$((OFFSET+MAX_SEQS))"
if notExists "${TMP_PATH}/pref.done"; then
# shellcheck disable=SC2086
${RUNNER} "$MMSEQS" prefilter "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" \
--max-seqs "${SEARCH_LIM}" --offset-result "${OFFSET}" ${PREFILTER_PAR} \
|| fail "prefilter died"
touch "${TMP_PATH}/pref.done"
fi
if notExists "${TMP_PATH}/pref_count" || notExists "${TMP_PATH}/pref_keep.list"; then
# shellcheck disable=SC2086
"$MMSEQS" result2stats "$PROFILEDB" "$INPUT" "${TMP_PATH}/pref" "${TMP_PATH}/pref_count" \
--stat linecount ${THREADS_PAR} \
|| fail "result2stats died"
fi
if notExists "${TMP_PATH}/pref_keep.list"; then
# remove profiles that do not provide more hits
# shellcheck disable=SC2086
"$MMSEQS" filterdb "${TMP_PATH}/pref_count" "${TMP_PATH}/pref_keep" \
--filter-column 1 --comparison-operator ge --comparison-value "${MAX_SEQS}" ${THREADS_PAR} \
|| fail "filterdb died"
rm -f "${TMP_PATH}/pref_count" "${TMP_PATH}/pref_count.index"
awk '$3 > 1 { print $1 }' "${TMP_PATH}/pref_keep.index" | sort -k1,1 > "${TMP_PATH}/pref_keep.list"
rm -f "${TMP_PATH}/pref_keep" "${TMP_PATH}/pref_keep.index"
fi
if notExists "${TMP_PATH}/aln.done"; then
# shellcheck disable=SC2086
${RUNNER} "$MMSEQS" align "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" "${TMP_PATH}/aln" ${ALIGNMENT_PAR} \
|| fail "align died"
rm -f "${TMP_PATH}/pref" "${TMP_PATH}/pref.index"
touch "${TMP_PATH}/aln.done"
fi
if notExists "${TMP_PATH}/aln_swap.done"; then
# note: we recover the right evalue, since it is computed according to the original target db
# shellcheck disable=SC2086
"$MMSEQS" swapresults "${TARGET}" "${INPUT}" "${TMP_PATH}/aln" "${TMP_PATH}/aln_swap" ${SWAP_PAR} \
|| fail "swapresults died"
rm -f "${TMP_PATH}/aln" "${TMP_PATH}/aln.index"
touch "${TMP_PATH}/aln_swap.done"
fi
MERGED="${TMP_PATH}/aln_swap"
if [ -f "${TMP_PATH}/aln_merged" ]; then
# shellcheck disable=SC2086
"$MMSEQS" mergedbs "${INPUT}" "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged" "${TMP_PATH}/aln_swap" ${VERBOSITY_PAR} \
|| fail "mergedbs died"
mv -f "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged.index"
mv -f "${TMP_PATH}/aln_merged_new.index" "${TMP_PATH}/aln_merged.index"
rm -f "${TMP_PATH}/aln_swap" "${TMP_PATH}/aln_swap.index"
MERGED="${TMP_PATH}/aln_merged"
fi
# keep only the top max-seqs hits according to the default alignment sorting criteria
# shellcheck disable=SC2086
"$MMSEQS" sortresult "${MERGED}" "${TMP_PATH}/aln_merged_trunc" ${SORTRESULT_PAR} \
|| fail "sortresult died"
mv -f "${TMP_PATH}/aln_merged_trunc" "${TMP_PATH}/aln_merged"
mv -f "${TMP_PATH}/aln_merged_trunc.index" "${TMP_PATH}/aln_merged.index"
join "${TMP_PATH}/pref_keep.list" "${PROFILEDB}.index" > "${PROFILEDB}.index.tmp"
mv -f "${PROFILEDB}.index.tmp" "${PROFILEDB}.index"
# keep for the prefilter only the next hits
OFFSET="$SEARCH_LIM"
# shellcheck disable=SC2046,SC2005
NUM_PROFILES="$(echo $(wc -l < "${PROFILEDB}.index"))"
rm -f "${TMP_PATH}/pref.done" "${TMP_PATH}/aln.done" "${TMP_PATH}/aln_swap.done" "${TMP_PATH}/pref_keep.list"
printf "%d\\t%d\\n" "${NUM_PROFILES}" "${OFFSET}" > "${TMP_PATH}/aln_${STEP}.checkpoint"
STEP="$((STEP+1))"
done
mv -f "${TMP_PATH}/aln_merged" "${RESULT}"
mv -f "${TMP_PATH}/aln_merged.index" "${RESULT}.index"
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
while [ "$STEP" -lt "$MAX_STEPS" ]; do
if [ -f "${TMP_PATH}/aln_${STEP}.checkpoint" ]; then
rm -f "${TMP_PATH}/aln_${STEP}.checkpoint"
fi
done
rm -f "${PROFILEDB}" "${PROFILEDB}.index" "${PROFILEDB}.dbtype" "${PROFILEDB}.params"
rm -f "$TMP_PATH/searchslicedtargetprofile.sh"
fi