Permalink
Browse files

Add merge reads and create workflow to support paired end and single …

…read file input
  • Loading branch information...
martin-steinegger committed Aug 2, 2018
1 parent b720fb3 commit 21eec96161823b55b2ee7a882c851d26a0993595
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 2.8.9 FATAL_ERROR)
project(plass CXX)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/lib/mmseqs/cmake")

set(FRAMEWORK_ONLY 1 CACHE INTERNAL "" FORCE)
@@ -9,4 +9,6 @@ add_subdirectory(lib/mmseqs)
add_subdirectory(data)
include_directories(lib)
add_subdirectory(lib/kerasify)
add_subdirectory(lib/flash)

add_subdirectory(src)
@@ -10,29 +10,25 @@ notExists() {
[ ! -f "$1" ]
}

abspath() {
if [ -d "$1" ]; then
(cd "$1"; pwd)
elif [ -f "$1" ]; then
if [[ $1 == */* ]]; then
echo "$(cd "${1%/*}"; pwd)/${1##*/}"
else
echo "$(pwd)/$1"
fi
elif [ -d $(dirname "$1") ]; then
echo "$(cd $(dirname "$1"); pwd)/$(basename "$1")"
fi
}

# check amount of input variables
[ "$#" -ne 3 ] && echo "Please provide <sequenceDB> <outDB> <tmp>" && exit 1;
[ ! -n "${READ_FILES}" ] && echo "Please provide ${READ_FILES}" && exit 1;
[ ! -n ${OUT_FILE} ] && echo "Please provide ${OUT_FILE}" && exit 1;
[ ! -n ${TMP_PATH} ] && echo "Please provide ${TMP_PATH}" && exit 1;

# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ -f "$2" ] && echo "$2 exists already!" && exit 1;
[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p $3;
[ -f "${OUT_FILE}" ] && echo "${OUT_FILE} exists already!" && exit 1;
[ ! -d "${TMP_PATH}" ] && echo "tmp directory ${TMP_PATH} not found!" && mkdir -p ${TMP_PATH};

if notExists "${TMP_PATH}/nucl_reads"; then
if [ ${PAIRED_END} -eq 1 ]; then
echo "PAIRED END MODE"
$MMSEQS mergereads $READ_FILES "${TMP_PATH}/nucl_reads"
else
$MMSEQS createdb $READ_FILES "${TMP_PATH}/nucl_reads"
fi
fi

INPUT="$(abspath "$1")"
TMP_PATH="$(abspath "$3")"
INPUT="${TMP_PATH}/nucl_reads"
if notExists "${TMP_PATH}/nucl_6f_start"; then
$MMSEQS extractorfs "${INPUT}" "${TMP_PATH}/nucl_6f_start" --contig-start-mode 1 --contig-end-mode 0 --orf-start-mode 0 --min-length 30 --max-length 45 --max-gaps 0 \
|| fail "extractorfs start step died"
@@ -59,6 +55,13 @@ if notExists "${TMP_PATH}/aa_6f_start_long"; then
$MMSEQS concatdbs "${TMP_PATH}/aa_6f_long" "${TMP_PATH}/aa_6f_start" "${TMP_PATH}/aa_6f_start_long" \
|| fail "concatdbs start long step died"
fi

# $MMSEQS concatdbs "${TMP_PATH}/aa_6f_start" "${TMP_PATH}/aa_6f_end" "${TMP_PATH}/aa_6f_start_end"
if notExists "${TMP_PATH}/aa_6f_start_long_h"; then
$MMSEQS concatdbs "${TMP_PATH}/aa_6f_long_h" "${TMP_PATH}/aa_6f_start_h" "${TMP_PATH}/aa_6f_start_long_h" \
|| fail "concatdbs start long step died"
fi

INPUT="${TMP_PATH}/aa_6f_start_long"
STEP=0
if [ -z "$NUM_IT" ]; then
@@ -81,7 +84,7 @@ while [ $STEP -lt $NUM_IT ]; do
fi

if [ $STEP -eq 0 ]; then
if notExists "${TMP_PATH}/corrected_reads"; then
if notExists "${TMP_PATH}/corrected_seqs"; then
$MMSEQS findassemblystart "$INPUT" "${TMP_PATH}/aln_$STEP" "${TMP_PATH}/corrected_seqs" \
|| fail "Findassemblystart alignment step died"
fi
@@ -108,12 +111,35 @@ done
STEP=$(($STEP-1))

# post processing
if notExists "${TMP_PATH}/assembly_${STEP}_filtered"; then
$MMSEQS filternoncoding "${TMP_PATH}/assembly_${STEP}" "${TMP_PATH}/assembly_${STEP}_filtered" \
|| fail "Filter protein with NN step died"
RESULT="${TMP_PATH}/assembly_${STEP}"
if [ ${PROTEIN_FILTER} -eq 1 ]; then
RESULT="${TMP_PATH}/assembly_${STEP}_filtered"
if notExists "${TMP_PATH}/assembly_${STEP}_filtered"; then
$MMSEQS filternoncoding "${TMP_PATH}/assembly_${STEP}" "${TMP_PATH}/assembly_${STEP}_filtered" \
|| fail "Filter protein with NN step died"
fi
fi
mv -f "${TMP_PATH}/assembly_${STEP}_filtered" "$2" || fail "Could not move result to $2"
mv -f "${TMP_PATH}/assembly_${STEP}_filtered.index" "$2.index" || fail "Could not move result to $2.index"


# select only assembled sequences
awk 'NR == FNR { f[$1] = $0; next } $1 in f { print f[$1], $0 }' ${RESULT}.index ${TMP_PATH}/aa_6f_start_long.index > ${RESULT}_tmp.index
awk '$3 > $6 { print }' ${RESULT}_tmp.index > ${RESULT}_only_assembled.index
mv ${RESULT}.index ${RESULT}_old.index
mv ${RESULT}_only_assembled.index ${RESULT}.index

# create fasta output
if notExists "${RESULT}_h"; then
ln -s ${TMP_PATH}/aa_6f_start_long_h "${RESULT}_h"
fi
if notExists "${RESULT}_h.index"; then
ln -s ${TMP_PATH}/aa_6f_start_long_h.index "${RESULT}_h.index"
fi

if notExists "${RESULT}.fasta"; then
$MMSEQS convert2fasta "${RESULT}" "${RESULT}.fasta"
fi

mv -f "${RESULT}.fasta" "$OUT_FILE" || fail "Could not move result to $OUT_FILE"

if [ -n "$REMOVE_TMP" ]; then
echo "Removing temporary files"
@@ -0,0 +1,2 @@
add_library(flash combine_reads.h combine_reads.cpp read.h read.cpp util.h util.cpp)
mmseqs_setup_derived_target(flash)
Oops, something went wrong.

0 comments on commit 21eec96

Please sign in to comment.