Permalink
Browse files

Add createtaxdb

  • Loading branch information...
Martin Steinegger Martin Steinegger
Martin Steinegger authored and Martin Steinegger committed Nov 7, 2018
1 parent d27b052 commit 3265a001da7fc13c09daef39c58128d31c6086a7
Showing with 123 additions and 2 deletions.
  1. +1 −0 data/CMakeLists.txt
  2. +66 −0 data/createtaxdb.sh
  3. +1 −0 src/CommandDeclarations.h
  4. +8 −2 src/mmseqs.cpp
  5. +1 −0 src/taxonomy/CMakeLists.txt
  6. +46 −0 src/taxonomy/createtaxdb.cpp
@@ -12,6 +12,7 @@ set(COMPILED_RESOURCES
update_clustering.sh
searchtargetprofile.sh
createindex.sh
createtaxdb.sh
translated_search.sh
taxonomy.sh
multihitdb.sh
@@ -0,0 +1,66 @@
#!/bin/sh -e
notExists() {
[ ! -f "$1" ]
}
hasCommand() {
command -v "$1" >/dev/null 2>&1 || { echo "Please make sure that $1 is in \$PATH."; exit 1; }
}
notExists "$1" && echo "$1 not found!" && exit 1;
hasCommand wget
hasCommand awk
hasCommand zcat
hasCommand touch
hasCommand tar
TAXDBNAME="$1"
MAPPINGFILE=$2
NCBITAXINFO="$3"
TMP_PATH="${4:-$2}"
if [ "$DOWNLOAD_DATA" -eq "1" ]; then
# Download NCBI taxon information
if notExists "$4/ncbi_download.complete"; then
echo "Download taxdump.tar.gz"
wget -nv -O - "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" \
| tar -C "${TMP_PATH}" -xzf - names.dmp nodes.dmp merged.dmp delnodes.dmp
touch "${TMP_PATH}/ncbi_download.complete"
fi
NCBITAXINFO="${TMP_PATH}"
# Download the latest UniProt ID mapping to extract taxon identifiers
if notExists "${TMP_PATH}/mapping_download.complete"; then
echo "Download idmapping.dat.gz"
URL="ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz"
wget -nv -O - "$URL" | zcat | awk '$2 == "NCBI_TaxID" {print $1"\t"$3 }' > "${TMP_PATH}/taxidmapping"
touch "${TMP_PATH}/mapping_download.complete"
fi
MAPPINGFILE="${TMP_PATH}/taxidmapping"
fi
# create mapping
if notExists "${TMP_PATH}/targetDB_mapping.complete"; then
awk 'NR == FNR { f[$1] = $2 } $2 in f { print $1"\t"f[$2] }' \
"$MAPPINGFILE" "${TAXDBNAME}.lookup" > "${TMP_PATH}/targetDB_mapping"
touch "${TMP_PATH}/targetDB_mapping.complete"
fi
# finalize database
mv -f "${NCBITAXINFO}/names.dmp" "${TAXDBNAME}_names.dmp"
mv -f "${NCBITAXINFO}/nodes.dmp" "${TAXDBNAME}_nodes.dmp"
mv -f "${NCBITAXINFO}/merged.dmp" "${TAXDBNAME}_merged.dmp"
mv -f "${NCBITAXINFO}/delnodes.dmp" "${TAXDBNAME}_delnodes.dmp"
mv -f "${TMP_PATH}/targetDB_mapping" "${TAXDBNAME}_mapping"
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
rm -f "${TMP_PATH}/names.dmp" "${TMP_PATH}/nodes.dmp" "${TMP_PATH}/merged.dmp" "${TMP_PATH}/delnodes.dmp"
rm -f "${TMP_PATH}/taxidmapping"
if [ "$DOWNLOAD_DATA" -eq "1" ]; then
rm -f "${TMP_PATH}/ncbi_download.complete" "${TMP_PATH}/targetDB_mapping.complete"
fi
rm -f "${TMP_PATH}/targetDB_mapping.complete"
rm -f createtaxdb.sh
fi
@@ -81,6 +81,7 @@ extern int summerizeresultsbyset(int argc, const char **argv, const Command &com
extern int swapdb(int argc, const char **argv, const Command& command);
extern int swapresults(int argc, const char **argv, const Command& command);
extern int taxonomy(int argc, const char **argv, const Command& command);
extern int createtaxdb(int argc, const char **argv, const Command& command);
extern int translateaa(int argc, const char **argv, const Command& command);
extern int translatenucs(int argc, const char **argv, const Command& command);
extern int tsv2db(int argc, const char **argv, const Command& command);
@@ -116,10 +116,16 @@ std::vector<struct Command> commands = {
{"taxonomy", taxonomy, &par.taxonomy, COMMAND_TAXONOMY,
"Compute taxonomy and lowest common ancestor for each sequence.",
NULL,
"Milot Mirdita <milot@mirdita.de>",
"<i:queryDB> <i:targetDB> <i:targetTaxonMapping> <i:NcbiTaxdmpDir> <o:taxaDB> <tmpDir>",
"Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
"<i:queryDB> <i:targetDB> <o:taxaDB> <tmpDir>",
CITATION_MMSEQS2
},
{"createtaxdb", createtaxdb, &par.onlyverbosity, COMMAND_TAXONOMY,
"Annotates a sequence database with NCBI taxonomy information",
"Annotates a sequence database with NCBI taxonomy information. The program will download the Uniprot taxMappingFile and ncbi-taxdump-folder if only two arguments are passed.",
"Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
"<i:sequenceDB> [<i:taxMappingFile> <i:ncbi-taxdump-folder>] <tmpDir>",
CITATION_MMSEQS2},
{"search-2m", search2m, &par.taxonomy, COMMAND_TAXONOMY,
"Compute taxonomy and lowest common ancestor for each sequence.",
NULL,
@@ -9,5 +9,6 @@ set(taxonomy_source_files
taxonomy/addtaxonomy.cpp
taxonomy/NcbiTaxonomy.cpp
taxonomy/filtertaxdb.cpp
taxonomy/createtaxdb.cpp
PARENT_SCOPE
)
@@ -0,0 +1,46 @@
#include <CommandCaller.h>
#include "NcbiTaxonomy.h"
#include "Parameters.h"
#include "DBWriter.h"
#include "FileUtil.h"
#include "Debug.h"
#include "Util.h"
#include "createtaxdb.sh.h"
#ifdef OPENMP
#include <omp.h>
#endif
int createtaxdb(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
par.parseParameters(argc, argv, command, 1, true, Parameters::PARSE_VARIADIC);
std::string tmp = par.filenames.back();
if (FileUtil::directoryExists(tmp.c_str())==false){
Debug(Debug::INFO) << "Tmp " << tmp << " folder does not exist or is not a directory.\n";
if (FileUtil::makeDir(tmp.c_str()) == false){
Debug(Debug::ERROR) << "Could not create tmp folder " << tmp << ".\n";
EXIT(EXIT_FAILURE);
} else {
Debug(Debug::INFO) << "Created dir " << tmp << "\n";
}
}
CommandCaller cmd;
cmd.addVariable("TMP_PATH", tmp.c_str());
if(par.filenames.size() == 4) {
cmd.addVariable("DOWNLOAD_DATA", "0");
}else if(par.filenames.size() == 2) {
cmd.addVariable("DOWNLOAD_DATA", "1");
}
FileUtil::writeFile(tmp + "/createindex.sh", createtaxdb_sh, createtaxdb_sh_len);
std::string program(tmp + "/createindex.sh");
cmd.execProgram(program.c_str(), par.filenames);
return EXIT_SUCCESS;
}

0 comments on commit 3265a00

Please sign in to comment.