Skip to content
Permalink
Browse files

Add a mode to createdb to support soft linking data instead of copy

  • Loading branch information
martin-steinegger committed Nov 11, 2019
1 parent 5ae5503 commit 11e2736028cc12a6523b507b061158903f8de317
@@ -14,7 +14,7 @@ INPUT="$INPUT"

if notExists "${TMP_PATH}/query.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_PAR} \
"$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_QUERY_PAR} \
|| fail "query createdb died"
fi

@@ -10,7 +10,7 @@ notExists() {

if notExists "${TMP_PATH}/query.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_PAR} \
"$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_QUERY_PAR} \
|| fail "query createdb died"
fi

@@ -41,6 +41,8 @@
typedef struct __kstream_t { \
char *buf; \
int begin, end, is_eof; \
size_t cur_buf_pos; \
size_t newline; \
type_t f; \
} kstream_t;

@@ -71,6 +73,7 @@
if (ks->is_eof && ks->begin >= ks->end) return -1; \
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->cur_buf_pos += ks->end; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end == 0) { ks->is_eof = 1; return -1;} \
if (ks->end == -1) { ks->is_eof = 1; return -3;}\
@@ -102,14 +105,15 @@ typedef struct __kstring_t {
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->cur_buf_pos += ks->end; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end == 0) { ks->is_eof = 1; break; } \
if (ks->end == -1) { ks->is_eof = 1; return -3; } \
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == '\n') break; \
if (ks->buf[i] == '\n') { ks->newline+=(append == 1); break; } \
} else if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
@@ -158,6 +162,9 @@ typedef struct __kstring_t {
{ \
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
s->f->end=0; \
s->f->cur_buf_pos=0; \
s->f->newline=0; \
return s; \
} \
SCOPE void kseq_destroy(kseq_t *ks) \
@@ -179,11 +186,15 @@ typedef struct __kstring_t {
{ \
int c,r; \
kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
ks->newline = 0; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
seq->offset = ks->cur_buf_pos + ks->begin; \
if (c < 0) return c; /* end of file or error*/ \
seq->last_char = c; \
} /* else: the first header char has been read in the previous call */ \
} else{ /* else: the first header char has been read in the previous call */ \
seq->offset = ks->cur_buf_pos + ks->begin; \
} \
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
@@ -192,11 +203,14 @@ typedef struct __kstring_t {
seq->seq.s = (char*)malloc(seq->seq.m); \
} \
while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
if (c == '\n') continue; /* skip empty lines */ \
if (c == '\n') { ks->newline++; continue; } /* skip empty lines */ \
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
} \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
seq->multiline = (ks->newline > 1); \
if (c == '>' || c == '@') { /* the first header char has been read */ \
seq->last_char = c; \
} \
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
@@ -221,6 +235,8 @@ typedef struct __kstring_t {
typedef struct { \
kstring_t name, comment, seq, qual; \
int last_char; \
size_t offset; \
bool multiline; \
kstream_t *f; \
} kseq_t;

@@ -162,7 +162,7 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,

ksw_extz2_sse(0, queryRevLenToAlign, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev, targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, 40, flag, &ez);

int qStartPos = querySeqObj->L - ( qStartRev + ez.max_q ) -1 ;
int qStartPos = querySeqObj->L - ( qStartRev + ez.max_q ) -1;
int tStartPos = targetSeqObj->L - ( tStartRev + ez.max_t ) -1;

int alignFlag = 0;
@@ -69,6 +69,9 @@ class DBWriter {

static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& lookupFile, int sortMode = DBReader<unsigned int>::SORT_BY_ID_OFFSET);

bool isClosed(){
return closed;
}
private:
size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx);
void writeThreadBuffer(unsigned int idx, size_t dataSize);
@@ -139,6 +139,19 @@ size_t FileUtil::getFreeSpace(const char *path) {
return stat.f_bfree * stat.f_frsize;
}


std::string FileUtil::getRealPathFromSymLink(const std::string path){
char *p = realpath(path.c_str(), NULL);
if (p == NULL) {
Debug(Debug::ERROR) << "Could not get path of " << path << "!\n";
EXIT(EXIT_FAILURE);
}

std::string name(p);
free(p);
return name;
}

std::string FileUtil::getHashFromSymLink(const std::string path){
char *p = realpath(path.c_str(), NULL);
if (p == NULL) {
@@ -24,6 +24,8 @@ class FileUtil {

static void deleteTempFiles(const std::list<std::string> &tmpFiles);

static std::string getRealPathFromSymLink(const std::string path);

static std::string getHashFromSymLink(const std::string path);

static void* mmapFile(FILE * file, size_t *dataSize);
@@ -19,7 +19,8 @@ bool KSeqFile::ReadEntry() {
int result = KSEQFILE::kseq_read(s);
if (result < 0)
return false;

entry.offset = s->offset;
entry.multiline = s->multiline;
entry.name = s->name;
entry.comment = s->comment;
entry.sequence = s->seq;
@@ -63,6 +64,8 @@ bool KSeqGzip::ReadEntry() {
entry.comment = s->comment;
entry.sequence = s->seq;
entry.qual = s->qual;
entry.offset = -1;
entry.multiline = s->multiline;

return true;
}
@@ -105,6 +108,8 @@ bool KSeqBzip::ReadEntry() {
entry.comment = s->comment;
entry.sequence = s->seq;
entry.qual = s->qual;
entry.offset = -1;
entry.multiline = s->multiline;

return true;
}
@@ -12,6 +12,8 @@ class KSeqWrapper {
kstring_t sequence;
kstring_t comment;
kstring_t qual;
size_t offset;
bool multiline;
} entry;

virtual bool ReadEntry() = 0;
@@ -91,7 +91,7 @@ Parameters::Parameters():
PARAM_DB_OUTPUT(PARAM_DB_OUTPUT_ID, "--db-output", "Database output", "Output a result db instead of a text file", typeid(bool), (void*) &dbOut, "", MMseqsParameter::COMMAND_EXPERT),
// --include-only-extendablediagonal
PARAM_RESCORE_MODE(PARAM_RESCORE_MODE_ID,"--rescore-mode", "Rescore mode", "Rescore diagonal with: 0: Hamming distance, 1: local alignment (score only), 2: local alignment, 3: global alignment or 4: longest alignment fullfilling window quality criterion", typeid(int), (void *) &rescoreMode, "^[0-4]{1}$"),
PARAM_WRAPPED_SCORING(PARAM_WRAPPED_SCORING_ID,"--wrapped-scoring", "Allow wrapped scoring","Double the (nucleotide) query sequence during the scoring process to allow wrapped diagonal scoring around end and start", typeid(bool), (void *) &wrappedScoring, "", MMseqsParameter::COMMAND_MISC),
PARAM_WRAPPED_SCORING(PARAM_WRAPPED_SCORING_ID,"--wrapped-scoring", "Allow wrapped scoring","Double the (nucleotide) query sequence during the scoring process to allow wrapped diagonal scoring around end and start", typeid(bool), (void *) &wrappedScoring, "", MMseqsParameter::COMMAND_MISC|MMseqsParameter::COMMAND_EXPERT),
PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID,"--filter-hits", "Remove hits by seq. id. and coverage", "filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT),
PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by evalue (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
// result2msa
@@ -169,12 +169,12 @@ Parameters::Parameters():
PARAM_USE_HEADER(PARAM_USE_HEADER_ID,"--use-fasta-header", "Use fasta header", "use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers",typeid(bool),(void *) &useHeader, ""),
PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "numeric ids in index file are offset by this value ",typeid(int),(void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
PARAM_DB_TYPE(PARAM_DB_TYPE_ID,"--dbtype", "Database type", "Database type 0: auto, 1: amino acid 2: nucleotides",typeid(int),(void *) &dbType, "[0-2]{1}"),
PARAM_DONT_SPLIT_SEQ_BY_LEN(PARAM_DONT_SPLIT_SEQ_BY_LEN_ID,"--dont-split-seq-by-len", "Split seq. by length", "Dont split sequences by --max-seq-len",typeid(bool),(void *) &splitSeqByLen, ""),
PARAM_DONT_SHUFFLE(PARAM_DONT_SHUFFLE_ID,"--dont-shuffle", "Do not shuffle input database", "Do not shuffle input database",typeid(bool),(void *) &shuffleDatabase, ""),
PARAM_CREATEDB_MODE(PARAM_CREATEDB_MODE_ID, "--createdb-mode", "Createdb mode", "createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q)",typeid(int),(void *) &createdbMode, "^[0-1]{1}$"),
PARAM_SHUFFLE(PARAM_SHUFFLE_ID,"--shuffle", "Shuffle input database", "Shuffle input database",typeid(bool),(void *) &shuffleDatabase, ""),
PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use ffindex header", "use the ffindex header file instead of the body to map the entry keys",typeid(bool),(void *) &useHeaderFile, ""),
// splitsequence
PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "overlap between sequences",typeid(int),(void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "sequence split mode 0: soft link data write new index, 1: copy data",typeid(int),(void *) &sequenceSplitMode, "^[0-1]{1}$"),
PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "sequence split mode 0: copy data, 1: soft link data and write new index,",typeid(int),(void *) &sequenceSplitMode, "^[0-1]{1}$"),
// gff2db
PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID,"--gff-type", "GFF type", "type in the GFF file to filter by",typeid(std::string),(void *) &gffType, ""),
// translatenucs
@@ -254,7 +254,7 @@ Parameters::Parameters():
PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: Single Search LCA , 2: 2bLCA, 3: approx. 2bLCA, 4: top hit", typeid(int), (void*) &taxonomySearchMode, "^[1-4]{1}$"),
PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment", typeid(int), (void*) &taxonomyOutpuMode, "^[0-1]{1}$"),
// createsubdb
PARAM_SUBDB_MODE(PARAM_SUBDB_MODE_ID, "--subdb-mode", "Subdb mode", "LCA Mode 0: copy data 1: soft link data", typeid(int), (void*) &subDbMode, "^[0-1]{1}$")
PARAM_SUBDB_MODE(PARAM_SUBDB_MODE_ID, "--subdb-mode", "Subdb mode", "Subdb mode 0: copy data 1: soft link data and write index", typeid(int), (void*) &subDbMode, "^[0-1]{1}$")
{
if (instance) {
Debug(Debug::ERROR) << "Parameter instance already exists!\n";
@@ -658,9 +658,9 @@ Parameters::Parameters():
// create db
createdb.push_back(&PARAM_MAX_SEQ_LEN);
createdb.push_back(&PARAM_DONT_SPLIT_SEQ_BY_LEN);
createdb.push_back(&PARAM_DB_TYPE);
createdb.push_back(&PARAM_DONT_SHUFFLE);
createdb.push_back(&PARAM_SHUFFLE);
createdb.push_back(&PARAM_CREATEDB_MODE);
createdb.push_back(&PARAM_ID_OFFSET);
createdb.push_back(&PARAM_COMPRESSED);
createdb.push_back(&PARAM_V);
@@ -1042,6 +1042,7 @@ Parameters::Parameters():
easytaxonomy = combineList(taxonomy, addtaxonomy);
easytaxonomy = combineList(easytaxonomy, convertalignments);
easytaxonomy = combineList(easytaxonomy, createtsv);
easytaxonomy = combineList(easytaxonomy, createdb);
// multi hit db
multihitdb = combineList(createdb, extractorfs);
@@ -1827,7 +1828,7 @@ void Parameters::setDefaults() {
searchType = SEARCH_TYPE_AUTO;

// createdb
splitSeqByLen = true;
createdbMode = SEQUENCE_SPLIT_MODE_HARD;
shuffleDatabase = true;

// format alignment
@@ -214,8 +214,8 @@ class Parameters {
static const int SEQ_ID_LONG = 2;

// seq. split mode
static const int SEQUENCE_SPLIT_MODE_SOFT = 0;
static const int SEQUENCE_SPLIT_MODE_HARD = 1;
static const int SEQUENCE_SPLIT_MODE_HARD = 0;
static const int SEQUENCE_SPLIT_MODE_SOFT = 1;

// rescorediagonal
static const int RESCORE_MODE_HAMMING = 0;
@@ -459,12 +459,13 @@ class Parameters {
// createdb
int identifierOffset;
int dbType;
bool splitSeqByLen;
int createdbMode;
bool shuffleDatabase;

// splitsequence
int sequenceOverlap;
int sequenceSplitMode;

// convert2fasta
bool useHeaderFile;

@@ -761,8 +762,8 @@ class Parameters {
PARAMETER(PARAM_USE_HEADER) // also used by extractorfs
PARAMETER(PARAM_ID_OFFSET) // same
PARAMETER(PARAM_DB_TYPE)
PARAMETER(PARAM_DONT_SPLIT_SEQ_BY_LEN)
PARAMETER(PARAM_DONT_SHUFFLE)
PARAMETER(PARAM_CREATEDB_MODE)
PARAMETER(PARAM_SHUFFLE)

// convert2fasta
PARAMETER(PARAM_USE_HEADER_FILE)
@@ -494,7 +494,6 @@ void Sequence::mapSequence(const char * sequence, unsigned int dataLen){
this->L = l;
}


void Sequence::printPSSM(){
printf("Query profile of sequence %d\n", dbKey);
printf("Pos ");
@@ -16,9 +16,9 @@

#include "simd.h"
#include "MemoryMapped.h"

#include <algorithm>
#include <sys/mman.h>
#include <fstream> // std::ifstream

#ifdef OPENMP
#include <omp.h>
@@ -244,7 +244,9 @@ std::pair<ssize_t,ssize_t> Util::getFastaHeaderPosition(const std::string& heade
}


std::string Util::parseFastaHeader(const std::string& header) {
std::string Util::parseFastaHeader(const char * headerPtr) {
size_t len = Util::skipNoneWhitespace(headerPtr);
std::string header(headerPtr, len);
std::pair<ssize_t, ssize_t> pos = Util::getFastaHeaderPosition(header);
if(pos.first == -1 && pos.second == -1)
return "";
@@ -462,6 +464,35 @@ int Util::omp_thread_count() {
return n;
}

std::map<unsigned int, std::string> Util::readLookup(const std::string& file, const bool removeSplit) {
std::map<unsigned int, std::string> mapping;
if (file.length() > 0) {
std::ifstream mappingStream(file);
if (mappingStream.fail()) {
Debug(Debug::ERROR) << "File " << file << " not found!\n";
EXIT(EXIT_FAILURE);
}

std::string line;
while (std::getline(mappingStream, line)) {
std::vector<std::string> split = Util::split(line, "\t");
unsigned int id = strtoul(split[0].c_str(), NULL, 10);

std::string& name = split[1];

size_t pos;
if (removeSplit && (pos = name.find_last_of('_')) != std::string::npos) {
name = name.substr(0, pos);
}

mapping.emplace(id, name);
}
}

return mapping;
}


std::string Util::removeWhiteSpace(std::string in) {
in.erase(std::remove_if(in.begin(), in.end(), isspace), in.end());
return in;
@@ -6,7 +6,7 @@
#include <cstring>
#include <vector>
#include <limits>

#include <map>
#include "MMseqsMPI.h"

#ifndef EXIT
@@ -240,8 +240,8 @@ class Util {
}


static std::pair<ssize_t,ssize_t> getFastaHeaderPosition(const std::string& header);
static std::string parseFastaHeader(const std::string& header);
static std::pair<ssize_t,ssize_t> getFastaHeaderPosition(const std::string & header);
static std::string parseFastaHeader(const char * header);

static inline char toUpper(char character){
character += ('a' <= character && character <= 'z') ? ('A' - 'a') : 0;
@@ -310,6 +310,9 @@ class Util {

static std::string removeWhiteSpace(std::string in);

static std::map<unsigned int, std::string> readLookup(const std::string& lookupFile,
const bool removeSplit = false);

static bool canBeCovered(const float covThr, const int covMode, float queryLength, float targetLength);

static bool hasCoverage(float covThr, int covMode, float queryCov, float targetCov);
@@ -168,7 +168,7 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
size_t start = (i * flushSize);
size_t bucketSize = std::min(seqDbr.getSize() - (i * flushSize), flushSize);

#pragma omp for schedule(dynamic, 100)
#pragma omp for schedule(dynamic, 10)
for (size_t id = start; id < (start + bucketSize); id++) {
progress.updateProgress();
seq.mapSequence(id, seqDbr.getDbKey(id), seqDbr.getData(id, thread_idx), seqDbr.getSeqLen(id));

0 comments on commit 11e2736

Please sign in to comment.
You can’t perform that action at this time.