From dd800f961efaa87dc4737c72885b0162c1ce35af Mon Sep 17 00:00:00 2001 From: Hannes Hauswedell Date: Wed, 23 Nov 2016 15:18:22 +0100 Subject: [PATCH] [FEATURE] expose macro to use 32bit as position type for proteins --- src/CMakeLists.txt | 26 ++++++++++++++++---------- src/lambda.hpp | 29 +++++++++++++++++++++++++++-- src/lambda_indexer.cpp | 5 +++-- src/lambda_indexer.hpp | 10 ++++++++-- src/options.hpp | 4 ++++ 5 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 846bf9d52..74975b41f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -83,11 +83,12 @@ message ("\n${ColourBold}Build configuration${ColourReset}") message (STATUS "LAMBDA version is: ${SEQAN_APP_VERSION}") -option (LAMBDA_FASTBUILD "Build only blastp and blastx modes (speeds up build)." OFF) -option (LAMBDA_NATIVE_BUILD "Architecture-specific optimizations, i.e. g++ -march=native." ON) -option (LAMBDA_STATIC_BUILD "Include all libraries in the binaries." OFF) -option (LAMBDA_MMAPPED_DB "Use mmapped access to the database." OFF) -option (LAMBDA_LINGAPS_OPT "Add optimized codepaths for linear gap costs (inc. bin size and compile time)." OFF) +option (LAMBDA_FASTBUILD "Build only blastp and blastx modes (speeds up build)." OFF) +option (LAMBDA_LINGAPS_OPT "Add optimized codepaths for linear gap costs (increases bin size and compile time)." OFF) +option (LAMBDA_LONG_PROTEIN_SUBJ_SEQS "Make max protein sequence length == 4.3billion instead of 65,535. INVALIDATES INDEXS!" OFF) +option (LAMBDA_MMAPPED_DB "Use mmapped access to the database." OFF) +option (LAMBDA_NATIVE_BUILD "Architecture-specific optimizations, i.e. g++ -march=native." ON) +option (LAMBDA_STATIC_BUILD "Include all libraries in the binaries." OFF) if (LAMBDA_FASTBUILD) add_definitions (-DFASTBUILD=1) @@ -125,12 +126,17 @@ if (LAMBDA_LINGAPS_OPT) add_definitions (-DLAMBDA_LINGAPS_OPT=1) endif () +if (LAMBDA_LONG_PROTEIN_SUBJ_SEQS) + add_definitions (-DLAMBDA_LONG_PROTEIN_SUBJ_SEQS=1) +endif () + message(STATUS "The following options are selected for the build:") -message( " LAMBDA_FASTBUILD ${LAMBDA_FASTBUILD}") -message( " LAMBDA_LINGAPS_OPT ${LAMBDA_LINGAPS_OPT}") -message( " LAMBDA_MMAPPED_DB ${LAMBDA_MMAPPED_DB}") -message( " LAMBDA_NATIVE_BUILD ${LAMBDA_NATIVE_BUILD}") -message( " LAMBDA_STATIC_BUILD ${LAMBDA_STATIC_BUILD}") +message( " LAMBDA_FASTBUILD ${LAMBDA_FASTBUILD}") +message( " LAMBDA_LINGAPS_OPT ${LAMBDA_LINGAPS_OPT}") +message( " LAMBDA_LONG_PROTEIN_SUBJ_SEQS ${LAMBDA_LONG_PROTEIN_SUBJ_SEQS}") +message( " LAMBDA_MMAPPED_DB ${LAMBDA_MMAPPED_DB}") +message( " LAMBDA_NATIVE_BUILD ${LAMBDA_NATIVE_BUILD}") +message( " LAMBDA_STATIC_BUILD ${LAMBDA_STATIC_BUILD}") message(STATUS "Run 'cmake -LH' to get a comment on each option.") message(STATUS "Remove CMakeCache.txt and re-run cmake with -DOPTIONNAME=ON|OFF to change an option.") diff --git a/src/lambda.hpp b/src/lambda.hpp index 76b4fe369..054c97152 100644 --- a/src/lambda.hpp +++ b/src/lambda.hpp @@ -128,7 +128,15 @@ template ())) @@ -160,13 +168,30 @@ validateIndexOptions(LambdaOptions const & options) { buffer.clear(); readIndexOption(buffer, "genetic_code", options); - unsigned long b = 0; + b = 0; if ((!lexicalCast(b, buffer)) || (b != static_cast(options.geneticCode))) { std::cerr << "WARNING: The codon translation table used during indexing and during search are different. " "This is not a problem per se, but is likely not what you want.\n\n"; } } + + buffer.clear(); + readIndexOption(buffer, "subj_seq_len_bits", options); + b = 0; + if ((!lexicalCast(b, buffer)) || (b != static_cast(sizeof(SizeTypePos_) * 8))) + { + #ifndef LAMBDA_LONG_PROTEIN_SUBJ_SEQS + std::cerr << "ERROR: Your lambda executable was built with LAMBDA_LONG_PROTEIN_SUBJ_SEQS,\n" + " but the index was created by an executable that was built without it.\n"; + #else + std::cerr << "ERROR: Your lambda executable was built without LAMBDA_LONG_PROTEIN_SUBJ_SEQS,\n" + " but the index was created by an executable that was built with it.\n"; + #endif + std::cerr << " You need to recreate the index or rebuild Lambda.\n"; + return -1; + } + return 0; } diff --git a/src/lambda_indexer.cpp b/src/lambda_indexer.cpp index ee5a752ba..6c8ac21f9 100644 --- a/src/lambda_indexer.cpp +++ b/src/lambda_indexer.cpp @@ -197,7 +197,7 @@ realMain(LambdaIndexerOptions const & options, dumpTranslatedSeqs(translatedSeqs, options); // see if final sequence set actually fits into index - if (!checkIndexSize(translatedSeqs)) + if (!checkIndexSize(translatedSeqs, BlastProgramSelector

())) return -1; if (options.dbIndexType == DbIndexType::FM_INDEX) @@ -246,7 +246,8 @@ realMain(LambdaIndexerOptions const & options, { options.indexDir + "/option:alph_original", std::string(_alphName(OrigSubjAlph

())) }, { options.indexDir + "/option:alph_translated", std::string(_alphName(TransAlph

())) }, { options.indexDir + "/option:alph_reduced", std::string(_alphName(TRedAlph())) }, - { options.indexDir + "/option:genetic_code", std::to_string(options.geneticCode) } + { options.indexDir + "/option:genetic_code", std::to_string(options.geneticCode) }, + { options.indexDir + "/option:subj_seq_len_bits", std::to_string(sizeof(SizeTypePos_) * 8)}, }) { std::ofstream f{std::get<0>(s).c_str(), std::ios_base::out | std::ios_base::binary}; diff --git a/src/lambda_indexer.hpp b/src/lambda_indexer.hpp index 9eff8c779..79bd75827 100644 --- a/src/lambda_indexer.hpp +++ b/src/lambda_indexer.hpp @@ -276,9 +276,9 @@ dumpTranslatedSeqs(TCDStringSet> const & translatedSeqs, // Function loadSubj() // -------------------------------------------------------------------------- -template +template inline bool -checkIndexSize(TCDStringSet> const & seqs) +checkIndexSize(TCDStringSet> const & seqs, BlastProgramSelector

const &) { using SAV = typename SAValue>>::Type; uint64_t curNumSeq = length(seqs); @@ -303,6 +303,12 @@ checkIndexSize(TCDStringSet> const & seqs) std::cerr << "Too long sequences to be indexed:\n " << "length" << maxLen << " present in file, but only " << maxLenSeq << " supported by index.\n"; + #ifndef LAMBDA_LONG_PROTEIN_SUBJ_SEQS + if (p != BlastProgram::BLASTN) + std::cout << "You can recompile Lambda and add -DLAMBDA_LONG_PROTEIN_SUBJ_SEQS=1 to activate\n" + "support for longer protein sequences.\n"; + #endif + return false; } return true; diff --git a/src/options.hpp b/src/options.hpp index e92adb017..17d52e845 100644 --- a/src/options.hpp +++ b/src/options.hpp @@ -54,7 +54,11 @@ using SizeTypeNum_ = uint32_t; template struct SizeTypePosMeta_ { +#ifdef LAMBDA_LONG_PROTEIN_SUBJ_SEQS + using Type = uint32_t; +#else using Type = uint16_t; +#endif }; template <>