Permalink
Browse files

transnucl memory bugfix

  • Loading branch information...
Enywook committed Apr 17, 2018
1 parent 44471cc commit 99be556f8df7877b711566b74dfbc97a46946538
View
@@ -32,3 +32,11 @@ src/workflow/time_test
build/
.idea/
cmake-build-*/
BenchmarkingDatas/
examples/
TestPhages/
temp/
BacteriaBench/
EliTestFiles/
NCBI_Bacteria_Orfs/
build.old/
View
@@ -31,7 +31,7 @@ add_library(mmseqs-framework
${util_header_files}
${util_source_files}
${workflow_source_files}
)
util/Aggregation.cpp util/Aggregation.h)
target_include_directories(mmseqs-framework PUBLIC ${CMAKE_BINARY_DIR}/generated)
target_include_directories(mmseqs-framework PUBLIC ${PROJECT_BINARY_DIR}/generated)
@@ -64,5 +64,6 @@ extern int proteinaln2nucl(int argc, const char **argv, const Command& command);
extern int apply(int argc, const char **argv, const Command& command);
extern int versionstring(int argc, const char **argv, const Command& command);
extern int shellcompletion(int argc, const char **argv, const Command& command);
extern int aggregate(int argc, const char **argv, const Command& command);
#endif
View
@@ -1,15 +1,11 @@
#include "Parameters.h"
#include "Sequence.h"
#include "Debug.h"
#include "Util.h"
#include <unistd.h>
#include "DistanceCalculator.h"
#include <iomanip>
#include <regex.h>
#include <string>
#include <sstream>
#include <climits>
#ifdef __CYGWIN__
#include <sys/cygwin.h>
@@ -64,9 +60,11 @@ Parameters::Parameters():
PARAM_CLUSTER_MODE(PARAM_CLUSTER_MODE_ID,"--cluster-mode", "Cluster mode", "0: Setcover, 1: connected component, 2: Greedy clustering by sequence length 3: Greedy clustering by sequence length (low mem)",typeid(int), (void *) &clusteringMode, "[0-3]{1}$", MMseqsParameter::COMMAND_CLUST),
PARAM_CLUSTER_STEPS(PARAM_CLUSTER_STEPS_ID,"--cluster-steps", "Cascaded clustering steps", "cascaded clustering steps from 1 to -s",typeid(int), (void *) &clusterSteps, "^[1-9]{1}$", MMseqsParameter::COMMAND_CLUST|MMseqsParameter::COMMAND_EXPERT),
PARAM_CASCADED(PARAM_CASCADED_ID,"--single-step-clustering", "Single step clustering", "switches from cascaded to simple clustering workflow",typeid(bool), (void *) &cascaded, "", MMseqsParameter::COMMAND_CLUST),
//affinity clustering
// affinity clustering
PARAM_MAXITERATIONS(PARAM_MAXITERATIONS_ID,"--max-iterations", "Max depth connected component", "maximum depth of breadth first search in connected component",typeid(int), (void *) &maxIteration, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUST|MMseqsParameter::COMMAND_EXPERT),
PARAM_SIMILARITYSCORE(PARAM_SIMILARITYSCORE_ID,"--similarity-type", "Similarity type", "type of score used for clustering [1:2]. 1=alignment score. 2=sequence identity ",typeid(int),(void *) &similarityScoreType, "^[1-2]{1}$", MMseqsParameter::COMMAND_CLUST|MMseqsParameter::COMMAND_EXPERT),
// merge Clusters
PARAM_BY_DB(PARAM_BY_DB_ID, "--by-db", "Merge by DB", "Merge results by key column in DB", typeid(std::string), (void *) &DBfile, ""),
// logging
PARAM_V(PARAM_V_ID,"-v", "Verbosity","verbosity level: 0=nothing, 1: +errors, 2: +warnings, 3: +info",typeid(int), (void *) &verbosity, "^[0-3]{1}$", MMseqsParameter::COMMAND_COMMON),
// create profile (HMM)
@@ -157,6 +155,8 @@ Parameters::Parameters():
PARAM_SORT_ENTRIES(PARAM_SORT_ENTRIES_ID, "--sort-entries", "Sort entries", "Sort column set by --filter-column, by 0) no sorting, 1) increasing, 2) decreasing or 3) random shuffle.", typeid(int), (void *) &sortEntries, "^[1-9]{1}[0-9]*$"),
PARAM_BEATS_FIRST(PARAM_BEATS_FIRST_ID, "--beats-first", "Beats first", "Filter by comparing each entry to the first entry.", typeid(bool), (void*) &beatsFirst, ""),
PARAM_JOIN_DB(PARAM_JOIN_DB_ID, "--join-db","join to DB", "Join another database entry with respect to the database identifier in the chosen column", typeid(std::string), (void*) &joinDB, ""),
//aggregate
PARAM_MODE(PARAM_MODE_ID, "--mode", "Aggregation Mode", "Choose wich of aggregation to launch : bestHit/pval", typeid(std::string), (void*) &mode, ""),
// concatdb
PARAM_PRESERVEKEYS(PARAM_PRESERVEKEYS_ID,"--preserve-keys", "Preserve the keys", "the keys of the two DB should be distinct, and they will be preserved in the concatenation.",typeid(bool), (void *) &preserveKeysB, ""),
//diff
@@ -243,6 +243,10 @@ Parameters::Parameters():
clust.push_back(PARAM_SIMILARITYSCORE);
clust.push_back(PARAM_THREADS);
//mergeClusters
mergeclusters.push_back(PARAM_THREADS) ;
mergeclusters.push_back(PARAM_BY_DB) ;
// find orf
onlyverbosity.push_back(PARAM_V);
@@ -378,6 +382,8 @@ Parameters::Parameters():
msa2profile.push_back(PARAM_THREADS);
msa2profile.push_back(PARAM_V);
//mergeclusters
// profile2pssm
profile2pssm.push_back(PARAM_SUB_MAT);
profile2pssm.push_back(PARAM_MAX_SEQ_LEN);
@@ -474,6 +480,9 @@ Parameters::Parameters():
filterDb.push_back(PARAM_INCLUDE_IDENTITY);
filterDb.push_back(PARAM_JOIN_DB);
//aggregate
aggregate.push_back(PARAM_MODE) ;
aggregate.push_back(PARAM_THREADS) ;
onlythreads.push_back(PARAM_THREADS);
onlythreads.push_back(PARAM_V);
@@ -515,6 +524,7 @@ Parameters::Parameters():
// mergedbs
mergedbs.push_back(PARAM_BY_DB) ;
mergedbs.push_back(PARAM_MERGE_PREFIXES);
mergedbs.push_back(PARAM_V);
@@ -1079,6 +1089,9 @@ void Parameters::setDefaults() {
// Clustering workflow
removeTmpFiles = false;
//mergeclusters
DBfile="" ;
// convertprofiledb
profileMode = PROFILE_MODE_HMM;
@@ -1163,6 +1176,7 @@ void Parameters::setDefaults() {
// rescorediagonal
rescoreMode = Parameters::RESCORE_MODE_HAMMING;
filterHits = false;
// filterDb
filterColumn = 1;
filterColumnRegex = "^.*$";
View
@@ -209,6 +209,9 @@ class Parameters {
int maxIteration; // Maximum depth of breadth first search in connected component
int similarityScoreType; // Type of score to use for reassignment 1=alignment score. 2=coverage 3=sequence identity 4=E-value 5= Score per Column
//mergecluster
std::string DBfile ;
//extractorfs
int orfMinLength;
int orfMaxLength;
@@ -314,6 +317,9 @@ class Parameters {
bool beatsFirst;
std::string joinDB;
//aggregate
std::string mode ;
// mergedbs
std::string mergePrefixes;
@@ -424,6 +430,9 @@ class Parameters {
PARAMETER(PARAM_MAXITERATIONS)
PARAMETER(PARAM_SIMILARITYSCORE)
//Merge Clusters
PARAMETER(PARAM_BY_DB)
// logging
PARAMETER(PARAM_V)
std::vector<MMseqsParameter> clust;
@@ -542,6 +551,9 @@ class Parameters {
PARAMETER(PARAM_BEATS_FIRST)
PARAMETER(PARAM_JOIN_DB)
//aggregate
PARAMETER(PARAM_MODE)
// concatdb
PARAMETER(PARAM_PRESERVEKEYS)
@@ -612,6 +624,7 @@ class Parameters {
std::vector<MMseqsParameter> clusteringWorkflow;
std::vector<MMseqsParameter> clusterUpdateSearch;
std::vector<MMseqsParameter> clusterUpdateClust;
std::vector<MMseqsParameter> mergeclusters ;
std::vector<MMseqsParameter> clusterUpdate;
std::vector<MMseqsParameter> translatenucs;
std::vector<MMseqsParameter> swapresult;
@@ -634,7 +647,7 @@ class Parameters {
std::vector<MMseqsParameter> taxonomy;
std::vector<MMseqsParameter> profile2pssm;
std::vector<MMseqsParameter> profile2cs;
std::vector<MMseqsParameter> aggregate ;
std::vector<MMseqsParameter> combineList(const std::vector<MMseqsParameter> &par1,
const std::vector<MMseqsParameter> &par2);
View
@@ -113,7 +113,7 @@ std::vector<struct Command> commands = {
"Milot Mirdita <milot@mirdita.de>",
"<i:sequenceDB> <i:clusterDB> <o:fastaDB>",
CITATION_MMSEQS2},
{"mergeclusters", mergeclusters, &par.onlythreads, COMMAND_CLUSTER,
{"mergeclusters", mergeclusters, &par.mergeclusters, COMMAND_CLUSTER,
"Merge multiple cluster DBs into single cluster DB",
NULL,
"Maria Hauser & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
@@ -193,7 +193,13 @@ std::vector<struct Command> commands = {
"Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
"<i:sequenceDB> <o:resultDB> <i:resultDB1> ... <i:resultDBn>",
CITATION_MMSEQS2},
{"splitdb", splitdb, &par.splitdb, COMMAND_DB,
{"aggregate", aggregate, &par.aggregate, COMMAND_DB,
"Aggregate search results by genomes",
NULL,
"Clovis Norroy <clovis.norroy@gmail.com>",
"<i:searchResults> <o:aggregated_file>",
CITATION_MMSEQS2},
{"splitdb", splitdb, &par.splitdb, COMMAND_DB,
"Split a mmseqs DB into multiple DBs",
NULL,
"Milot Mirdita <milot@mirdita.de>",
View
@@ -1,4 +1,5 @@
set(util_source_files
util/aggregate.cpp
util/apply.cpp
util/clusthash.cpp
util/convert2fasta.cpp
View
@@ -2,6 +2,9 @@
#include <list>
#include <sstream>
#include <itoa.h>
#include <algorithm>
#include <mutex>
#include "omptl/omptl_algorithm"
#include "Debug.h"
#include "DBReader.h"
@@ -14,8 +17,7 @@
#endif
void mergeClusteringResults(std::string seqDB, std::string outDB, std::list<std::string> cluSteps, int threads)
{
void mergeClusteringResults(std::string seqDB, std::string outDB, std::list<std::string> cluSteps, int threads){
// open the sequence database
// it will serve as the reference for sequence indexes
std::string seqDBIndex = seqDB + ".index";
@@ -138,19 +140,72 @@ void mergeClusteringResults(std::string seqDB, std::string outDB, std::list<std:
delete mergedClustering[i];
}
delete[] mergedClustering;
Debug(Debug::WARNING) << "...done.\n";
}
void mergeSearchWithClustersResults(const std::string &searchResults, const std::string &outDB,
const std::string &geneToQuerySetLink, int threads){
std::string searchResultsIndex = searchResults + ".index";
DBReader<unsigned int> searchResultsDB(searchResults.c_str(), searchResultsIndex.c_str());
searchResultsDB.open(DBReader<unsigned int>::NOSORT);
std::string geneToQuerySetLinkIndex = geneToQuerySetLink + ".index";
DBReader<unsigned int> geneToQuerySetLinkDB(geneToQuerySetLink.c_str(), geneToQuerySetLinkIndex.c_str());
geneToQuerySetLinkDB.open(DBReader<unsigned int>::NOSORT);
std::string outDBIndex = outDB + ".index";
auto* dbw = new DBWriter(outDB.c_str(), outDBIndex.c_str(), static_cast<unsigned int>(threads));
dbw->open();
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < geneToQuerySetLinkDB.getSize(); i++){
std::string buffer;
std::stringstream querySetGeneList(geneToQuerySetLinkDB.getData(i));
std::string Gene;
int thread_idx = 0;
#ifdef OPENMP
thread_idx = omp_get_thread_num();
#endif
// go through the sequences in the cluster and add them to the initial clustering
while (std::getline(querySetGeneList, Gene)){
auto key = (unsigned int) strtoul(Gene.c_str(), nullptr, 10);
char *data = searchResultsDB.getDataByDBKey(key);
buffer.append(data);
}
dbw->writeData(buffer.c_str(), buffer.length(), geneToQuerySetLinkDB.getDbKey(i),
static_cast<unsigned int>(thread_idx));
}
dbw->close();
delete dbw;
searchResultsDB.close();
geneToQuerySetLinkDB.close();
Debug(Debug::WARNING) << "...done.\n";
}
int mergeclusters(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
par.parseParameters(argc, argv, command, 4, true, Parameters::PARSE_VARIADIC);
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, 3, true, true);
#ifdef OPENMP
omp_set_num_threads(par.threads);
#endif
std::list<std::string> clusterings;
for(size_t i = 2; i < par.filenames.size(); i++){
clusterings.emplace_back(par.filenames[i]);
for (int i = 2; i < argc; i++) {
clusterings.push_back(std::string(argv[i]));
}
if (!par.DBfile.empty()){
mergeSearchWithClustersResults(par.db1, par.db2, par.DBfile, par.threads);
}
mergeClusteringResults(par.db1, par.db2, clusterings, par.threads);
else
mergeClusteringResults(par.db1, par.db2, clusterings, par.threads);
return 0;
}
View
@@ -35,8 +35,11 @@ int translatenucs(int argc, const char **argv, const Command& command) {
TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
#pragma omp parallel
{
char* aa = new char[par.maxSeqLen/3 + 3 + 1];
int thread_idx = 0;
char* aa = new char[par.maxSeqLen + 3 + 1];
#pragma omp for schedule(dynamic, 5)
for (size_t i = 0; i < entries; ++i) {
int thread_idx = 0;
#ifdef OPENMP
thread_idx = omp_get_thread_num();
#endif
@@ -47,7 +50,7 @@ int translatenucs(int argc, const char **argv, const Command& command) {
char* data = reader.getData(i);
bool addStopAtStart = false;
bool addStopAtEnd = false;
if (addOrfStop == true) {
if(addOrfStop == true){
char* headData = header->getDataByDBKey(key);
char * entry[255];
size_t columns = Util::getWordsOfLine(headData, entry, 255);
@@ -89,6 +92,13 @@ int translatenucs(int argc, const char **argv, const Command& command) {
Debug(Debug::WARNING) << "Nucleotide sequence entry " << key << " length (" << length << ") is too short. Skipping entry.\n";
continue;
}
if(length > 3*par.maxSeqLen) {
Debug(Debug::WARNING) << "Nucleotide sequence entry " << key << " length (" << length << ") is too long. Trimming entry.\n";
length = 3*par.maxSeqLen;
}
// std::cout << data << std::endl;
char * writeAA;
if (addStopAtStart) {

0 comments on commit 99be556

Please sign in to comment.