Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Woosub-Kim committed Jan 30, 2024
2 parents da825d5 + 852434a commit f690b9d
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 26 deletions.
5 changes: 4 additions & 1 deletion src/commons/LocalParameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ LocalParameters::LocalParameters() :
PARAM_FILE_EXCLUDE(PARAM_FILE_EXCLUDE_ID, "--file-exclude", "File Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &fileExclude, "^.*$"),
PARAM_INDEX_EXCLUDE(PARAM_INDEX_EXCLUDE_ID, "--index-exclude", "Index Exclusion", "Exclude parts of the index:\n0: Full index\n1: Exclude k-mer index (for use with --prefilter-mode 1)\n2: Exclude C-alpha coordinates (for use with --sort-by-structure-bits 0)\nFlags can be combined bit wise", typeid(int), (void *) &indexExclude, "^[0-3]{1}$", MMseqsParameter::COMMAND_EXPERT),
PARAM_COMPLEX_REPORT_MODE(PARAM_COMPLEX_REPORT_MODE_ID, "--complex-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &complexReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN)
PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN),
PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$")
{
PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id";
PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$";
Expand Down Expand Up @@ -78,6 +79,7 @@ LocalParameters::LocalParameters() :
structurecreatedb.push_back(&PARAM_WRITE_LOOKUP);
structurecreatedb.push_back(&PARAM_TAR_INCLUDE);
structurecreatedb.push_back(&PARAM_TAR_EXCLUDE);
structurecreatedb.push_back(&PARAM_INPUT_FORMAT);
// protein chain only
structurecreatedb.push_back(&PARAM_FILE_INCLUDE);
structurecreatedb.push_back(&PARAM_FILE_EXCLUDE);
Expand Down Expand Up @@ -209,6 +211,7 @@ LocalParameters::LocalParameters() :
maskLowerCaseMode = 1;
coordStoreMode = COORD_STORE_MODE_CA_DIFF;
clusterSearch = 0;
inputFormat = 0; // auto detect
fileInclude = ".*";
fileExclude = "^$";
dbSuffixList = "_h,_ss,_ca";
Expand Down
2 changes: 2 additions & 0 deletions src/commons/LocalParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ class LocalParameters : public Parameters {
PARAMETER(PARAM_INDEX_EXCLUDE)
PARAMETER(PARAM_COMPLEX_REPORT_MODE)
PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE)
PARAMETER(PARAM_INPUT_FORMAT)

int prefMode;
float tmScoreThr;
Expand All @@ -133,6 +134,7 @@ class LocalParameters : public Parameters {
int indexExclude;
int complexReportMode;
double eValueThrExpandComplex;
int inputFormat;

static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner,
Expand Down
77 changes: 62 additions & 15 deletions src/strucclustutils/GemmiWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,23 @@ std::unordered_map<std::string, int> getEntityTaxIDMapping(gemmi::cif::Document&
return entity_to_taxid;
}

bool GemmiWrapper::load(std::string & filename){
if (gemmi::iends_with(filename, ".fcz")) {
GemmiWrapper::Format mapFormat(gemmi::CoorFormat format) {
switch (format) {
case gemmi::CoorFormat::Pdb:
return GemmiWrapper::Format::Pdb;
case gemmi::CoorFormat::Mmcif:
return GemmiWrapper::Format::Mmcif;
case gemmi::CoorFormat::Mmjson:
return GemmiWrapper::Format::Mmjson;
case gemmi::CoorFormat::ChemComp:
return GemmiWrapper::Format::ChemComp;
default:
return GemmiWrapper::Format::Unknown;
}
}

bool GemmiWrapper::load(const std::string& filename, Format format) {
if ((format == Format::Foldcomp) || (format == Format::Detect && gemmi::iends_with(filename, ".fcz"))) {
std::ifstream in(filename, std::ios::binary);
if (!in) {
return false;
Expand All @@ -77,22 +92,30 @@ bool GemmiWrapper::load(std::string & filename){
#else
gemmi::BasicInput infile(filename);
#endif
gemmi::CoorFormat format = gemmi::coor_format_from_ext(infile.basepath());
if (format == Format::Detect) {
format = mapFormat(gemmi::coor_format_from_ext(infile.basepath()));
}
gemmi::Structure st;
std::unordered_map<std::string, int> entity_to_tax_id;
switch (format) {
case gemmi::CoorFormat::Mmcif: {
case Format::Mmcif: {
gemmi::cif::Document doc = gemmi::cif::read(infile);
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure(doc);
break;
}
case gemmi::CoorFormat::Mmjson:
st = gemmi::make_structure(gemmi::cif::read_mmjson(infile));
case Format::Mmjson: {
gemmi::cif::Document doc = gemmi::cif::read_mmjson(infile);
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure(doc);
break;
case gemmi::CoorFormat::ChemComp:
st = gemmi::make_structure_from_chemcomp_doc(gemmi::cif::read(infile));
}
case Format::ChemComp: {
gemmi::cif::Document doc = gemmi::cif::read(infile);
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure_from_chemcomp_doc(doc);
break;
}
default:
st = gemmi::read_pdb(infile);
}
Expand All @@ -112,8 +135,8 @@ struct OneShotReadBuf : public std::streambuf
}
};

bool GemmiWrapper::loadFromBuffer(const char * buffer, size_t bufferSize, const std::string& name) {
if (bufferSize > MAGICNUMBER_LENGTH && strncmp(buffer, MAGICNUMBER, MAGICNUMBER_LENGTH) == 0) {
bool GemmiWrapper::loadFromBuffer(const char * buffer, size_t bufferSize, const std::string& name, GemmiWrapper::Format format) {
if ((format == Format::Foldcomp) || (format == Format::Detect && (bufferSize > MAGICNUMBER_LENGTH && strncmp(buffer, MAGICNUMBER, MAGICNUMBER_LENGTH) == 0))) {
OneShotReadBuf buf((char *) buffer, bufferSize);
std::istream istr(&buf);
if (!istr) {
Expand All @@ -127,21 +150,45 @@ bool GemmiWrapper::loadFromBuffer(const char * buffer, size_t bufferSize, const
#else
gemmi::BasicInput infile(name);
#endif
gemmi::CoorFormat format = gemmi::coor_format_from_ext(infile.basepath());
if (format == Format::Detect) {
format = mapFormat(gemmi::coor_format_from_ext(infile.basepath()));
}

gemmi::Structure st;
std::unordered_map<std::string, int> entity_to_tax_id;
switch (format) {
case gemmi::CoorFormat::Pdb:
case Format::Pdb:
st = gemmi::pdb_impl::read_pdb_from_stream(gemmi::MemoryStream(buffer, bufferSize), name, gemmi::PdbReadOptions());
break;
case gemmi::CoorFormat::Mmcif: {
case Format::Mmcif: {
gemmi::cif::Document doc = gemmi::cif::read_memory(buffer, bufferSize, name.c_str());
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure(doc);
break;
}
case gemmi::CoorFormat::Unknown:
case gemmi::CoorFormat::Detect:
case Format::Mmjson: {
char* bufferCopy = (char*)malloc(bufferSize + 1 * sizeof(char));
if (bufferCopy == NULL) {
return false;
}
if (memcpy(bufferCopy, buffer, bufferSize) == NULL) {
free(bufferCopy);
return false;
}
bufferCopy[bufferSize] = '\0';
gemmi::cif::Document doc = gemmi::cif::read_mmjson_insitu(bufferCopy, bufferSize, name);
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure(doc);
free(bufferCopy);
break;
}
case Format::ChemComp: {
gemmi::cif::Document doc = gemmi::cif::read_memory(buffer, bufferSize, name.c_str());
entity_to_tax_id = getEntityTaxIDMapping(doc);
st = gemmi::make_structure_from_chemcomp_doc(doc);
break;
}
default:
return false;
}
updateStructure((void*) &st, name, entity_to_tax_id);
Expand Down
14 changes: 12 additions & 2 deletions src/strucclustutils/GemmiWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,21 @@

class GemmiWrapper {
public:
enum class Format {
Detect = 0,
Pdb,
Mmcif,
Mmjson,
ChemComp,
Foldcomp,
Unknown
};

GemmiWrapper();

bool loadFromBuffer(const char * buffer, size_t bufferSize, const std::string& name);
bool loadFromBuffer(const char * buffer, size_t bufferSize, const std::string& name, Format format = Format::Detect);

bool load(std::string & filename);
bool load(const std::string& filename, Format format = Format::Detect);

std::pair<size_t, size_t> nextChain();

Expand Down
18 changes: 10 additions & 8 deletions src/strucclustutils/structcreatedb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
bool needsReorderingAtTheEnd = false;
size_t needToWriteModel = 0;

const int inputFormat = par.inputFormat;

// Process tar files!
for(size_t i = 0; i < tarFiles.size(); i++) {
mtar_t tar;
Expand Down Expand Up @@ -365,7 +367,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
}
#endif

#pragma omp parallel default(none) shared(tar, par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter, std::cerr, std::cout) num_threads(localThreads) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
#pragma omp parallel default(none) shared(tar, par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter, std::cerr, std::cout, inputFormat) num_threads(localThreads) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
{
unsigned int thread_idx = 0;
#ifdef OPENMP
Expand Down Expand Up @@ -491,7 +493,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
} else {
pdbFile.append(dataBuffer, tarHeader.size);
}
if (readStructure.loadFromBuffer(pdbFile.c_str(), pdbFile.size(), name) == false) {
if (readStructure.loadFromBuffer(pdbFile.c_str(), pdbFile.size(), name, (GemmiWrapper::Format)inputFormat) == false) {
incorrectFiles++;
continue;
}
Expand All @@ -515,7 +517,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {


//===================== single_process ===================//__110710__//
#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, looseFiles, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, looseFiles, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter, inputFormat) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
{
unsigned int thread_idx = 0;
#ifdef OPENMP
Expand All @@ -535,7 +537,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < looseFiles.size(); i++) {
progress.updateProgress();

if(readStructure.load(looseFiles[i]) == false){
if(readStructure.load(looseFiles[i], (GemmiWrapper::Format)inputFormat) == false){
incorrectFiles++;
continue;
}
Expand Down Expand Up @@ -570,7 +572,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
filter = parts[2][0];
}
progress.reset(SIZE_MAX);
#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, gcsPaths, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, client, bucket_name, filter, mappingWriter) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, gcsPaths, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, client, bucket_name, filter, mappingWriter) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel, inputFormat)
{
StructureTo3Di structureTo3Di;
PulchraWrapper pulchra;
Expand Down Expand Up @@ -600,7 +602,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
Debug(Debug::ERROR) << reader.status().message() << "\n";
} else {
std::string contents{std::istreambuf_iterator<char>{reader}, {}};
if (readStructure.loadFromBuffer(contents.c_str(), contents.size(), obj_name) == false) {
if (readStructure.loadFromBuffer(contents.c_str(), contents.size(), obj_name, inputFormat) == false) {
incorrectFiles++;
} else {
__sync_add_and_fetch(&needToWriteModel, (readStructure.modelCount > 1));
Expand All @@ -624,7 +626,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
DBReader<unsigned int> reader(dbs[i].c_str(), (dbs[i]+".index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_LOOKUP);
reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
progress.reset(reader.getSize());
#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, reader, mappingWriter) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
#pragma omp parallel default(none) shared(par, torsiondbw, hdbw, cadbw, aadbw, mat, progress, globalCnt, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, reader, mappingWriter, inputFormat) reduction(+:incorrectFiles, tooShort, notProtein, needToWriteModel)
{
StructureTo3Di structureTo3Di;
PulchraWrapper pulchra;
Expand All @@ -651,7 +653,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) {
size_t lookupId = reader.getLookupIdByKey(reader.getDbKey(i));
std::string name = reader.getLookupEntryName(lookupId);

if (readStructure.loadFromBuffer(data, len, name) == false) {
if (readStructure.loadFromBuffer(data, len, name, (GemmiWrapper::Format)inputFormat) == false) {
incorrectFiles++;
} else {
__sync_add_and_fetch(&needToWriteModel, (readStructure.modelCount > 1));
Expand Down

0 comments on commit f690b9d

Please sign in to comment.