Skip to content

Commit

Permalink
Add in place and multithread support for more of the MolStandardize c…
Browse files Browse the repository at this point in the history
…ode (#6970)
  • Loading branch information
greglandrum committed Dec 12, 2023
1 parent c9a46b3 commit c7c9ad3
Show file tree
Hide file tree
Showing 13 changed files with 966 additions and 122 deletions.
81 changes: 58 additions & 23 deletions Code/GraphMol/MolStandardize/Fragment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,10 @@ void FragmentRemover::removeInPlace(RWMol &mol) {
mol.commitBatchEdit();
}

bool isOrganic(const ROMol &frag) {
bool isOrganic(const ROMol &mol, const std::vector<int> &indices) {
// Returns true if fragment contains at least one carbon atom.
for (const auto at : frag.atoms()) {
if (at->getAtomicNum() == 6) {
for (auto idx : indices) {
if (mol.getAtomWithIdx(idx)->getAtomicNum() == 6) {
return true;
}
}
Expand All @@ -179,57 +179,84 @@ LargestFragmentChooser::LargestFragmentChooser(
countHeavyAtomsOnly = other.countHeavyAtomsOnly;
}

ROMol *LargestFragmentChooser::choose(const ROMol &mol) {
ROMol *LargestFragmentChooser::choose(const ROMol &mol) const {
auto res = new RWMol(mol);
chooseInPlace(*res);
// resanitize the molecule
MolOps::sanitizeMol(*res);

return static_cast<ROMol *>(res);
}

void LargestFragmentChooser::chooseInPlace(RWMol &mol) const {
BOOST_LOG(rdInfoLog) << "Running LargestFragmentChooser\n";

if (!mol.getNumAtoms()) {
return new ROMol(mol);
return;
}

std::vector<std::vector<int>> frags;
MolOps::getMolFrags(mol, frags);
if (frags.size() == 1) {
// nothing to do
return;
}
std::vector<boost::shared_ptr<ROMol>> frags = MolOps::getMolFrags(mol);

LargestFragmentChooser::Largest l;

for (const auto &frag : frags) {
std::string smiles = MolToSmiles(*frag);
SmilesWriteParams ps;
int bestFragment = -1;
for (auto fidx = 0u; fidx < frags.size(); ++fidx) {
const auto &frag = frags[fidx];
std::string smiles = MolFragmentToSmiles(mol, ps, frag);
BOOST_LOG(rdInfoLog) << "Fragment: " << smiles << "\n";
bool organic = isOrganic(*frag);
bool organic = isOrganic(mol, frag);
if (this->preferOrganic) {
// Skip this fragment if not organic and we already have an organic
// fragment as the largest so far
if (l.Fragment != nullptr && l.Organic && !organic) {
if (bestFragment >= 0 && l.Organic && !organic) {
continue;
}
// Reset largest if it wasn't organic and this fragment is organic
// if largest and organic and not largest['organic']:
if (l.Fragment != nullptr && organic && !l.Organic) {
l.Fragment = nullptr;
if (bestFragment >= 0 && organic && !l.Organic) {
bestFragment = -1;
}
}
unsigned int numatoms = 0;
if (this->useAtomCount) {
for (const auto at : frag->atoms()) {
for (const auto idx : frag) {
++numatoms;
if (!this->countHeavyAtomsOnly) {
numatoms += at->getTotalNumHs();
numatoms += mol.getAtomWithIdx(idx)->getTotalNumHs();
}
}
// Skip this fragment if fewer atoms than the largest
if (l.Fragment != nullptr && (numatoms < l.NumAtoms)) {
if (bestFragment >= 0 && (numatoms < l.NumAtoms)) {
continue;
}
}

// Skip this fragment if equal number of atoms but weight is lower
double weight = Descriptors::calcExactMW(*frag);
if (l.Fragment != nullptr &&
(!this->useAtomCount || numatoms == l.NumAtoms) &&
double weight = 0.0;
for (auto idx : frag) {
const auto atom = mol.getAtomWithIdx(idx);
// it's not important to be perfect here
weight += 100 * atom->getAtomicNum() + atom->getIsotope() -
atom->getFormalCharge() * .1;
if (!this->countHeavyAtomsOnly) {
weight += atom->getTotalNumHs();
}
}

if (bestFragment >= 0 && (!this->useAtomCount || numatoms == l.NumAtoms) &&
(weight < l.Weight)) {
continue;
}

// Skip this fragment if equal number of atoms and equal weight but smiles
// comes last alphabetically
if (l.Fragment != nullptr &&
(!this->useAtomCount || numatoms == l.NumAtoms) &&
if (bestFragment >= 0 && (!this->useAtomCount || numatoms == l.NumAtoms) &&
(weight == l.Weight) && (smiles > l.Smiles)) {
continue;
}
Expand All @@ -238,13 +265,21 @@ ROMol *LargestFragmentChooser::choose(const ROMol &mol) {
<< numatoms << ")\n";
// Otherwise this is the largest so far
l.Smiles = smiles;
l.Fragment = frag;
bestFragment = fidx;
l.NumAtoms = numatoms;
l.Weight = weight;
l.Organic = organic;
}

return new ROMol(*(l.Fragment));
mol.beginBatchEdit();
for (auto fi = 0; fi < static_cast<int>(frags.size()); ++fi) {
if (fi == bestFragment) {
continue;
}
for (auto i : frags[fi]) {
mol.removeAtom(i);
}
}
mol.commitBatchEdit();
}

LargestFragmentChooser::Largest::Largest() : Smiles(""), Fragment(nullptr) {}
Expand Down
3 changes: 2 additions & 1 deletion Code/GraphMol/MolStandardize/Fragment.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ class RDKIT_MOLSTANDARDIZE_EXPORT LargestFragmentChooser {
LargestFragmentChooser(const LargestFragmentChooser &other);
~LargestFragmentChooser() = default;

ROMol *choose(const ROMol &mol);
ROMol *choose(const ROMol &mol) const;
void chooseInPlace(RWMol &mol) const;
struct Largest {
Largest();
Largest(std::string &smiles, boost::shared_ptr<ROMol> fragment,
Expand Down
169 changes: 124 additions & 45 deletions Code/GraphMol/MolStandardize/MolStandardize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,16 +182,47 @@ void cleanupInPlace(std::vector<RWMol *> &mols, int numThreads,
mols, numThreads, params);
}

void tautomerParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(mol, params);
}

canonicalTautomerInPlace(mol, params);
cleanupInPlace(mol, params);
}

void tautomerParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
tautomerParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}

RWMol *tautomerParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
tautomerParentInPlace(*res, params, skip_standardize);
return res.release();
}

void fragmentParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
fragmentParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}
void fragmentParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(*res, params);
cleanupInPlace(mol, params);
}

std::unique_ptr<RWMol> ct{canonicalTautomer(res.get(), params)};
cleanupInPlace(*ct, params);
return ct.release();
LargestFragmentChooser lfragchooser(params.preferOrganic);
lfragchooser.chooseInPlace(mol);
}

// Return the fragment parent of a given molecule.
Expand All @@ -200,65 +231,109 @@ RWMol *tautomerParent(const RWMol &mol, const CleanupParameters &params,
RWMol *fragmentParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
if (!skip_standardize) {
cleanupInPlace(*res, params);
}
LargestFragmentChooser lfragchooser(params.preferOrganic);
return static_cast<RWMol *>(lfragchooser.choose(*res));
fragmentParentInPlace(*res, params, skip_standardize);
return res.release();
}

RWMol *stereoParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
RWMol *res = new RWMol(mol);
void stereoParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
stereoParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}
void stereoParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(*res, params);
cleanupInPlace(mol, params);
}

MolOps::removeStereochemistry(*res);
return res;
MolOps::removeStereochemistry(mol);
}
RWMol *stereoParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
stereoParentInPlace(*res, params, skip_standardize);
return res.release();
}
void isotopeParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
isotopeParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}

RWMol *isotopeParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
RWMol *res = new RWMol(mol);
void isotopeParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(*res, params);
cleanupInPlace(mol, params);
}

for (auto atom : res->atoms()) {
for (auto atom : mol.atoms()) {
atom->setIsotope(0);
}
return res;
}
RWMol *isotopeParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
isotopeParentInPlace(*res, params, skip_standardize);
return res.release();
}

void chargeParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
chargeParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}
void chargeParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
fragmentParentInPlace(mol, params, skip_standardize);
Uncharger uncharger(params.doCanonical);
uncharger.unchargeInPlace(mol);
cleanupInPlace(mol, params);
}
RWMol *chargeParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
// Return the charge parent of a given molecule.
// The charge parent is the uncharged version of the fragment parent.
std::unique_ptr<RWMol> res{new RWMol(mol)};
chargeParentInPlace(*res, params, skip_standardize);
return res.release();
}

std::unique_ptr<RWMol> fragparent{
fragmentParent(mol, params, skip_standardize)};
void superParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(mol, params);
}
// we can skip fragmentParent since the chargeParent takes care of that
chargeParentInPlace(mol, params, true);
isotopeParentInPlace(mol, params, true);
stereoParentInPlace(mol, params, true);
tautomerParentInPlace(mol, params, true);
cleanupInPlace(mol, params);
}

Uncharger uncharger(params.doCanonical);
uncharger.unchargeInPlace(*fragparent);
cleanupInPlace(*fragparent, params);
return fragparent.release();
void superParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
superParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}

RWMol *superParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res;
if (!skip_standardize) {
res.reset(cleanup(mol, params));
} else {
res.reset(new RWMol(mol));
}
// we can skip fragmentParent since the chargeParent takes care of that
res.reset(chargeParent(*res, params, true));
res.reset(isotopeParent(*res, params, true));
res.reset(stereoParent(*res, params, true));
res.reset(tautomerParent(*res, params, true));
return cleanup(*res, params);
std::unique_ptr<RWMol> res{new RWMol(mol)};
superParentInPlace(*res, params, skip_standardize);
return res.release();
}

RWMol *normalize(const RWMol *mol, const CleanupParameters &params) {
Expand All @@ -281,7 +356,7 @@ void normalizeInPlace(RWMol &mol, const CleanupParameters &params) {
void normalizeInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params) {
std::unique_ptr<Normalizer> normalizer{normalizerFromParams(params)};
auto sfunc = [&](RWMol &m, const CleanupParameters &) {
auto sfunc = [&normalizer](RWMol &m, const CleanupParameters &) {
normalizer->normalizeInPlace(m);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
Expand All @@ -291,10 +366,10 @@ void reionizeInPlace(RWMol &mol, const CleanupParameters &params) {
std::unique_ptr<Reionizer> reionizer{reionizerFromParams(params)};
reionizer->reionizeInPlace(mol);
}
void reionizeInPlace(std::vector<RWMol *> &mols,int numThreads,
void reionizeInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params) {
std::unique_ptr<Reionizer> reionizer{reionizerFromParams(params)};
auto sfunc = [&](RWMol &m, const CleanupParameters &) {
auto sfunc = [&reionizer](RWMol &m, const CleanupParameters &) {
reionizer->reionizeInPlace(m);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
Expand All @@ -311,10 +386,10 @@ void removeFragmentsInPlace(RWMol &mol, const CleanupParameters &params) {
remover->removeInPlace(mol);
}

void removeFragmentsInPlace(std::vector<RWMol *> &mols,int numThreads,
void removeFragmentsInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params) {
std::unique_ptr<FragmentRemover> remover{fragmentRemoverFromParams(params)};
auto sfunc = [&](RWMol &m, const CleanupParameters &) {
auto sfunc = [&remover](RWMol &m, const CleanupParameters &) {
remover->removeInPlace(m);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
Expand All @@ -325,6 +400,10 @@ RWMol *canonicalTautomer(const RWMol *mol, const CleanupParameters &params) {
std::unique_ptr<TautomerEnumerator> te{tautomerEnumeratorFromParams(params)};
return static_cast<RWMol *>(te->canonicalize(*mol));
}
void canonicalTautomerInPlace(RWMol &mol, const CleanupParameters &params) {
std::unique_ptr<TautomerEnumerator> te{tautomerEnumeratorFromParams(params)};
te->canonicalizeInPlace(mol);
}

std::string standardizeSmiles(const std::string &smiles) {
std::unique_ptr<RWMol> mol{SmilesToMol(smiles, 0, false)};
Expand Down

0 comments on commit c7c9ad3

Please sign in to comment.