Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add in place and multithread support for more of the MolStandardize code #6970

Merged
merged 8 commits into from
Dec 12, 2023
81 changes: 58 additions & 23 deletions Code/GraphMol/MolStandardize/Fragment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,10 @@ void FragmentRemover::removeInPlace(RWMol &mol) {
mol.commitBatchEdit();
}

bool isOrganic(const ROMol &frag) {
bool isOrganic(const ROMol &mol, const std::vector<int> &indices) {
// Returns true if fragment contains at least one carbon atom.
for (const auto at : frag.atoms()) {
if (at->getAtomicNum() == 6) {
for (auto idx : indices) {
if (mol.getAtomWithIdx(idx)->getAtomicNum() == 6) {
return true;
}
}
Expand All @@ -179,57 +179,84 @@ LargestFragmentChooser::LargestFragmentChooser(
countHeavyAtomsOnly = other.countHeavyAtomsOnly;
}

ROMol *LargestFragmentChooser::choose(const ROMol &mol) {
ROMol *LargestFragmentChooser::choose(const ROMol &mol) const {
auto res = new RWMol(mol);
chooseInPlace(*res);
// resanitize the molecule
MolOps::sanitizeMol(*res);

return static_cast<ROMol *>(res);
}

void LargestFragmentChooser::chooseInPlace(RWMol &mol) const {
BOOST_LOG(rdInfoLog) << "Running LargestFragmentChooser\n";

if (!mol.getNumAtoms()) {
return new ROMol(mol);
return;
}

std::vector<std::vector<int>> frags;
MolOps::getMolFrags(mol, frags);
if (frags.size() == 1) {
// nothing to do
return;
}
std::vector<boost::shared_ptr<ROMol>> frags = MolOps::getMolFrags(mol);

LargestFragmentChooser::Largest l;

for (const auto &frag : frags) {
std::string smiles = MolToSmiles(*frag);
SmilesWriteParams ps;
int bestFragment = -1;
for (auto fidx = 0u; fidx < frags.size(); ++fidx) {
const auto &frag = frags[fidx];
std::string smiles = MolFragmentToSmiles(mol, ps, frag);
BOOST_LOG(rdInfoLog) << "Fragment: " << smiles << "\n";
bool organic = isOrganic(*frag);
bool organic = isOrganic(mol, frag);
if (this->preferOrganic) {
// Skip this fragment if not organic and we already have an organic
// fragment as the largest so far
if (l.Fragment != nullptr && l.Organic && !organic) {
if (bestFragment >= 0 && l.Organic && !organic) {
continue;
}
// Reset largest if it wasn't organic and this fragment is organic
// if largest and organic and not largest['organic']:
if (l.Fragment != nullptr && organic && !l.Organic) {
l.Fragment = nullptr;
if (bestFragment >= 0 && organic && !l.Organic) {
bestFragment = -1;
}
}
unsigned int numatoms = 0;
if (this->useAtomCount) {
for (const auto at : frag->atoms()) {
for (const auto idx : frag) {
++numatoms;
if (!this->countHeavyAtomsOnly) {
numatoms += at->getTotalNumHs();
numatoms += mol.getAtomWithIdx(idx)->getTotalNumHs();
}
}
// Skip this fragment if fewer atoms than the largest
if (l.Fragment != nullptr && (numatoms < l.NumAtoms)) {
if (bestFragment >= 0 && (numatoms < l.NumAtoms)) {
continue;
}
}

// Skip this fragment if equal number of atoms but weight is lower
double weight = Descriptors::calcExactMW(*frag);
if (l.Fragment != nullptr &&
(!this->useAtomCount || numatoms == l.NumAtoms) &&
double weight = 0.0;
for (auto idx : frag) {
const auto atom = mol.getAtomWithIdx(idx);
// it's not important to be perfect here
weight += 100 * atom->getAtomicNum() + atom->getIsotope() -
atom->getFormalCharge() * .1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Making weight depend on formal charge looks a bit of a hack, and could lead to unintentional ties (10 formal charges count like one neutron). Wouldn't it be better to leave formal charge out of the weight computation and rather add it as an additional compare criterion?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is clearly a heuristic (sounds nicer than "hack") and it doesn't have to be exactly correct, just reproducible.

Plus, how often are you going to have a molecule with two fragments which have the same number of atoms and the same atoms but which differ by one neutron and a net formal charge difference of 10?

I think it's a theoretical risk. :-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a hack :-) but I am fine with it.

if (!this->countHeavyAtomsOnly) {
weight += atom->getTotalNumHs();
}
}

if (bestFragment >= 0 && (!this->useAtomCount || numatoms == l.NumAtoms) &&
(weight < l.Weight)) {
continue;
}

// Skip this fragment if equal number of atoms and equal weight but smiles
// comes last alphabetically
if (l.Fragment != nullptr &&
(!this->useAtomCount || numatoms == l.NumAtoms) &&
if (bestFragment >= 0 && (!this->useAtomCount || numatoms == l.NumAtoms) &&
(weight == l.Weight) && (smiles > l.Smiles)) {
continue;
}
Expand All @@ -238,13 +265,21 @@ ROMol *LargestFragmentChooser::choose(const ROMol &mol) {
<< numatoms << ")\n";
// Otherwise this is the largest so far
l.Smiles = smiles;
l.Fragment = frag;
bestFragment = fidx;
l.NumAtoms = numatoms;
l.Weight = weight;
l.Organic = organic;
}

return new ROMol(*(l.Fragment));
mol.beginBatchEdit();
for (auto fi = 0; fi < static_cast<int>(frags.size()); ++fi) {
if (fi == bestFragment) {
continue;
}
for (auto i : frags[fi]) {
mol.removeAtom(i);
}
}
mol.commitBatchEdit();
}

LargestFragmentChooser::Largest::Largest() : Smiles(""), Fragment(nullptr) {}
Expand Down
3 changes: 2 additions & 1 deletion Code/GraphMol/MolStandardize/Fragment.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ class RDKIT_MOLSTANDARDIZE_EXPORT LargestFragmentChooser {
LargestFragmentChooser(const LargestFragmentChooser &other);
~LargestFragmentChooser() = default;

ROMol *choose(const ROMol &mol);
ROMol *choose(const ROMol &mol) const;
void chooseInPlace(RWMol &mol) const;
struct Largest {
Largest();
Largest(std::string &smiles, boost::shared_ptr<ROMol> fragment,
Expand Down
169 changes: 124 additions & 45 deletions Code/GraphMol/MolStandardize/MolStandardize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,16 +182,47 @@ void cleanupInPlace(std::vector<RWMol *> &mols, int numThreads,
mols, numThreads, params);
}

void tautomerParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(mol, params);
}

canonicalTautomerInPlace(mol, params);
cleanupInPlace(mol, params);
}

void tautomerParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
tautomerParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}

RWMol *tautomerParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
tautomerParentInPlace(*res, params, skip_standardize);
return res.release();
}

void fragmentParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
fragmentParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}
void fragmentParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(*res, params);
cleanupInPlace(mol, params);
}

std::unique_ptr<RWMol> ct{canonicalTautomer(res.get(), params)};
cleanupInPlace(*ct, params);
return ct.release();
LargestFragmentChooser lfragchooser(params.preferOrganic);
lfragchooser.chooseInPlace(mol);
}

// Return the fragment parent of a given molecule.
Expand All @@ -200,65 +231,109 @@ RWMol *tautomerParent(const RWMol &mol, const CleanupParameters &params,
RWMol *fragmentParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
if (!skip_standardize) {
cleanupInPlace(*res, params);
}
LargestFragmentChooser lfragchooser(params.preferOrganic);
return static_cast<RWMol *>(lfragchooser.choose(*res));
fragmentParentInPlace(*res, params, skip_standardize);
return res.release();
}

RWMol *stereoParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
RWMol *res = new RWMol(mol);
void stereoParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
stereoParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}
void stereoParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(*res, params);
cleanupInPlace(mol, params);
}

MolOps::removeStereochemistry(*res);
return res;
MolOps::removeStereochemistry(mol);
}
RWMol *stereoParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
stereoParentInPlace(*res, params, skip_standardize);
return res.release();
}
void isotopeParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
isotopeParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}

RWMol *isotopeParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
RWMol *res = new RWMol(mol);
void isotopeParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(*res, params);
cleanupInPlace(mol, params);
}

for (auto atom : res->atoms()) {
for (auto atom : mol.atoms()) {
atom->setIsotope(0);
}
return res;
}
RWMol *isotopeParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res{new RWMol(mol)};
isotopeParentInPlace(*res, params, skip_standardize);
return res.release();
}

void chargeParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
chargeParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}
void chargeParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
fragmentParentInPlace(mol, params, skip_standardize);
Uncharger uncharger(params.doCanonical);
uncharger.unchargeInPlace(mol);
cleanupInPlace(mol, params);
}
RWMol *chargeParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
// Return the charge parent of a given molecule.
// The charge parent is the uncharged version of the fragment parent.
std::unique_ptr<RWMol> res{new RWMol(mol)};
chargeParentInPlace(*res, params, skip_standardize);
return res.release();
}

std::unique_ptr<RWMol> fragparent{
fragmentParent(mol, params, skip_standardize)};
void superParentInPlace(RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
if (!skip_standardize) {
cleanupInPlace(mol, params);
}
// we can skip fragmentParent since the chargeParent takes care of that
chargeParentInPlace(mol, params, true);
isotopeParentInPlace(mol, params, true);
stereoParentInPlace(mol, params, true);
tautomerParentInPlace(mol, params, true);
cleanupInPlace(mol, params);
}

Uncharger uncharger(params.doCanonical);
uncharger.unchargeInPlace(*fragparent);
cleanupInPlace(*fragparent, params);
return fragparent.release();
void superParentInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params,
bool skip_standardize) {
auto sfunc = [skip_standardize](RWMol &m, const CleanupParameters &ps) {
superParentInPlace(m, ps, skip_standardize);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
}

RWMol *superParent(const RWMol &mol, const CleanupParameters &params,
bool skip_standardize) {
std::unique_ptr<RWMol> res;
if (!skip_standardize) {
res.reset(cleanup(mol, params));
} else {
res.reset(new RWMol(mol));
}
// we can skip fragmentParent since the chargeParent takes care of that
res.reset(chargeParent(*res, params, true));
res.reset(isotopeParent(*res, params, true));
res.reset(stereoParent(*res, params, true));
res.reset(tautomerParent(*res, params, true));
return cleanup(*res, params);
std::unique_ptr<RWMol> res{new RWMol(mol)};
superParentInPlace(*res, params, skip_standardize);
return res.release();
}

RWMol *normalize(const RWMol *mol, const CleanupParameters &params) {
Expand All @@ -281,7 +356,7 @@ void normalizeInPlace(RWMol &mol, const CleanupParameters &params) {
void normalizeInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params) {
std::unique_ptr<Normalizer> normalizer{normalizerFromParams(params)};
auto sfunc = [&](RWMol &m, const CleanupParameters &) {
auto sfunc = [&normalizer](RWMol &m, const CleanupParameters &) {
normalizer->normalizeInPlace(m);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
Expand All @@ -291,10 +366,10 @@ void reionizeInPlace(RWMol &mol, const CleanupParameters &params) {
std::unique_ptr<Reionizer> reionizer{reionizerFromParams(params)};
reionizer->reionizeInPlace(mol);
}
void reionizeInPlace(std::vector<RWMol *> &mols,int numThreads,
void reionizeInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params) {
std::unique_ptr<Reionizer> reionizer{reionizerFromParams(params)};
auto sfunc = [&](RWMol &m, const CleanupParameters &) {
auto sfunc = [&reionizer](RWMol &m, const CleanupParameters &) {
reionizer->reionizeInPlace(m);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
Expand All @@ -311,10 +386,10 @@ void removeFragmentsInPlace(RWMol &mol, const CleanupParameters &params) {
remover->removeInPlace(mol);
}

void removeFragmentsInPlace(std::vector<RWMol *> &mols,int numThreads,
void removeFragmentsInPlace(std::vector<RWMol *> &mols, int numThreads,
const CleanupParameters &params) {
std::unique_ptr<FragmentRemover> remover{fragmentRemoverFromParams(params)};
auto sfunc = [&](RWMol &m, const CleanupParameters &) {
auto sfunc = [&remover](RWMol &m, const CleanupParameters &) {
remover->removeInPlace(m);
};
standardizeMultipleMolsInPlace(sfunc, mols, numThreads, params);
Expand All @@ -325,6 +400,10 @@ RWMol *canonicalTautomer(const RWMol *mol, const CleanupParameters &params) {
std::unique_ptr<TautomerEnumerator> te{tautomerEnumeratorFromParams(params)};
return static_cast<RWMol *>(te->canonicalize(*mol));
}
void canonicalTautomerInPlace(RWMol &mol, const CleanupParameters &params) {
std::unique_ptr<TautomerEnumerator> te{tautomerEnumeratorFromParams(params)};
te->canonicalizeInPlace(mol);
}

std::string standardizeSmiles(const std::string &smiles) {
std::unique_ptr<RWMol> mol{SmilesToMol(smiles, 0, false)};
Expand Down