-
Notifications
You must be signed in to change notification settings - Fork 8
/
fragment.py
161 lines (126 loc) · 4.88 KB
/
fragment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import logging
from collections import Counter
import pandas as pd
import rdkit
import rdkit.Chem
import rdkit.Chem.AllChem
def fragment_iterator(smiles, skip_warnings=False, skip_rings=False):
mol_stereo = enumerate_stereocenters(smiles)
if (mol_stereo["atom_unassigned"] != 0) or (mol_stereo["bond_unassigned"] != 0):
logging.warning(f"Molecule {smiles} has undefined stereochemistry")
if skip_warnings:
return
mol = rdkit.Chem.MolFromSmiles(smiles)
mol = rdkit.Chem.rdmolops.AddHs(mol)
rdkit.Chem.Kekulize(mol, clearAromaticFlags=True)
for bond in mol.GetBonds():
if skip_rings and bond.IsInRing():
continue
if bond.GetBondTypeAsDouble() > 1.9999:
continue
try:
# Use RDkit to break the given bond
mh = rdkit.Chem.RWMol(mol)
a1 = bond.GetBeginAtomIdx()
a2 = bond.GetEndAtomIdx()
mh.RemoveBond(a1, a2)
mh.GetAtomWithIdx(a1).SetNoImplicit(True)
mh.GetAtomWithIdx(a2).SetNoImplicit(True)
# Call SanitizeMol to update radicals
rdkit.Chem.SanitizeMol(mh)
# Convert the two molecules into a SMILES string
fragmented_smiles = rdkit.Chem.MolToSmiles(mh)
# Split fragment and canonicalize
split_smiles = fragmented_smiles.split(".")
if len(split_smiles) == 2:
frag1, frag2 = sorted(split_smiles)
elif len(split_smiles) == 1:
frag1, frag2 = split_smiles[0], ""
else:
raise ValueError("Too many fragments")
frag1 = canonicalize_smiles(frag1)
frag2 = canonicalize_smiles(frag2)
# Stoichiometry check
assert (
count_atom_types(frag1) + count_atom_types(frag2)
) == count_atom_types(smiles), "Error with {}; {}; {}".format(
frag1, frag2, smiles
)
# Check introduction of new stereocenters
is_valid_stereo = check_stereocenters(frag1) and check_stereocenters(frag2)
yield pd.Series(
{
"molecule": smiles,
"bond_index": bond.GetIdx(),
"bond_type": get_bond_type(bond),
"fragment1": frag1,
"fragment2": frag2,
"is_valid_stereo": is_valid_stereo,
}
)
except ValueError:
logging.error(
"Fragmentation error with {}, bond {}".format(smiles, bond.GetIdx())
)
continue
def count_atom_types(smiles):
""" Return a dictionary of each atom type in the given fragment or molecule
"""
mol = rdkit.Chem.MolFromSmiles(smiles, sanitize=True)
mol = rdkit.Chem.rdmolops.AddHs(mol)
return Counter([atom.GetSymbol() for atom in mol.GetAtoms()])
def canonicalize_smiles(smiles):
""" Return a consistent SMILES representation for the given molecule """
mol = rdkit.Chem.MolFromSmiles(smiles)
return rdkit.Chem.MolToSmiles(mol)
def enumerate_stereocenters(smiles):
""" Returns a count of both assigned and unassigned stereocenters in the
given molecule """
mol = rdkit.Chem.MolFromSmiles(smiles)
rdkit.Chem.FindPotentialStereoBonds(mol)
stereocenters = rdkit.Chem.FindMolChiralCenters(mol, includeUnassigned=True)
stereobonds = [
bond
for bond in mol.GetBonds()
if bond.GetStereo() is not rdkit.Chem.rdchem.BondStereo.STEREONONE
]
atom_assigned = len([center for center in stereocenters if center[1] != "?"])
atom_unassigned = len([center for center in stereocenters if center[1] == "?"])
bond_assigned = len(
[
bond
for bond in stereobonds
if bond.GetStereo() is not rdkit.Chem.rdchem.BondStereo.STEREOANY
]
)
bond_unassigned = len(
[
bond
for bond in stereobonds
if bond.GetStereo() is rdkit.Chem.rdchem.BondStereo.STEREOANY
]
)
return pd.Series(
{
"atom_assigned": atom_assigned,
"atom_unassigned": atom_unassigned,
"bond_assigned": bond_assigned,
"bond_unassigned": bond_unassigned,
}
)
def check_stereocenters(smiles):
"""Check the given SMILES string to determine whether accurate
enthalpies can be calculated with the given stereochem information
"""
stereocenters = enumerate_stereocenters(smiles)
if stereocenters["bond_unassigned"] > 0:
return False
max_unassigned = 1 if stereocenters["atom_assigned"] == 0 else 1
if stereocenters["atom_unassigned"] <= max_unassigned:
return True
else:
return False
def get_bond_type(bond):
return "{}-{}".format(
*tuple(sorted((bond.GetBeginAtom().GetSymbol(), bond.GetEndAtom().GetSymbol())))
)