In [21]:
import pandas as pd
from hydra import initialize, compose
from pathlib import Path
from ergochemics.draw import draw_molecule
from ergochemics.mapping import rc_to_nest
from IPython.display import SVG
from rdkit import Chem
from rdkit.Chem import AllChem
from cgr.ml import scrub_anonymous_template_atoms
from itertools import accumulate
from collections import defaultdict, Counter
import networkx as nx

In [3]:
with initialize(version_base=None, config_path="../configs/filepaths"):
    cfg = compose(config_name="filepaths")

In [18]:
mechinfo_rules = pd.read_csv(
    Path(cfg.processed_data) / "mechinformed_rules.csv"
)

dt01_rules = pd.read_csv(
    Path(cfg.processed_data) / "mechinferred_dt_01_rules.csv"
)
mechinfo_rules.head()

Unnamed: 0,id,smarts,entry_id,mechanism_id
0,0,[C&D3&v4&H1&+0&!R&z1:1]-[C&D3&v4&H0&+0&!R&z2:2...,"[1, 1]","[1, 1]"
1,1,[C&D3&v4&H0&+0&R&z2:1](=[O&D1&v2&H0&+0&!R:2])-...,"[2, 2, 210, 257, 258]","[2, 1, 1, 1, 1]"
2,2,([C&D3&v4&H0&+0&!R&z2:1](=[O&D1&v2&H0&+0&!R:2]...,"[2, 2, 210, 257, 258]","[2, 1, 1, 1, 1]"
3,3,[N&D3&v3&H0&+0&R:1]1-[C&D2&v4&H1&+0&R&z1:2]=[C...,[3],[1]
4,4,[O&D1&v2&H1&+0&!R:1]-[c&D3&v4&H0&+0&R&z1:2]1:[...,[3],[1]


In [14]:
def contains_split_mol(rxn_sma: str) -> bool:
    rxn = AllChem.ReactionFromSmarts(rxn_sma)
    n_rct = rxn.GetNumReactantTemplates()
    n_pdt = rxn.GetNumProductTemplates()
    lrxn, rrxn = rxn_sma.split('>>')
    n_rct_sma = Counter(lrxn)['.']
    n_pdt_sma = Counter(rrxn)['.']
    return (n_rct != n_rct_sma + 1) or (n_pdt != n_pdt_sma + 1)

In [16]:
mechinfo_rules['contains_split_mol'] = mechinfo_rules['smarts'].apply(contains_split_mol)
mechinfo_rules['contains_split_mol'].value_counts()

contains_split_mol
False    781
True     118
Name: count, dtype: int64

In [17]:
mechinfo_rules.loc[mechinfo_rules['contains_split_mol']]

Unnamed: 0,id,smarts,entry_id,mechanism_id,contains_split_mol
1,1,[C&D3&v4&H0&+0&R&z2:1](=[O&D1&v2&H0&+0&!R:2])-...,"[2, 2, 210, 257, 258]","[2, 1, 1, 1, 1]",True
2,2,([C&D3&v4&H0&+0&!R&z2:1](=[O&D1&v2&H0&+0&!R:2]...,"[2, 2, 210, 257, 258]","[2, 1, 1, 1, 1]",True
15,15,([C&D2&v4&H2&+0&!R&z1:1]-[O&D2&v2&H0&+0&!R:2]....,[9],[1],True
16,16,([O&D1&v2&H1&+0&!R:1].[O&D1&v2&H1&+0&!R:2]).[P...,[9],[1],True
23,23,[C&D3&v4&H0&+0&R&z2:1]-[N&D3&v3&H0&+0&R:2].[O&...,"[15, 258]","[1, 2]",True
...,...,...,...,...,...
798,798,([O&D1&v2&H0&+0&!R:1]=[C&D3&v4&H0&+0&R&z2:2]-[...,"[861, 861]","[1, 1]",True
849,849,([C&D2&v4&H2&+0&!R&z0:1]-[C&D2&v4&H2&+0&!R&z0:...,[969],[2],True
850,850,([C&D2&v4&H1&+0&!R&z0:1]=[C&D2&v4&H1&+0&!R&z0:...,[969],[2],True
871,871,[C&D4&v4&H0&+0&R&z2:1](-[O&D1&v2&H1&+0&!R:2])(...,[983],[1],True


In [19]:
dt01_rules['contains_split_mol'] = dt01_rules['smarts'].apply(contains_split_mol)
dt01_rules['contains_split_mol'].value_counts()

contains_split_mol
False    13892
True       352
Name: count, dtype: int64

In [20]:
mm = pd.read_parquet(
    Path(cfg.raw_data) / "distilled_mech_reactions.parquet"
)
mm["mech_atoms"] = mm["mech_atoms"].apply(rc_to_nest)
mm["reaction_center"] = mm["reaction_center"].apply(rc_to_nest)
mm.head()

Unnamed: 0,entry_id,mechanism_id,smarts,am_smarts,reaction_center,mech_atoms,enzyme_name,uniprot_id,ec,reported_direction
0,1,1,NC(CCC(=O)O)C(=O)O>>NC(CCC(=O)O)C(=O)O,[NH2:26][CH:25]([CH2:27][CH2:31][C:28](=[O:29]...,"(((7, 8, 9),), ((7, 8, 9),))","(((1,),), ((1,),))",glutamate racemase,P56868,5.1.1.3,True
1,1,1,NC(CCC(=O)O)C(=O)O>>NC(CCC(=O)O)C(=O)O,[NH2:26][CH:25]([CH2:27][CH2:31][C:28](=[O:29]...,"(((7, 8, 9),), ((7, 8, 9),))","(((1,),), ((1,),))",glutamate racemase,P56868,5.1.1.3,False
2,2,2,*C(=O)NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O>>*C(=O)NC...,[*:33][C:32](=[O:35])[NH:30][CH:16]1[C:17](=[O...,"(((5, 7), (0,)), ((5, 7, 9),))","(((6,), ()), ((6,),))",beta-lactamase (Class A),P62593,3.5.2.6,True
3,2,2,*C(=O)NC(C(=O)O)C1NC(C(=O)O)C(C)(C)S1>>*C(=O)N...,[*:33][C:32](=[O:35])[NH:30][CH:16]([C:17](=[O...,"(((5, 7, 9),), ((5, 7), (0,)))","(((6,),), ((6,), ()))",beta-lactamase (Class A),P62593,3.5.2.6,False
4,2,1,*C(=O)NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O>>*C(=O)NC...,[*:51][C:50](=[O:52])[NH:48][CH:33]1[C:35](=[O...,"(((5, 7), (0,)), ((5, 7, 9),))","(((6,), ()), ((6,),))",beta-lactamase (Class A),P62593,3.5.2.6,True


In [22]:
def contains_split_rc(rxn: str, atoms_to_include: list[list[int]], reaction_center: list[list[int]]) -> str:
    '''
    Extracts a reaction template from a reaction string and a list of atoms to include.
    
    Args
    ----
    rxn : str
        Atom-mapped reaction string
    atoms_to_include : Iterable[Iterable[int]]
        List of lists of LHS atom indices to include in the template.
    reaction_center : Iterable[Iterable[int]]
        List of lists of LHS atom indices that are part of the reaction center. Only
        for reactants / lhs of rxn

    Returns
    -------
    str
        Reaction template (SMARTS)    
    '''
    reaction_center = reaction_center[0]
    atoms_to_include = atoms_to_include[0]
    lhs, rhs = [[Chem.MolFromSmiles(s) for s in side.split('.')] for side in rxn.split('>>')]
    rhs = [[rmol, set(), set()] for rmol in rhs] # Initialize rhs

    ltemplate = []
    for lmol, l_incl_aidxs, rc in zip(lhs, atoms_to_include, reaction_center):
        rc = set(rc)
        l_incl_aidxs = set(l_incl_aidxs) - rc # Ensure rc atoms not double counted in incl_aidxs
        
        A = Chem.GetAdjacencyMatrix(lmol)
        A_incl_rc = A[tuple(l_incl_aidxs | rc), :][:, tuple(l_incl_aidxs | rc)] # Subgraph on incl | rc
        A_rc = A[tuple(rc), :][:, tuple(rc)]
        
        G_incl_rc = nx.from_numpy_array(A_incl_rc)
        G_rc = nx.from_numpy_array(A_rc)
        rc_ccs = list(nx.connected_components(G_rc))

        if len(rc_ccs) > 1:
            return True
        
    return False

In [23]:
mm['contains_split_rc'] = mm.apply(
    lambda row: contains_split_rc(
        row['smarts'],
        row['mech_atoms'],
        row['reaction_center']
    ),
    axis=1
)
mm['contains_split_rc'].value_counts()

contains_split_rc
False    1236
True      110
Name: count, dtype: int64