In [32]:
from hydra import initialize, compose
from pathlib import Path
import pandas as pd
from ergochemics.draw import draw_reaction, draw_molecule
from ergochemics.mapping import rc_to_nest
from IPython.display import SVG
import json
from rdkit import Chem
from rdkit.Chem import AllChem

with initialize(version_base=None, config_path="./conf/filepaths"):
    cfg = compose(config_name="filepaths")

In [2]:
with open(Path(cfg.raw_data) / "pathway" / "sprhea_240310_v3_mapped_no_subunits.json", 'r') as f:
    rhea = json.load(f)
print(len(rhea))
print(len({k: v for k, v in rhea.items() if v['min_rule'] is not None}))

min_rules = pd.read_csv(Path(cfg.rules) / "rc_plus_0_rules.csv", sep=',')

18954
13114


In [3]:
# To pull out rule of interest

rhea_id = 25250
for k, v in rhea.items():
    if rhea_id in v['rhea_ids']:
        rxn_id = k
        print(k)
        break

ni_id = rhea[rxn_id]['min_rule']
print(ni_id)

for k, v in min_rules.iterrows():
    if ni_id in v['ni_ids']:
        print(v['id'])
        break

641
rule0273
837


In [3]:
fn = "mapped_sprhea_240310_v3_mapped_no_subunits_x_rc_plus_1_rules.parquet"
df = pd.read_parquet(Path(cfg.processed_data) / "pathway" / fn)
df["template_aidxs"] = df["template_aidxs"].apply(rc_to_nest)
print(len(df))
print(f"{len(df) * 100 / len(rhea):.1f}%")
df.head()

12268
64.7%


Unnamed: 0,rxn_id,smarts,am_smarts,rule,template_aidxs,rule_id
0,1,CC(O)C(O)C(O)C(O)C(=O)O>>CC(O)C(O)CC(=O)C(=O)O.O,[CH3:8][CH:6]([OH:9])[CH:1]([OH:7])[CH:2]([OH:...,[C&D3&v4&H1&0*&!R&z1:1]-[C&D3&v4&H1&0*&!R&z1:2...,"(((3, 5, 6, 7, 8, 9),), ((3, 5, 6, 7, 8), (0,)))",1
1,10,CCCCCCCCCCCC(=O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc...,[CH3:14][CH2:13][CH2:12][CH2:11][CH2:10][CH2:9...,[C&D2&v4&H2&0*&!R&z0:1]-[C&D3&v4&H0&0*&!R&z2:2...,"(((10, 11, 12, 13, 14), (0, 1, 2, 3, 4)), ((10...",2
2,100,*C(=O)OCC(COP(=O)(O)OC1C(O)C(OP(=O)(O)O)C(O)C(...,[*:30][C:27](=[O:31])[O:25][CH2:23][CH:22]([CH...,[*:1]-[O&D2&v2&H0&0*&!R:2]-[P&D4&v5&H0&0*&!R:3...,"(((15, 16, 17, 18, 19, 20), (0,)), ((15, 16), ...",3698
3,10003,O=C(O)CCCCCCCCCCCCCCC(=O)O.O=P(O)(O)O.Nc1ncnc2...,[O:2]=[C:3]([OH:1])[CH2:5][CH2:6][CH2:7][CH2:8...,[O&D1&v2&H0&0*&!R:1]=[C&D3&v4&H0&0*&!R&z2:2](-...,"(((0, 1, 2, 3), (0, 1, 2, 3, 4), (19, 21)), ((...",6
4,10008,O=[N+]([O-])c1ccc(OP(=O)(O)O)cc1.O>>O=[N+]([O-...,[O:9]=[N+:8]([O-:10])[c:7]1[cH:5][cH:3][c:1]([...,[*:1]-[O&D2&v2&H0&0*&!R:2]-[P&D4&v5&H0&0*&!R:3...,"(((6, 7, 8, 9, 10, 11), (0,)), ((6, 7), (0, 1,...",3698


In [4]:
fn = "mapped_sprhea_240310_v3_mapped_no_subunits_x_rc_plus_0_rules.parquet"
rc0_maps = pd.read_parquet(Path(cfg.processed_data) / "pathway" / fn)
print(len(rc0_maps))

13851


In [8]:
missed = list(set(rc0_maps["rxn_id"]) - set(df["rxn_id"]))

In [97]:
print(Chem.AtomFromSmarts('[N&D3&v3&H0&R&0*:10]').DescribeQuery())

AtomAnd
  AtomAnd
    AtomAnd
      AtomAnd
        AtomAnd
          AtomType 7 = val
          AtomExplicitDegree 3 = val
        AtomTotalValence 3 = val
      AtomHCount 0 = val
    AtomInNRings -1 = val
  AtomIsotope 0 = val

