In [26]:
import json
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from src.utils import construct_sparse_adj_mat
from pathlib import Path

In [4]:
fn = "/home/stef/quest_data/hiec/scratch/sprhea_v3_folded_pt_ns/rcmcs/3fold/train_val_0.parquet"
staged_on_quest = pd.read_parquet(fn)

In [29]:
issues = {}
for i, row in staged_on_quest.iterrows():
    sma = row['smarts']
    lhs, rhs = [side.split('.') for side in sma.split('>>')]
    lhs_mols = [Chem.MolFromSmiles(s) for s in lhs]
    rhs_mols = [Chem.MolFromSmiles(s) for s in rhs]
    l_rc, r_rc = [list(elt) for elt in row['reaction_center']]
    for m, r in zip(lhs_mols + rhs_mols, l_rc + r_rc):
        if r.max() > m.GetNumAtoms():
            issues[row['reaction_idx']] = (sma, [elt.max() for elt in l_rc + r_rc], [elt.GetNumAtoms() for elt in lhs_mols + rhs_mols])

In [30]:
issues

{4236: ('[OH2:10].[O:1]([P:2](=[O:3])([OH:4])[O:5][P:6](=[O:7])([OH:8])[OH:9])[P:11]([O:12][CH2:15][CH:16]1[CH:17]([OH:20])[CH2:19][CH:21]([n:22]2[cH:23][n:25][c:27]3[c:24]2[n:26][c:28]([NH2:31])[n:30][c:29]3[O:32][CH3:33])[O:18]1)(=[O:13])[OH:14]>>[OH:1][P:2](=[O:3])([OH:4])[O:5][P:6](=[O:7])([OH:8])[OH:9].[OH:10][P:11]([O:12][CH2:15][CH:16]1[CH:17]([OH:20])[CH2:19][CH:21]([n:22]2[cH:23][n:25][c:27]3[c:24]2[n:26][c:28]([NH2:31])[n:30][c:29]3[O:32][CH3:33])[O:18]1)(=[O:13])[OH:14]',
  [np.int64(0), np.int64(22), np.int64(21), np.int64(2)],
  [1, 32, 9, 24]),
 1399: ('[CH2:49]1[N:50]([CH3:75])[c:62]2[c:56]([cH:61][cH:64][c:67]3[c:65]2[cH:68][c:70]2[c:71]([cH:69]3)[O:73][CH2:74][O:72]2)-[c:53]2[c:51]1[c:52]1[c:54]([cH:58][c:57]2[O:63][CH3:66])[O:59][CH2:60][O:55]1.[c:1]1([C:7]([NH2:8])=[O:9])[cH:2][n+:3]([CH:10]2[O:11][CH:13]([CH2:16][O:18][P:19](=[O:20])([OH:21])[O:22][P:23](=[O:24])([OH:25])[O:26][CH2:27][CH:28]3[O:29][CH:31]([n:34]4[cH:36][n:39][c:41]5[c:37]4[n:40][cH:45][n:47][c:46]5

In [21]:
with open("/home/stef/quest_data/hiec/data/sprhea/v3_folded_pt_ns.json", 'r') as f:
    sprhea = json.load(f)

adj, idx_sample, idx_feature = construct_sparse_adj_mat(Path("/home/stef/quest_data/hiec/data/sprhea/v3_folded_pt_ns.csv"))

Constructing v3_folded_pt_ns sparse adjacency matrix


In [25]:
entry = sprhea[idx_feature[440]]
entry

{'smarts': 'CCCCCCCCCCCCCCCCOCC(COP(=O)(O)OCC[N+](C)(C)C)OC(=O)CCC.O>>CCCC(=O)O.CCCCCCCCCCCCCCCCOCC(O)COP(=O)(O)OCC[N+](C)(C)C',
 'min_rules': ['rule0007', 'rule0006'],
 'rcs': [[[18, 31], [0]], [[3, 5], [19]]],
 'enzymes': [{'uniprot_id': 'Q8VDG7',
   'sequence': 'MGAGQSVCFPPISGPHHIGCTDVMEGHSLEGSLFRLFYPCQASEKCEQPLWIPRYEYSMGLADYLQYNKRWVGLLFNVGIGSCRLPVSWNGPFKAKESGYPLIILSHGLGGFRASYSAFCMELASRGFVVAAVEHRDQSAAATYFCKPTSQESSPAESLEEEWLPFRRIKEGEKEFHVRNPQVHQRVKECVRVLRILQDASAGKTVVNVFPGGLDLMTLKGSIDRNRVAVMGHSFGGATAVLALTQEVQFRCAIALDAWMFPLERDFYPKARGPVFFINVEKFQTVESVNLMKKICAQHEQSRIVTVLGAVHRSQTDFAFVTGNLIAKFFSSNSRGTLDPYESQEVMVRAILAFLQKHLDLKEDYDQWSSFIEGVGPSLIQGAPHYLSSL',
   'existence': 'Evidence at protein level',
   'reviewed': 'reviewed',
   'ec': '2.3.1.149; 3.1.1.47',
   'organism': 'Mus musculus (Mouse)',
   'name': 'Platelet-activating factor acetylhydrolase 2, cytoplasmic (EC 3.1.1.47) (PAF:lysophospholipid transacetylase) (PAF:sphingosine transacetylase) (Platelet-activating factor acetyltransfera

In [32]:
for side in entry['smarts'].split('>>'):
    for s in side.split('.'):
        m = Chem.MolFromSmiles(s)
        print(s, m.GetNumAtoms())

for side in entry['am_smarts'].split('>>'):
    for s in side.split('.'):
        m = Chem.MolFromSmiles(s)
        print(s, m.GetNumAtoms())

CCCCCCCCCCCCCCCCOCC(COP(=O)(O)OCC[N+](C)(C)C)OC(=O)CCC 37
O 1
CCCC(=O)O 6
CCCCCCCCCCCCCCCCOCC(O)COP(=O)(O)OCC[N+](C)(C)C 32
[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:15][CH2:17][CH2:19][CH2:23][CH2:24][CH2:25][CH2:26][CH2:27][CH2:28][CH2:29][CH2:30][CH2:31][CH3:32])([CH2:4][O:6][P:8](=[O:10])([OH:11])[O:12][CH2:14][CH2:16][N+:18]([CH3:20])([CH3:21])[CH3:22])[O:33][C:34](=[O:35])[CH2:36][CH2:37][CH3:38] 37
[OH2:2] 1
[CH:1]([OH:2])([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:15][CH2:17][CH2:19][CH2:23][CH2:24][CH2:25][CH2:26][CH2:27][CH2:28][CH2:29][CH2:30][CH2:31][CH3:32])[CH2:4][O:6][P:8](=[O:10])([OH:11])[O:12][CH2:14][CH2:16][N+:18]([CH3:20])([CH3:21])[CH3:22] 32
[OH:33][C:34](=[O:35])[CH2:36][CH2:37][CH3:38] 6


In [33]:
for idx in issues:
    smarts = sprhea[idx_feature[idx]]['smarts']
    lhs, rhs = [side.split('.') for side in smarts.split('>>')]
    mols = [Chem.MolFromSmiles(s) for s in lhs + rhs]
    for rc_max, mol in zip(issues[idx][1], mols):
        if rc_max > mol.GetNumAtoms():
            print(f"idx {idx} with smarts {smarts} has issue: {rc_max} > {mol.GetNumAtoms()} for {Chem.MolToSmiles(mol)}")

In [24]:
staged_on_quest.loc[staged_on_quest.reaction_idx == 440]

Unnamed: 0,protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,reaction_center,y
948,558,440,Q8VDG7,9888,"[0.07856997, 0.24768348, 0.07567654, 0.0232599...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",1
949,559,440,P79106,9888,"[0.09574465, 0.23234762, 0.058149666, 0.032877...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",1
950,560,440,Q99487,9888,"[0.092274666, 0.23454094, 0.05176036, 0.043576...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",1
951,561,440,P83006,9888,"[0.10112702, 0.24632981, 0.06338388, 0.0236249...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",1
10903,9971,440,P0DUN3,9888,"[0.16980408, 0.30399475, 0.009269477, -0.06162...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",0
...,...,...,...,...,...,...,...,...
93470,6924,440,A4TSJ5,9888,"[0.010831217, 0.090564646, 0.0048261955, 0.137...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",0
95428,333,440,Q9ERS7,9888,"[-0.052899245, 0.29523712, -0.069401555, 0.042...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",0
96850,3589,440,P27038,9888,"[0.07959237, 0.18772636, 0.08321738, -0.025589...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",0
97345,18501,440,Q9N0A4,9888,"[0.032802057, 0.27320108, 0.026131785, 0.06960...",[CH:1]([CH2:3][O:5][CH2:7][CH2:9][CH2:13][CH2:...,"[[[18, 31], [0]], [[3, 5], [19]]]",0


In [1]:
from drfp import DrfpEncoder

rxn_smiles = [
    "CO.O[C@@H]1CCNC1.[C-]#[N+]CC(=O)OC>>[C-]#[N+]CC(=O)N1CC[C@@H](O)C1",
    "CCOC(=O)C(CC)c1cccnc1.Cl.O>>CCC(C(=O)O)c1cccnc1",
]

fps = DrfpEncoder.encode(rxn_smiles)

In [2]:
fps

[array([0, 0, 0, ..., 0, 0, 0], shape=(2048,), dtype=uint8),
 array([0, 0, 0, ..., 0, 0, 0], shape=(2048,), dtype=uint8)]

In [11]:
am_rxn = "[C:1][C:2]>>[C:1]=[C:2]"
rxn = "CC>>C=C"

am_fp = DrfpEncoder.encode(am_rxn)
fp = DrfpEncoder.encode(rxn)
(am_fp[0] == fp[0]).all()

np.False_

In [1]:
foo = [1,2,3]
max(foo)

3