In [None]:
'''
Try to get all atom mappings from metAMDB
and match them up with reactions I have in my known reactions
swissprot-rhea file
'''

In [1]:
import os
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import rdkit
print(rdkit.__version__)

from src.utils import save_json, get_compound_hash, get_reaction_hash, postsanitize_smiles, neutralise_charges, smarts_to_sub_smiles, sub_smiles_to_smarts
from src.post_processing import Enzyme, DatabaseEntry
from collections import Counter, defaultdict
import json

def sanitize(reactants, products):
    return postsanitize_smiles(reactants)[0], postsanitize_smiles(products)[0]

def neutralize(reactants, products):
    reactants = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in reactants]
    products = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in products]
    return reactants, products

def get_rhash(reactants, products):
    reactants_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in reactants]).items())
    products_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in products]).items())
    rhash = get_reaction_hash(reactants_hash_stoich, products_hash_stoich)
    return rhash

def read_rxn_file_custom(fn, sanitize=True):
    with open("../data/all_aams/46.rxn", 'r') as f:
        br = f.read()

    am_numbers_seen = []
    lhs = True
    reactants = []
    products= []
    for block in br.split("$MOL\n")[1:]:
        mol = Chem.MolFromMolBlock(block, sanitize=sanitize)

        if lhs: # If still processing reactants
            # Check if any repeated atom map numbers
            for atom in mol.GetAtoms():
                am_number = atom.GetAtomMapNum()
                if am_number in am_numbers_seen:
                    lhs = False # Move onto rhs
                    break
                else:
                    am_numbers_seen.append(am_number)
        
        if lhs:
            reactants.append(Chem.MolToSmarts(mol))

        else:
            products.append(Chem.MolToSmarts(mol))

    smarts = ".".join(reactants) + '>>' + ".".join(products)
    return smarts

def rxn_to_smiles(rxn):
    rs = [Chem.MolToSmiles(r) for r in rxn.GetReactants()]
    ps = [Chem.MolToSmiles(p) for p in rxn.GetProducts()]
    smiles = ".".join(rs) + ">>" + ".".join(ps)
    return smiles

2023.03.2


In [2]:
# Load am reactions

rxns = []
rxns_deamapped = []
bad_rxns = []
custom_rxns = []
dir = "../data/all_aams"
for i,fn in enumerate(sorted(os.listdir(dir), key=lambda x : int(x.split(".")[0]))):
    try:
        rxn = Chem.rdChemReactions.ReactionFromRxnFile(f"{dir}/{fn}", sanitize=True)
        rxn_dam = Chem.rdChemReactions.ReactionFromRxnFile(f"{dir}/{fn}", sanitize=True)
    except:
        try:
            smarts = read_rxn_file_custom(f"{dir}/{fn}")
            rxn = AllChem.ReactionFromSmarts(smarts)
            custom_rxns.append(rxn)
        except:
            bad_rxns.append(fn)
        else:
            rxns.append(rxn)
            rxns_deamapped.append(rxn)
    else:
        rxns.append(rxn)
        rxns_deamapped.append(rxn_dam)
    


[17:40:20] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 13 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 10 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 22 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26 N, 4, is greater than permitted
[17:40:20] Explicit valence for atom # 26

In [3]:
# Strip atom map numbers
for rxn in rxns_deamapped:

    for r in rxn.GetReactants():
        for atom in r.GetAtoms():
            atom.SetAtomMapNum(0)

    for p in rxn.GetProducts():
        for atom in p.GetAtoms():
            atom.SetAtomMapNum(0)


In [4]:
assert len(rxns) == len(rxns_deamapped)
n_rxns = len(rxns)

In [5]:
# Get reaction smarts
smarts = []
deamapped_smarts = []
bad_rxns = []
bad_dam = []

for i in range(n_rxns):
    try:
        sma = AllChem.ReactionToSmiles(rxns[i])
        dam_sma = AllChem.ReactionToSmiles(rxns_deamapped[i])
    except:
        bad_rxns.append(i)
    else:
        smarts.append(sma)
        deamapped_smarts.append(dam_sma)

In [6]:
assert len(smarts) == len(deamapped_smarts)

In [7]:
len(smarts), len(deamapped_smarts)

(49928, 49928)

In [8]:
# Sanitize and remove stereochem from deamapped smarts

san_smarts_metamdb = []
rhashes_metamdb = []
for sma in deamapped_smarts:
    reactants, products = smarts_to_sub_smiles(sma)
    reactants = [elt for elt in reactants if elt != '[H+]'] # Remove protons
    products = [elt for elt in products if elt != '[H+]']
    reactants, products = sanitize(reactants, products)
    reactants, products = neutralize(reactants, products)
    san_sma = ".".join(reactants) + '>>' + ".".join(products)
    rhash = get_rhash(reactants, products)

    san_smarts_metamdb.append(san_sma)
    rhashes_metamdb.append(rhash)


[17:41:47] Can't kekulize mol.  Unkekulized atoms: 16 20 23 24 25 26 27 28 30 31
[17:41:49] Can't kekulize mol.  Unkekulized atoms: 16 20 23 24 25 26 27 28 30 31
[17:41:55] Can't kekulize mol.  Unkekulized atoms: 1 5 6 10 11 12
[17:41:55] Can't kekulize mol.  Unkekulized atoms: 1 5 6 10 11 12
[17:41:56] Can't kekulize mol.  Unkekulized atoms: 16 20 23 24 25 26 27 28 30 31
[17:41:58] Can't kekulize mol.  Unkekulized atoms: 16 20 23 24 25 26 27 28 30 31
[17:41:59] Can't kekulize mol.  Unkekulized atoms: 2 5 6 7 8 9 10 12 13 24
[17:42:04] Can't kekulize mol.  Unkekulized atoms: 2 4 5 6 7 8
[17:42:04] Can't kekulize mol.  Unkekulized atoms: 1 5 6 7 8 9
[17:42:04] Can't kekulize mol.  Unkekulized atoms: 3 5 6 12
[17:42:05] Can't kekulize mol.  Unkekulized atoms: 1 5 6 7 8 9
[17:42:06] Can't kekulize mol.  Unkekulized atoms: 16 20 23 24 25 26 27 28 30 31
[17:42:06] Can't kekulize mol.  Unkekulized atoms: 2 5 6 7 8 9 10 12 13 24
[17:42:07] Can't kekulize mol.  Unkekulized atoms: 16 20 23 24 2

In [9]:
# Load known reactions
with open("../data/mapping/known_rxns_swissprot_enzymes_240310.json", 'r') as f:
    known_rxns = json.load(f)

# Check for hash matches
n_kr = len(known_rxns)
matches = defaultdict(set)
for i, rhash in enumerate(rhashes_metamdb):
    if rhash in known_rxns:
        matches[rhash].add(smarts[i])

print(f"{len(matches) / n_kr}")

0.1916887449710529
