# Reaction-uniprot data clean up

In [1]:
%load_ext autoreload
%autoreload 2
from src.utils import load_json, save_json, get_compound_hash, get_reaction_hash
from minedatabase.utils import postsanitize_smiles, neutralise_charges
import pandas as pd
import csv
from collections import defaultdict
from collections import namedtuple, Counter
from rdkit import Chem
import numpy as np

In [2]:
DatabaseId = namedtuple("DatabaseId", "db id")
Enzyme = namedtuple("Enzyme", "id sequence")

def extract_info(entry):
    # Pull entry and uniprot ids
    if len(entry) == 3:
        rdict, pdict, uniprot = entry
        
        if uniprot == '':
            uniprot = []
        else:
            uniprot = uniprot.split(',')

    elif len(entry) == 2:
        rdict, pdict = entry
        uniprot = []

    # Smiles
    reactants, products = list(rdict.values()), list(pdict.values())
    reactants, products = sanitize(reactants, products)
    reactants, products = neutralize(reactants, products)
    sma = ".".join(sorted(reactants)) + ">>" + ".".join(sorted(products)) # Sort for consistent ordering of subs

    # Inchi (smiles if pathological) hashes
    reactants_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in reactants]).items())
    products_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in products]).items())
    rhash = get_reaction_hash(reactants_hash_stoich, products_hash_stoich)
    
    return rhash, uniprot, sma
    
def sanitize(reactants, products):
    return postsanitize_smiles(reactants)[0], postsanitize_smiles(products)[0]

def neutralize(reactants, products):
    reactants = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in reactants]
    products = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in products]
    return reactants, products

In [12]:
# Add info from JNI's "uniprot" jsons

brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
combo_dataset = defaultdict(lambda : defaultdict(lambda : {'smarts':None, 'uniprot_ids':[]}))
bad_reactions = []

# Add metacyc first
for reaction_key, reaction_entry in mc_uniprot.items():
    rhash, uniprot_ids, smarts = extract_info(reaction_entry)
    combo_dataset[rhash][f"metacyc:{reaction_key}"]['uniprot_ids'] = uniprot_ids
    combo_dataset[rhash][f"metacyc:{reaction_key}"]['smarts'] = smarts

# Add brenda. There are multiple reactions per entry
# so we have to iterate through them all
for k, v in brenda_uniprot.items():
    n_reactions = len(v)
    n_digits = np.log10(n_reactions).astype(int) + 1

    for i in range(n_reactions):
        reaction_key = f"{k}_{i:0{n_digits}}"
        reaction_entry = v[i]
        rhash, uniprot_ids, smarts = extract_info(reaction_entry)
        combo_dataset[rhash][f"brenda:{reaction_key}"]['uniprot_ids'] = uniprot_ids
        combo_dataset[rhash][f"brenda:{reaction_key}"]['smarts'] = smarts

[00:29:46] Explicit valence for atom # 26 O, 3, is greater than permitted
[00:29:46] Explicit valence for atom # 26 O, 3, is greater than permitted
[00:29:46] Explicit valence for atom # 26 O, 3, is greater than permitted
[00:29:46] Explicit valence for atom # 26 O, 3, is greater than permitted
[00:29:47] Explicit valence for atom # 16 N, 4, is greater than permitted
[00:29:47] Explicit valence for atom # 16 N, 4, is greater than permitted
[00:29:48] Explicit valence for atom # 9 N, 4, is greater than permitted
[00:29:48] Explicit valence for atom # 9 N, 4, is greater than permitted
[00:29:48] Explicit valence for atom # 9 N, 4, is greater than permitted
[00:29:48] Explicit valence for atom # 9 N, 4, is greater than permitted
[00:29:49] Explicit valence for atom # 16 N, 4, is greater than permitted
[00:29:49] Explicit valence for atom # 25 N, 4, is greater than permitted
[00:29:49] Explicit valence for atom # 25 N, 4, is greater than permitted
[00:29:49] Explicit valence for atom # 16 

In [18]:
test = ['R990062b1988c86d1788aab855eb0df7750b9bf7acbb248c6dd00eb47c8b54645',
'R1ca59f9a2867ce1705f95de667a054f208b7b6dba57a695ff50a3caba7ede5ac',
 'R5b17a55315f739ed8f272010497ddc0c37aa25cc08d0bdcaeec288027e858ef0',
 'R6626fae30c91234170d590ff079cdcf4a41332a68b4169e0c36925ae5f496a33',
 'R24a34a7ccc0c4dbb87b9527f1d2d5b11a688379acb615383f35ed291e52ec7a8',
 'R005c025192f48b62dedd9e39c427109396fbce25b151c2ef7063f9c61ae8e76a',
 'R597e29a7deabe2bca8775aae13b963da21b46949cedc69e14b8f98b4a7b1cea7',
 'Rbe31215472e97731f6ca6aac6210c8842f28caa396d89a597fff9d2ab0cda7b5',
 'Rc9089af4bbea206d489119f5552563239510f97fcb66c348d5bfcc9a9f9b6f23',
 'R1990c25752953f62d36f0043f21484bad084bda667ed6e8e93e1c3a6fa3b448e',
 'Rcea5ce4a6b69d13df46f6cc5606580e0427f61ca2e146336a446d6c744b6ffb1',
 'R18118a0354016fe8fb6381a635e1a75c51955e9602a87911ca0d32e65d80790f',
 'R2562a1bebff80cd5ef88e0b3be7f3bce18e01f7f781a275579cf70f1c0f885c6',
 'R7faff8144ef65afd9b0972fa167b27e086800ce2cbc9972558722f572e874b2a',
 'Rcf6131392ecec55b16fbc4d3e0ab5aa43a807bcaf9570b26758ffe235a38106f',
 'R2f1225e5a411a049ca96bead24f0df0c2c3f64c7cf1defcd1d6f778a08f0b014',
 'R3422c75c7f445fb66cde2b9705f265f618c38b34e1f785c9ed38dd19d8ef43ab',
 'R0baba3e49d7fdfd788b8e5a0a29dc0adde930b40eaefeab6e4c5358785b5f120',
 'R80b64be81be0f6131ef5c7a653fb189af5b884cb35bec0754de947fcab190a24',
 'R60882074f917b0b652359e728a089487cf4a1003a8657d4d490dde7448368370',
 'Re81ebb3e4ba09a713693c65d6c244613575fd5cda10f11c4ffc6564534790e5d',
 'R5e8a5f40641db9bf1cbe81ce54dcb4f304a26f2c46c2c543c8ab543a9d0a3c3e',
 'Rc4a36c5d9838b94f303932aa8fe7c73cafd53773e1bb02e60bb6a8c91688c593']

for elt in test:
    print(combo_dataset[elt])

defaultdict(<function <lambda>.<locals>.<lambda> at 0x7fd4b3e41e60>, {'metacyc:MALONATE-COA-TRANSFERASE-RXN_reverse': {'smarts': 'CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O.O=C(O)CC(=O)O>>CC(=O)O.CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O)C(O)C(=O)NCCC(=O)NCCSC(=O)CC(=O)O', 'uniprot_ids': []}, 'brenda:2.8.3.3_1': {'smarts': 'CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O.O=C(O)CC(=O)O>>CC(=O)O.CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O)C(O)C(=O)NCCC(=O)NCCSC(=O)CC(=O)O', 'uniprot_ids': []}})
defaultdict(<function <lambda>.<locals>.<lambda> at 0x7fd4b3517170>, {'metacyc:2.8.3.10-RXN_reverse': {'smarts': 'CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O.O=C(O)CC(O)(CC(=O)O)C(=O)O>>CC(=O)O.CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O)C(O)C(=O)NCCC(=O)NCCSC(=O)CC(O)(CC(=O)O)C(=O)O', 

In [26]:
combo_dataset['R5b17a55315f739ed8f272010497ddc0c37aa25cc08d0bdcaeec288027e858ef0']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'metacyc:RXN-1082_reverse': {'smarts': 'CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O.O=C(O)CCC(O)C(=O)O>>CC(=O)O.CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O)C(O)C(=O)NCCC(=O)NCCSC(=O)C(O)CCC(=O)O',
              'uniprot_ids': ['Q59111'],
              'imt_rules': ['rule0152_2']},
             'brenda:2.8.3.12_00': {'smarts': 'CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O.O=C(O)CCC(O)C(=O)O>>CC(=O)O.CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O)C(O)C(=O)NCCC(=O)NCCSC(=O)C(O)CCC(=O)O',
              'uniprot_ids': ['Q59111'],
              'imt_rules': ['rule0152_2']},
             'kegg:R04000': {'smarts': 'CC(=O)SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32)C(O)C1OP(=O)(O)O.O=C(O)CCC(O)C(=O)O>>CC(=O)O.CC(C)(COP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N)ncnc32

In [23]:
# Merge in JNI's reaction jsons

db_names = ["mc_v21", "brenda", "kegg"]
mc_rxns, brenda_rxns, kegg_rxns = [load_json(f"../data/mapping/{elt}_as_is.json") for elt in db_names]
db_new_names_to_rxns = {"metacyc":mc_rxns, "brenda":brenda_rxns, "kegg":kegg_rxns}

for new_name, rxns in db_new_names_to_rxns.items():
    for reaction_key, reaction_entry in rxns.items():
        rhash, uniprot_ids, smarts = extract_info(reaction_entry)
        if rhash not in combo_dataset:
            combo_dataset[rhash][f"{new_name}:{reaction_key}"]['uniprot_ids'] = uniprot_ids
            combo_dataset[rhash][f"{new_name}:{reaction_key}"]['smarts'] = smarts
        elif f"{new_name}:{reaction_key}" not in combo_dataset[rhash]:
            combo_dataset[rhash][f"{new_name}:{reaction_key}"]['uniprot_ids'] = uniprot_ids
            combo_dataset[rhash][f"{new_name}:{reaction_key}"]['smarts'] = smarts

[01:11:38] Explicit valence for atom # 26 O, 3, is greater than permitted
[01:11:38] Explicit valence for atom # 26 O, 3, is greater than permitted
[01:11:38] Explicit valence for atom # 26 O, 3, is greater than permitted
[01:11:38] Explicit valence for atom # 26 O, 3, is greater than permitted
[01:11:39] Explicit valence for atom # 16 N, 4, is greater than permitted
[01:11:40] Explicit valence for atom # 16 N, 4, is greater than permitted
[01:11:40] Explicit valence for atom # 9 N, 4, is greater than permitted
[01:11:40] Explicit valence for atom # 9 N, 4, is greater than permitted
[01:11:40] Explicit valence for atom # 9 N, 4, is greater than permitted
[01:11:40] Explicit valence for atom # 9 N, 4, is greater than permitted
[01:11:41] Explicit valence for atom # 16 N, 4, is greater than permitted
[01:11:41] Explicit valence for atom # 25 N, 4, is greater than permitted
[01:11:41] Explicit valence for atom # 25 N, 4, is greater than permitted
[01:11:41] Explicit valence for atom # 16 

In [25]:
# Add in mapped rules
df_opt = pd.read_csv('../data/mapping/intermediate_rules_optimized_rxns.tsv', sep='\t')
db_ids_to_rules = defaultdict(list)
rules_to_db_ids = defaultdict(list)
nrxns = 0
for index, row in df_opt.iterrows():
    rule = row["Name"]
    db_ids = row["Comments"].split(';')
    nrxns += len(db_ids)
    for elt in db_ids:
        db_ids_to_rules[elt].append(rule)

for rhash, entry in combo_dataset.items():
    for db_id in entry.keys():
        entry[db_id]['imt_rules'] = db_ids_to_rules.get(db_id, [])

In [27]:
ct_combo = 0
degen = []
for k,v in combo_dataset.items():
    degen.append(len(v))
    for db_id in v.keys():
        ct_combo += 1

print(ct_combo)
print(sum(degen) / len(degen))

map_ct = 0
for k,v in db_ids_to_rules.items():
    if len(v) > 0:
        map_ct += 1

print(map_ct)
print(len(combo_dataset))

91034
1.773539324748193
65819
51329


In [28]:
save_json(combo_dataset , "../data/mapping/known_rxns_w_provenance_all_info_jni.json")