# Reaction-uniprot data clean up

In [None]:
'''
TODO: 

    - Fix up cpd hash
    - Fix up reaction hash
    - Bring in sanitization and neutralization
    - Take as input brenda and mc uniprot, fold entries in on 
    unique hash id - count how many entries, how many unique reactions
'''

'\nTODO: \n\n    - Fix up cpd hash\n    - Fix up reaction hash\n    - Bring in sanitization and neutralization\n    - Take as input brenda and mc uniprot, fold entries in on \n    unique hash id - count how many entries, how many unique reactions\n'

In [1]:
%load_ext autoreload
%autoreload 2
from src.utils import load_json, save_json, get_compound_hash, get_reaction_hash
from minedatabase.utils import postsanitize_smiles, neutralise_charges
import pandas as pd
import csv
from collections import defaultdict
from collections import namedtuple, Counter
from rdkit import Chem
import numpy as np

In [2]:
DatabaseId = namedtuple("DatabaseId", "db id")
Enzyme = namedtuple("Enzyme", "id sequence")

def extract_info(entry):
    # Pull entry and uniprot ids
    if len(entry) == 3:
        rdict, pdict, uniprot = entry
        
        if uniprot == '':
            uniprot = []
        else:
            uniprot = uniprot.split(',')

    elif len(entry) == 2:
        rdict, pdict = entry
        uniprot = []

    # Smiles
    reactants, products = list(rdict.values()), list(pdict.values())
    reactants, products = sanitize(reactants, products)
    reactants, products = neutralize(reactants, products)
    sma = ".".join(sorted(reactants)) + ">>" + ".".join(sorted(products)) # Sort for consistent ordering of subs

    # Inchi (smiles if pathological) hashes
    reactants_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in reactants]).items())
    products_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in products]).items())
    rhash = get_reaction_hash(reactants_hash_stoich, products_hash_stoich)
    
    return rhash, uniprot, sma
    
def sanitize(reactants, products):
    return postsanitize_smiles(reactants)[0], postsanitize_smiles(products)[0]

def neutralize(reactants, products):
    reactants = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in reactants]
    products = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in products]
    return reactants, products

In [3]:
# Add info from JNI's "uniprot" jsons

brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
combo_dataset = defaultdict(lambda : defaultdict(lambda : {'smarts':None, 'uniprot_ids':[]}))
bad_reactions = []

# Add metacyc first
for reaction_key, reaction_entry in mc_uniprot.items():
    rhash, uniprot_ids, smarts = extract_info(reaction_entry)
    combo_dataset[rhash][f"metacyc:{reaction_key}"]['uniprot_ids'] = uniprot_ids
    combo_dataset[rhash][f"metacyc:{reaction_key}"]['smarts'] = smarts

# Add brenda. There are multiple reactions per entry
# so we have to iterate through them all
for k, v in brenda_uniprot.items():
    n_reactions = len(v)
    n_digits = np.log10(25).astype(int) + 1

    for i in range(n_reactions):
        reaction_key = f"{k}_{i:0{n_digits}}"
        reaction_entry = v[i]
        rhash, uniprot_ids, smarts = extract_info(reaction_entry)
        combo_dataset[rhash][f"brenda:{reaction_key}"]['uniprot_ids'] = uniprot_ids
        combo_dataset[rhash][f"brenda:{reaction_key}"]['smarts'] = smarts

[19:38:48] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:38:48] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:38:48] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:38:48] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:38:48] Explicit valence for atom # 16 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 16 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 16 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 25 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 25 N, 4, is greater than permitted
[19:38:49] Explicit valence for atom # 16 

In [4]:
# Merge in JNI's reaction jsons

db_names = ["mc_v21", "brenda", "kegg"]
mc_rxns, brenda_rxns, kegg_rxns = [load_json(f"../data/mapping/{elt}_as_is.json") for elt in db_names]
db_new_names_to_rxns = {"metacyc":mc_rxns, "brenda":brenda_rxns, "kegg":kegg_rxns}

for new_name, rxns in db_new_names_to_rxns.items():
    for reaction_key, reaction_entry in rxns.items():
        rhash, uniprot_ids, smarts = extract_info(reaction_entry)
        if rhash not in combo_dataset:
            if f"{new_name}:{reaction_key}" not in combo_dataset[rhash]:
                combo_dataset[rhash][f"{new_name}:{reaction_key}"]['uniprot_ids'] = uniprot_ids
                combo_dataset[rhash][f"{new_name}:{reaction_key}"]['smarts'] = smarts
        else:
            combo_dataset[rhash][f"{new_name}:{reaction_key}"]['uniprot_ids'] = uniprot_ids
            combo_dataset[rhash][f"{new_name}:{reaction_key}"]['smarts'] = smarts

[19:43:19] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:43:19] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:43:19] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:43:19] Explicit valence for atom # 26 O, 3, is greater than permitted
[19:43:19] Explicit valence for atom # 16 N, 4, is greater than permitted
[19:43:19] Explicit valence for atom # 16 N, 4, is greater than permitted
[19:43:19] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:43:19] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:43:19] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:43:19] Explicit valence for atom # 9 N, 4, is greater than permitted
[19:43:20] Explicit valence for atom # 16 N, 4, is greater than permitted
[19:43:20] Explicit valence for atom # 25 N, 4, is greater than permitted
[19:43:20] Explicit valence for atom # 25 N, 4, is greater than permitted
[19:43:20] Explicit valence for atom # 16 

In [34]:
# Add in mapped rules
df_opt = pd.read_csv('../data/mapping/intermediate_rules_optimized_rxns.tsv', sep='\t')
db_ids_to_rules = defaultdict(list)
rules_to_db_ids = defaultdict(list)
nrxns = 0
for index, row in df_opt.iterrows():
    rule = row["Name"]
    db_ids = row["Comments"].split(';')
    nrxns += len(db_ids)
    for elt in db_ids:
        db_ids_to_rules[elt].append(rule)

for rhash, entry in combo_dataset.items():
    for db_id in entry.keys():
        entry[db_id]['imt_rules'] = db_ids_to_rules.get(db_id, [])

In [57]:
list(combo_dataset.values())[30]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'metacyc:6PFRUCTPHOS-RXN': {'smarts': 'Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O.O=P(O)(O)OCC1OC(O)(COP(=O)(O)O)C(O)C1O>>Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O.O=P(O)(O)OCC1OC(O)(CO)C(O)C1O',
              'uniprot_ids': [],
              'imt_rules': ['rule0014_08']},
             'metacyc:TAGAKIN-RXN': {'smarts': 'Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O.O=P(O)(O)OCC1OC(O)(COP(=O)(O)O)C(O)C1O>>Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O.O=P(O)(O)OCC1OC(O)(CO)C(O)C1O',
              'uniprot_ids': [],
              'imt_rules': ['rule0014_08']},
             'brenda:2.7.1.11_05': {'smarts': 'Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O.O=P(O)(O)OCC1OC(O)(COP(=O)(O)O)C(O)C1O>>Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O.O=P(O)(O)OCC1OC(O)(CO)C(O)C1O',
              'uniprot_ids': [],
              'imt_rules': ['rule0014_08']},
             'brenda:2.7.1

In [44]:
ct_combo = 0
degen = []
for k,v in combo_dataset.items():
    degen.append(len(v))
    for db_id in v.keys():
        ct_combo += 1

print(ct_combo)
print(sum(degen) / len(degen))

map_ct = 0
for k,v in db_ids_to_rules.items():
    if len(v) > 0:
        map_ct += 1

print(map_ct)
print(len(combo_dataset))

108454
2.1129186230006427
65819
51329


In [58]:
save_json(combo_dataset , "../data/mapping/known_rxns_w_provenance_all_info_jni.json")