# Make rule 2 uniprot dict

In [14]:
%load_ext autoreload
%autoreload 2
from src.utils import load_json, get_compound_hash, get_reaction_hash
from minedatabase.utils import postsanitize_smiles, neutralise_charges
import pandas as pd
import csv
from collections import defaultdict

In [21]:
brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
key_repeats = set(brenda_uniprot.keys()) & set(mc_uniprot.keys())
print(len(key_repeats))
rxn2uniprot = defaultdict(list)

# Add metacyc first
for k,v in mc_uniprot.items():
    uniprot_str = v[-1]
    if uniprot_str != '':
        rxn2uniprot[k] += uniprot_str.split(',')

# Add brenda. There are multiple reactions per entry
# so we have to iterate through them all
for k, v in brenda_uniprot.items():
    n_reactions = len(v)
    n_digits = n_reactions // 10 + 1

    for i in range(n_reactions):
        reaction_key = f"{k}_{i:0{n_digits}}"
        uniprot_str = v[i][-1]
        if uniprot_str != '':
            uniprot_ids = uniprot_str.split(',')
            rxn2uniprot[reaction_key] += uniprot_ids

0
33712


In [62]:
# Load rules
rules_path = '../data/rules/JN3604IMT_rules.tsv'
rule_df = pd.read_csv(rules_path, delimiter='\t')
rule_df.set_index('Name', inplace=True)

# Load mapping
rxn2rule = {}
db_names = ['_mc_v21', '_brenda', '_kegg']
suffix = '_imt_rules_enforce_cof.csv'
for name in db_names:
    mapping_path = '../data/mapping/mapping' + name + suffix
    with open(mapping_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) == 1:
                rxn2rule[row[0]] = []
            else:
                rxn2rule[row[0]] = row[1:]

# Make rule2rxn
rule2rxn = defaultdict(list)
for k,v in rxn2rule.items():
    for elt in v:
        rule2rxn[elt].append(k)

# Load all known reaction json entries into dict
known_rxns = {}
pref = '../data/mapping/'
suffs = ['mc_v21_as_is.json', 'brenda_as_is.json', 'kegg_as_is.json']
for elt in suffs:
    known_rxns.update(load_json(pref + elt))

In [65]:
rule2uniprot = defaultdict(list)
for rule in rule_df.index:
    rxns = rule2rxn[rule]
    uniprot_ids = []
    for rxn in rxns:
        uniprot_ids += rxn2uniprot[rxn]

    rule2uniprot[rule] += uniprot_ids

In [None]:
sorted([(len(rule2uniprot[k]), k) for k in rule_df.index], reverse=True)

# Reaction-enzyme data clean up

In [None]:
'''
TODO: 

    - Fix up cpd hash
    - Fix up reaction hash
    - Bring in sanitization and neutralization
    - Take as input brenda and mc uniprot, fold entries in on 
    unique hash id - count how many entries, how many unique reactions
'''

In [68]:
%load_ext autoreload
%autoreload 2
from src.utils import load_json, get_compound_hash, get_reaction_hash
from minedatabase.utils import postsanitize_smiles, neutralise_charges
import pandas as pd
import csv
from collections import defaultdict
from collections import namedtuple, Counter
from rdkit import Chem

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [105]:
DatabaseId = namedtuple("DatabaseId", "db id")
Enzyme = namedtuple("Enzyme", "id sequence")

def extract_info(entry):
    rdict, pdict, uniprot = entry
    reactants, products = list(rdict.values()), list(pdict.values())
    reactants, products = sanitize(reactants, products)
    reactants, products = neutralize(reactants, products)
    reactants_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in reactants]))
    products_hash_stoich = list(Counter([get_compound_hash(elt)[0] for elt in products]))
    rhash = get_reaction_hash(reactants_hash_stoich, products_hash_stoich)
    uniprot = uniprot.split(',')
    return rhash, uniprot
    
def sanitize(reactants, products):
    return postsanitize_smiles(reactants)[0], postsanitize_smiles(products)[0]

def neutralize(reactants, products):
    reactants = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in reactants]
    products = [Chem.MolToSmiles(neutralise_charges(Chem.MolFromSmiles(elt, sanitize=False))) for elt in products]
    return reactants, products

In [110]:
brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
combo_dataset = defaultdict(lambda : defaultdict(lambda : list))
bad_reactions = []

# Add metacyc first
for reaction_key, reaction_entry in mc_uniprot.items():
    rhash, uniprot_ids = extract_info(reaction_entry)
    combo_dataset[rhash][f"metacyc:{reaction_key}"] = uniprot_ids

# Add brenda. There are multiple reactions per entry
# so we have to iterate through them all
for k, v in brenda_uniprot.items():
    n_reactions = len(v)
    n_digits = n_reactions // 10 + 1

    for i in range(n_reactions):
        reaction_key = f"{k}_{i:0{n_digits}}"
        reaction_entry = v[i]
        rhash, uniprot_ids = extract_info(reaction_entry)
        combo_dataset[rhash][f"brenda:{reaction_key}"] = uniprot_ids

[15:54:51] Explicit valence for atom # 26 O, 3, is greater than permitted
[15:54:51] Explicit valence for atom # 26 O, 3, is greater than permitted
[15:54:51] Explicit valence for atom # 26 O, 3, is greater than permitted
[15:54:51] Explicit valence for atom # 26 O, 3, is greater than permitted
[15:54:51] Explicit valence for atom # 16 N, 4, is greater than permitted
[15:54:51] Explicit valence for atom # 16 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 9 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 9 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 9 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 9 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 16 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 25 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 25 N, 4, is greater than permitted
[15:54:52] Explicit valence for atom # 16 

In [112]:
print(len(combo_dataset), len(mc_uniprot) + len(brenda_uniprot))

19671 26474


In [109]:
combo_dataset

defaultdict(<function __main__.<lambda>()>,
            {'Re699206523c93afd277d90c19eabd26461d8015849ee637d10a6c628eb808165': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'metacyc:RXN-12199': ['A0JND9',
                           'B3A0N5',
                           'E0D877',
                           'F4JSH1',
                           'O18956',
                           'O75355',
                           'O80612',
                           'O93295',
                           'O96559',
                           'P40009',
                           'P49961',
                           'P50635',
                           'P55772',
                           'P80595',
                           'P97687',
                           'Q2QYE1',
                           'Q5DRK1',
                           'Q5MY95',
                           'Q6NQA8',
                           'Q6Z4P2',
                           'Q8H1D8',
              

In [80]:
from rdkit.Chem import AllChem
inchi = AllChem.MolToInchi(AllChem.MolFromSmiles(reactants[2], sanitize=False))
compound = inchi.rsplit("-", 3 - 1)[0]
compound

'InChI=1S/H3O4P/c1'

In [89]:
mol = Chem.MolFromSmiles(products[0], sanitize=False)
inchi = AllChem.MolToInchi(mol)

[15:33:44] Can't kekulize mol.  Unkekulized atoms: 3 5 6 12


KekulizeException: Can't kekulize mol.  Unkekulized atoms: 3 5 6 12

In [67]:
get_compound_hash(reactants[0])[0]

'C1f96bafc8e1f5cceccd44de5cef3adfa3b2533b7'

In [46]:
Chem.MolFromSmiles('CC(=O)NC1CCN2O[Fe]345(ON(CCCC6NC(=O)C(CCCN(O3)C(=O~4)/C=C(\\C)CCOC(=O)C1)NC6=O)C(/C=C(/C)CCO)=O~5)~O=C2/C=C(\\C)CCO')

[14:31:09] Explicit valence for atom # 26 O, 3, is greater than permitted
