In [17]:
from src.utils import load_json
import pandas as pd
import csv
from collections import defaultdict

In [60]:
brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
key_repeats = set(brenda_uniprot.keys()) & set(mc_uniprot.keys())
print(len(key_repeats))
rxn2uniprot = defaultdict(list)

# Add metacyc first
for k,v in mc_uniprot.items():
    uniprot_str = v[-1]
    if uniprot_str != '':
        rxn2uniprot[k] += uniprot_str.split(',')

# Add brenda. There are multiple reactions per entry
# so we have to iterate through them all
for k, v in brenda_uniprot.items():
    n_reactions = len(v)
    n_digits = n_reactions // 10 + 1

    for i in range(n_reactions):
        reaction_key = f"k_{i:0{n_digits}}"
        uniprot_str = v[i][-1]
        if uniprot_str != '':
            uniprot_ids = uniprot_str.split(',')
            rxn2uniprot[reaction_key] += uniprot_ids

0


In [62]:
# Load rules
rules_path = '../data/rules/JN3604IMT_rules.tsv'
rule_df = pd.read_csv(rules_path, delimiter='\t')
rule_df.set_index('Name', inplace=True)

# Load mapping
rxn2rule = {}
db_names = ['_mc_v21', '_brenda', '_kegg']
suffix = '_imt_rules_enforce_cof.csv'
for name in db_names:
    mapping_path = '../data/mapping/mapping' + name + suffix
    with open(mapping_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) == 1:
                rxn2rule[row[0]] = []
            else:
                rxn2rule[row[0]] = row[1:]

# Make rule2rxn
rule2rxn = defaultdict(list)
for k,v in rxn2rule.items():
    for elt in v:
        rule2rxn[elt].append(k)

# Load all known reaction json entries into dict
known_rxns = {}
pref = '../data/mapping/'
suffs = ['mc_v21_as_is.json', 'brenda_as_is.json', 'kegg_as_is.json']
for elt in suffs:
    known_rxns.update(load_json(pref + elt))

In [65]:
rule2uniprot = defaultdict(list)
for rule in rule_df.index:
    rxns = rule2rxn[rule]
    uniprot_ids = []
    for rxn in rxns:
        uniprot_ids += rxn2uniprot[rxn]

    rule2uniprot[rule] += uniprot_ids

In [71]:
sorted([(len(rule2uniprot[k]), k) for k in rule_df.index], reverse=True)

[(5393, 'rule0349_1'),
 (5297, 'rule0087_1'),
 (5138, 'rule0086_1'),
 (3325, 'rule0001_22'),
 (2929, 'rule0001_15'),
 (2561, 'rule0006_179'),
 (2314, 'rule0194_2'),
 (2241, 'rule0029_5'),
 (2223, 'rule0081_5'),
 (2223, 'rule0080_6'),
 (2180, 'rule0029_1'),
 (2135, 'rule0071_5'),
 (1976, 'rule0099_1'),
 (1976, 'rule0098_1'),
 (1948, 'rule0006_174'),
 (1941, 'rule0007_098'),
 (1812, 'rule0169_1'),
 (1796, 'rule0014_20'),
 (1768, 'rule0006_108'),
 (1643, 'rule0015_18'),
 (1568, 'rule0129_09'),
 (1558, 'rule0098_3'),
 (1496, 'rule0020_02'),
 (1471, 'rule0006_098'),
 (1454, 'rule0038_4'),
 (1453, 'rule0268_2'),
 (1453, 'rule0267_2'),
 (1442, 'rule0155_1'),
 (1428, 'rule0015_34'),
 (1428, 'rule0014_29'),
 (1422, 'rule0033_05'),
 (1418, 'rule0034_06'),
 (1356, 'rule0028_47'),
 (1335, 'rule0302_1'),
 (1324, 'rule0028_42'),
 (1315, 'rule0015_16'),
 (1295, 'rule0348_1'),
 (1295, 'rule0347_1'),
 (1285, 'rule0303_2'),
 (1179, 'rule0070_4'),
 (1173, 'rule0097_3'),
 (1170, 'rule0195_4'),
 (1166, 'ru