In [17]:
import csv
import json
import numpy as np

Count the number of reactants prescribed by a smarts string

In [16]:
rules_path = 'minimal1224_all_uniprot.tsv'

# Read in rules
rules = []
with open(rules_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        rules.append([row[0], row[1], row[2]])

rules = rules[1:] # Remove header

def count_reactants(rule_smarts):
    '''
    Counts number of reactants in a SMARTS-
    encoded operator
    '''
    reactants = rule_smarts.split('>>')[0]
    dot_split = reactants.split('.') # Reactants separated by '.'

    # But must catch where pieces of a single compound
    # are split by '.', in which case they'll be surrounded by ()
    left_split_parens = []
    right_split_parens = []
    for i, elt in enumerate(dot_split):
        if (elt[0] == '(') & (elt[-1] != ')'):
            left_split_parens.append(i)
        elif (elt[0] != '(') & (elt[-1] == ')'):
            right_split_parens.append(i)
            
    left_split_parens, right_split_parens = np.array(left_split_parens), np.array(right_split_parens)
    overcount = (right_split_parens - left_split_parens).sum()

    n_reactants = len(dot_split) - overcount
    return int(n_reactants)

reactant_counts = []
ground_truth = []
for elt in rules:
    reactant_counts.append(count_reactants(elt[2]))
    ground_truth.append(len(elt[1].split(';')))

err_idx = []
for i in range(len(rules)):
    if reactant_counts[i] - ground_truth[i] != 0:
        err_idx.append(i)

print(err_idx)

[]


In [23]:
pks_rxns_path = 'rxn_dict_metacyc_ids.json'
metacyc_rxns_path = 'metacyc_all_reactions_smiles_221214.json'

# Read in polyketide reactions
with open(pks_rxns_path, 'r') as f:
    pk_rxn_dict = json.load(f)

for k,v in pk_rxn_dict.items():
    print(k, v)
    break

# Read in all metacyc reactions
with open(metacyc_rxns_path, 'r') as f:
    mc_rxn_dict = json.load(f)

for k,v in mc_rxn_dict.items():
    print(k, v[0])
    break

print(len(list(mc_rxn_dict.keys())))

|4-COUMARATE--COA-LIGASE-RXN| [{'|COUMARATE|': 'C(=O)([O-])/C=C/C1(\\C=C/C(/O)=C\\C=1)', '|CO-A|': 'CC(C)(COP([O-])(=O)OP(OC[C@H]3(O[C@@H](N1(C2(\\N=C/N=C(C(\\N=C/1)=2)/N)))[C@H](O)[C@H](OP([O-])(=O)[O-])3))(=O)[O-])[C@@H](O)C(=O)NCCC(=O)NCCS', '|ATP|': 'C(OP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])[C@H]3(O[C@@H](N1(C2(\\C(\\N=C/1)=C(N)/N=C\\N=2)))[C@H](O)[C@H](O)3)'}, {'|P-COUMAROYL-COA|': 'CC(C)(COP(OP([O-])(OC[C@@H]1([C@@H](OP([O-])([O-])=O)[C@@H](O)[C@@H](O1)N2(C3(\\N=C/N=C(C(\\N=C/2)=3)/N))))=O)([O-])=O)[C@@H](O)C(=O)NCCC(=O)NCCSC(=O)/C=C/C4(\\C=C/C(/O)=C\\C=4)', '|AMP|': 'C(OP(=O)([O-])[O-])[C@H]3(O[C@@H](N1(C2(\\C(\\N=C/1)=C(N)/N=C\\N=2)))[C@H](O)[C@H](O)3)', '|PPI|': 'O=P(O)(OP([O-])([O-])=O)[O-]'}]
|RXN-16531| {'|ACETYL-COA|': 'CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(OP([O-])(OC[C@@H]1([C@@H](OP([O-])([O-])=O)[C@@H](O)[C@@H](O1)N2(C3(\\N=C/N=C(C(\\N=C/2)=3)/N))))=O)([O-])=O', '|MALONYL-COA|': 'CC(C)([C@@H](O)C(=O)NCCC(=O)NCCSC(=O)CC(=O)[O-])COP(=O)(OP(=O)(OC[C@@H]1([C@@H](OP([