In [11]:
%load_ext autoreload
%autoreload 2

from src.rxn_ctr_mcs import *
from src.utils import load_json, rxn_entry_to_smarts
from src.pathway_utils import get_reverse_paths_to_starting, create_graph_from_pickaxe
from src.post_processing import *

from minedatabase.pickaxe import Pickaxe
from minedatabase.utils import get_compound_hash

from rdkit.Chem import AllChem

import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import pandas as pd
import csv
import os
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Set params

expansion_dir = '../data/raw_expansions/'
fn = "succinate_to_mvacid_gen_4_tan_sample_n_samples_1000.pk" # Expansion file name
generations = 4

# Load results
pk = Pickaxe()
path = expansion_dir + fn
pk.load_pickled_pickaxe(path)

----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading ../data/raw_expansions/succinate_to_mvacid_gen_4_tan_sample_n_samples_1000.pk pickled data.
Loaded 89758 compounds
Loaded 106853 reactions
Loaded 3604 operators
Loaded 1 targets
Took 5.896747827529907


In [3]:
# Create the initial graph

DG, rxn, edge = create_graph_from_pickaxe(pk, "Biology")
starting_nodes = []
bad_nodes = []
for n in DG.nodes():
    try:
        if DG.nodes()[n]["Type"] == "Starting Compound":
            starting_nodes.append(n)
    except:
        bad_nodes.append(n)

RDKit ERROR: [10:39:11] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[10:39:11] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [10:39:12] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
[10:39:12] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: 
RDKit ERROR: [10:39:19] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[10:39:19] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [10:39:20] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
[10:39:20] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: 
RDKit ERROR: [10:39:20] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[10:39:20] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [10:39:20] Can't kekulize mol.  Unkek

In [4]:
# Get pathways
max_depth = generations * 2
paths = []

# Specify Targets / Starting Cpds
target_smi = pk.target_smiles[0]
target_cids = [get_compound_hash(smi)[0] for smi in pk.target_smiles]
starting_cpds = [get_compound_hash(val["SMILES"])[0] for val in pk.compounds.values() if val["Type"].startswith("Start")]

# Loop through targets and get pathways from targets to starting compounds
for this_target in target_cids:
    this_paths = get_reverse_paths_to_starting(DG, begin_node=this_target, end_nodes=starting_cpds, max_depth=max_depth)
    # If we find paths then reverse those paths and assign to a dictionary
    if this_paths:
        this_paths = list(set([tuple(path[1::2]) for path in [[*reversed(ind_path)] for ind_path in this_paths]]))
        for elt in this_paths:
            for r in pk.reactions[elt[0]]["Reactants"]:
                if r[-1] in starting_cpds:
                    paths.append(pathway(rhashes=elt, starter_hash=r[-1], target_hash=this_target)) 

In [5]:
# Make predicted reaction dict

pred_rxns = {}
degen_rhashes = defaultdict(lambda : 1)
for elt in paths:
    for this_rhash in elt.rhashes:
        if this_rhash not in pred_rxns:
            rxn_sma = rxn_hash_2_rxn_sma(this_rhash, pk)
            pred_rxns[this_rhash] = reaction(this_rhash, rxn_sma)
        else:
            degen_rhashes[this_rhash] += 1


In [6]:
# Put paths in dict indexed by starter-target pairs
# target_id_2_name = {'Cec77ea281f69ca989bbef0a4c7794128a196c716': '2-methyleneglutarate',
#                     'C3a5b833f1a5abe2063ff8b22706fabb74097bead': '4-methyleneglutamate'}
target_id_2_name = {'C6ec1611229ff4fc7a19244967c7716266fc021a1': 'mvacid'}

st_paths = defaultdict(list)
for elt in paths:
    t_name = target_id_2_name[elt.target]
    s_name = pk.compounds[elt.starter]["ID"]
    st_paths[(s_name, t_name)].append(elt)

for k,v in st_paths.items():
    print(k, len(v))

('succinate', 'mvacid') 41


In [7]:
# Load in IMT rule mapping

# Load rules
rules_path = '../src/rules/JN3604IMT_rules.tsv'
rule_df = pd.read_csv(rules_path, delimiter='\t')
rule_df.set_index('Name', inplace=True)

# Load mapping
rxn2rule = {}
db_names = ['_mc_v21', '_brenda', '_kegg']
suffix = '_imt_rules_enforce_cof.csv'
for name in db_names:
    mapping_path = '../data/mapping/mapping' + name + suffix
    with open(mapping_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) == 1:
                rxn2rule[row[0]] = []
            else:
                rxn2rule[row[0]] = row[1:]

# Make rule2rxn
rule2rxn = {}
for k,v in rxn2rule.items():
    for elt in v:
        if elt not in rule2rxn:
            rule2rxn[elt] = [k]
        else:
            rule2rxn[elt].append(k)

# Load all known reaction json entries into dict
known_rxns = {}
pref = '../data/mapping/'
suffs = ['mc_v21_as_is.json', 'brenda_as_is.json', 'kegg_as_is.json']
for elt in suffs:
    known_rxns.update(load_json(pref + elt))

In [8]:
# Populate reaction objects in rxn dict w/ known reactions

for k, v in pred_rxns.items():
    this_rules = list(pk.reactions[k]["Operators"])
    this_known_rxns = []
    for elt in this_rules:
        if elt in rule2rxn:
            this_rxn_ids = rule2rxn[elt]
            for this_id in this_rxn_ids:
                this_sma = rxn_entry_to_smarts(known_rxns[this_id])
                this_known_rxns.append((None, this_sma, this_id))
    
    v.known_rxns = [list(elt) for elt in set(this_known_rxns)]


In [9]:
# Populate pred_rxns, known rxn prc-mcs slot

for x in range(len(pred_rxns.keys())):
    h = list(pred_rxns.keys())[x]
    test_rxn = pred_rxns[h]
    a = 0 # Number known rxns analyszed
    for z, kr in enumerate(test_rxn.known_rxns):

        rxn_sma1 = test_rxn.smarts
        rxn_sma2 = kr[1]

        # Catch stoichiometry mismatches stemming from pickaxe, early post-processing
        if tuple([len(elt.split('.')) for elt in rxn_sma2.split('>>')]) != tuple([len(elt.split('.')) for elt in rxn_sma1.split('>>')]):
            print(x, z, 'stoich_error')
            continue

        # Skip pred reactions that trigger RXNMapper atom mapping errors
        try:
            am_rxn_smarts = [atom_map(rxn_sma1), atom_map(rxn_sma2)] # Atom map
        except:
            continue

        # Construct reaction objects
        rxns = []
        for elt in am_rxn_smarts:
            temp = AllChem.ReactionFromSmarts(elt, useSmiles=True)
            temp.Initialize()
            rxns.append(temp)

        rc_atoms = [elt.GetReactingAtoms() for elt in rxns] # Get reaction center atom idxs

        # Construct rxn ctr mol objs
        rcs = []
        for i, t_rxn in enumerate(rxns):
            temp = []
            for j, t_mol in enumerate(t_rxn.GetReactants()):
                temp.append(get_sub_mol(t_mol, rc_atoms[i][j]))
            rcs.append(temp)

        # Align substrates of the 2 reactions
        rc_idxs = [] # Each element: (idx for rxn 1, idx for rxn 2)
        remaining = [[i for i in range(len(elt))] for elt in rcs]
        while (len(remaining[0]) > 0) & (len(remaining[1]) > 0):
            idx_pair = align_substrates(rcs, remaining)

            if idx_pair is None:
                break
            else:
                rc_idxs.append(idx_pair)
                remaining[0].remove(idx_pair[0])
                remaining[1].remove(idx_pair[1])

        # Skip if you haven't aligned every reactant pred to known
        if len(rc_idxs) < len(rxn_sma1.split('>>')[0].split('.')):
            continue

        rxns = align_atom_map_nums(rxns, rcs, rc_idxs, rc_atoms)

        # Compute MCS seeded by reaction center
        prc_mcs = get_prc_mcs(rxns, rcs, rc_idxs, rc_atoms) 
        pred_rxns[h].known_rxns[z][0] = prc_mcs # Update pred_rxns
        
        a += 1

    print(x, ':', a / (z+1), 'out of', z+1)

Token indices sequence length is longer than the specified maximum sequence length for this model (1465 > 512). Running this sequence through the model will result in indexing errors


0 : 0.15 out of 40
1 : 0.6086956521739131 out of 23
2 : 1.0 out of 24
3 : 1.0 out of 18
4 : 1.0 out of 24
5 : 0.6086956521739131 out of 23
6 : 0.13636363636363635 out of 88
7 : 1.0 out of 18
8 : 0.6086956521739131 out of 23
9 : 1.0 out of 9
10 : 1.0 out of 18
11 : 1.0 out of 9
12 : 0.15 out of 40
13 : 1.0 out of 20
14 : 0.13636363636363635 out of 88
15 : 0.9310344827586207 out of 29
16 : 1.0 out of 20
17 : 0.13636363636363635 out of 88
18 : 0.15 out of 40
19 : 0.723404255319149 out of 47
20 : 0.0 out of 9
21 : 0.15 out of 40
22 : 0.0 out of 9
23 : 0.0 out of 88
24 : 0.0 out of 9
25 : 0.13636363636363635 out of 88
26 : 0.15 out of 40
27 : 0.13636363636363635 out of 88
28 : 0.0 out of 40
29 : 0.15 out of 40
30 : 0.723404255319149 out of 47
31 : 0.9310344827586207 out of 29
32 : 0.13636363636363635 out of 88


In [12]:
# Save reactions dict and paths list (ultimately will replace with expansion object)

rxns_fn = 'predicted_reactions_' + fn
paths_fn = 'paths_' + fn
save_dir = '../data/processed_expansions/'
rxns_path = save_dir + rxns_fn
paths_path = save_dir + paths_fn

with open(rxns_path, 'wb') as f:
    pickle.dump(pred_rxns, f)

with open(paths_path, 'wb') as f:
    pickle.dump(paths, f)