# New code

## Imports and definitions

In [82]:
import os
CWD = os.path.abspath("")
os.chdir(CWD)
from collections import defaultdict
import pandas as pd
import csv
from utils import load_json, save_json, sanitize

from minedatabase.pickaxe import Pickaxe
from minedatabase.utils import get_compound_hash

from rdkit.Chem import CanonSmiles, Draw
from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
from pathway_utils import get_stoich_pk, get_reverse_paths_to_starting
import numpy as np
import PIL
from pathway_utils import create_graph_from_pickaxe

In [83]:
# Define classes for pathway and reaction entries

class pathway:
    def __init__(self, rhashes, starter_hash=None, target_hash=None, prc_mcs=None, dG=None):
        self.starter = starter_hash
        self.target = target_hash
        self.rhashes = rhashes # Tuple of reaction hash ids
        self.prc_mcs = prc_mcs # Peri-rxn-ctr MCS score ave over reactions
        self.dG = dG # Placeholder for thermo

class reaction:
    def __init__(self, rid, smarts, rules=[], known_rxns=[]):
        self.rid = rid
        self.smarts = smarts
        self.rules = rules
        self.known_rxns = known_rxns

def rxn_hash_2_rxn_sma(rhash, pk):
    '''
    Make reaction smarts string for
    reaction indexed by rhash in a pk
    object
    '''
    rxn_stoich = get_stoich_pk(rhash, pk)
    products = ".".join([".".join([smi]*stoich) for smi, stoich in rxn_stoich.items() if stoich >= 0])
    reactants = ".".join([".".join([smi]*abs(stoich)) for smi, stoich in rxn_stoich.items() if stoich <= 0])
    rxn_sma = ">>".join([reactants, products])
    return rxn_sma

def make_rxn_sma(rxn_entry):
    '''
    Convert our standard rxn json
    entry into a reaction smarts
    '''
    reactants = sanitize(list(rxn_entry[0].values()))
    products = sanitize(list(rxn_entry[1].values()))
    sma = ".".join(reactants) + ">>" + ".".join(products)
    return sma

# Pathway drawing functions

def draw_rxn(rxn_sma):
    return Draw.ReactionToImage(
        AllChem.ReactionFromSmarts(rxn_sma, useSmiles=True),
        subImgSize=(200, 200), useSVG=False, drawOptions=None, returnPNG=False
    )

def get_concat_h(im1, im2):
    dst = PIL.Image.new('RGB', (im1.width + im2.width, max(im1.height, im2.height)))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst

def get_concat_v(im1, im2):
    dst = PIL.Image.new('RGB', (max(im1.width, im2.width), im1.height + im2.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, im1.height))
    return dst

def draw_pathway(pred_known_pairs):
    for i, elt in enumerate(pred_known_pairs):
        if i == 0:
            img = get_concat_h(*elt)
        else:
            img = get_concat_v(img, get_concat_h(*elt))

    return img

## Main

In [100]:
# Set params

expansion_dir = '../../data_from_quest/bottle/'
fn = "methylene_molecules_to_mvacid_gen_4_tan_sample_1_n_samples_1000.pk" # Expansion file name
generations = 4

# Load results
pk = Pickaxe()
path = expansion_dir + fn
pk.load_pickled_pickaxe(path)

----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading ../../data_from_quest/bottle/methylene_molecules_to_mvacid_gen_4_tan_sample_1_n_samples_1000.pk pickled data.
Loaded 146255 compounds
Loaded 155116 reactions
Loaded 3604 operators
Loaded 1 targets
Took 8.9239182472229


In [101]:
# Create the initial graph

DG, rxn, edge = create_graph_from_pickaxe(pk, "Biology")
starting_nodes = []
bad_nodes = []
for n in DG.nodes():
    try:
        if DG.nodes()[n]["Type"] == "Starting Compound":
            starting_nodes.append(n)
    except:
        bad_nodes.append(n)

RDKit ERROR: [11:52:10] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:52:10] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [11:52:10] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:52:10] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [11:52:11] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:52:11] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [11:52:15] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:52:15] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [11:52:22] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:52:22] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [11:52:24] Can't kekulize mol.  Unkek

In [102]:
# Get pathways
max_depth = generations * 2
paths = []

# Specify Targets / Starting Cpds
target_smi = pk.target_smiles[0]
target_cids = [get_compound_hash(smi)[0] for smi in pk.target_smiles]
starting_cpds = [get_compound_hash(val["SMILES"])[0] for val in pk.compounds.values() if val["Type"].startswith("Start")]

# Loop through targets and get pathways from targets to starting compounds
for this_target in target_cids:
    this_paths = get_reverse_paths_to_starting(DG, begin_node=this_target, end_nodes=starting_cpds, max_depth=max_depth)
    # If we find paths then reverse those paths and assign to a dictionary
    if this_paths:
        this_paths = list(set([tuple(path[1::2]) for path in [[*reversed(ind_path)] for ind_path in this_paths]]))
        for elt in this_paths:
            for r in pk.reactions[elt[0]]["Reactants"]:
                if r[-1] in starting_cpds:
                    paths.append(pathway(rhashes=elt, starter_hash=r[-1], target_hash=this_target)) 

In [103]:
# Make predicted reaction dict

pred_rxns = {}
degen_rhashes = defaultdict(lambda : 1)
for elt in paths:
    for this_rhash in elt.rhashes:
        if this_rhash not in pred_rxns:
            rxn_sma = rxn_hash_2_rxn_sma(this_rhash, pk)
            pred_rxns[this_rhash] = reaction(this_rhash, rxn_sma)
        else:
            degen_rhashes[this_rhash] += 1


RDKit ERROR: [11:55:41] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
[11:55:41] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: 
RDKit ERROR: [11:55:41] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
[11:55:41] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: 
RDKit ERROR: [11:55:42] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
[11:55:42] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: 
RDKit ERROR: [11:55:42] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:55:42] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34

RDKit ERROR: [11:55:43] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34
RDKit ERROR: 
[11:55:43] Can't kekulize mol.  Unkekulized atoms: 19 23 26 27 28 29 30 31 33 34



In [104]:
# Put paths in dict indexed by starter-target pairs
# target_id_2_name = {'Cec77ea281f69ca989bbef0a4c7794128a196c716': '2-methyleneglutarate',
#                     'C3a5b833f1a5abe2063ff8b22706fabb74097bead': '4-methyleneglutamate'}
target_id_2_name = {'C6ec1611229ff4fc7a19244967c7716266fc021a1': 'mvacid'}

st_paths = defaultdict(list)
for elt in paths:
    t_name = target_id_2_name[elt.target]
    s_name = pk.compounds[elt.starter]["ID"]
    st_paths[(s_name, t_name)].append(elt)

for k,v in st_paths.items():
    print(k, len(v))

('2-methyleneglutarate', 'mvacid') 21443
('4-methylene-l-glutarate', 'mvacid') 991


In [105]:
# Load in IMT rule mapping

# Load rules
rules_path = '../src/rules/JN3604IMT_rules.tsv'
rule_df = pd.read_csv(rules_path, delimiter='\t')
rule_df.set_index('Name', inplace=True)

# Load mapping
rxn2rule = {}
db_names = ['_mc_v21', '_brenda', '_kegg']
suffix = '_imt_rules_enforce_cof.csv'
for name in db_names:
    mapping_path = '../data/mapping/mapping' + name + suffix
    with open(mapping_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) == 1:
                rxn2rule[row[0]] = []
            else:
                rxn2rule[row[0]] = row[1:]

# Make rule2rxn
rule2rxn = {}
for k,v in rxn2rule.items():
    for elt in v:
        if elt not in rule2rxn:
            rule2rxn[elt] = [k]
        else:
            rule2rxn[elt].append(k)

# Load all known reaction json entries into dict
known_rxns = {}
pref = '../data/mapping/'
suffs = ['mc_v21_as_is.json', 'brenda_as_is.json', 'kegg_as_is.json']
for elt in suffs:
    known_rxns.update(load_json(pref + elt))

In [106]:
# Populate reaction objects in rxn dict w/ known reactions

for k, v in pred_rxns.items():
    this_rules = list(pk.reactions[k]["Operators"])
    this_known_rxns = []
    for elt in this_rules:
        if elt in rule2rxn:
            this_rxn_ids = rule2rxn[elt]
            for this_id in this_rxn_ids:
                this_sma = make_rxn_sma(known_rxns[this_id])
                this_known_rxns.append((-1, this_sma, this_id))
    
    v.known_rxns = list(set(this_known_rxns))


In [115]:
pred_known_pairs = []
example_path = st_paths[('2-methyleneglutarate', 'mvacid')][3].rhashes
rng = np.random.default_rng(seed=1208)
for elt in example_path:
    this_pred = pred_rxns[elt]
    this_idx = rng.integers(0, len(this_pred.known_rxns) - 1)
    pred_img = draw_rxn(this_pred.smarts)
    known_img = draw_rxn(this_pred.known_rxns[this_idx][1])
    pred_known_pairs.append([pred_img, known_img])

path_img = draw_pathway(pred_known_pairs)
path_img.save('../artifacts/test_concat.png')
    

In [116]:
print(this_pred.smarts)
print(this_pred.known_rxns[this_idx][1])

C=C(CCC(O)O)C(=O)O.NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)C=CC1>>C=C(CCCO)C(=O)O.NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)c1.O
CC(=CCNc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O)CO.NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)C=CC1>>CC(C)=CCNc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O.NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)c1.O


In [120]:
foo = get_concat_v(*pred_known_pairs[-1])
foo.save('../artifacts/example_pred_known_pair.png')

In [121]:
this_pred.known_rxns[this_idx]

(-1,
 'CC(=CCNc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O)CO.NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)C=CC1>>CC(C)=CCNc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O.NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)c1.O',
 'RXN-4304_reverse')

In [92]:
for elt in pred_known_pairs:
    print(elt[0].width, elt[1].width)

for elt in pred_known_pairs:    
    print(elt[0].height, elt[1].height)

800 800
800 800
800 800
1600 1600
200 200
200 200
200 200
200 200
