In [1]:
import rdkit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import json
import ast
import itertools
from rdkit import Chem
from rdkit.Chem import AllChem
import copy

### Silence non-critical RDKit warnings to minimize unnecessary outputs
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

## Joseph's code

In [3]:
class MapRules:
    """Map intended reactants to database reactants described by the same reaction operators."""

    def __init__(self, atom_mapping_path=None, rules_path=None, molfiles_path=None, seed_dict=None,
                 cofactor_list_path=None, cofactor_pair_path=None, cofactors=True, start_name=None, start_smiles=None):
        self.atom_mapping_path = atom_mapping_path
        self.rules = pd.read_csv(rules_path, sep='\t', index_col=0)
        self.molfiles_path = molfiles_path
        if self.atom_mapping_path:
            self.reaction_df = self._create_metacyc_info(self.atom_mapping_path)
        if cofactors:
            self.cofactor_name_dict, self.cofactor_list_dict, self.cofactor_pair_dict = get_cofactors(cofactor_list_path, cofactor_pair_path)
        else:
            self.cofactor_name_dict = {}
            self.cofactor_list_dict = {}
            self.cofactor_pair_dict = {}
        self.seed_dict = seed_dict
        if start_name:
            self.start_name = start_name
        if start_smiles:
            self.start_smiles = start_smiles

    def _create_metacyc_info(self, atom_mapping_path):
        """Read metacyc atom mapping information and return as a dataframe, also takes in reaction.dat information"""

        # read reactions.dat, group info for each reaction
        with open(atom_mapping_path, 'r') as file:
            reactions = file.read().split('//')

        # read reactions.dat info for each reaction
        info_df = pd.DataFrame()

        for entry in reactions:

            # storing info for this reaction
            info_dict = {}

            # for each line
            lines = entry.split('\n')
            for line in lines:
                try:
                    line = line.split(' - ', 1)
                    if (line[0] not in info_dict) & (line[1] != 'PROTON') & (line[1] != '|Acceptor|') \
                            & (line[1] != '|Donor-H2|') & (line[1] != '|ETR-Quinones|') & (line[1] != '|ETR-Quinols|'):
                        # if left or right in atom-mappings
                        if line[0] == 'LEFT' or line[0] == 'RIGHT':
                            try:
                                if line[1] + ' ' in info_dict['ATOM-MAPPINGS']:
                                    info_dict[line[0]] = line[1]
                            except KeyError:
                                pass
                        else:
                            info_dict[line[0]] = line[1]
                except:
                    continue

            # process reaction information
            try:
                unique_id = info_dict['UNIQUE-ID']
            except KeyError:
                continue

            try:
                reaction_direction = info_dict['REACTION-DIRECTION'].replace('PHYSIOL-', '').replace('IRREVERSIBLE-',
                                                                                                     '')
                left = info_dict['LEFT'].lstrip('|').rstrip('|') + ' '
                right = info_dict['RIGHT'].lstrip('|').rstrip('|') + ' '
            except KeyError:
                reaction_direction = ''
                left = ''
                right = ''

            try:
                ec = info_dict['EC-NUMBER'].lstrip('EC-')
            except KeyError:
                ec = ''

            # add information to info_df
            info_add = pd.DataFrame([left, right, reaction_direction, ec], index=['Left', 'Right', 'Direction', 'EC'],
                                    columns=[unique_id])
            info_df = pd.concat([info_df, info_add], axis=1, sort=True)

        # read atom mapping, group info for each reaction
        with open(atom_mapping_path, 'r') as file:
            metacyc = file.read().split('//')

        # read atom mapping info for each reaction
        reaction_df = pd.DataFrame()
        for entry in metacyc:

            # storing info for this reaction
            reaction_dict = {}

            # for each line
            lines = entry.split('\n')
            for line in lines:
                try:
                    line = line.split(' - ', 1)
                    reaction_dict[line[0]] = line[1]
                except:
                    continue

            # solve for MetaCyc 19.0 ((
            try:
                am = reaction_dict['ATOM-MAPPINGS'].lstrip('(').split('(')
                am = '('.join(am[1:])

                if not am:  # skip if no am
                    continue

                # parse am, reactants, products
                temp = am.split(')) ((')
                product_temp = '(' + temp[1].rstrip(')') + ')'

                temp = temp[0].split(') (')
                am_temp = temp[0]
                reactant_temp = '(' + ') ('.join(temp[1:]).replace('(', '', 1).replace('(', '', 1) + ')'

                # duplicate mols
                if '((' in reactant_temp:  # reactant side
                    name_edit = reactant_temp
                    name_edit = name_edit.split('((')

                    for i in range(1, len(name_edit)):  # edit each duplicate mol
                        name_edit[i] = name_edit[i].replace(' ', ':', 1).replace(')', '', 1)

                    reactant_temp = '('.join(name_edit)

                if '((' in product_temp:  # product side
                    name_edit = product_temp
                    name_edit = name_edit.split('((')

                    for i in range(1, len(name_edit)):  # edit each duplicate mol
                        name_edit[i] = name_edit[i].replace(' ', ':', 1).replace(')', '', 1)

                    product_temp = '('.join(name_edit)

                reactant_temp = ''.join(re.findall(r'[A-Za-z0-9_:()\- ]', reactant_temp))
                product_temp = ''.join(re.findall(r'[A-Za-z0-9_:()\- ]', product_temp))

            except KeyError:
                continue

            # determine if UNIQUE-ID in atom-mapping is in reactions.dat
            try:
                info_dict = info_df[reaction_dict['UNIQUE-ID']]
            except KeyError:
                direction = '+-'
                ec = ''
                reaction_add = pd.DataFrame(
                    [reactant_temp, product_temp, am_temp, direction, ec],
                    index=['Reactants', 'Products', 'AM', 'Direction', 'EC'], columns=[reaction_dict['UNIQUE-ID']])
                reaction_df = pd.concat([reaction_df, reaction_add], axis=1, sort=True)
                continue

            # process direction info
            direction = info_dict.Direction
            ec = info_dict.EC

            if direction != '':  # if direction info is provided
                if direction == 'REVERSIBLE':  # if reversible
                    direction = '+-'
                elif direction == 'LEFT-TO-RIGHT':  # if left to right
                    if (info_dict.Left in reactant_temp) & (info_dict.Right in product_temp):
                        direction = '+'
                    elif (info_dict.Right in reactant_temp) & (info_dict.Left in product_temp):
                        direction = '-'
                elif direction == 'RIGHT-TO-LEFT':  # if right to left
                    if (info_dict.Left in reactant_temp) & (info_dict.Right in product_temp):
                        direction = '-'
                    elif (info_dict.Right in reactant_temp) & (info_dict.Left in product_temp):
                        direction = '+'
                else:
                    direction = '+-'
            else:  # if direction info not provided, default reversible
                direction = '+-'

            reaction_add = pd.DataFrame(
                [reactant_temp, product_temp, am_temp, direction, ec],
                index=['Reactants', 'Products', 'AM', 'Direction', 'EC'], columns=[reaction_dict['UNIQUE-ID']])
            reaction_df = pd.concat([reaction_df, reaction_add], axis=1, sort=True)

        print('Finished reading MetaCyc information.')

        return reaction_df

    def _read_cofactor_info(self, cofactor_list_path, cofactor_pair_path):
        """Read in cofactor information, store so that reactants can be accurately mapped later
        :return: cofactor_name_dict (all cofactors)
        :rtype: {name in rxn rule: [names of molfile]}
        :return: cofactor_list_dict (cofactor list)
        :rtype: {cofactor: [names of molfile]}
        :return: cofactor_pair_dict (cofactor pair)
        :rtype: {cofactor1___cofactor2: [name of molfile1___name of molfile2]}
        """

        cofactor_name_dict = {}
        cofactor_list_dict = {}
        cofactor_pair_dict = {}

        cofactor_list_df = pd.read_csv(cofactor_list_path)
        cofactor_pair_df = pd.read_csv(cofactor_pair_path)

        # for each list
        for i, current_df in cofactor_list_df.iterrows():
            try:
                cofactor_name_dict[current_df['replacement']].append(current_df['molfile'])
            except KeyError:
                cofactor_name_dict[current_df['replacement']] = [current_df['molfile']]
            try:
                cofactor_list_dict[current_df['replacement']].append(current_df['molfile'])
            except KeyError:
                cofactor_list_dict[current_df['replacement']] = [current_df['molfile']]

        # for each pair
        for i, current_df in cofactor_pair_df.iterrows():
            try:
                cofactor_name_dict[current_df['reactant_replacement']].append(current_df['reactant_molfile'])
            except KeyError:
                cofactor_name_dict[current_df['reactant_replacement']] = [current_df['reactant_molfile']]
            try:
                cofactor_pair_dict[
                    current_df['reactant_replacement'] + '___' + current_df['product_replacement']].append(
                    current_df['reactant_molfile'] + '___' + current_df['product_molfile'])
            except KeyError:
                cofactor_pair_dict[
                    current_df['reactant_replacement'] + '___' + current_df['product_replacement']] =\
                    [current_df['reactant_molfile'] + '___' + current_df['product_molfile']]

        # unique molfile name
        for k, v in cofactor_name_dict.items():
            cofactor_name_dict[k] = sorted(list(set(v)))
        for k, v in cofactor_pair_dict.items():
            cofactor_pair_dict[k] = sorted(list(set(v)))

        return cofactor_name_dict, cofactor_list_dict, cofactor_pair_dict

    def map_pickaxe_rules(self, lhs_dict, rhs_dict, rule_current, return_reaction_center=False):

        rxn_df = self.rules.loc[rule_current.split(';')[0]]
        rule = rxn_df['SMARTS']
        reactants = rxn_df['Reactants']
        products = rxn_df['Products']

        # remove cofactor, sanitize mols
        lhs_list, rhs_list = self._process_substrates(lhs_dict, rhs_dict, rule_current)

        # match index with pickaxe
        match_index = self._map_rules(rule, lhs_list, rhs_list, reactants, products, return_reaction_center)

        return match_index

    def _map_rules(self, rule, lhs, rhs, reactants, products, return_reaction_center):
        """Operator mapping"""

        rxn = Chem.rdChemReactions.ReactionFromSmarts(rule)
        reactants = reactants.split(';')
        cofactor_index_reactants = [i for i, r in enumerate(reactants) if r != 'Any']

        products = products.split(';')
        cofactor_index_products = [i for i, p in enumerate(products) if p != 'Any']

        # if number of reactants does not match reactant template
        if len(lhs) > reactants.count('Any'):
            repetitive_mols = set(lhs).intersection(set(rhs))

            while repetitive_mols:
                lhs.remove(sorted(repetitive_mols)[0])
                rhs.remove(sorted(repetitive_mols)[0])
                repetitive_mols = set(lhs).intersection(set(rhs))

        lhs_set = set()
        for lhs_perm in itertools.permutations(lhs):
            lhs_set.add(lhs_perm)

        for lhs_perm in lhs_set:
            lhs_temp = list(lhs_perm)

            for c in cofactor_index_reactants:
                if self.molfiles_path:
                    lhs_temp[c:c] = [Chem.MolToSmiles(Chem.MolFromMolFile(os.path.sep.join([self.molfiles_path, self.cofactor_name_dict[reactants[c]] + '.mol'])))]
                elif self.seed_dict:
                    lhs_temp[c:c] = [self.seed_dict[self.cofactor_name_dict[reactants[c]]]]

            # pruned MetaCyc
            try:
                lhs_tuple = tuple([Chem.MolFromSmiles(i) for i in lhs_temp])
                outputs = rxn.RunReactants(lhs_tuple)
            except:
                try:
                    lhs_tuple = tuple([Chem.MolFromSmiles(i, sanitize=False) for i in lhs_temp])
                    outputs = rxn.RunReactants(lhs_tuple)
                except:
                    continue

            # # pickaxe
            # lhs_tuple_list = []
            # for i in lhs_temp:
            #     try:
            #         temp_mol = Chem.MolFromSmiles(i)
            #         temp_mol = AllChem.AddHs(temp_mol)
            #         AllChem.Kekulize(temp_mol, clearAromaticFlags=True)
            #     except:
            #         temp_mol = Chem.MolFromSmiles(i, sanitize=False)
            #     lhs_tuple_list.append(temp_mol)
            # lhs_tuple = tuple(lhs_tuple_list)
            # outputs = rxn.RunReactants(lhs_tuple)

            for rxn_output in outputs:

                rhs_run = [Chem.MolToSmiles(rhs_mols) for rhs_mols in rxn_output]
                rhs_list = copy.deepcopy(rhs_run)

                for c in cofactor_index_products:
                    rhs_list.remove(rhs_run[c])

                # for all tautomer possibilities of clean rhs
                for rhs in postsanitize_smiles(rhs):
                    rhs = list(rhs)

                    for rhs_list in postsanitize_smiles(rhs_list):

                        # pruned MetaCyc
                        if sorted(list(rhs_list)) == sorted(rhs):
                            # lhs_index = [int(np.where(np.argsort(lhs) == i)[0]) for i in np.argsort(lhs_perm)]
                            # rhs_index = [int(np.where(np.argsort(rhs) == i)[0]) for i in np.argsort(rhs_list)]
                            lhs_index = [lhs.index(i) for i in lhs_perm]
                            rhs_index = [rhs.index(i) for i in rhs_list]

                            # return atom index of reaction center
                            if return_reaction_center:

                                # try to append lhs reactants
                                lhs_mols = []
                                for l in lhs_perm:
                                    lhs_mols.append(Chem.MolFromSmiles(l))
                                    if not lhs_mols[-1]:
                                        lhs_mols[-1] = Chem.MolFromSmiles(l, sanitize=False)

                                smarts_list, _ = get_smarts(rule)
                                smarts_list = [s for i, s in enumerate(smarts_list)
                                               if i not in cofactor_index_reactants]

                                # possible reaction center
                                temp_lhs_match = [Chem.MolFromSmiles(l, sanitize=False).GetSubstructMatches(
                                    Chem.MolFromSmarts(smarts_list[i])) for i, l in enumerate(lhs_perm)]
                                reaction_center_set = [set(itertools.chain(*l)) for l in temp_lhs_match]
                                lhs_all_matches = itertools.product(*temp_lhs_match)

                                # for all possible reaction centers
                                for lhs_match in lhs_all_matches:

                                    # iterate over all reactants
                                    for l_idx, match in enumerate(lhs_match):
                                        for protect in reaction_center_set[l_idx] - set(match):
                                            lhs_mols[l_idx].GetAtomWithIdx(protect).SetProp('_protected', '1')

                                    # add cofactors
                                    lhs_temp_mol = list(lhs_mols)
                                    for c in cofactor_index_reactants:
                                        if self.molfiles_path:
                                            lhs_temp_mol[c:c] = [Chem.MolFromMolFile(os.path.sep.join(
                                                [self.molfiles_path, self.cofactor_name_dict[reactants[c]] + '.mol']))]
                                        elif self.seed_dict:
                                            lhs_temp_mol[c:c] = [Chem.MolFromSmiles(
                                                self.seed_dict[self.cofactor_name_dict[reactants[c]]])]

                                    # for all possible reaction outcomes
                                    for rhs_rxn in rxn.RunReactants(tuple(lhs_temp_mol)):

                                        for rhs_smiles in postsanitize_smiles([Chem.MolToSmiles(r) for r in rhs_rxn]):

                                            # found match
                                            if tuple(r for i, r in enumerate(rhs_smiles)
                                                     if i not in cofactor_index_products) == rhs_list:
                                                return lhs_index, rhs_index, list(lhs_match)

                                    # else remove protection
                                    for l_idx, match in enumerate(lhs_match):
                                        for deprotect in reaction_center_set[l_idx] - set(match):
                                            lhs_mols[l_idx].GetAtomWithIdx(deprotect).ClearProp('_protected')

                            else:
                                return lhs_index, rhs_index

                        # # pickaxe
                        # if sorted(list(rhs_list)) == sorted(rhs):
                        #     lhs_index = [int(np.where(np.argsort(lhs) == i)[0]) for i in np.argsort(lhs_perm)]
                        #     rhs_index = [int(np.where(np.argsort(rhs) == i)[0]) for i in np.argsort(rhs_list)]
                        #     return lhs_index, rhs_index

        return None, None

    def _post_process(self, enzyme_list):
        """Post processing"""

        enzyme_list_temp = []

        non_orphan_flag = False

        for e in enzyme_list:
            if 'ENZRXN' in e:
                non_orphan_flag = True
                continue

        if non_orphan_flag:
            for e in enzyme_list:
                if 'ENZRXN' in e:
                    enzyme_list_temp.append(e)
            return enzyme_list_temp
        else:
            return enzyme_list

    def _process_substrates(self, lhs_dict_temp, rhs_dict_temp, rule_current):
        """Process substrates"""

        # check cofactor designation
        rule_reactant_names = self.rules.loc[rule_current, 'Reactants']
        rule_product_names = self.rules.loc[rule_current, 'Products']

        reactant_names, product_names = label_cofactor(sorted(lhs_dict_temp), sorted(rhs_dict_temp), self.cofactor_list_dict, self.cofactor_pair_dict)

        if sorted(rule_reactant_names.split(';')) != sorted(reactant_names.split(';')) \
                or sorted(rule_product_names.split(';')) != sorted(product_names.split(';')):
            raise ValueError('Cofactor designation error.')

        # create list from dict
        lhs_list = [lhs_dict_temp[k] for i, k in enumerate(sorted([k for k in lhs_dict_temp]))
                    if reactant_names.split(';')[i] == 'Any']
        rhs_list = [rhs_dict_temp[k] for i, k in enumerate(sorted([k for k in rhs_dict_temp]))
                    if product_names.split(';')[i] == 'Any']

        # sanitize
        for i, m in enumerate(lhs_list):
            try:
                temp_mol = Chem.MolFromSmiles(m)
                Chem.rdmolops.RemoveStereochemistry(temp_mol)
                lhs_list[i] = Chem.MolToSmiles(temp_mol)
            except:
                temp_mol = Chem.MolFromSmiles(m, sanitize=False)
                Chem.rdmolops.RemoveStereochemistry(temp_mol)
                lhs_list[i] = Chem.MolToSmiles(temp_mol)

        for i, m in enumerate(rhs_list):
            try:
                temp_mol = Chem.MolFromSmiles(m)
                Chem.rdmolops.RemoveStereochemistry(temp_mol)
                rhs_list[i] = Chem.MolToSmiles(temp_mol)
            except:
                temp_mol = Chem.MolFromSmiles(m, sanitize=False)
                Chem.rdmolops.RemoveStereochemistry(temp_mol)
                rhs_list[i] = Chem.MolToSmiles(temp_mol)

        return lhs_list, rhs_list
    
def get_cofactors(input_cofactor_list_path, input_cofactor_pair_path):
    """Get cofactor list & pairs"""

    # cofactor to cpd id dict
    cofactor_name_dict = {}

    # cofactor list name designation
    cofactor_list_dict = {}
    for k, v in pd.read_csv(input_cofactor_list_path, sep='\t', index_col=0).iterrows():
        cofactor_list_dict[k.upper()] = v['replacement']
        if v['replacement'] not in cofactor_name_dict:
            cofactor_name_dict[v['replacement']] = k

    # cofactor pair name designation
    cofactor_pair_dict = {}
    with open(input_cofactor_pair_path) as f:
        cofactor_pair_read_json = json.loads(f.read())
    for k, v in cofactor_pair_read_json.items():
        for pair in v:
            cofactor_pair_dict[(pair[1].upper(), pair[2].upper())] = k
            cofactor_pair_dict[(pair[2].upper(), pair[1].upper())] = '%s,%s' % (k.split(',')[1], k.split(',')[0])
            if k.split(',')[0] not in cofactor_name_dict:
                cofactor_name_dict[k.split(',')[0]] = pair[1]
                cofactor_name_dict[k.split(',')[1]] = pair[2]

    return cofactor_name_dict, cofactor_list_dict, cofactor_pair_dict

def label_cofactor(input_reactant_molfile, input_product_molfile, cofactor_list, cofactor_pair):  # label cofactors
    """Label cofactors & cofator pairs for product & reactant names"""

    # reactant & product names
    reactant_molfile = [':'.join(m.upper().split(':')[0:max(1, len(m.split(':')) - 1)]) for m in input_reactant_molfile]
    product_molfile = [':'.join(m.upper().split(':')[0:max(1, len(m.split(':')) - 1)]) for m in input_product_molfile]

    # new substrate labels
    reactant_names = ['Any'] * len(reactant_molfile)
    product_names = ['Any'] * len(product_molfile)

    # get cofactor pairs
    for i_lhs, lhs in enumerate(reactant_molfile):
        for i_rhs, rhs in enumerate(product_molfile):

            # skip if already assigned
            if product_names[i_rhs] != 'Any':
                continue

            # assign cofactor pair designation
            try:
                temp_pair = cofactor_pair[(lhs, rhs)]
                reactant_names[i_lhs] = temp_pair.split(',')[0]
                product_names[i_rhs] = temp_pair.split(',')[1]
                break
            except KeyError:
                continue

        # assign cofactor list if no cofactor pair assigned
        if reactant_names[i_lhs] == 'Any':
            try:
                reactant_names[i_lhs] = cofactor_list[lhs]
            except KeyError:
                continue

    # assign cofactor list for rhs
    for i_rhs, rhs in enumerate(product_molfile):

        # assign cofactor list if no cofactor pair assigned
        if product_names[i_rhs] == 'Any':
            try:
                product_names[i_rhs] = cofactor_list[rhs]
            except KeyError:
                continue

    return ';'.join(reactant_names), ';'.join(product_names)

def postsanitize_smiles(smiles_list):
    """Postsanitize smiles after running SMARTS.
    :returns tautomer list of list of smiles"""

    sanitized_list = []
    # tautomer_smarts = '[#6:1]1:[#6:2]:[#7H1X3:3]:[#6:4]:[#7H0X2:5]:1>>[#6:1]1:[#6:2]:[#7H0X2:3]:[#6:4]:[#7H1X3:5]:1'
    tautomer_smarts = '[#7H1X3&a:1]:[#6&a:2]:[#7H0X2&a:3]>>[#7H0X2:1]:[#6:2]:[#7H1X3:3]'

    for s in smiles_list:

        temp_mol = Chem.MolFromSmiles(s, sanitize=False)

        # # pickaxe
        # temp_mol = Chem.rdmolops.RemoveHs(temp_mol)

        aromatic_bonds = [i.GetIdx() for i in temp_mol.GetBonds() if i.GetBondType() == Chem.rdchem.BondType.AROMATIC]

        for i in temp_mol.GetBonds():
            if i.GetBondType() == Chem.rdchem.BondType.UNSPECIFIED:
                i.SetBondType(Chem.rdchem.BondType.SINGLE)

        try:
            Chem.SanitizeMol(temp_mol)
            Chem.rdmolops.RemoveStereochemistry(temp_mol)
            temp_smiles = Chem.MolToSmiles(temp_mol)

        except Exception as msg:
            if 'Can\'t kekulize mol' in str(msg):
                # unkekulized_indices = [int(i) for i in str(msg).split('Unkekulized atoms: ')[1].split('.')[0].rstrip(' \n').split(' ')]
                pyrrole_indices = [i[0] for i in temp_mol.GetSubstructMatches(Chem.MolFromSmarts('n'))]

                # indices to sanitize
                # for s_i in set(unkekulized_indices).intersection(set(pyrrole_indices)):
                for s_i in pyrrole_indices:
                    temp_mol = Chem.MolFromSmiles(s, sanitize=False)
                    if temp_mol.GetAtomWithIdx(s_i).GetNumExplicitHs() == 0:
                        temp_mol.GetAtomWithIdx(s_i).SetNumExplicitHs(1)
                    elif temp_mol.GetAtomWithIdx(s_i).GetNumExplicitHs() == 1:
                        temp_mol.GetAtomWithIdx(s_i).SetNumExplicitHs(0)
                    try:
                        Chem.SanitizeMol(temp_mol)

                        processed_pyrrole_indices = [i[0] for i in
                                                     temp_mol.GetSubstructMatches(Chem.MolFromSmarts('n'))]
                        processed_aromatic_bonds = [i.GetIdx() for i in
                                                    temp_mol.GetBonds() if i.GetBondType() == Chem.rdchem.BondType.AROMATIC]
                        if processed_pyrrole_indices != pyrrole_indices or aromatic_bonds != processed_aromatic_bonds:
                            continue

                        Chem.rdmolops.RemoveStereochemistry(temp_mol)
                        temp_smiles = Chem.MolToSmiles(temp_mol)
                        break
                    except:
                        continue
                if 'temp_smiles' not in vars():
                    Chem.rdmolops.RemoveStereochemistry(temp_mol)
                    temp_smiles = Chem.MolToSmiles(temp_mol)
                    sanitized_list.append([temp_smiles])
                    continue
            else:
                Chem.rdmolops.RemoveStereochemistry(temp_mol)
                temp_smiles = Chem.MolToSmiles(temp_mol)
                sanitized_list.append([temp_smiles])
                continue
        rxn = AllChem.ReactionFromSmarts(tautomer_smarts)

        try:
            tautomer_mols = rxn.RunReactants((Chem.MolFromSmiles(temp_smiles), ))
        except:
            try:
                tautomer_mols = rxn.RunReactants((Chem.MolFromSmiles(temp_smiles, sanitize=False),))
            except:
                continue

        tautomer_smiles = [Chem.MolToSmiles(m[0]) for m in tautomer_mols]
        sanitized_list.append(sorted(set(tautomer_smiles + [temp_smiles])))

    return list(itertools.product(*sanitized_list))

## Initialize Joseph's reaction mapper

In [4]:
cpd_path = '../04_map_rxn_rules/brenda_neutralize.tsv'
rules_path = '../04_map_rxn_rules/minimal1224_all_uniprot.tsv'
cofactor_list_path = '../04_map_rxn_rules/cofactor_list_alldb.tsv'
cofactor_pair_path = '../04_map_rxn_rules/cofactor_pair_alldb.json'
SEED_neutralized = '../04_map_rxn_rules/SEED_neutralized.tsv'
db_cpd_dict = {k: v['smiles'] for k, v in pd.read_csv(cpd_path, sep='\t', index_col=0).iterrows()}
mapper = MapRules(rules_path=rules_path, cofactor_list_path=cofactor_list_path, cofactor_pair_path=cofactor_pair_path, seed_dict=db_cpd_dict)

# Yash

#### Read in SI table and SEED table with SMILE strings and aliases as pandas Dataframes

In [5]:
supp_table_06_df = pd.read_csv("../01_raw_data/supp_table_06.csv")
supp_table_07_df = pd.read_csv("../01_raw_data/supp_table_07.csv")
supp_table_08_df = pd.read_csv("../01_raw_data/supp_table_08.csv")

table06_reactions = supp_table_06_df['predicted reaction (entire reaction information with cofactors, conditions, etc.)']
table07_reactions = supp_table_07_df['predicted reactant pair']
table08_reactions = supp_table_08_df['example reaction with this atom difference']

SEED_SMILES_aliases_df = pd.read_csv("../03_processed_data/SEED_smiles_and_aliases_cleaned.csv",low_memory=False)
SEED_neutralized_df = pd.read_csv("../06_SEED_neutralized/seed_neutralize.tsv",delimiter='\t')

### generate SEED cleaned

In [6]:
SEED_df = pd.read_csv("../02_SEED_database/SEED.csv",low_memory=False)

In [7]:
SEED_df.columns

Index(['id', 'abbreviation', 'name', 'formula', 'mass', 'source', 'inchikey',
       'charge', 'is_core', 'is_obsolete', 'linked_compound', 'is_cofactor',
       'deltag', 'deltagerr', 'pka', 'pkb', 'abstract_compound',
       'comprised_of', 'aliases', 'smiles', 'notes'],
      dtype='object')

In [8]:
def clean_aliases(aliases_list_raw):
    cleaned_aliases = []
    for name in aliases_list_raw:
        if '|' in name: # remove vertical pipe if present and take string left of pipe
            cleaned_aliases.append(name.split('|')[0].strip().lower())
        else: 
            cleaned_aliases.append(name.strip().lower()) 
    cleaned_aliases[0] = cleaned_aliases[0].lstrip('name: ')
    return cleaned_aliases

In [9]:
def extract_aliases_from_SEED(SEED_df):
    all_cleaned_aliases = [] # initialize empty list to store lists of cleaned aliases from each entry of SEED
    
    for i in range(0,SEED_df.shape[0]): # iterate through all entries of SEED
        
        cpd_name = SEED_df["name"][i] # extract compound name first
        
        if type(cpd_name) is not float: # if compound name is valid
            cpd_name = cpd_name.lower() # convert compound name to lower case
            try:
                cpd_aliases_list_raw = SEED_df["aliases"][i].split(';') # extract aliases associated with compound
                cpd_cleaned_aliases = list(np.unique(clean_aliases(cpd_aliases_list_raw))) # clean the aliases
                if cpd_name in cpd_cleaned_aliases: # if compound name is already present in cleaned aliases list
                    all_cleaned_aliases.append(cpd_cleaned_aliases) # add cleaned aliases list to main list
                else: # if compound name is not already present in cleaned aliases list
                    cpd_cleaned_aliases.append(cpd_name) # add compound name is cleaned aliases list first
                    all_cleaned_aliases.append(cpd_cleaned_aliases) # then add cleaned aliases list to main list
            except AttributeError: # if there are no aliases at all
                all_cleaned_aliases.append(cpd_name) # add only the compound name to main aliases list
        
        elif type(cpd_name) is float: # if compound name is not valid
            cpd_aliases_list_raw = SEED_df["aliases"][i].split(';') # extract aliases associated with compound
            cpd_cleaned_aliases = list(np.unique(clean_aliases(cpd_aliases_list_raw))) # clean the aliases
            all_cleaned_aliases.append(cpd_cleaned_aliases) # add cleaned aliases to main list
    return all_cleaned_aliases

In [10]:
all_cleaned_aliases = extract_aliases_from_SEED(SEED_df)

In [11]:
all_cleaned_aliases

[['c01328',
  'h20',
  'h2o',
  'h3o+',
  'ho-',
  'hydrogen oxide',
  'hydroxide',
  'hydroxide ion',
  'hydroxyl',
  'hydroxyl ion',
  'oh',
  'oh-',
  'oh1',
  'oxonium',
  'water'],
 ["adenosine 5'-triphosphate",
  "adenosine-5'-triphosphate",
  'adenosine-triphosphate',
  'adenylpyrophosphate',
  'tp',
  'atp'],
 ['beta-nad+',
  'beta-nicotinamide adenine dinucleotide',
  'coenzyme i',
  'diphosphopyridine nucleotide',
  'diphosphopyridine nucleotide oxidized',
  'dpn',
  'dpn+',
  'dpn-ox',
  'nad',
  'nad+',
  'nad-ox',
  'nad-oxidized',
  'nadide',
  'nicotinamide adenine dinucleotide',
  'nicotinamide adenine dinucleotide oxidized',
  'nicotinamideadeninedinucleotide'],
 ['beta-nadh',
  'dihydrodiphosphopyridine nucleotide',
  'dihydronicotinamide adenine dinucleotide',
  'diphosphopyridine nucleotide reduced',
  'dpnh',
  'nad-reduced',
  'nadh',
  'nadh+h+',
  'nadh2',
  'nicotinamide adenine dinucleotide - reduced',
  'nicotinamide adenine dinucleotide reduced',
  'nicotina

In [12]:
SEED_df["cleaned aliases"] = all_cleaned_aliases

In [13]:
SEED_SMILES_aliases_df = SEED_df.loc[:,['id','smiles','cleaned aliases']]

In [14]:
SEED_SMILES_aliases_df

Unnamed: 0,id,smiles,cleaned aliases
0,cpd00001,O,"[c01328, h20, h2o, h3o+, ho-, hydrogen oxide, ..."
1,cpd00002,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,"[adenosine 5'-triphosphate, adenosine-5'-triph..."
2,cpd00003,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,"[beta-nad+, beta-nicotinamide adenine dinucleo..."
3,cpd00004,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,"[beta-nadh, dihydrodiphosphopyridine nucleotid..."
4,cpd00005,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,"[beta-nadph, dihydronicotinamide adenine dinuc..."
...,...,...,...
33987,cpd37298,CC(C)=CCC/C(C)=C/CC/C(C)=C/CC/C(C)=C\CC/C(C)=C...,"[tacyc: cpd-175, cpd-175]"
33988,cpd37299,CCCCCCCCCCCCCCCCCCCCCCCCCCC(C(=O)OC[C@H]1O[C@H...,"[racyc: cpd1g-1344, cpd1g-1344]"
33989,cpd37300,NC(=O)CCC(=O)Nc1ncn([C@@H]2O[C@H](COP(=O)([O-]...,"[kegg: c04811, c04811]"
33990,cpd37301,CC1=C(CCC(=O)[O-])C2=Cc3c(CCC(=O)[O-])c(C)c4n3...,"[heme c, heme_c]"


### lookup

In [15]:
def lookup_SMILES_from_alias(SEED_SMILES_aliases_df,cpd_name):
    """
    Get SMILES string and ID of a compound from its name
    """
    count = 0
    idx = []
    for i in range(0,SEED_SMILES_aliases_df.shape[0]):
        aliases_list = SEED_SMILES_aliases_df['cleaned aliases'][i]
        for alias in aliases_list:
            if cpd_name.lower() == alias.strip():
                idx.append(i)
                count += 1
    
    if count != 0: # if compound is located
        cpd_smiles = SEED_SMILES_aliases_df["smiles"].iloc[idx[0]]
        cpd_id = SEED_SMILES_aliases_df["id"].iloc[idx[0]]
    
    elif count == 0: # if compound is not located:
        cpd_smiles = 'None'
        cpd_id = 'None'
        
    return cpd_smiles,cpd_id,count

In [16]:
lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'atp')

('Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])O)[C@@H](O)[C@H]1O',
 'cpd00002',
 1)

In [18]:
lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'3-hydroxypivalic acid')

('None', 'None', 0)

### process reactions from table 6

In [19]:
def process_reactions(reaction_string):
    reactants_list = [] # initialize empty list to store all reactants
    products_list = [] # initialize empty list to store all products
    
    reactants_w_enzymes = reaction_string.split('<=>')[0] # extract all items on LHS of reaction
    products_w_enzymes = reaction_string.split('<=>')[1] # extract all items on RHS of reaction
    
    # Extract reactants individually from LHS
    delimiter = ';'
    num_occurrences = reactants_w_enzymes.count(delimiter)
    reactants = reactants_w_enzymes.split(delimiter,maxsplit=num_occurrences)[num_occurrences]
        
    # Append each individual reactant to reactant list
    delimiter = ' + '
    num_reactants = reactants.count(delimiter)
    for reactant in reactants.split(delimiter,maxsplit=num_reactants):
        reactants_list.append(reactant.lower().strip())
    
    if '[' in products_w_enzymes: # this is the case in table 6
        products = products_w_enzymes.split('[')[0]
    elif ')' in products_w_enzymes: # this is the case in table 8
        products = products_w_enzymes.strip(')')

    num_products = products.count(delimiter)
    for product in products.split(delimiter,maxsplit=num_products):
        products_list.append(product.strip().strip('\n').lower())
        
    if '' in reactants_list: 
        reactants_list.remove('')
    
    if '' in products_list:
        products_list.remove('')
        
    return reactants_list, products_list

In [20]:
def rxn_dict_frm_table(table08_reactions):
    
    rxn_count = 0 # initialize a reaction counter
    rxn_dict = {} # initialize a reaction dictionary to store all reactions in the table
    
    for i in range(0,len(table08_reactions)):
        rxn_count += 1 # increase reaction counter by 1
        
        reactant_dict = {}
        product_dict = {}
        
        reactants_list, products_list = process_reactions(table08_reactions[i])
        
        for reactant in reactants_list:
            cpd_smiles,cpd_id,cpd_count = lookup_SMILES_from_alias(SEED_SMILES_aliases_df,reactant)
            
            if type(cpd_smiles) != float:
                # Neutralize any reactants
                if '+' in cpd_smiles or '-' in cpd_smiles:
                    cpd_smiles = SEED_neutralized_df[SEED_neutralized_df['cpd']==cpd_id]['smiles'].iloc[0]
    
                # Remove hydrogen ions from reactants side
                if cpd_id != 'cpd00067' and cpd_id != 'None':
                    reactant_dict.update({cpd_id:cpd_smiles})
                    
            elif type(cpd_smiles) == float:
                reactant_dict.update({cpd_id:'None'})

        for product in products_list:
            cpd_smiles,cpd_id,cpd_count = lookup_SMILES_from_alias(SEED_SMILES_aliases_df,product)
            
            if type(cpd_smiles) != float:
                # Neutralize any products
                if '+' in cpd_smiles or '-' in cpd_smiles:
                    cpd_smiles = SEED_neutralized_df[SEED_neutralized_df['cpd']==cpd_id]['smiles'].iloc[0]

                # Remove hydrogen ions from products side
                if cpd_id != 'cpd00067' and cpd_id != 'None':
                    product_dict.update({cpd_id:cpd_smiles})    
            
            elif type(cpd_smiles) == float:
                reactant_dict.update({cpd_id:'None'})
        
        product_reactant_list = [reactant_dict,product_dict]
        
        # Update reaction dictionary with the reaction
        rxn_dict.update({"R%i"%rxn_count:product_reactant_list})
        
    return rxn_dict 

In [21]:
rxn_dict = rxn_dict_frm_table(table06_reactions)

In [27]:
rxn_dict["R1"]

[{'cpd00249': 'O=c1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)[nH]1',
  'cpd00001': 'O'},
 {'cpd00092': 'O=c1cc[nH]c(=O)[nH]1',
  'cpd00105': 'OC[C@H]1OC(O)[C@H](O)[C@@H]1O'}]

## Yash rule mapping

In [23]:
def map_rxn_to_rule(lhs_rhs):
    count = 0
    for i in range(0,1225):

        count += 1
        
        if count < 10:
            rule = 'rule000%i' % count
        elif count >= 10 and count < 100:
            rule = 'rule00%i' % count
        elif count >= 100 and count < 1000:
            rule = 'rule0%i' % count
        else:
            rule = 'rule%i' % count

        try:
            mapping_object = mapper.map_pickaxe_rules(lhs_rhs[0], lhs_rhs[1], rule)
            if mapping_object == (None,None):
                pass
            else:
                break # mapped
            #if mapping_object[0][0] == 0 and mapping_object[1][0] == 0:
            #    break
            #elif mapping_object == (None,None):
            #    pass
        except :#or TypeError:  # exception for specific error type
            pass

    return rule

In [25]:
SEED_SMILES_aliases_df[SEED_SMILES_aliases_df["smiles"]=="C=C(C)CCO"]

Unnamed: 0,id,smiles,cleaned aliases


In [None]:
substrate_cpd_ID = "cpd35276"
substrate_cpd_SMILES = "CC(C)(O)CCO"
product_cpd_ID = 
product_cpd_SMILES = 

In [16]:
for key in list(rxn_dict.keys()):
    
    lhs_rhs = rxn_dict[key]
    rule = map_rxn_to_rule(lhs_rhs)
    print(rule)

rule0013
rule0099
rule0016
rule0015
rule0015
rule0024
rule1225
rule0002
rule0002
rule0002
rule0002
rule0498
rule0002
rule0002
rule0002
rule0002
rule0018
rule0002
rule0002
rule0018
rule1225
rule0013
rule0013
rule1225
rule0195
rule0084
rule0035
rule1225
rule0012
rule0042
rule0042
rule0013
rule0012
rule0042
rule0042
rule0013
rule0067
rule0067
rule0067
rule0067


In [17]:
mapped_rxns = {}
unmapped_rxns = {}

for key in list(rxn_dict.keys()):
    
    lhs_rhs = rxn_dict[key]
    rule = map_rxn_to_rule(lhs_rhs)
    
    if rule != "rule1225": # reaction is mapped
        lhs_rhs.append(rule)
        if lhs_rhs not in mapped_rxns.values(): # ensure duplicates are not added again
            mapped_rxns.update({key:lhs_rhs})
    
    elif rule == "rule1225": # reaction is not mapped
        if lhs_rhs not in unmapped_rxns.values():
            unmapped_rxns.update({key:lhs_rhs})

In [18]:
print(len(mapped_rxns),len(unmapped_rxns))

23 3


In [19]:
unmapped_rxns

{'R7': [{'cpd19405': 'CC(C)=CCCC1=CC(c2cc(O)ccc2O)OC1'},
  {'cpd19404': 'CC(C)=CCCC1=CC(C2=CC(=O)C=CC2=O)OC1'}],
 'R21': [{'cpd10567': 'O=CCBr',
   'cpd00042': 'N[C@@H](CCC(=O)N[C@@H](CS)C(=O)NCC(=O)O)C(=O)O'},
  {'cpd10568': 'N[C@@H](CCC(=O)N[C@@H](CSCC=O)C(=O)NCC(=O)O)C(=O)O',
   'cpd00966': 'Br'}],
 'R28': [{'cpd00062': 'O=c1ccn([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]2O)c(=O)[nH]1',
   'cpd00001': 'O'},
  {'cpd00091': 'O=c1ccn([C@@H]2O[C@H](COP(=O)(O)O)[C@@H](O)[C@H]2O)c(=O)[nH]1',
   'cpd00012': 'O=P(O)(O)OP(=O)(O)O'}]}

In [20]:
with open ("./table06_mapped_rxns.json","w") as f:
    f.write(json.dumps(mapped_rxns))
with open ("./table06_unmapped_rxns.json","w") as r:
    r.write(json.dumps(unmapped_rxns))

### tests

In [21]:
### Table 6 tests
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'uridine') == ('O=c1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)[nH]1', 'cpd00249', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'H2O') == ('O', 'cpd00001', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Uracil') == ('O=c1cc[nH]c(=O)[nH]1', 'cpd00092', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'D-Ribose') == ('OC[C@H]1OC(O)[C@H](O)[C@@H]1O', 'cpd00105', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Orthophosphate') == ('O=P([O-])([O-])O', 'cpd00009', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'alpha-D-Ribose 1-phosphate') == ('O=P([O-])([O-])O[C@H]1O[C@H](CO)[C@@H](O)[C@H]1O', 'cpd00475', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Pyridoxal Phosphate') == ('Cc1ncc(COP(=O)([O-])[O-])c(C=O)c1O', 'cpd00016', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Pyridoxal') == ('Cc1ncc(CO)c(C=O)c1O', 'cpd00215', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'ATP') == ('Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])O)[C@@H](O)[C@H]1O','cpd00002',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'ADP') == ('Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])O)[C@@H](O)[C@H]1O','cpd00008',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'L-Glutamate') == ('[NH3+][C@@H](CCC(=O)[O-])C(=O)[O-]', 'cpd00023', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'L-Glutamyl 5-phosphate') == ('[NH3+][C@@H](CCC(=O)OP(=O)([O-])[O-])C(=O)[O-]', 'cpd02097', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'4-Aminobutanoate') == ('[NH3+]CCCC(=O)[O-]', 'cpd00281', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'CO2') == ('O=C=O', 'cpd00011', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Dihydroshikonofuran') == ('CC(C)=CCCC1=CC(c2cc(O)ccc2O)OC1', 'cpd19405', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Dihydroechinofuran') == ('CC(C)=CCCC1=CC(C2=CC(=O)C=CC2=O)OC1', 'cpd19404', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'(R)-3-Hydroxybutanoate') == ('C[C@@H](O)CC(=O)[O-]', 'cpd00797', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'NAD+') == ('NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1','cpd00003',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'NADH') == ('NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1','cpd00004',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'H+') == ('[H+]', 'cpd00067', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'acetoacetate') == ('CC(=O)CC(=O)[O-]', 'cpd00142', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'(S)-3-Hydroxyisobutyrate') == ('C[C@H](CO)C(=O)[O-]', 'cpd19043', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'(R)-3-Hydroxybutanoate') == ('C[C@@H](O)CC(=O)[O-]', 'cpd00797', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'(S)-Methylmalonate Semialdehyde') == ('C[C@@H](C=O)C(=O)[O-]', 'cpd19044', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'2-Hydroxyglutarate') == ('O=C([O-])CCC(O)C(=O)[O-]', 'cpd01709', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'FAD') == ('Cc1cc2nc3c(=O)[n-]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)c2cc1C','cpd00015',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'FADH2') == ('Cc1cc2c(cc1C)N(C[C@H](O)[C@H](O)[C@H](O)COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n3cnc4c(N)ncnc43)[C@H](O)[C@@H]1O)c1[nH]c(=O)[nH]c(=O)c1N2','cpd00982',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'2-Oxoglutarate') == ('O=C([O-])CCC(=O)C(=O)[O-]', 'cpd00024', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'2-Dehydro-3-deoxy-D-xylonate') == ('O=C([O-])C(=O)C[C@H](O)CO', 'cpd00520', 3)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'5-Hydroxy-2,4-dioxopentanoate') == ('O=C(CO)CC(=O)C(=O)[O-]', 'cpd12723', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Phenyllactate') == ('O=C([O-])[C@H](O)Cc1ccccc1', 'cpd03331', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Phenylpyruvate') == ('O=C([O-])C(=O)Cc1ccccc1', 'cpd00143', 2)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'NADP+') == ('NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)c1','cpd00006',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'NADPH') == ('NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1','cpd00005',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'3-(2-Hydroxyphenyl)propanoate') == ('O=C([O-])CCc1ccccc1O', 'cpd00882', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'trans-2-Hydroxycinnamate') == ('O=C([O-])/C=C/c1ccccc1O', 'cpd01224', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'3-(2-Hydroxyphenyl)propanoate') == ('O=C([O-])CCc1ccccc1O', 'cpd00882', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'2-Bromoacetaldehyde') == ('O=CCBr', 'cpd10567', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Glutathione') == ('[NH3+][C@@H](CCC(=O)N[C@@H](CS)C(=O)NCC(=O)[O-])C(=O)[O-]', 'cpd00042', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Cys-Gly') == ('[NH3+][C@@H](CS)C(=O)NCC(=O)[O-]', 'cpd01017', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Hydrobromic acid') == ('[Br-]', 'cpd00966', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'2-Bromoacetaldehyde') == ('O=CCBr', 'cpd10567', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'UMP') == ('O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]2O)c(=O)[nH]1','cpd00091',2)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Diphosphate') == ('O=P([O-])([O-])OP(=O)([O-])O', 'cpd00012', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Acetate') == ('CC(=O)[O-]', 'cpd00029', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'UTP') == ('O=c1ccn([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])O)[C@@H](O)[C@H]2O)c(=O)[nH]1','cpd00062',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"3'-UMP") == ('O=c1ccn([C@@H]2O[C@H](CO)[C@@H](OP(=O)([O-])[O-])[C@H]2O)c(=O)[nH]1','cpd00989',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,'Pseudouridine') == ('O=c1[nH]cc([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)[nH]1', 'cpd01405', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"Pseudouridine 5'-phosphate") == ('O=c1[nH]cc([C@@H]2O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]2O)c(=O)[nH]1','cpd00859',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"D-Ribose 5-phosphate") == ('O=P([O-])([O-])OC[C@H]1OC(O)[C@H](O)[C@@H]1O', 'cpd00101', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"4-Acetamidobutanoate") == ('CC(=O)NCCCC(=O)[O-]', 'cpd01889', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"Acetyl-CoA") == ('CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-]','cpd00022',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"CoA") == ('CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-])[C@@H](O)C(=O)NCCC(=O)NCCS','cpd00010',2)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"L-Phenylalanine") == ('[NH3+][C@@H](Cc1ccccc1)C(=O)[O-]', 'cpd00066', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"D-Phenylalanine") == ('[NH3+][C@H](Cc1ccccc1)C(=O)[O-]', 'cpd01526', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"N-Acetyl-D-Phenylalanine") == ('CC(=O)N[C@H](Cc1ccccc1)C(=O)[O-]', 'cpd03338', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"N-Acetyl-L-Phenylalanine") == ('CC(=O)N[C@@H](Cc1ccccc1)C(=O)[O-]', 'cpd02218', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"4-Acetamidobutanoate") == ('CC(=O)NCCCC(=O)[O-]', 'cpd01889', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"6-Acetamido-2-oxohexanoate") == ('CC(=O)NCCCCC(=O)C(=O)[O-]', 'cpd03291', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"AMP") == ('Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])[O-])[C@@H](O)[C@H]1O','cpd00018',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"citrate") == ('O=C([O-])CC(O)(CC(=O)[O-])C(=O)[O-]', 'cpd00137', 1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"N2-citryl-N6-acetyl-N6-hydroxy-L-lysine") == ('CC(=O)N(O)CCCC[C@H](NC(=O)CC(O)(CC(=O)[O-])C(=O)[O-])C(=O)[O-]','cpd24157',1)
assert lookup_SMILES_from_alias(SEED_SMILES_aliases_df,"N6-acetyl-N6-hydroxy-L-lysine") == ('CC(=O)N(O)CCCC[C@H]([NH3+])C(=O)[O-]', 'cpd02456', 1)

### Table 8 tests
