In [1]:
import pandas as pd
import sys
sys.path.append('/home/stef/Tb/reaction_mapping')
from utils import save_json, load_json, sanitize, neutralize_atoms
from collections import defaultdict
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
m2s = Chem.MolToSmiles
s2m = Chem.MolFromSmiles

In [2]:
# Load stuff in

min_rules_path = '../mapping/minimal1224_all_uniprot.tsv'
imt_rules_path = '../mapping/JN3604IMT_rules.tsv'
paired_labels_path = 'cofactor_pair_alldb.json'
unpaired_labels_path = 'cofactor_list_alldb.tsv'

min_df = pd.read_csv(min_rules_path, delimiter='\t')
min_df.set_index('Name', inplace=True)

imt_df = pd.read_csv(imt_rules_path, delimiter='\t')
imt_df.set_index('Name', inplace=True)

paired_cofactors = load_json(paired_labels_path)
unpaired_df = pd.read_csv(unpaired_labels_path, delimiter='\t')

# Compare cofactor labels: my list, min rules, imt rules

In [None]:
min_labels = []
for elt in min_df["Reactants"].to_list():
    min_labels += [subelt.lower() for subelt in elt.split(';')]

for elt in min_df["Products"].to_list():
    min_labels += [subelt.lower() for subelt in elt.split(';')]
min_labels = set(min_labels)


imt_labels = []
for elt in imt_df["Reactants"].to_list():
    imt_labels += [subelt.lower() for subelt in elt.split(';')]

for elt in imt_df["Products"].to_list():
    imt_labels += [subelt.lower() for subelt in elt.split(';')]
imt_labels = set(imt_labels)

# Assemble my set
temp = []
for k in paired_cofactors.keys():
    temp += [elt.lower() for elt in k.split(',')]

paired_labels = set(temp)
unpaired_labels = set([elt.lower() for elt in unpaired_df['replacement'].to_list()])

my_set = paired_labels | unpaired_labels

Min rules have a few more cofactors than imt rules

In [26]:
min_labels - imt_labels

{'co', 'formyl_acceptor_cof', 'formyl_donor_cof', 'hi'}

In [27]:
imt_labels - min_labels

set()

I am not missing any cofactors from either min or imt rules

In [28]:
min_labels - my_set

{'any'}

In [29]:
imt_labels - my_set

{'any'}

I have cofactors that min and imt rules do not. Need to get rid of these

In [30]:
my_set - min_labels

{'ascorbate_cof', 'ascorbate_radical_cof', 'co3', 'h+', 'hbr'}

In [31]:
my_set - imt_labels

{'ascorbate_cof',
 'ascorbate_radical_cof',
 'co',
 'co3',
 'formyl_acceptor_cof',
 'formyl_donor_cof',
 'h+',
 'hbr',
 'hi'}

# Generate smiles to cofactor class look up tables for template matching during mapping

In [39]:

seed2smi = load_json('seed_id_to_smiles.json')
seed2aliases = load_json('seed_id_to_aliases.json')

# Make dict of unpaired_cofactors label2name
unpaired_cofactors = defaultdict(list)
for idx, row in unpaired_df.iterrows():
    unpaired_cofactors[row["replacement"]].append(row["molfile"])

smi2unpaired_cof = {}
# Try with seed ids first
for k, v in unpaired_cofactors.items():
    for elt in v:
        if elt[:3] == 'cpd':
            if elt in seed2smi:
                smi = m2s(neutralize_atoms(s2m(seed2smi[elt])))
                smi2unpaired_cof[sanitize([smi])[0]] = k

# Finish with aliases if necessary
for k,v in unpaired_cofactors.items():
    if k not in smi2unpaired_cof.values():
        for elt in v:
            elt = elt.lower()
            for k2, v2 in seed2aliases.items():
                if elt in v2:
                    smi = m2s(neutralize_atoms(s2m(seed2smi[k2])))
                    smi2unpaired_cof[sanitize([smi])[0]] = k

save_json(smi2unpaired_cof, '../mapping/smi2unpaired_cof.json')

rm_from_min = ['CO3', 'H+', 'HBr']
rm_from_imt = rm_from_min + ['HI', 'CO']

smi2unpaired_cof_min = smi2unpaired_cof.copy()
smi2unpaired_cof_imt = smi2unpaired_cof.copy()

rm_smi_from_min = []
rm_smi_from_imt = []
for k,v in smi2unpaired_cof.items():
    if v in rm_from_min:
        rm_smi_from_min.append(k)
    if v in rm_from_imt:
        rm_smi_from_imt.append(k)

for elt in rm_smi_from_min:
    smi2unpaired_cof_min.pop(elt)

for elt in rm_smi_from_imt:
    smi2unpaired_cof_imt.pop(elt)

save_json(smi2unpaired_cof_min, '../mapping/smi2unpaired_cof_min.json')
save_json(smi2unpaired_cof_imt, '../mapping/smi2unpaired_cof_imt.json')



In [40]:
# New smiles-based cof look up for paired cofactors

manual_found = pd.read_csv('manual_cofactors.csv', sep=',')
manual_found.set_index("Name", inplace=True)

smi2paired_cof = {}
missing_cofactors = [] # Missing compounds
for k, v in paired_cofactors.items():
    for pair in v:
        temp_pair = []
        for elt in pair[1:]:
            found = False
            if elt[:3] == 'cpd': # If is a seed id, search seed2smi
                if elt in seed2smi:
                    temp_pair.append(seed2smi[elt])
                    found = True
                elif (elt.lower() in manual_found.index) & (not pd.isna(manual_found.loc[elt]['Smiles'])):
                    temp_pair.append(manual_found.loc[elt.lower()]['Smiles'])
                    found = True
            elif elt.lower() in manual_found.index: # If not, try manual list
                elt = elt.lower()
                if not pd.isna(manual_found.loc[elt]['Smiles']):
                    temp_pair.append(manual_found.loc[elt]['Smiles'])
                    found = True
            else: # Otherwise search aliases on modelseed
                elt = elt.lower()
                for k2, v2 in seed2aliases.items():
                    if (elt in v2) & (k2 in seed2smi):              
                        temp_pair.append(seed2smi[k2])
                        found = True
                        break # Don't keep searching after first hit
            
            if not found:
                missing_cofactors.append(elt.lower())

        if len(temp_pair) == 2:
            temp_pair = [m2s(neutralize_atoms(s2m(elt))) for elt in temp_pair] # Neutralize atoms
            temp_pair = sanitize(temp_pair) # Canonicalize and remove stereochemistry
            temp_pair = ','.join(temp_pair)
            if temp_pair not in smi2paired_cof:
                smi2paired_cof[temp_pair] = [k]

a_few_missing_pairs = [['Nc1nc(O)nc2c1ncn2[C@H]1C[C@H](O)[C@@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])O)O1', 'Nc1nc(O)nc2c1ncn2[C@H]1C[C@H](O)[C@@H](COP(=O)([O-])[O-])O1'],
                       ['*C/C(C)=C/Cc1c(C)c(O)c2ccccc2c1O', '*C/C(C)=C/CC1=C(C)C(=O)c2ccccc2C1=O'],
                       ['Cc1cc2nc3c(nc(=O)[n-]c3=O)n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C', 'Cc1cc2NC3C(=O)NC(=O)N=C3N(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C']
                        ]

afmp_classes = [['PYROPHOSPHATE_DONOR_CoF,PYROPHOSPHATE_ACCEPTOR_CoF'],
                ["Ubiquinols_CoF,Ubiquinones_CoF"],
                ["FAD_CoF,FADH2_CoF"]
                ]

for i in range(len(a_few_missing_pairs)):
    temp_pair = a_few_missing_pairs[i]
    temp_pair = [m2s(neutralize_atoms(s2m(elt))) for elt in temp_pair] # Neutralize atoms
    temp_pair = sanitize(temp_pair) # Canonicalize and remove stereochemistry
    temp_pair = ','.join(temp_pair)
    if temp_pair not in smi2paired_cof:
        smi2paired_cof[temp_pair] = afmp_classes[i]

# save_json(smi2paired_cof, '../mapping/smi2paired_cof.json')

rm_from_min = ["ASCORBATE_RADICAL_CoF,ASCORBATE_CoF"]
rm_from_imt = rm_from_min + ["FORMYL_DONOR_CoF,FORMYL_ACCEPTOR_CoF"]

smi2paired_cof_min = smi2paired_cof.copy()
smi2paired_cof_imt = smi2paired_cof.copy()

rm_smi_from_min = []
rm_smi_from_imt = []
for k,v in smi2paired_cof.items():
    if v in rm_from_min:
        rm_smi_from_min.append(k)
    if v in rm_from_imt:
        rm_smi_from_imt.append(k)

for elt in rm_smi_from_min:
    smi2paired_cof_min.pop(elt)

for elt in rm_smi_from_imt:
    smi2paired_cof_imt.pop(elt)

save_json(smi2paired_cof_min, '../mapping/smi2paired_cof_min.json')
save_json(smi2paired_cof_imt, '../mapping/smi2paired_cof_imt.json')