In [1]:
%load_ext autoreload
%autoreload 2
from src.chem_draw import draw_rxn_svg, draw_pwy_svg
from src.utils import sort_x_by_y, ensure_dirs, load_json
from svgutils import compose as sc
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
import subprocess
import pandas as pd
from collections import defaultdict

In [2]:
# Params
starters = '2mg'
targets = 'mvacid'
generations = 2

expansion_dir = '../data/processed_expansions/'
fn = f"{starters}_to_{targets}_gen_{generations}_tan_sample_1_n_samples_1000.pkl" # Expansion file name

# Load processed expansions
with open(expansion_dir + fn, 'rb') as f:
    pe = pickle.load(f)


In [55]:
for _, pr in pe.predicted_reactions.items():
    print([kr.enzymes for kr in pr.analogues])

[[], [Enzyme(uniprot_id='E9RFS9', sequence=None), Enzyme(uniprot_id='E9RFT0', sequence=None), Enzyme(uniprot_id='E9RFT1', sequence=None)], [], [Enzyme(uniprot_id='G1UBD1', sequence=None), Enzyme(uniprot_id='Q8KQF0', sequence=None), Enzyme(uniprot_id='Q607G3', sequence=None)], [], [], [], [Enzyme(uniprot_id='Q8KQE6', sequence=None), Enzyme(uniprot_id='Q8KQE7', sequence=None), Enzyme(uniprot_id='Q8KQE8', sequence=None), Enzyme(uniprot_id='Q8KQE9', sequence=None), Enzyme(uniprot_id='Q8KQF0', sequence=None), Enzyme(uniprot_id='Q8KQF0', sequence=None)], [], [], [Enzyme(uniprot_id='G1UBD1', sequence=None), Enzyme(uniprot_id='Q8KQF0', sequence=None), Enzyme(uniprot_id='Q607G3', sequence=None)], [], [Enzyme(uniprot_id='Q59723', sequence=None)], [Enzyme(uniprot_id='E9RFS9', sequence=None), Enzyme(uniprot_id='E9RFT0', sequence=None), Enzyme(uniprot_id='E9RFT1', sequence=None)], [Enzyme(uniprot_id='Q936S9', sequence=None), Enzyme(uniprot_id='Q936T1', sequence=None), Enzyme(uniprot_id='Q936T2', se

In [58]:
pr = list(pe.predicted_reactions.values())[0]
kr = pr.analogues[1]
kr.enzymes

[Enzyme(uniprot_id='E9RFS9', sequence=None),
 Enzyme(uniprot_id='E9RFT0', sequence=None),
 Enzyme(uniprot_id='E9RFT1', sequence=None)]

In [54]:
# Count how many pathways have full info

for k,v in paths.items():
    n_paths_w_full_info = 0
    for p in v:
        path_full_info = []
        for rh in p.rhashes:
            full_info = False
            for elt in pred_rxns[rh].known_rxns:
                if elt[0] is not None:
                    full_info = True
                    break

            path_full_info.append(full_info)
        
        if all(path_full_info):
            n_paths_w_full_info += 1

    print(k, f"{n_paths_w_full_info} / {len(v)} paths w/ full info")


('fumarate', 'mvacid') 2 / 2 paths w/ full info


In [55]:
# Sort known reactions by average (could alter the sort by value)
for k in pred_rxns.keys():
    pred_rxns[k].sort_known_rxns()

In [56]:
# Set prc mcs attribute for each path by averaging over 
# known reactions
for st_pair in paths:
    for p in paths[st_pair]:
        p.compute_mean_prc_mcs(pred_rxns)

In [None]:
# Readin uniprot ids
# TODO: move up in pipeline

brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
key_repeats = set(brenda_uniprot.keys()) & set(mc_uniprot.keys())
print(len(key_repeats))
rxn2uniprot = defaultdict(list)

# Add metacyc first
for k,v in mc_uniprot.items():
    uniprot_str = v[-1]
    if uniprot_str != '':
        rxn2uniprot[k] += uniprot_str.split(',')

# Add brenda. There are multiple reactions per entry
# so we have to iterate through them all
for k, v in brenda_uniprot.items():
    n_reactions = len(v)
    n_digits = n_reactions // 10 + 1

    for i in range(n_reactions):
        reaction_key = f"{k}_{i:0{n_digits}}"
        uniprot_str = v[i][-1]
        if uniprot_str != '':
            uniprot_ids = uniprot_str.split(',')
            rxn2uniprot[reaction_key] += uniprot_ids

print(len(rxn2uniprot))

In [58]:
# First filter by mdf and min prc_mcs then sort by mean prc_mcs

min_thresh = 0.1
sheets = []
sheetnames = []

for st_pair in paths.keys():

    # TODO: write mdf getter that returns -np.inf to make this kind of logic easier 
    # Remove zero mins
    filtered_idxs = []
    for i in range(len(paths[st_pair])):
        if paths[st_pair][i].mdf:
            if (paths[st_pair][i].mdf > 0) & (paths[st_pair][i].min_mcs() > min_thresh):
                filtered_idxs.append(i)

    # filtered_idxs = [i for i in range(len(paths[st_pair])) if (paths[st_pair][i].mdf > 0) & (paths[st_pair][i].min_mcs() > min_thresh)]
    
    if len(filtered_idxs) > 0:
        # Get mean prc_mcs of non-zero min paths
        mean_prc_mcs = []
        for idx in filtered_idxs:
            mean_prc_mcs.append(paths[st_pair][idx].mean_mcs())

        mean_sorted_idxs, mean_prc_mcs = sort_x_by_y(filtered_idxs, mean_prc_mcs, reverse=True)
        print(f"{st_pair} {len(filtered_idxs)} paths with mdf > 0 and min mcs > {min_thresh}")

        # Generate pwy svgs & csv
        print("Generating svgs & csv")
        headers = ['starter', 'target', 'mdf', 'mcs'] + [f"uniprot_{g+1}" for g in range(generations)]
        to_df = {k: [] for k in headers}
        pwy_svg_outdir = f"../artifacts/pwy_svgs/{fn[:-3]}/{'_'.join(st_pair)}/"
        ensure_dirs(pwy_svg_outdir)

        for i, pwy_idx in enumerate(mean_sorted_idxs):
            this_path = paths[st_pair][pwy_idx]

            # svg (mainly)
            sma_hash_pairs = []
            this_rhashes = this_path.rhashes
            for r, this_rhash in enumerate(this_rhashes):
                this_rxn = pred_rxns[this_rhash]
                sma_hash_pairs.append([(this_rxn.smarts, this_rhash), (this_rxn.known_rxns[0][1], hash(this_rxn.known_rxns[0][1]))])
                
                # Uniprot ids
                # TODO make simple by moving uniprot up in pp
                # and write getters / standard return values
                uniprot_str_for_csv = ''
                best_kr = this_rxn.known_rxns[0]
                if best_kr[0]:
                    best_mcs = sum(best_kr[0]) / len(best_kr[0])
                    for kr in this_rxn.known_rxns:
                        if kr[0] is None:
                            continue
                        elif (sum(kr[0]) / len(kr[0]) == best_mcs):
                            uniprot_str_for_csv += f"{kr[2]}: [{', '.join(rxn2uniprot[kr[2]])}] | "




                        # if (kr[0] is None) or (kr[2] not in uniprot):
                        #     continue
                        # elif uniprot[kr[2]][-1] == '':
                        #     continue
                        # elif (sum(kr[0]) / len(kr[0]) == best_mcs):
                        #     this_uniprot.append(uniprot[kr[2]][-1])
                
                to_df[f"uniprot_{r+1}"].append(uniprot_str_for_csv) # Add uniprots to csv

            # o for oops, need better way.. didn't know there
            # were <n step pathways in n generation expansions
            if len(this_rhashes) < generations:
                for o in range(len(this_rhashes), generations):
                    to_df[f"uniprot_{o+1}"].append('')
            
            outpath = pwy_svg_outdir + f"{i:03}_{pwy_idx}.svg"
            draw_pwy_svg(sma_hash_pairs, outpath)

            # Add infor to csv
            to_df['starter'].append(st_pair[0])
            to_df['target'].append(st_pair[1])
            to_df['mdf'].append(this_path.mdf)
            to_df['mcs'].append(this_path.mean_mcs())


        # Make df for this st pair
        sheets.append(pd.DataFrame(to_df))
        sheetnames.append("_".join(st_pair))


        # Convert svgs to pdfs
        print("Generating pdfs")
        tmp_pdf_outdir = f"../artifacts/tmp_pdfs/{fn[:-3]}/{'_'.join(st_pair)}/"
        ensure_dirs(tmp_pdf_outdir)
        for i, pwy_idx in enumerate(mean_sorted_idxs):
            cmd = ["inkscape", f"--export-pdf={tmp_pdf_outdir}{i:03}_{pwy_idx}.pdf", f"{pwy_svg_outdir}{i:03}_{pwy_idx}.svg"]
            subprocess.run(cmd)

        # Concatenate pdfs
        print("Concatenating pdfs")
        pwy_pdf_outdir = f"../artifacts/pwy_pdfs/{fn[:-3]}/"
        ensure_dirs(pwy_pdf_outdir)
        cat_pdf_fn = pwy_pdf_outdir + '_'.join(st_pair) + '.pdf'
        individual_pwys = sorted([f"{i:03}_{pwy_idx}.pdf" for i, pwy_idx in enumerate(mean_sorted_idxs)])
        individual_pwys = [tmp_pdf_outdir + elt for elt in individual_pwys]
        
        cmd = ["pdfunite", *individual_pwys, cat_pdf_fn]
        subprocess.run(cmd)

# Concatenate sheets into xls
print("Saving xlsx")
if len(sheets) > 0:
    writer = pd.ExcelWriter(f"../artifacts/pwy_xls/{fn[:-3]}" + '.xlsx') # Arbitrary output name
    for i, df in enumerate(sheets):
        df.to_excel(writer,sheet_name=sheetnames[i])
    writer.save()

# Remove stuff
dirs = ['../artifacts/' + elt for elt in ['tmp_pdfs', 'rxn_svgs', 'pwy_svgs', 'mol_svgs']]
for elt in dirs:
    subprocess.run([f"rm -r {elt}/*"], shell=True)

('fumarate', 'mvacid') 2 paths with mdf > 0 and min mcs > 0.1
Generating svgs & csv
Generating pdfs


Gtk-Message: 11:53:10.920: Failed to load module "gail"
Gtk-Message: 11:53:10.920: Failed to load module "atk-bridge"








































Gtk-Message: 11:53:11.546: Failed to load module "gail"
Gtk-Message: 11:53:11.546: Failed to load module "atk-bridge"










































Concatenating pdfs
Saving xlsx


In [59]:
# not 'et' in ['l']

In [60]:
# # Single st_pair
# from rdkit import Chem
# from rdkit.Chem import AllChem
# st_pair = ('succinate', 'hopa')
# min_thresh = 0.05

# # TODO: write mdf getter that returns -np.inf to make this kind of logic easier 
# # Remove zero mins
# filtered_idxs = []
# for i in range(len(paths[st_pair])):
#     if paths[st_pair][i].mdf:
#         if (paths[st_pair][i].mdf > 0) & (paths[st_pair][i].min_mcs() > min_thresh):
#             filtered_idxs.append(i)

# # filtered_idxs = [i for i in range(len(paths[st_pair])) if (paths[st_pair][i].mdf > 0) & (paths[st_pair][i].min_mcs() > min_thresh)]

# if len(filtered_idxs) > 0:
#     # Get mean prc_mcs of non-zero min paths
#     mean_prc_mcs = []
#     for idx in filtered_idxs:
#         mean_prc_mcs.append(paths[st_pair][idx].mean_mcs())

#     mean_sorted_idxs, mean_prc_mcs = sort_x_by_y(filtered_idxs, mean_prc_mcs, reverse=True)
#     print(f"{st_pair} {len(filtered_idxs)} paths with mdf > 0 and min mcs > {min_thresh}")