In [60]:
%load_ext autoreload
%autoreload 2
from src.chem_draw import draw_rxn_svg, draw_pwy_svg
from src.utils import sort_x_by_y, ensure_dirs, load_json
from svgutils import compose as sc
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
import subprocess
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [158]:
# Params
starters = 'ccm_v0'
targets = 'mvacid'
generations = 4

expansion_dir = '../data/processed_expansions/'
fn = f"{starters}_to_{targets}_gen_{generations}_tan_sample_1_n_samples_1000.pk" # Expansion file name
rxns_path = expansion_dir + 'predicted_reactions_' + fn
paths_path = expansion_dir + 'paths_' + fn

In [159]:
# Load reactions and paths
with open(rxns_path, 'rb') as f:
    pred_rxns = pickle.load(f)

with open(paths_path, 'rb') as f:
    paths = pickle.load(f)

In [160]:
# Count how many pathways have full info

for k,v in paths.items():
    n_paths_w_full_info = 0
    for p in v:
        path_full_info = []
        for rh in p.rhashes:
            full_info = False
            for elt in pred_rxns[rh].known_rxns:
                if elt[0] is not None:
                    full_info = True
                    break

            path_full_info.append(full_info)
        
        if all(path_full_info):
            n_paths_w_full_info += 1

    print(k, f"{n_paths_w_full_info} / {len(v)} paths w/ full info")


('fumarate', 'mvacid') 414 / 557 paths w/ full info
('succinate', 'mvacid') 12 / 17 paths w/ full info
('acetate', 'mvacid') 2 / 2 paths w/ full info
('pyruvate', 'mvacid') 3 / 3 paths w/ full info


In [161]:
# Sort known reactions by average (could alter the sort by value)
for k in pred_rxns.keys():
    pred_rxns[k].sort_known_rxns()

In [162]:
# Set prc mcs attribute for each path by averaging over 
# known reactions
for st_pair in paths:
    for p in paths[st_pair]:
        p.compute_mean_prc_mcs(pred_rxns)

In [163]:
# Readin uniprot ids
# TODO: move up in pipeline

brenda_uniprot = load_json('../data/mapping/brenda_uniprot.json')
mc_uniprot = load_json('../data/mapping/metacyc_uniprot.json')
key_repeats = set(brenda_uniprot.keys()) & set(mc_uniprot.keys())
print(len(key_repeats))
uniprot = {**mc_uniprot, **brenda_uniprot}

0


In [164]:
# First filter by mdf and min prc_mcs then sort by mean prc_mcs

min_thresh = 0.05
sheets = []
sheetnames = []

for st_pair in paths.keys():

    # TODO: write mdf getter that returns -np.inf to make this kind of logic easier 
    # Remove zero mins
    filtered_idxs = []
    for i in range(len(paths[st_pair])):
        if paths[st_pair][i].mdf:
            if (paths[st_pair][i].mdf > 0) & (paths[st_pair][i].min_mcs() > min_thresh):
                filtered_idxs.append(i)

    # filtered_idxs = [i for i in range(len(paths[st_pair])) if (paths[st_pair][i].mdf > 0) & (paths[st_pair][i].min_mcs() > min_thresh)]
    
    if len(filtered_idxs) > 0:
        # Get mean prc_mcs of non-zero min paths
        mean_prc_mcs = []
        for idx in filtered_idxs:
            mean_prc_mcs.append(paths[st_pair][idx].mean_mcs())

        mean_sorted_idxs, mean_prc_mcs = sort_x_by_y(filtered_idxs, mean_prc_mcs, reverse=True)
        print(f"{st_pair} {len(filtered_idxs)} paths with mdf > 0 and min mcs > {min_thresh}")

        # Generate pwy svgs & csv
        print("Generating svgs & csv")
        headers = ['starter', 'target', 'mdf', 'mcs'] + [f"uniprot_{g+1}" for g in range(generations)]
        to_df = {k: [] for k in headers}
        pwy_svg_outdir = f"../artifacts/pwy_svgs/{fn[:-3]}/{'_'.join(st_pair)}/"
        ensure_dirs(pwy_svg_outdir)

        for i, pwy_idx in enumerate(mean_sorted_idxs):
            this_path = paths[st_pair][pwy_idx]

            # svg (mainly)
            sma_hash_pairs = []
            this_rhashes = this_path.rhashes
            # print(pwy_idx, len(this_rhashes))
            for r, this_rhash in enumerate(this_rhashes):
                this_uniprot = ''
                this_rxn = pred_rxns[this_rhash]
                sma_hash_pairs.append([(this_rxn.smarts, this_rhash), (this_rxn.known_rxns[0][1], hash(this_rxn.known_rxns[0][1]))])
                
                # Uniprot ids
                for kr in this_rxn.known_rxns:
                    if kr[2] in uniprot:
                        this_uniprot += uniprot[kr[2]][-1]

                # print(r, len(this_uniprot), this_uniprot)
                to_df[f"uniprot_{r+1}"].append(this_uniprot) # Add uniprots to csv


            # o for oops, need better way.. didn't know there
            # were <n step pathways in n generation expansions
            if len(this_rhashes) < generations:
                for o in range(len(this_rhashes), generations):
                    to_df[f"uniprot_{o+1}"].append('')
            
            outpath = pwy_svg_outdir + f"{i:03}_{pwy_idx}.svg"
            draw_pwy_svg(sma_hash_pairs, outpath)

            # Add infor to csv
            to_df['starter'].append(st_pair[0])
            to_df['target'].append(st_pair[1])
            to_df['mdf'].append(this_path.mdf)
            to_df['mcs'].append(this_path.mean_mcs())


        # Make df for this st pair
        sheets.append(pd.DataFrame(to_df))
        sheetnames.append("_".join(st_pair))


        # Convert svgs to pdfs
        print("Generating pdfs")
        tmp_pdf_outdir = f"../artifacts/tmp_pdfs/{fn[:-3]}/{'_'.join(st_pair)}/"
        ensure_dirs(tmp_pdf_outdir)
        for elt in os.listdir(pwy_svg_outdir):
            cmd = ["inkscape", f"--export-pdf={tmp_pdf_outdir}{elt[:-3]}pdf", f"{pwy_svg_outdir}{elt}"]
            subprocess.run(cmd)

        # Concatenate pdfs
        print("Concatenating pdfs")
        pwy_pdf_outdir = f"../artifacts/pwy_pdfs/{fn[:-3]}/"
        ensure_dirs(pwy_pdf_outdir)
        cat_pdf_fn = pwy_pdf_outdir + '_'.join(st_pair) + '.pdf'
        individual_pwys = sorted(list(os.listdir(tmp_pdf_outdir)))
        individual_pwys = [tmp_pdf_outdir + elt for elt in individual_pwys]
        if cat_pdf_fn in individual_pwys:
            individual_pwys.remove(cat_pdf_fn)
        cmd = ["pdfunite", *individual_pwys, cat_pdf_fn]
        subprocess.run(cmd)

# Concatenate sheets into xls
print("Saving xlsx")
if len(sheets) > 0:
    writer = pd.ExcelWriter(f"../artifacts/pwy_xls/{fn[:-3]}" + '.xlsx') # Arbitrary output name
    for i, df in enumerate(sheets):
        df.to_excel(writer,sheet_name=sheetnames[i])
    writer.save()

('fumarate', 'mvacid') 93 paths with mdf > 0 and min mcs > 0.05
Generating svgs & csv
Generating pdfs


Gtk-Message: 21:31:37.395: Failed to load module "gail"
Gtk-Message: 21:31:37.395: Failed to load module "atk-bridge"







































Gtk-Message: 21:31:38.586: Failed to load module "gail"
Gtk-Message: 21:31:38.586: Failed to load module "atk-bridge"







































Gtk-Message: 21:31:39.768: Failed to load module "gail"
Gtk-Message: 21:31:39.768: Failed to load module "atk-bridge"







































Gtk-Message: 21:31:41.321: Failed to load module "gail"
Gtk-Message: 21:31:41.322: Failed to load module "atk-bridge"







































Gtk-Message: 21:31:42.501: Failed to load module "gail"
Gtk-Message: 21:31:42.501: Failed to load module "atk-bridge"







































Gtk-Message: 21:31:43.627: Failed to load module "gail"
Gtk-Message: 21:31:43.627: Failed to load module "atk-bridge"







































Gtk-Message: 21:31:44.989: Failed to load module "gail"
Gt

Concatenating pdfs
('pyruvate', 'mvacid') 2 paths with mdf > 0 and min mcs > 0.05
Generating svgs & csv
Generating pdfs


Gtk-Message: 21:34:34.572: Failed to load module "gail"
Gtk-Message: 21:34:34.573: Failed to load module "atk-bridge"







































Gtk-Message: 21:34:35.884: Failed to load module "gail"
Gtk-Message: 21:34:35.884: Failed to load module "atk-bridge"









































Concatenating pdfs
Saving xlsx


In [165]:
# Concatenate sheets into xls
print("Saving xlsx")
if len(sheets) > 0:
    writer = pd.ExcelWriter(f"../artifacts/pwy_xls/{fn[:-3]}" + '.xlsx') # Arbitrary output name
    for i, df in enumerate(sheets):
        sheetname = '_'.join(list(paths.keys())[i])
        df.to_excel(writer,sheet_name=sheetname)
    writer.save()

Saving xlsx
