In [69]:
%load_ext autoreload
%autoreload 2
from src.chem_draw import draw_pwy_svg
from src.utils import ensure_dirs
import pickle
import subprocess
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
def thresholded_uniprot_strings(path_enzymes, enzyme_validation_threshold):
    uniprot_strings = []
    for rxn_enzymes in path_enzymes:
        if rxn_enzymes:
            up_str = ", ".join([e.uniprot_id for e in rxn_enzymes if e.validation_score >= enzyme_validation_threshold])
            uniprot_strings.append(up_str)
        else:
            uniprot_strings.append(None)

    return uniprot_strings

In [71]:
# Params
starters = '2mg'
targets = 'mvacid'
generations = 2

expansion_dir = '../data/processed_expansions/'
fn = f"{starters}_to_{targets}_gen_{generations}_tan_sample_1_n_samples_1000.pkl" # Expansion file name

# Load processed expansions
with open(expansion_dir + fn, 'rb') as f:
    pe = pickle.load(f)


In [72]:
print(pe.starter_target_pairs)

{('2mg', 'mvacid')}


In [100]:
starter = '2mg'
target = 'mvacid'
enzyme_validation_threshold = 0.9
sort_by = ['enzyme_validation', 'prc_mcs']
filter_by = {'mdf':0, 'enzyme_validation':enzyme_validation_threshold}

paths = pe.get_paths_w_st(starter=starter,
                  target=target,
                  sort_by=sort_by,
                  filter_by=filter_by,
                  reduce_predicted_reactions='min'
                  )

print(len(paths))

3


In [101]:
st_pair = (starter, target)
sheets = []
sheetnames = []

# Generate pwy svgs & csv
print("Generating svgs & csv")
headers = ['id', 'starter', 'target', 'mdf', 'pwy_ave_mcs'] + [f"mcs_rxn_{g+1}" for g in range(generations)] + [f"uniprot_rxn_{g+1}" for g in range(generations)]
pwy_svg_outdir = f"../artifacts/pwy_svgs/{fn[:-3]}/{'_'.join(st_pair)}/"
ensure_dirs(pwy_svg_outdir)

rows_for_sheet = []
for path in paths:
    row = [] # One path
    up_strings = thresholded_uniprot_strings(pe.get_path_enzymes(path), enzyme_validation_threshold)
    prc_mcs = pe.get_path_prc_mcs(path) # Top-analogue prc_mcs for each predicted reaction in path 
    row += [path.id, path.starter, path.target, path.mdf, sum(prc_mcs) / len(prc_mcs)] # Append average prc_mcs

    # Append reaction prc_mcs
    for g in range(generations):
        if g < len(path.reaction_ids):
            row.append(prc_mcs[g])
        else:
            row.append(None)

    # Append uniprot strings
    for g in range(generations):
        if g < len(path.reaction_ids):
            row.append(up_strings[g])
        else:
            row.append(None)

    # Draw pathway svg 
    sma_hash_pairs = []
    for prid in path.reaction_ids:
        pr = pe.predicted_reactions[prid]
        analogue = pr.top_analogue()['analogue']
        sma_hash_pairs.append([(pr.smarts, prid), (analogue.smarts, analogue.id)])
    
    outpath = pwy_svg_outdir + f"{path.id:04}.svg"
    draw_pwy_svg(sma_hash_pairs, path.id, outpath)

    rows_for_sheet.append(row)

# Make df for this st pair
# sheet = pd.DataFrame
sheets.append(pd.DataFrame(rows_for_sheet, columns=headers).set_index('id'))
sheetnames.append("_".join(st_pair))

# Convert svgs to pdfs
print("Generating pdfs")
tmp_pdf_outdir = f"../artifacts/tmp_pdfs/{fn[:-3]}/{'_'.join(st_pair)}/"
ensure_dirs(tmp_pdf_outdir)
for path in paths:
    cmd = ["inkscape", f"--export-pdf={tmp_pdf_outdir}{path.id:04}.pdf", f"{pwy_svg_outdir}{path.id:04}.svg"]
    subprocess.run(cmd)

# Concatenate pdfs
print("Concatenating pdfs")
pwy_pdf_outdir = f"../artifacts/pwy_pdfs/{fn[:-3]}/"
ensure_dirs(pwy_pdf_outdir)
cat_pdf_fn = pwy_pdf_outdir + '_'.join(st_pair) + '.pdf'
individual_pwys = [f"{tmp_pdf_outdir}{path.id:04}.pdf" for path in paths]

cmd = ["pdfunite", *individual_pwys, cat_pdf_fn]
subprocess.run(cmd)

# Concatenate sheets into xls
print("Saving xlsx")
writer = pd.ExcelWriter(f"../artifacts/pwy_xls/{fn[:-3]}" + '.xlsx') # Arbitrary output name
for i, df in enumerate(sheets):
    df.to_excel(writer, sheet_name=sheetnames[i])
writer.save()

# Remove stuff
dirs = ['../artifacts/' + elt for elt in ['tmp_pdfs', 'rxn_svgs', 'pwy_svgs', 'mol_svgs']]
for elt in dirs:
    subprocess.run([f"rm -r {elt}/*"], shell=True)

Generating svgs & csv
Generating pdfs


Failed to get connection
** (org.inkscape.Inkscape:294784): CRITICAL **: 11:21:19.161: dbus_g_proxy_new_for_name: assertion 'connection != NULL' failed

** (org.inkscape.Inkscape:294784): CRITICAL **: 11:21:19.161: dbus_g_proxy_call: assertion 'DBUS_IS_G_PROXY (proxy)' failed

** (org.inkscape.Inkscape:294784): CRITICAL **: 11:21:19.161: dbus_g_connection_register_g_object: assertion 'connection != NULL' failed
Failed to get connection
** (org.inkscape.Inkscape:294807): CRITICAL **: 11:21:19.564: dbus_g_proxy_new_for_name: assertion 'connection != NULL' failed

** (org.inkscape.Inkscape:294807): CRITICAL **: 11:21:19.564: dbus_g_proxy_call: assertion 'DBUS_IS_G_PROXY (proxy)' failed

** (org.inkscape.Inkscape:294807): CRITICAL **: 11:21:19.564: dbus_g_connection_register_g_object: assertion 'connection != NULL' failed
Failed to get connection
** (org.inkscape.Inkscape:294831): CRITICAL **: 11:21:19.936: dbus_g_proxy_new_for_name: assertion 'connection != NULL' failed

** (org.inkscape.

Concatenating pdfs
Saving xlsx


In [102]:
df.head()

Unnamed: 0_level_0,starter,target,mdf,pwy_ave_mcs,mcs_rxn_1,mcs_rxn_2,uniprot_rxn_1,uniprot_rxn_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7,2mg,mvacid,55.004919,0.0,0.0,0.0,"Q7WKM5, A3Q5P5, A1SMQ5, Q2S3H9, A1BE51, Q5LCW4...",Q8WVX9
15,2mg,mvacid,9.351731,0.0,0.0,0.0,"Q9JID6, P33121, P97524, O14975, P18163, P41216...",Q8WVX9
2,2mg,mvacid,45.789952,0.355392,0.710784,0.0,B3EY95,Q8WVX9
