In [1]:
import matplotlib.pyplot as plt
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout

from jasyntho import SynthTree
from jasyntho.extract import ExtractReaction


def extract_subgraph(graph, start_node):
    """Use BFS to find all nodes reachable from start_node."""
    reachable_nodes = set(nx.bfs_tree(graph, start_node))
    return graph.subgraph(reachable_nodes).copy()


def plot_graph(G):
    fig = plt.figure(figsize=(10, 7))
    pos = graphviz_layout(G, prog="dot")
    nx.draw(G, pos, with_labels=True, arrows=True)
    plt.show()


def update_graph(G, new_edges, rm_edges, rm_nodes, names_mapping):
    """function takes as input new_edges, rm_edges, rm_nodes, names_mapping"""

    # Relabel nodes
    G = nx.relabel_nodes(G, names_mapping)

    # Updates edges
    for e in new_edges:
        G.add_edge(e[0], e[1])
    for e in rm_edges:
        G.remove_edge(e[0], e[1])

    # Remove nodes
    for n in rm_nodes:
        G.remove_node(n)

    # Remove nodes with length 1
    reach_sgs = SynthTree.get_reach_subgraphs(G)
    for k, g in reach_sgs.items():
        if len(g) == 1:
            G.remove_node(k)
    return G


async def extract_tree(path, model="gpt-3.5-turbo"):
    tree = SynthTree.from_dir(path)
    tree.rxn_extract = ExtractReaction(model=model)

    tree.raw_prods = await tree.async_extract_rss(mode="vision")
    tree.products = [p for p in tree.raw_prods if not p.isempty()]

    reach_sgs = tree.partition()
    return tree

  from .autonotebook import tqdm as notebook_tqdm


# ja074300t

In [None]:
path = "../benchmark/papers/ja074300t"
tree = await extract_tree(path)

In [None]:
fg = tree.full_g.copy()

names_mapping = {
    "SI–1 2": "SI-1",
    "oxazolidinone-lactol 70": "70",
    "(Z)-114": "114",
    "Aldehyde": "SI-30",
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ("58", "(-)-36"),
    ("58", "57"),
    ("58", "DME"),
    ("57", "EtOAc"),
    ("98", "93"),
    ("98", "93"),
    ("98", "sodium chlorite"),
    ("98", "MeCN"),
    ("98", "sodium phosphate"),
    ("98", "TEMPO"),
    ("98", "MTBE"),
    ("98", "MTBE"),
    ("99", "98"),
    ("99", "MeI"),
    ("99", "potassium carbonate"),
    ("99", "DMF"),
    ("SI-17", "SI-16"),
    ("SI-17", "THF"),
    ("SI-17", "NH4Cl"),
    ("SI-17", "4-pentenyl-1-magnesium bromide"),
    ("101", "SI-17"),
    ("101", "DMSO"),
    ("101", "Et3N"),
    ("101", "oxalyl chloride"),
    ("SI-18", "101"),
    ("SI-18", "102"),
    ("SI-18", "N-potassiumhexamethyldisilazane"),
    ("103", "HF"),
    ("103", "SI-18"),
    ("113", "(E)-107"),
    ("113", "(Z)-107"),
    ("118", "117"),
    ("118", "116"),
    ("122", "SI-23"),
    ("123", "122"),
    ("SI-24", "123"),
    ("124", "SI-24"),
]

rm_edges = [
    ("101", "SI-16"),
    ("101", "Mg"),
    ("101", "5-bromo-1-pentene"),
]

rm_nodes = ["OOOTBSOTIPS1163", "TBSSI-223", "NNOTIPSOOOTBSSI-23", "17b"]

# gt head nodes
gthn = ["32", "75", "70", "100", "105", "114", "1", "131", "132", "SI-29"]

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     plot_graph(g)
#     print(g.nodes)
# if len(g) > 1:
#     if not any(map(lambda x: x in g, gthn)):
#         plot_graph(g)
#         print(g.nodes)


import pickle

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

# ja512124c

In [None]:
path = "../benchmark/papers/jacs.0c00308"
tree = await extract_tree(path)

In [None]:
import os

fg = tree.full_g.copy()

names_mapping = {
    "epoxide 25": "25",
    "cyclohexadienone 26": "26",
    "tetracycle 22": "22",
    "diol 24": "24",
    "diketone 13": "13",
    "enone 23": "23",
    "alcohol 27": "27",
    "spirolleycle 30": "30",
    "corresponding ketone": "18",
    "4,4-dimethyl cyclopentanone 37": "37",
    "Acetonide 38": "38",
    "tetracyclic diketones C7-deoxy-13": "C7-deoxy-13",
    "MOM ether S-6": "S-6",
    "vinylphenol C7-deoxy-14": "C7-deoxy-14",
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ("S-3", "IBX"),
    ("S-3", "DMSO"),
    ("S-3", "NaOH"),
    ("S-1", "16"),
    ("S-2", "BzOH"),
    ("S-2", "PPh3"),
    ("S-2", "DIAD"),
    ("S-2", "p-TsOH•H2O"),
    ("15", "H2NNHCONH2•HCl"),
    ("15", "Pb(OAc)4"),
    ("15", "Pd/BaSO4"),
    ("15", "SiO2"),
    ("C7-epi-18", "15"),
    ("C7-epi-18", "17"),
    ("C7-epi-18", "n-BuLi"),
    ("18", "C7-epi-18"),
    ("C7-deoxy-C8,C13-diepi-13", "C7-deoxy-14"),
    ("C7-deoxy-C8,C13-diepi-13", "HFIP"),
    ("C7-deoxy-C8,C13-diepi-13", "PIFA"),
    ("C7-deoxy-C8,C13-diepi-13", "Na2S2O3"),
    ("S-6", "S-5"),
    ("S-6", "NaHMDS"),
    ("37", "33"),
    ("39", "38"),
    ("S-10", "39"),
    ("S-9", "39"),
    ("41", "39"),
    ("7", "42"),
    ("7", "VO(acac)2"),
    ("7", "4A molecular sieve"),
    ("7", "TBHP"),
    ("7", "NaBH4"),
]

rm_edges = [
    ("S-2", "S-3"),
    ("S-1", "S-2"),
    ("S-1", "S-3"),
    ("S-3", "S-1"),
    ("15", "S-1"),
    ("S-6", "18"),
    ("38", "39"),
    ("7", "1"),
    ("7", "5"),
    ("7", "2"),
    ("7", "10"),
]

rm_nodes = ["17b", "9", "5", "4", "2", "10", "11", "P1", "18a", "S4", "8"]

# gt head nodes
gthn = [
    "C7-deoxy-13",
    "C7,C8,C13-triepi-13",
    "C7-deoxy-C8,C13-diepi-13",
    "C14-epi-27",
    "21",
    "30",
    "34",
    "35",
    "36",
    "S-8",
    "S-9",
    "S-10",
    "S-11",
    "46",
    "7",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     if len(g) > 1:
#         # if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)


import pickle

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

# jacs.0c00363

In [5]:
path = "../benchmark/papers/jacs.0c00363"
tree = await extract_tree(path)

Finished processing batch. Cost: 0.019095
Finished processing batch. Cost: 0.02968
Finished processing batch. Cost: 0.030175000000000004
Finished processing batch. Cost: 0.030520000000000002
Finished processing batch. Cost: 0.03175
Finished processing batch. Cost: 0.030160000000000003
Finished processing batch. Cost: 0.031120000000000002
Finished processing batch. Cost: 0.030070000000000003
Finished processing batch. Cost: 0.029725
Finished processing batch. Cost: 0.031735000000000006
Finished processing batch. Cost: 0.031810000000000005
Finished processing batch. Cost: 0.035125
Finished processing batch. Cost: 0.031465
Finished processing batch. Cost: 0.03241
Finished processing batch. Cost: 0.03572500000000001
Finished processing batch. Cost: 0.029710000000000004
Finished processing batch. Cost: 0.030010000000000002
Finished processing batch. Cost: 0.041095000000000007
Finished processing batch. Cost: 0.046150000000000004
Finished processing batch. Cost: 0.047170000000000004
Finished

In [40]:
import os

fg = tree.full_g.copy()

names_mapping = {
    "Compound 12": "12",
    "Compound 13": "13",
    "Compound 16": "16",
    "Compound 24": "24",
    "Compound S4": "S4",
    "Compound 19": "19",
    "Compound 20": "20",
    "Compound S5": "S5",
    "Crude S3": "S3",
    "Mixture of 15a, 15b, and 15c": "15a",
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ("18", "17"),
    ("19", "18"),
    ("19", "LiAlH4"),
    ("19", "NaOH"),
    ("20", "19"),
    ("20", "NaHCO3"),
    ("20", "DMP"),
    ("20", "S4"),
    ("S5", "20"),
    ("S6", "S5"),
    ("10", "S1"),
    ("S2", "10"),
    ("11", "S2"),
    ("S3", "11"),
    ("12", "S3"),
    ("15b", "12"),
    ("15b", "n-C4F9SO2F"),
    ("15b", "DBU"),
    ("15c", "12"),
    ("15c", "n-C4F9SO2F"),
    ("15c", "DBU"),
]

rm_edges = [
    ("S4", "S2"),
    ("S4", "S3"),
    ("S5", "S3"),
    ("S5", "S1"),
    ("S2", "S1"),
    ("12", "11"),
]

rm_nodes = [
    "2",
    "3",
    "4",
    "Natural Propindilactone G",
    "Synthetic Propindilactone G",
    "Compound B",
    "Compound A",
    "17a",
    "P1",
    "18a",
]

# gt head nodes
gthn = [
    "1",
    "24",
    "15a",
    "15b",
    "15c",
    "13",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     if len(g) > 1:
#         if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)

import pickle

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

6


# jacs.0c02143

In [21]:
import os
import pickle

path = "../benchmark/papers/jacs.0c02143"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [39]:

import os

fg = tree.copy()
names_mapping = {
    'compound 7': '7',
    'compound 6': '6',
    'compound 8':'8',
    '(+)-Waihoensene (1)':'1',
    '(±)-7':'7', 
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ("S4", "S3"),
    ("S4", "S7"),
    ("S7", "Mg"),
    ("S7", "S6"),
    ("S7", "I2"),
    ("S2", "S1"),
    ("S3", "S2"),
    ("S3", "PTSA"),
    ("S3", "i-BuOH"),
    ("S2", "S6"),
    ("S2", "HMPA"),
    ("S2", "HCl"),
    ('7','S4'), 
    ('7','CuBr·Me2S'), 
    ('7','MeLi'), 
    ('7','TMSCl'), 
    ('7','TBAF'), 
    ('11', '2-ethyl-2-methyl-3-dioxolane'),
    ('11', 'NaHCO3'),
    ('17', 'PhMe'),
    ('15', 'PhSiD3'),
    ('15', 'Fe(acac)3'),
    ('16', 'Fe(acac)3'),
    ('16', 'CD3CDOD'),

]

rm_edges = [
    ("S4", "S2"),
    ('7','S2'), 
    ('16','15'), 
    ('16','13'), 
    # ('17', '13'),

]

rm_nodes = [
    '4-1',
    'Unknown Reactants',
    'S5',
    'C8-epi-20',
    '17a'

]

# gt head nodes
gthn = [
    "1",
    "13-iso",
    "15",
    "16"
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     # if len(g) > 1:
#         # if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

4


# jacs.0c02513

In [40]:

path = "../benchmark/papers/jacs.0c02513"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [49]:
import os

fg = tree.copy()

names_mapping = {

    'Vinyl Silane 20': '20',
    'Enone 5': '5',
    'Cyclobutane 21': '21',
    'Cyclobutanol 4a': '4a',
    'Scabrolide A': '1',
    'Vinylmagnesium bromide (VinylMgBr)': ' VinylMgBr',
    'Epoxide S-1': 'S-1',
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ("4a", "epi-25"),
    ("21", "hv"),
    ('24', 'Cp2TiCl2'),
    ('24', '1,4-cyclohexadiene'),
    ('epi-24', 'Cp2TiCl2'),
    ('epi-24', '1,4-cyclohexadiene'),
    ("S-1", "hv"),
    ("5", "MeCN"),
]

rm_edges = [
]

rm_nodes = [
    "17b",
]

# gt head nodes
gthn = [
    "1",
    "21",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     # if len(g) > 1:
#         # if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

2


# jacs.0c10122

In [50]:

path = "../benchmark/papers/jacs.0c10122"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [96]:


import os

fg = tree.copy()

names_mapping = {
    'Compound 20':'20',
    'Compound 22':'22',
    'Compound 9':'9',
    'Compound 23':'23',
    'Compound 24':'24',
    'Compound 10':'10',
    'compound 25':'25',
    'compound 24':'24',
    'compound 7':'7',
    'compound 26':'26',
    'compound 15':'15',
    'compound 27':'27',
    'compound 8':'8',
    'compound 28': '28',
    'Compound 8': '8',
    'Compound 27': '27',
    'Compound 28': '28',
    'Compound 7': '7',
    'Compound 29': '29',
    '(+)-Haperforin G': '1',
    'BuOK': 't-BuOK',
    'DPPP': 'DPPF',
    '[Ir(ppy)₂(dtbbpy)]PF₆': '[Ir(ppy)2(dtbbpy)]PF6'
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ('12','18'),
    ('12', '19'),
    ('12', 'DCC'),
    ('12', 'DMAP'),
    ('11', '12'),
    ('20', '11'),
    ('S8', '20'),
    ('21', 'S8'),
    ('S9', '21'),
    ('10', 'S9'),
    ('22', '10'),
    ('9', '22'),
    ('23', '9'),
    ('24', '23'),
    ('S10', '23'),
    ('25', '24'),
    ('7', '25'),
    ('26', '16'),
    ('26', '17'),
    ('15', '26'),
    ('S11', '29'),
    ('1', 'S11'),
    ('S11', 'SOCl2'),
    ('S11', 'pyridine'),
    ('28', 'Blue LED'),
    ('26', 'diisopropenylzinc'),
    ('25', 'KHMDS'),
    ('24', 'KHMDS'),
    ('24', '18-crown-6'),
    ('24', 'CH3I'),
    ('24', 'NH4Cl'),
    ('S10', 'KHMDS'),
    ('S10', '18-crown-6'),
    ('S10', 'CH3I'),
    ('S10', 'NH4Cl'),
    ('23', 'TPAP'),
    ('23', 'NMO'),
    ('9', 'NaOH'),
    ('9', 'MeOH'),
    ('9', 'H2O2'),
    ('22', 'Co2(CO)8'),
    ('22', 'CO'),
    ('22', 'TBAF'),
    ('10', 'PPh3+CH3Br-'),
    ('10', 'nBuLi'),
    ('S9', 'TPAP'),
    ('S9', 'NMO'),
    ('S9', '4Å molecular sieve'),
]

rm_edges = [
    ('S11', '9'),
    ('1', '29'),
    ('S8', 'S9'),
    ('S10', '22'),
]

rm_nodes = [
    'unnamed main reactant',
    'Reactants and Reagents',
    'Unknown',
    'NaOH, H2O2',
    'KHMDs, toluene, 18-crown-6',
    'S12',

]

# gt head nodes
gthn = [
    "S6",
    "S7",
    "(±)-18",
    "(-)-18",
    'S10',
    '1'
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     # if len(g) > 1:
#         # if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

6


# jacs.1c00293

In [97]:

path = "../benchmark/papers/jacs.1c00293"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [142]:


import os

fg = tree.copy()

names_mapping = {
    'pentacyclic alcohol S4': 'S4',
    'exo-methyl pentacycle 22': '22',
    'deoxygenated pentacycle 23': '23',
    'cephanolide B (2)': '2',
    'iso-cephanolide B (2’)': '2’',
    'cephanolide C (3)': '3',
    'Trifluoromethanesulfonic anhydride (Tf2O)': 'Tf2O', 
    'Carbon monoxide (CO)': 'CO',
    'Triethylamine (Et3N)': 'Et3N', 
    'Sodium thiosulfate (Na2S2O3)':'Na2S2O3',
    'Potassium carbonate (K2CO3)': 'K2CO3',
    'Selenium dioxide (SeO2)': 'SeO2',
    'Dess-Martin periodinane (DMP)': 'DMP',
    'Palladium on carbon (Pd/C)': 'Pd/C',
    'Hydrogen (H2)': 'H2',
    '1,8-Diazabicyclo[5.4.0]undec-7-ene (DBU)': 'DBU',
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ('S2', 'S1'),
    ('S2', 'KH2PO4'),
    ('S2', 'P2O5'),
    ('16', 'S2'),
    ('16', 'Tf2O'),
    ('16', '2,4,6-collidine'),
    ('S8', '30'),
    ('S8', 'NaH'),
    ('S8', 'CS2'),
    ('S8', 'MeI'),
    ('31', 'S8'),
    ('31', 'AIBN'),
    ('31', 'Bu3SnH'),
    ('31', 'PhMe'),
    ('18', '14'),
    ('11', '18'),
    ('11', '16'),
    ('18', 'Pd(OAc)2'),
    ('18', 'DavePhos'),
    ('18', 'KF'),
    ('4', 'Pd(dppf)Cl2•DCM'),
    ('4', 'S6'),
    ('4', 'TBAF'),
    ('S5', '4Å molecular sieve'),
    ('S5', 'DCE'),
    ('1', 'HFIP'),
    ('29', '28'),
    ('29', 'NaBH4'),
    ('29', 'MeOH'),
    ('30', '29'),
    ('30', 'hv'),
    ('30', 'I2'),
    ('30', 'PIDA'),
    ('30', 'TBAF'),
    ('27', '26'),
    ('27', 'Pd/C'),
    ('27', 'H2'),
    ('28', 'DBU'),
    ('22', 'Pd/C'),
    ('22', 'H2'),
    ('21', 'Ti(Oi-Pr)2Cl2'),
    ('21', 'CH2(ZnBr)2'),
    ('18', '15'),
    ('2', 'HFIP'),
    ('2’', 'HFIP'),
]

rm_edges = [
    ('3','1'),
    ('11', '14'),
    ('11', 'KF'),
    ('11', 'DavePhos'),
    ('31', '30'),
    ('30', '28'),
    ('28', '26'),
    ('28', 'Pd/C'),
    ('28', 'H2'),
    ('11', '15'),
]

rm_nodes = [
    'product',
    'Palladium(II) bis(dibenzylideneacetone)',
    'Unknown starting material',
    'XX',
    'YY',
]

# gt head nodes
gthn = [
    "1",
    "2",
    "2’",
    "3",
    "4"
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     if '18' in g:
#     # if len(g) > 1:
#     #     if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

5


# jacs.3c01991 (TBD) -- too hard!

In [144]:

path = "../benchmark/papers/jacs.3c01991"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [None]:


import os

fg = tree.copy()

names_mapping = {
    'product 22': '22',
    'KB343 (1)': '1',
    'epi-KB343 (2)': '2',
    'Catalytic Pd/C': 'Pd/C',
    'Hydrogen gas': 'H2',
    'Trimethylphosphine (Me3P)': 'Me3P',
    ' Bu3P': 'Bu3P',
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    # ("15c", "DBU"),
]

rm_edges = [
    # ("S4", "S2"),
]

rm_nodes = [
    '4,6,2-(tert-Butyldimethylsilyloxy)-6-(tert-butoxycarbonylamino)-2-(methoxymethyl)aniline',
    '4-(tert-Butyldimethylsilyloxy)-6-(tert-butoxycarbonylamino)-2-(methoxymethyl)phenyl N-(benzyloxycarbonyl)methoxyamide',
    'Benzyllic Oxidation Product',
    'Guanidine Synthesis Product',
    'Formally Diaminated Product',
    'BocN Product',
    'R Rhodium Catalyst',
    'S1',
    # 'catalyst A',
    'aza-Michael',
    # 'Ruthenium(IV) oxide monohydrate',
    # 'Sodium Azide',
    'Pd/C, H2',
    # '(S,S)-Co(III)-Salen-SbF6',
    # "Rawal's diene",
    # 'cul',
    # 'Starting compound',
    'product',
    'catalyst',
    'reagent',
    # 'reagent X',
    # 'catalyst Y',
    # "18a",
]

# gt head nodes
gthn = [
    # "1",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

for g in reach_sgs.values():
    if len(g) > 1:
        if not any(map(lambda x: x in g, gthn)):
            plot_graph(g)
            print(g.nodes)

# with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
#     pickle.dump(fg, f)

# jacs.3c07019 (TBD) -- too hard!

In [153]:

path = "../benchmark/papers/jacs.3c07019"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [None]:


import os

fg = tree.copy()

names_mapping = {
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    # ("15c", "DBU"),
]

rm_edges = [
    # ("S4", "S2"),
]

rm_nodes = [
    # "18a",
]

# gt head nodes
gthn = [
    # "1",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

for g in reach_sgs.values():
    if len(g) > 1:
        if not any(map(lambda x: x in g, gthn)):
            plot_graph(g)
            print(g.nodes)

# with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
#     pickle.dump(fg, f)

# jacs.8b00148

In [157]:

path = "../benchmark/papers/jacs.8b00148"

with open(os.path.join(path, "extracted_graph_gpt4t_text_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [251]:


import os

fg = tree.copy()

names_mapping = {
 '19 2': '19',
 'zinc powder': 'Zn',
 'hydrogen': 'H2',
 '1,3,4,6-Tetra-O-acetyl-2-deoxy-2-trichloracetamido-β-D-glucopyranose':'S1',
 '2,2,6,6-tetramethylpiperidine 1-oxyl (TEMPO)': 'TEMPO',
 '(diacetoxyliodo) benzene (BAIB)': 'BAIB',
 'benzylglucuronate': '14',
 '10% Pd/C': 'Pd/C',
 'Boc2O': '(Boc)2O',
 '8’': '8\'',
 '35\uf061': '35',

#  '100',
#  '480',
#  '1.14',
#  'allyl alcohol',
#  'Boron trifluoride diethyl ether complex',
#  'benzaldehyde dimethyl acetal',
#  'sodium hydrogencarbonate',
#  'N-phenyl trifluoroacetimidoyl chloride',
#  'Phenyl 2-azido-3,4-di-O-acetyl-2-deoxy-1-seleno-\uf061',
#  '8’',
#  '35\uf061',
#  '10% Pd/C'
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ('D-Glucosamine hydrochloride', '2-p-methoxybenzylidenamino-D-glucopyranose'),
    ('D-Glucosamine hydrochloride', 'anisaldehyde'),
    ('D-Glucosamine hydrochloride', 'NaOH'),
    ('2-deoxy-2-p-methoxybenzyliden-amino-1,3,4,6-tetra-O-acetyl--D-glucopyranoside', 'D-Glucosamine hydrochloride'),
    ('2-deoxy-2-p-methoxybenzyliden-amino-1,3,4,6-tetra-O-acetyl--D-glucopyranoside', 'Ac2O'),
    ('2-deoxy-2-p-methoxybenzyliden-amino-1,3,4,6-tetra-O-acetyl--D-glucopyranoside', 'DMAP'),
    ('2-amino-1,3,4,6-tetra-O-acetyl-2-deoxy--D-glucopyranosyl hydrochloride', 'acetone'),
    ('2-amino-1,3,4,6-tetra-O-acetyl-2-deoxy--D-glucopyranosyl hydrochloride', '2-deoxy-2-p-methoxybenzyliden-amino-1,3,4,6-tetra-O-acetyl--D-glucopyranoside'),
    ('S1', '2-amino-1,3,4,6-tetra-O-acetyl-2-deoxy--D-glucopyranosyl hydrochloride'),
    ('S1', 'trichloroacetly chloride'),
    ('S1', 'Et3N'),
    ('S1', 'NaHCO3'),
    ('10', 'S1'),
    ('49', '44'),
    ('47', '44'),
    ('48', '44'),
    ('49', '39'),
    ('47', '39'),
    ('48', '39'),
    ('49', '46'),
    ('47', '46'),
    ('48', '46'),
    ('46', '44'),
    ('45', '44'),
    ('44', '43'),
    ('43', '40'),
    ('42', '41'),
    ('41', '40'),
    ('40', '36'),
    ('39', 'S8'),
    ('S8', '15'),
    ('7', '36'),
    ('36', '9'),
    ('36', '8’'),
    ('15', '14'),
    ('3', '5'),
    ('54', '(R)-3-O-Benzylbutyric acid'),
    ('52', '47'), ('52', 'Ac2O'), ('52', 'DMAP'),
    ('47', 'DDQ'), ('47', '51'), ('52', 'H2O'),
    ('51', 'AW-200 MS'),
    ('41', '1,4-dioxane'),
    ('35', '34'),
    ('35', 'AcSH'),
    ('34', '33'),
    ('34', 'N-Bn-N-Cbz-3-aminopropan-1-ol'),
]

rm_edges = [
    ('14', '14'),
    ('54', '1'),
    ('54', 'oxalyl chloride'),
    ('52', '51'), ('52', 'DDQ'), 
    ('S1', '100'), ('S1', '480'), ('S1', '57'), ('S1', '1.14'), ('S1', '33'), ('S1', '61'),
    # ("S4", "S2"),
]

rm_nodes = [
    # "18a",
]

# gt head nodes
gthn = [
    '24',
    '2',
    '38',
    '42',
    '45',
    '48',
    '49',
    '47',


    # "1",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for k, g in reach_sgs.items():
#     # if '44' in g:
#     # if len(g) > 1:
#         # if not any(map(lambda x: x in g, gthn)):
#             print(k)
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

7


# jacs.7b13260

In [338]:

path = "../benchmark/papers/jacs.7b13260"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [340]:


import os

fg = tree.copy()

names_mapping = {
    'Magnesium': 'Mg',
    'Palladium(II) acetate': 'Pd(OAc)2',
    'Copper(I) iodide': 'CuI',
    'Trimethylsilyl chloride': 'TMSCl',
    'Sodium iodide':'NaI',
    'Potassium dihydrogen phosphate (K2HPO4)': 'K2HPO4',
    'Methylmagnesium chloride':'MeMgCl',
    'Pyridinium chlorochromate (PCC)': 'PCC',
    '10 (major)': '10',
    'Ethylmagnesium bromide':'EtMgBr',
    'Hydrochloric acid':'HCl',
    'TCCA (Trichloroisocyanuric acid)':'TCCA',
    'Cerium chloride (CeCl3)': 'CeCl3',
    'Tricycle 17':'17',
    'Aldehyde 14':'14',
    'Silyl enol ether 18':'18',
    'Ketone 19':'19',
    'palladium catalyst':'Pd',
    'triethylamine':'Et3N',
    'tert-Butyl hydroperoxide (TBHP)':'TBHP',
    'Copper(I) mesitylenesulfonate (Cu(MeCN)4OTf)':'Cu(MeCN)4OTf',
    "4-Methoxy-2,2'-bipyridine (4-Omebpy)":'4-Omebpy',
    'N-Methylimidazole (NMI)':'NMI',
    'Samarium(II) iodide (SmI2)':'SmI2',
    'Tetramethylsilane chloride (TMSCl)':'TMSCl',
    'Diisopropylethylamine (iPr2NEt)': 'iPr2NEt',
    'Chloromethyl methyl ether (MOMCl)':'MOMCl',
    'Enal':'5',
    'Hydrindanone':'12',
    'Z-59':'Z-S9',

}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ('12-epi-S5', 'iPr2NEt'),
    ('12-epi-14', '12-epi-S6'),
    ('12-epi-17','12-epi-14'),
    ('12-epi-18','12-epi-17'),
    ('26','SmI2'),
    ('26','H2O'),
    ('10','9'),
    ('S3','9'),
    ('S3','CuI'),
    ('S3','Isopropenylmagnesium bromide'),
    ('S1','4A molecular sieve'),
    ('11','S1'),
    ('11','6N HCl'),
    ('S4','S1'),
    ('S4','6N HCl'),
    ('S14','S13'),
    ('S14','[Cu]/bpy'),
    ('S14','NMI'),
    ('S14','ABNO'),
    ('S13','LiAlD4'),
    ('S13','S12'),
    ('S13','Et2O'),
    ('S12','14'),
    ('S12','NaClO2'),
    ('S12','KH2PO4'),
    ('S12','2-methyl-2-butene'),
    ('S12','tBuOH'),
    ('E-S9', 'S11'),
    ('E-S9', 'CuI'), ('E-S9', 'MeMgBr'),
    ('E-S9', 'S11'),
    ("S11", "NaOMe"),
    ("S11", "PhSH"),
    ("S11", "MeOH"),
    ("S5", "MOMCl"),
    ("S5", "iPr2NEt"),
    ("S5", "CH2Cl2"),
    ("Z-6", "B2(OH)4"),
    ("Z-6", "Pd(MeCN)4(BF4)2"),
    ("Z-S10", "DIBAL-H"),
    ("12-epi-20", "Li"),
    ("12-epi-20", "NH3"),
    ("12-epi-19", "Mn(dpm)3"),
    ("12-epi-19", "PhSiH3"),
    ("12-epi-19", "TBHP"),
    ("12-epi-18", "LiHMDS"),
    ("12-epi-18", "TIPSOTf"),
    ("12-epi-17", "SmI2"),
    ("12-epi-17", "H2O"),

    ("12-epi-14", "[Cu]/bpy"),
    ("12-epi-14", "NMI"),
    ("12-epi-14", "ABNO"),
    ("12-epi-S6", "HCO2H"),
    ('S8', 'LDA'),
    ('S8', 'EtOCOCl'),
    ('S7', 'TrtCl'),
    ('S7', 'Et3N'),
    ('S7', 'DMAP'),
    ('S7', '3-butyn-1-ol'),
    ('9','7'),
    ('9','8'),
    ('9','TMSCl'),
    ('9','Pd(OAc)2'),
    ('9','O2'),
    ('9', 'CuCN•2LiCl'),

]

rm_edges = [
    ("10", "10"),
    ('1','5'),
    ('21-d','21'),
    ('S14', 'S12'),
    ('S14', '14'),
    ('S14', 'NaClO2'), ('S14', 'K2HPO4'), 
    ('S14', '2-methyl-2-butene'),
    ('S14', 'LiAlD4'),
    ('13b','13a'),
    ('E-S10', 'S11'),
    ('E-S10', 'CuI'), ('E-S10', 'MeMgBr'),
    ('S8', 'EtO'),
    ('S8', 'OTrt'), ('S8', 'O'), ('S8', 'OTIPS')
]


rm_nodes = [
    '22c',
    '5b',
    '17a',
    '17b',
    '18b',
    'Unknown Starting Material (inferred based on Compound 11)',
    'Unknown Starting Material (inferred based on Compound S2 (dr mix))',
    'cyclopropane-containing compound with chlorine, hydroxyl, and methyl groups',
    '6-Methylspiro[4.5]decanone', '7-Chloro-1,6-dioxaspiro[4.4]nonan-4-one',
    '1-Ethoxycyclohex-1-ene', '7-ethoxy-1,6-dioxaspiro[4.4]nonane',
    '2-(2-Cyanomethyl)-1,3-dioxane', '2-(2-Bromomethyl)-1,3-dioxane',
    '7-Ethyl-1,6-dioxaspiro[4.4]non-4-ene', 'Cyclohexenone',
    'Dichlorocyclohexanone', '7-Ethoxy-1,6-dioxaspiro[4.4]nonan-4-one',
    'Unknown Starting Material (inferred based on Compound S4)',
    'SSF-III-75', 'SSF-III-67', 'SSF-III-37',
    'P1', 'S4',
    'SSF-III-78-characterization2.fid',
    'Cyclohexanone reagent', 'SSF-III-77-characterization2.1.fid',
    'cyclopropane-containing compound with chlorine and methyl groups',
    'compound with a cyclopropane ring and ketone group'
]

# gt head nodes
gthn = [
    "1",
    '12-epi-1',
    '21-d',
    '21',
    '11-epi-26',
    '26',
    'S3'
]


fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for g in reach_sgs.values():
#     # if len(g) > 1:
#         # if not any(map(lambda x: x in g, gthn)):
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

7


# jacs.7b09929

In [6]:
import os
import pickle

path = "../benchmark/papers/jacs.7b09929"

with open(os.path.join(path, "extracted_graph_gpt4t_text_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [16]:
import os

fg = tree.copy()

names_mapping = {
    'sodium hydride':'NaH',
    '909':'pyridine',
    'sodium hydroxide':'NaOH',
    'Triethylborane':'triethylborane',
    '(–)-Quadrigemine C':'7',
    '(–)-Hodgkinsine B': '3',
    '(–)-Hodgkinsine': '4',
    '(–)-Calycosidine': '5',
    '(–)-Psycholeine': '8',
    'Hodgkinsine B': '3',
    'Quadrigemine C':'7',
    'Psycholeine': '8',
    'Hodgkinsine': '4',
    'Calycosidine': '5',
    '2,6-di-tert-butyl-4-methylpyridine (DTBMP)':'DTBMP',
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    ('8', 'CH3COOH'),
    ('S5', '4-nitrophenyl (2-(trimethylsilyl)ethyl) carbonate'),
    ('5', 'CH3COOH'),
    ('42', 'S7'),
    ('42', 'hv'),
    ('S7', 'hv'),
    ('25', '23'),
    ('25', 'hydrazine'),
    ('25', '(diacetoxyiodo)benzene'),
]

rm_edges = [
    ('8', 'S11'), ('8', '19'), ('8', '20'), ('8', '23'), 
    ('27', '8'), ('34', '8'), ('39', '8'),
    ('S5', '4'),
    ('4', 'S11'), 
    ('5', 'S11'),
    ('3', 'S11'),
    ('4', '20'),
    ('4', '19'),
    ('5', '20'),
    ('5', '19'),
    ('30', '9'),
    ('26', '6'),
    ('3', '20'),
    ('3', '19'),
    ('7', '19'),
    ('7', '20'),
    ('7', '23'),
    ('3', '23'),
    ('4', '23'),
    ('5', '23'),

]

rm_nodes = [
    'cyclotryptamines',
    'NTeocNCO2MeH2NH',
]


# gt head nodes
gthn = [
    "5",
    "8",
    "3",
    "25",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

# for k, g in reach_sgs.items():
#     if len(g) > 1:
#         if not any(map(lambda x: x in g, gthn)):
#             print(k)
#             plot_graph(g)
#             print(g.nodes)

with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
    pickle.dump(fg, f)

4


# jacs.7b00807

In [19]:

path = "../benchmark/papers/jacs.7b00807"

with open(os.path.join(path, "extracted_graph_gpt4t_text_.pickle"), "rb") as f:
    tree = pickle.load(f)

In [None]:


import os

fg = tree.copy()

names_mapping = {
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    # ("15c", "DBU"),
]

rm_edges = [
    # ("S4", "S2"),
]

rm_nodes = [
    # "18a",
]

# gt head nodes
gthn = [
    # "1",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

for g in reach_sgs.values():
    if len(g) > 1:
        if not any(map(lambda x: x in g, gthn)):
            plot_graph(g)
            print(g.nodes)

# with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
#     pickle.dump(fg, f)

# jacs.9b12546

In [None]:

path = "../benchmark/papers/jacs.9b12546"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    fg = pickle.load(f)

In [None]:


import os

fg = tree.full_g.copy()

names_mapping = {
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    # ("15c", "DBU"),
]

rm_edges = [
    # ("S4", "S2"),
]

rm_nodes = [
    # "18a",
]

# gt head nodes
gthn = [
    # "1",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

for g in reach_sgs.values():
    if len(g) > 1:
        if not any(map(lambda x: x in g, gthn)):
            plot_graph(g)
            print(g.nodes)

# with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
#     pickle.dump(fg, f)

# jacs.9b09699

In [None]:

path = "../benchmark/papers/jacs.9b09699"

with open(os.path.join(path, "extracted_graph_gpt4t_vision_.pickle"), "rb") as f:
    fg = pickle.load(f)

In [None]:


import os

fg = tree.full_g.copy()

names_mapping = {
}

# a <- b reaction.  a is product, b is reactant
new_edges = [
    # ("15c", "DBU"),
]

rm_edges = [
    # ("S4", "S2"),
]

rm_nodes = [
    # "18a",
]

# gt head nodes
gthn = [
    # "1",
]

fg = update_graph(fg, new_edges, rm_edges, rm_nodes, names_mapping)

reach_sgs = SynthTree.get_reach_subgraphs(fg)
print(len(reach_sgs))

for g in reach_sgs.values():
    if len(g) > 1:
        if not any(map(lambda x: x in g, gthn)):
            plot_graph(g)
            print(g.nodes)

# with open(os.path.join(path, "gt_graph.pickle"), "wb") as f:
#     pickle.dump(fg, f)

Goal til here

---

# jacs.9b05013

# jacs.8b13029

# jacs.8b06755

# jacs.8b03015

# jacs.7b11299

# jacs.7b08749

# jacs.7b07724

# jacs.7b06055

# jacs.7b01454

# jacs.6b07846

# jacs.2c13889

# jacs.2c12529

# jacs.2c06934

# jacs.0c10122