In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hydra import initialize, compose
from rdkit import Chem
from ergochemics.mapping import rc_to_nest, get_reaction_center
from ergochemics.draw import draw_reaction, draw_molecule
from minedatabase.pickaxe import Pickaxe
from DORA_XGB import DORA_XGB
from tqdm import tqdm

with initialize(version_base=None, config_path="../configs/filepaths"):
    cfg = compose(config_name="filepaths")

In [14]:
cpd_df = pd.read_parquet(Path(cfg.processed_data) / "2_steps_ccm_aa_aplusb_True_compound_metrics.parquet")
cpd_df.head()

Unnamed: 0,smiles,fan_out,gen,id,expansion
0,O=C(O)CCC(=O)O,73,0,C4fe83a533b208e245e45b74e0967d7b8e9aad361,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...
1,O=C(O)C=CC(=O)O,23,0,Cd2c8451fbe5563b9a086b59c26af17772205613a,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...
2,O=C(O)CC(O)C(=O)O,35,0,Cc522b49cc542f782eb7779a6c90ac1a255334bfa,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...
3,O=C(O)CC(=O)C(=O)O,51,0,C086e98dbf2ac460c91ac1a991badef07eb497a72,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...
4,O=C(O)CCC(=O)C(=O)O,26,0,Ce264bf1bbd482e436995855afdf2170ce7229f1d,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...


In [16]:
cpd_df.groupby(["expansion", "gen"]).agg({"fan_out": ["mean", "std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,fan_out,fan_out
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
expansion,gen,Unnamed: 2_level_2,Unnamed: 3_level_2
2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w_coreactants_aplusb_True.pk,0,86.724138,117.87157
2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w_coreactants_aplusb_True.pk,1,21.681499,15.475309
2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w_coreactants_aplusb_True.pk,0,119.310345,154.962231
2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w_coreactants_aplusb_True.pk,1,26.77707,20.730161
2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreactants_aplusb_True.pk,0,13.758621,6.711691
2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreactants_aplusb_True.pk,1,7.352941,3.603326
2_steps_ccm_aa_rules_rc_plus_4_rules_w_coreactants_aplusb_True.pk,0,13.137931,7.740082
2_steps_ccm_aa_rules_rc_plus_4_rules_w_coreactants_aplusb_True.pk,1,6.560386,3.122884


In [3]:
dxgb_ac = DORA_XGB.feasibility_classifier(cofactor_positioning='add_concat')

In [None]:
expansions = [
    "2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w_coreactants_aplusb_True.pk",
    "2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w_coreactants_aplusb_True.pk",
    "2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreactants_aplusb_True.pk",
    "2_steps_ccm_aa_rules_rc_plus_4_rules_w_coreactants_aplusb_True.pk"
]


rxn_data = {
    "expansion": [],
    "fraction_dxgb_feasible": [],
}

for exp in tqdm(expansions, total=len(expansions)):
    pk = Pickaxe()
    pk.load_pickled_pickaxe(Path(cfg.interim_data) / exp)
    # feas_frac = calculate_dxgb_feas_frac(rxns=pk.reactions, dxgb=dxgb_ac)
    # rxn_data["expansion"].append(exp)
    # rxn_data["fraction_dxgb_feasible"].append(feas_frac)

  0%|          | 0/4 [00:00<?, ?it/s]

----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/cgr/data/interim/2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w_coreactants_aplusb_True.pk pickled data.


 25%|██▌       | 1/4 [00:01<00:03,  1.16s/it]

Loaded 6259 compounds
Loaded 9809 reactions
Loaded 7335 operators
Loaded 71 coreactants
Loaded 2 generation
Took 1.1585707664489746
----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/cgr/data/interim/2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w_coreactants_aplusb_True.pk pickled data.


 50%|█████     | 2/4 [00:02<00:02,  1.27s/it]

Loaded 8066 compounds
Loaded 13245 reactions
Loaded 6236 operators
Loaded 70 coreactants
Loaded 2 generation
Took 0.9358837604522705
----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/cgr/data/interim/2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreactants_aplusb_True.pk pickled data.


 75%|███████▌  | 3/4 [00:03<00:01,  1.23s/it]

Loaded 1373 compounds
Loaded 1881 reactions
Loaded 6351 operators
Loaded 74 coreactants
Loaded 2 generation
Took 0.8810744285583496
----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/cgr/data/interim/2_steps_ccm_aa_rules_rc_plus_4_rules_w_coreactants_aplusb_True.pk pickled data.


100%|██████████| 4/4 [00:05<00:00,  1.32s/it]

Loaded 1176 compounds
Loaded 1597 reactions
Loaded 7204 operators
Loaded 74 coreactants
Loaded 2 generation
Took 1.1998097896575928





Unnamed: 0,expansion,generation,fan_out,fan_out_std
0,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...,0,86.724138,115.821473
1,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...,1,21.681499,15.457177
2,2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w...,0,119.310345,152.26703
3,2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w...,1,26.77707,20.708142
4,2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreact...,0,13.758621,6.594957


In [12]:
cpd_df

Unnamed: 0,expansion,generation,fan_out,fan_out_std
0,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...,0,86.724138,115.821473
1,2_steps_ccm_aa_rules_mechinferred_dt_2_rules_w...,1,21.681499,15.457177
2,2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w...,0,119.310345,152.26703
3,2_steps_ccm_aa_rules_mechinferred_dt_3_rules_w...,1,26.77707,20.708142
4,2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreact...,0,13.758621,6.594957
5,2_steps_ccm_aa_rules_rc_plus_3_rules_w_coreact...,1,7.352941,3.595164
6,2_steps_ccm_aa_rules_rc_plus_4_rules_w_coreact...,0,13.137931,7.605461
7,2_steps_ccm_aa_rules_rc_plus_4_rules_w_coreact...,1,6.560386,3.115332


In [5]:
# rxn_df = pd.DataFrame(rxn_data)
# rxn_df.head()

In [17]:
pk.reactions[list(pk.reactions.keys())[6]]

{'_id': 'R953be8145dab070edb7e9b51374151c1a8a599882693243281f0081daf7f4b8a',
 'Reactants': [(1, 'Xc9b0df14e690ee411edec0aad2286b732e431b96'),
  (1, 'X281e38d790b809dc908f7c9749d5274a6d8280c4')],
 'Products': [(1, 'X85753d94c72a1ae851d34ac6ac8b0c9828cfc734'),
  (1, 'X151d45ce062c088336cff9e45bc138b6a63626fe')],
 'Operators': {'991_2'},
 'SMILES_rxn': '(1) NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)C=CC1 + (1) *C1=C(*)C(=O)C(*)=C(*)C1=O => (1) NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)c1 + (1) *c1c(*)c(O)c(*)c(*)c1O',
 'Operator_aligned_smarts': '*C1=C(*)C(=O)C(*)=C(*)C1=O.NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)C=CC1>>*c1c(*)c(O)c(*)c(*)c1O.NC(=O)c1ccc[n+](C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)c1'}

In [18]:
pk.compounds['Xc9b0df14e690ee411edec0aad2286b732e431b96']

{'ID': 'Ubiquinones_CoF',
 '_id': 'Xc9b0df14e690ee411edec0aad2286b732e431b96',
 'SMILES': '*C1=C(*)C(=O)C(*)=C(*)C1=O',
 'InChI_key': '*C1=C(*)C(=O)C(*)=C(*)C1=O',
 'Type': 'Coreactant',
 'Generation': 0,
 'atom_count': Counter({'C': 6, 'O': 2}),
 'Expand': False,
 'Formula': 'C6*4O2',
 'last_tani': 0}

In [19]:
pk.compounds['X281e38d790b809dc908f7c9749d5274a6d8280c4']

{'ID': 'NADH_CoF',
 '_id': 'X281e38d790b809dc908f7c9749d5274a6d8280c4',
 'SMILES': 'NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4cnc5c(N)ncnc54)C(OP(=O)(O)O)C3O)C(O)C2O)C=CC1',
 'InChI_key': 'ACFIXJIJDZMPPO',
 'Type': 'Coreactant',
 'Generation': 0,
 'atom_count': Counter({'H': 30, 'C': 21, 'O': 17, 'N': 7, 'P': 3}),
 'Expand': False,
 'Formula': 'C21H30N7O17P3',
 'last_tani': 0}

In [13]:
pk.compounds[list(pk.compounds.keys())[-1]]

{'ID': None,
 '_id': 'C7713642c8502d10c15ed646e480d558027f0aa65',
 'SMILES': 'O=CCc1c[nH]c2ccccc12',
 'InChI_key': 'WHOOUMGHGSPMGR-UHFFFAOYSA-N',
 'Type': 'Predicted',
 'Generation': 2,
 'atom_count': Counter({'C': 10, 'H': 9, 'N': 1, 'O': 1}),
 'Reactant_in': [],
 'Product_of': ['Rb05092315db9db1930a80d339cafb6ad66581031809e49f05b8bf95602061346'],
 'Expand': True,
 'Formula': 'C10H9NO',
 'last_tani': 0}

In [23]:
starters = {}
for v in pk.compounds.values():
    if v["Type"].startswith("Start"):
        starters[v['_id']] = v["ID"]
starters

{'C4fe83a533b208e245e45b74e0967d7b8e9aad361': 'succinate',
 'Cd2c8451fbe5563b9a086b59c26af17772205613a': 'fumarate',
 'Cc522b49cc542f782eb7779a6c90ac1a255334bfa': 'malate',
 'C086e98dbf2ac460c91ac1a991badef07eb497a72': 'oxaloacetate',
 'Ce264bf1bbd482e436995855afdf2170ce7229f1d': 'ketoglutarate',
 'C7846bd0a566d52f5c266777c7322ec392a887a5d': 'citrate',
 'Cf7e036e330ad26caa66bb458faad18fa684af921': 'pyruvate',
 'C69bd7ab90d3f972909c038af169917ec6eb194ea': 'acetate',
 'C24f799b65f77e5bef0585b5d5138bb22868a8c16': 'alanine',
 'Cbf56a9b3278f03e28b7f543d3a085d30c2ec9cb9': 'arginine',
 'C4d9cf891ab210715f541dea6745e33e3035be0e8': 'asparagine',
 'Ca1d054c7d7e3878e86b47d369fade7983641c7f9': 'aspartic_acid',
 'Ce05a63be50137c094d61a4b4a198659d78b4c589': 'cysteine',
 'C65d64c67d323a06f303bafc48646dd8cc2aca598': 'glutamine',
 'C8302328f8b370d8cde446ba182c879b9a9d28b76': 'glycine',
 'C0ea9546187e45fbb47e0466c75193007351c34db': 'histidine',
 'C0b1a113de475b725c076698562e4e7b571a039a2': 'isoleucine',

In [27]:
def construct_network(half_expansion: dict, starters: dict[str, str]):
        '''
        Constructs bipartite (compounds and reactions) directed graph
        in the order of real-world synthesis

        Args
        ----
        half_expansion
        '''
        cpd_nodes = []
        rxn_nodes = []
        edge_list = []

        # Get compound information
        for i, cpd in half_expansion.compounds.items():

            if i[0] == "X": # Skip coreactants
                 continue

            cpd_nodes.append(
                (
                    i,
                    {
                        "SMILES": cpd["SMILES"],
                        "InChIKey": cpd["InChI_key"],
                        "Type": cpd["Type"],
                    }
                )
            )

        # Get reaction information
        for i, rxn in half_expansion.reactions.items():
            in_linked = False
            out_linked = False

            rxn_nodes.append(
                (
                    i,
                    {
                        "Rule": rxn["Operators"],
                        "Type": "Reaction",
                    }
                )
            )
            
            # Neither starter, (mass source) nor X coreactant (non-mass-contributing source)
            non_sources = [
                c_id for _, c_id in rxn["Reactants"]
                if c_id[0] == 'C' and c_id not in starters
            ]

            mass_sources = [
                c_id for _, c_id in rxn["Reactants"]
                if c_id in starters
            ]

            if len(non_sources) == 1: # Other requirements for reaction are all sources
                edge_list.append((non_sources[0], i))
            elif len(non_sources) == 0: # Only sources required for reaction
                for ms in mass_sources:
                    edge_list.append((ms, i))
            else: # Currently don't support "extended branching" in synthesis paths
                pass

            for _, c_id in rxn["Products"]:

                # Don't outlink to non-mass-carrying coproducts
                # in order to approximately conserve mass along
                # synthesis paths
                if c_id[0] == 'X':
                     continue
                
                edge_list.append((i, c_id))
        
        return cpd_nodes, rxn_nodes 

before = construct_network(pk, starters)[1]

In [None]:
def filter_reactions(pk: dict, starters: dict[str, str]):
        '''
        Constructs bipartite (compounds and reactions) directed graph
        in the order of real-world synthesis

        Args
        ----

        '''
        rxn_nodes = []

        # Get reaction information
        for i, rxn in pk.reactions.items():
            in_linked = False
            out_linked = False

            
            # Neither starter, (mass source) nor X coreactant (non-mass-contributing source)
            non_sources = [
                c_id for _, c_id in rxn["Reactants"]
                if c_id[0] == 'C' and c_id not in starters
            ]

            if len(non_sources) == 1: # Other requirements for reaction are all sources
                in_linked = True
            elif len(non_sources) == 0: # Only sources required for reaction
                in_linked = True

            if any([elt[1][0] != 'X' for elt in rxn["Products"]]):
                 out_linked = True

            if in_linked and out_linked:
                rxn_nodes.append(
                    (i, {"Rule": rxn["Operators"], "Type": "Reaction"})
                )
        
        
        return rxn_nodes 

after = construct_network(pk, starters)

In [30]:
before = set([elt[0] for elt in before])
after = set([elt[0] for elt in after])

In [31]:
before - after

{'R0d8049cd52e69e381c707a3b72e899448bb124b3524abae8ff5225cdffcad645',
 'R953be8145dab070edb7e9b51374151c1a8a599882693243281f0081daf7f4b8a',
 'Rb3aec664b8debc1052c418472dd77c3baa2d97f6525cdd689b2dacc7b2031fe1',
 'Rda8a35db9d42b06a4b1df754568d73428d818058a2748843e923a0a0e3e94653',
 'Reeca2cd1dc09365a6647088d721d47dc31b3134d1a1a25c49959c13ae85fd0fc'}

In [32]:
pk.reactions['Reeca2cd1dc09365a6647088d721d47dc31b3134d1a1a25c49959c13ae85fd0fc']

{'_id': 'Reeca2cd1dc09365a6647088d721d47dc31b3134d1a1a25c49959c13ae85fd0fc',
 'Reactants': [(1, 'C7d84cb0b4995d01cf776bed183b74ab096417297'),
  (1, 'Xa2a868d3f3866d6bfbbe78b615e42f81511ebff8')],
 'Products': [(1, 'Xf4a7855630639e93324efbcfa9adb1b6a80e8b62')],
 'Operators': {'2328_0'},
 'SMILES_rxn': '(1) C=CC(=C)C + (1) O=P(O)(O)OP(=O)(O)O => (1) CC(C)=CCOP(=O)(O)OP(=O)(O)O',
 'Operator_aligned_smarts': 'C=CC(=C)C.O=P(O)(O)OP(=O)(O)O>>CC(C)=CCOP(=O)(O)OP(=O)(O)O'}

In [None]:
cpd_df = pd.read_parquet(Path(cfg.processed_data) / "test_compound_metrics.parquet")
cpd_df.head()

In [None]:
rxn_df = pd.read_parquet(Path(cfg.processed_data) / "test_reaction_metrics.parquet")
rxn_df.head()