In [33]:
from pathlib import Path
from tqdm import tqdm
from hydra import compose, initialize
import polars as pl
from minedatabase.pickaxe import Pickaxe
import json

with initialize(version_base=None, config_path="../conf/filepaths"):
    cfg = compose(config_name="filepaths")

In [27]:
kcs = pl.read_parquet(Path(cfg.known) / "known_compounds.parquet")
print(len(kcs))
kcs.head()

8603


id,smiles,names,n_atoms
str,str,list[str],i32
"""0""","""*""","[""A""]",1
"""1""","""**""","[""RX""]",2
"""2""","""*C""","[""an alkane""]",2
"""3""","""*C#N""","[""a nitrile""]",3
"""4""","""*C(*)(N)C(=O)O""","[""2,2-dialkylglycine""]",7


In [28]:
exp_names = [
    "3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk",
    "4_steps_bottle_targets_24_to_None_rules_mechinferred_dt_2_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk",
    "4_steps_bottle_targets_24_to_None_rules_mechinformed_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk",   
]

known_expansion_compounds = {}
for exp_name in exp_names:
    pk = Pickaxe()
    pk.load_pickled_pickaxe(
        Path(cfg.raw_expansions) / exp_name,
    )
    for cpd in tqdm(pk.compounds.values(), total=len(pk.compounds)):
        if cpd["_id"].startswith('X'):
            continue

        if cpd["SMILES"] in kcs["smiles"]:
            names = kcs.filter(pl.col("smiles") == cpd["SMILES"]).row(0, named=True)["names"]
            if len(names) > 0:
                name = names[0]
            else:
                name = None

            if cpd["SMILES"] not in known_expansion_compounds:
                known_expansion_compounds[cpd["SMILES"]] = {
                    "smiles": cpd["SMILES"],
                    "name": name,
                    "expansion": [exp_name],
                    "expansion_id": [cpd["_id"]],
                    "generation": [cpd["Generation"]]
                }
            else:
                known_expansion_compounds[cpd["SMILES"]]["expansion"].append(exp_name)
                known_expansion_compounds[cpd["SMILES"]]["expansion_id"].append(cpd["_id"])
                known_expansion_compounds[cpd["SMILES"]]["generation"].append(cpd["Generation"])

----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/quest_data/bottle/results/raw_expansions/3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk pickled data.




Loaded 50528 compounds
Loaded 95325 reactions
Loaded 1878 operators
Loaded 48 coreactants
Loaded 3 generation
Took 0.5814247131347656


100%|██████████| 50528/50528 [00:21<00:00, 2304.65it/s]


----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/quest_data/bottle/results/raw_expansions/4_steps_bottle_targets_24_to_None_rules_mechinferred_dt_2_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk pickled data.




Loaded 68894 compounds
Loaded 136655 reactions
Loaded 6507 operators
Loaded 48 coreactants
Loaded 4 generation
Took 1.0403213500976562


100%|██████████| 68894/68894 [00:28<00:00, 2401.03it/s]


----------------------------------------
Intializing pickaxe object

Done intializing pickaxe object
----------------------------------------

Loading /home/stef/quest_data/bottle/results/raw_expansions/4_steps_bottle_targets_24_to_None_rules_mechinformed_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk pickled data.




Loaded 684308 compounds
Loaded 1621728 reactions
Loaded 640 operators
Loaded 46 coreactants
Loaded 4 generation
Took 12.33701229095459


100%|██████████| 684308/684308 [04:37<00:00, 2465.28it/s] 


In [30]:
known_exp_cpds = pl.from_dicts(list(known_expansion_compounds.values()))
known_exp_cpds.head()

smiles,name,expansion,expansion_id,generation
str,str,list[str],list[str],list[i64]
"""OCC1OC(O)C(O)C(O)C1O""","""D-glucose""","[""3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk"", ""4_steps_bottle_targets_24_to_None_rules_mechinferred_dt_2_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk"", ""4_steps_bottle_targets_24_to_None_rules_mechinformed_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk""]","[""C9ab1a08d72c90a8167d1f3a668d8f1138e534a07"", ""C9ab1a08d72c90a8167d1f3a668d8f1138e534a07"", ""C9ab1a08d72c90a8167d1f3a668d8f1138e534a07""]","[1, 1, 1]"
"""O=CO""","""formate""","[""3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk"", ""4_steps_bottle_targets_24_to_None_rules_mechinferred_dt_2_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk"", ""4_steps_bottle_targets_24_to_None_rules_mechinformed_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk""]","[""C0e2c8d649fd28524ed617a741b4da1c714899166"", ""C0e2c8d649fd28524ed617a741b4da1c714899166"", ""C0e2c8d649fd28524ed617a741b4da1c714899166""]","[1, 1, 3]"
"""C=CC(=C)C""","""isoprene""","[""3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk"", ""4_steps_bottle_targets_24_to_None_rules_mechinferred_dt_2_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk""]","[""C7d84cb0b4995d01cf776bed183b74ab096417297"", ""C7d84cb0b4995d01cf776bed183b74ab096417297""]","[1, 1]"
"""CC(C)C(=O)O""","""2-methylpropanoate""","[""3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk""]","[""C2d94449d2035580e3a2de661386c29c111c7bf71""]",[1]
"""CCC(C)CC(=O)O""","""3-methylpentanoate""","[""3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk""]","[""C84d80f64c8534e7ac682f6e3a7e8d5f2810782a3""]",[1]


In [31]:
print(f"{len(known_expansion_compounds)} known compounds in expansion compounds")

177 known compounds in expansion compounds


In [32]:
name = "bottle_targets_24_retro"
known_exp_cpds.write_parquet(
    f"/home/stef/krxns/data/raw/{name}.parquet",
)

In [None]:
overrides = {exp_name: {} for exp_name in exp_names}
for row in known_exp_cpds.iter_rows(named=True):
    for exp_name, exp_id in zip(row["expansion"], row["expansion_id"]):
        overrides[exp_name][exp_id] = row["name"]

for k, v in overrides.items():
    fn = k.removesuffix(".pk")
    print(f"{k}: {len(v)} compounds")
    
    with open(Path(cfg.artifacts) / "st_overrides" / f"{fn}.json", "w") as f:
        json.dump(v, f)

3_steps_bottle_targets_24_to_None_rules_mechinferred_dt_98_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk: 137 compounds
4_steps_bottle_targets_24_to_None_rules_mechinferred_dt_2_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk: 82 compounds
4_steps_bottle_targets_24_to_None_rules_mechinformed_rules_w_coreactants_co_metacyc_coreactants_sampled_False_pruned_False_aplusb_True.pk: 66 compounds


: 