In [1]:
import json
import pathlib
from datetime import date
from ssr import SMC
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

with open("./setting.json") as f:
    variables = json.load(f)
variables["model_name"] = "SMC-RECUR_GTM"
main_path = variables["main_path"]

variables_addition = {
    "initial_pool_path": main_path + "data/pool.csv",
    "gtm_path": main_path + "model/GTM/enamine_gtm",
    "reactor_model_path": main_path + "model/molecular_transformer.pt",
    "sample_column": [
        "reactant",
        "product",
        "reactant_index",
        "ll",
        "surrogate_ll",
        "c_label",
        "freq",
        "target",
        "surrogate_target",
        "is_new",
    ],
    "surrogate_forward_model_path": {
        "qed": main_path + "model/QSPR/reactant_qed",
        "logp": main_path + "model/QSPR/reactant_logp",
    },
    "forward_model_path": {
        "qed": main_path + "model/QSPR/product_qed",
        "logp": main_path + "model/QSPR/product_logp",
    },
    "unique_col": "reactant_index",
    "black": "False",
}
variables.update(variables_addition)


variables["experiment_date"] = str(date.today())
variables["n_r"] = variables["n_r"] + 1



for i in range(variables["n_r"]):
    variables["sample_column"].append("r" + str(i + 1))
    variables["sample_column"].append("r" + str(i + 1) + "_id")
    variables["sample_column"].append("r" + str(i + 1) + "_gen")
    if i > 0:
        variables["sample_column"].append("p" + str(i))
for y in variables["y_list"]:
    variables["sample_column"].append(y)
    variables["sample_column"].append("surrogate_" + y)
variables["generation_temperature"] = -1 / variables["generation_threshold"] * 2

if variables["model_name"] == "SMC-RECUR_GTM":
    variables["SMC_reactor"] = True
    variables["surrogate"] = False
    variables["SMC_enrich"] = True
    variables["reactor_enrich"] = False
    variables["optimization"] = True
elif variables["model_name"] == "SMC-RECUR_GTM_SR_PL":
    variables["SMC_reactor"] = False
    variables["surrogate"] = True
    variables["SMC_enrich"] = False
    variables["reactor_enrich"] = True
    variables["optimization"] = True
elif variables["model_name"][:-1] == "SMC_":
    variables["SMC_reactor"] = True
    variables["surrogate"] = False
    variables["SMC_enrich"] = False
    variables["reactor_enrich"] = False
    variables["n_r"] = int(variables["model_name"][4:])
    variables["optimization"] = True
elif variables["model_name"] == "Random":
    variables["SMC_reactor"] = True
    variables["surrogate"] = False
    variables["SMC_enrich"] = False
    variables["reactor_enrich"] = False
    variables["p_exploitation"] = 0
    variables["optimization"] = False
elif variables["model_name"] == "Random_RECUR":
    variables["SMC_reactor"] = True
    variables["surrogate"] = False
    variables["SMC_enrich"] = True
    variables["reactor_enrich"] = False
    variables["p_exploitation"] = 0
    variables["optimization"] = False
else:
    raise Exception("wrong model name: {}".format(variables["model_name"]))

model_result_path = (
    main_path
    + "results/"
    + variables["experiment_date"]
    + "/"
    + variables["model_name"]
)
variables["model_df_path"] = model_result_path + "/dfs/"
if variables["reactor_enrich"]:
    variables["warehouse"] = model_result_path + "/warehouse"
    variables["warehouse_key"] = variables["warehouse"] + "/key"
    pathlib.Path(variables["warehouse"]).mkdir(parents=True, exist_ok=True)
    open(variables["warehouse_key"], "a").close()
    variables["pool_folder"] = model_result_path + "/pool_folder"
    variables["pool_folder_key"] = variables["pool_folder"] + "/key"
    variables["pool_updated_marker"] = variables["pool_folder"] + "/updated_marker"
    variables["pool_updated_path"] = variables["pool_folder"] + "/updated_pool.csv"
    pathlib.Path(variables["pool_folder"]).mkdir(parents=True, exist_ok=True)
    pathlib.Path(variables["pool_folder"] + "/addition").mkdir(
        parents=True, exist_ok=True
    )
    open(variables["pool_folder_key"], "a").close()


In [2]:
initial_reactant_pool = pd.read_csv(variables["initial_pool_path"])
print("SMC: initial pool size: {}".format(initial_reactant_pool.shape[0]))

SMC: initial pool size: 150549


In [4]:
import random
class dummy_MolecularTransformer:
    def react(self, reactant_list):
        product_list = [random.choice(R.split(".")) for R in reactant_list]
        return product_list    

In [5]:
reactor = dummy_MolecularTransformer()

In [6]:
SMC_instance = SMC(variables=variables, pool=initial_reactant_pool, reactor=reactor)


In [7]:
SMC_instance()

QSPR by RDKit


RDKit ERROR: [17:38:18] Explicit valence for atom # 17 O, 3, is greater than permitted
RDKit ERROR: [17:38:18] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 16 17 28 29 30
RDKit ERROR: 
RDKit ERROR: [17:38:18] SMILES Parse Error: extra open parentheses for input: 'C#CCCCCCN(Cc1nc(CC(C)C)no1)C(C(=O)NC[C@@H](CC(=O)O)CC(C)C'
RDKit ERROR: [17:38:18] SMILES Parse Error: extra close parentheses while parsing: C[C@H]1CNC(=O)C2(CC=CCN3CC4CC(C3)S4(=O)=O)CCCC2)C1
RDKit ERROR: [17:38:18] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: no atoms
RDKit ERROR: Violation occurred on line 179 in file /home/conda/feedstock_root/build_artifacts/rdkit_1588588730082/work/Code/GraphMol/ROMol.cpp
RDKit ERROR: Failed Expression: getNumAtoms() > 0
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [17:38:18] SMILES Parse Error: extra open parentheses for input: 'Cc1ccc(NC(=O)c2ccc(NC(=O)c3ccc(COC(=O)c4ccc5c(c4)C(CNC(=O)OCC4c5ccccc5-c5ccccc54)cc3)nc2)cc1'
RDKit ERROR: [17:38:18] C

QSPR by RDKit


RDKit ERROR: [17:38:38] Can't kekulize mol.  Unkekulized atoms: 1 2 9
RDKit ERROR: 
RDKit ERROR: [17:38:38] SMILES Parse Error: unclosed ring for input: 'CCc1ccc2ccnc(NCc3cccc(C(=O)NCC4(O)C5C6CC6C7CC6C7C6C6C5C65)c3)c2c1'
RDKit ERROR: [17:38:38] SMILES Parse Error: unclosed ring for input: 'COc1cc2c(Oc3ccc4c(c3)OC3(CCC(C)CC3)CC4(O)c3ccc(O)cc3O4)ccnc2cc1F'


SMC: pool size 150549 -> 150741
SMC: Step 0, target 50, new target 50, max rgen 1
QSPR by RDKit


RDKit ERROR: [17:38:59] SMILES Parse Error: unclosed ring for input: 'CC(C)(C)OC(=O)N1c2ccccc2C(NC(=S)N(C)C)C12CC3CC(CC(C3)C1)C2'
RDKit ERROR: [17:38:59] SMILES Parse Error: extra close parentheses while parsing: CC1(C)Cn2c(C3CCCN(S(=O)(=O)c4ccccc4)C3)nc3c(c2=O)CCC3)O1
RDKit ERROR: [17:38:59] 
RDKit ERROR: 
RDKit ERROR: ****
RDKit ERROR: Pre-condition Violation
RDKit ERROR: no atoms
RDKit ERROR: Violation occurred on line 179 in file /home/conda/feedstock_root/build_artifacts/rdkit_1588588730082/work/Code/GraphMol/ROMol.cpp
RDKit ERROR: Failed Expression: getNumAtoms() > 0
RDKit ERROR: ****
RDKit ERROR: 
RDKit ERROR: [17:38:59] SMILES Parse Error: unclosed ring for input: 'CN(CCCC12CCC(c3ccccc31)c1ccc(OC(F)(F)F)c(Cl)c1)C(=O)N1CC(C)(C2CC2)C1'
RDKit ERROR: [17:38:59] Can't kekulize mol.  Unkekulized atoms: 22 23 27 37 38
RDKit ERROR: 
RDKit ERROR: [17:38:59] SMILES Parse Error: unclosed ring for input: 'CNS(=O)(=O)c1cccc(N2C(=O)CN(C(C)CN3CCOCC3)C3CSC3)c1'


SMC: pool size 150741 -> 150923
SMC: Step 1, target 62, new target 60, max rgen 1


KeyboardInterrupt: 