In [15]:
%load_ext autoreload 
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
%load_ext watermark
%watermark -v -p numpy,pandas,scipy,scikit-learn,torch,rdkit,gpytorch,matplotlib,botorch,wandb


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.8.0

numpy       : 1.23.5
pandas      : 1.5.3
scipy       : 1.10.1
scikit-learn: 1.2.2
torch       : 2.0.1
rdkit       : 2023.3.1
gpytorch    : 1.10
matplotlib  : 3.3.2
botorch     : 0.8.2.dev9+g7f3aa92f
wandb       : 0.15.3



In [17]:
from chaos.data.module import BaseDataModule, Featurizer
from chaos.bo.module import BoModule
from chaos.initialization.initializers import BOInitializer
from pytorch_lightning import seed_everything
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger

# CHAOS Tutorial

Welcome to the CHAOS! (tutorial) We will walk through the entire workflow of the CHAOS framework, starting with loading your dataset from a CSV file to running a Bayesian optimization loop to find the optimal experimental settings for your chemical reactions.

## Introduction

CHAOS, which stands for CHemical Additives Optimization Screening is an open-source framework that leverages Bayesian optimization and machine learning to facilitate the optimization of chemical reactions. The objective of this tutorial is to provide a hands-on guide to using CHAOS for chemical reaction optimization.


In [18]:
## Data loading and featurization

In [19]:
featurizer = Featurizer(
    nBits=512, bond_radius=7, representation="drfp", task="reaction_optimization"
)
initializer = BOInitializer(
    method="kmeans", n_clusters=10, use_pca=10, metric="jaccard"
)
dm = BaseDataModule(
    data_path="../data/additives/additive_rxn_screening_plate_1.csv",
    input_column="rxn",
    target_column="objective",
    initializer=initializer,
    featurizer=featurizer,
)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fb9615f8e50>
Traceback (most recent call last):
  File "/home/rankovic/miniconda3/envs/additive_bo/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/rankovic/miniconda3/envs/additive_bo/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/rankovic/miniconda3/envs/additive_bo/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/rankovic/miniconda3/envs/additive_bo/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback fun

using pca
[164, 305, 640, 110, 84, 705, 378, 121, 471, 606] selected reactions
Selected reactions: [164, 305, 640, 110, 84, 705, 378, 121, 471, 606]


In [None]:
## Setting up the surrogate model

In [20]:
model_config = {
    "class_path": "chaos.surrogate_models.gp.SimpleGP",
    "init_args": {
        "likelihood": {
            "class_path": "gpytorch.likelihoods.GaussianLikelihood",
        },
        "covar_module": {
            "class_path": "gpytorch.kernels.ScaleKernel",
            "init_args": {
                "base_kernel": {
                    "class_path": "gpytorch.kernels.MaternKernel",
                    "init_args": {"eps": 1.0e-06, "nu": 0.5},
                },
                "eps": 1.0e-06,
            },
        },
        "standardize": True,
        "normalize": False,
        "initial_noise_val": 0.0001,
        "noise_constraint": 1.0e-05,
        "initial_outputscale_val": 2.0,
        "initial_lengthscale_val": 0.5,
    },
}

In [None]:
## Setting up the BO loop

In [21]:
bo_module = BoModule(
        data=dm,
        model_config=model_config,
        enable_plotting=True,
        enable_logging_images=True,
        beta=0.1,
    )

In [22]:
logger = (
    WandbLogger(project="additives-rebuttal") if bo_module.enable_plotting else None
)
trainer = Trainer(
    max_epochs=100,
    logger=logger,
    log_every_n_steps=1,
    num_sanity_val_steps=0,
    min_epochs=1,
    max_steps=-1,
    accelerator="cpu",
    devices=1,
)
trainer.fit(bo_module)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbojana-rankovic[0m ([33mliac[0m). Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
  rank_zero_warn(

  | Name | Type | Params
------------------------------
------------------------------
0         Trainable params
0         Non-trainable params
0         Total params
0.000     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


0,1
MAE_all,██▃▂▂▂▃▃▃▆▇▅▅▄▅▄▄▄▃▃▃▃▃▃▃▃▄▃▃▃▃▃▂▂▂▁▁▂▂▂
MAE_bottom_5,▁▂▄▅▅▅▅▅▅▇█▇▆▆▆▅▅▅▄▄▄▄▄▄▄▄▄▄▃▃▃▃▂▃▂▁▁▂▂▁
MAE_top_5,█▇▅▄▃▃▄▄▃▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▄▅▅▅▅▅▅
NLPD_all,█▃▁▁▃▄▂▃▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
NLPD_bottom_5,█▅▂▁▁▁▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁
NLPD_top_5,█▄▂▁▁▁▂▂▂▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃
R2_all,▇▇▇▇▆▆▇▆▆▂▁▃▄▄▄▅▅▄▆▆▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇█████
R2_bottom_5,█▇▆▆▅▄▄▄▄▂▁▂▄▃▃▄▄▄▆▆▆▅▅▅▅▅▅▅▅▅▅▅▅▅▅▆▅▅▄▇
R2_top_5,▆▆▇███████▇▇▇▇▇▆▆▆▆▆▄▄▄▃▃▃▃▃▃▂▃▂▂▂▂▁▁▁▁▁
average_similarity,▁▄▄▅▆▇▆▇▇███▇▇█▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆

0,1
MAE_all,19181.68047
MAE_bottom_5,31188.69941
MAE_top_5,29258.42342
NLPD_all,11.3559
NLPD_bottom_5,11.55767
NLPD_top_5,11.53732
R2_all,0.02613
R2_bottom_5,-39782.74121
R2_top_5,-36.64496
average_similarity,0.53831


In [None]:
## running for multiple seeds

In [None]:
for seed in list(range(1, 21)):
    seed_everything(1)
    featurizer = Featurizer(
        nBits=512, bond_radius=7, representation="drfp", task="reaction_optimization"
    )
    initializer = BOInitializer(
        method="kmeans", n_clusters=10, use_pca=10, metric="jaccard"
    )
    dm = BaseDataModule(
        data_path="../data/additives/additive_rxn_screening_plate_1.csv",
        input_column="rxn",
        target_column="objective",
        initializer=initializer,
        featurizer=featurizer,
    )
    bo_module = BoModule(
        data=dm,
        model_config=model_config,
        enable_plotting=True,
        enable_logging_images=True,
        beta=0.1,
    )
    logger = (
        WandbLogger(project="additives-rebuttal") if bo_module.enable_plotting else None
    )
    trainer = Trainer(
        max_epochs=100,
        logger=logger,
        log_every_n_steps=1,
        num_sanity_val_steps=0,
        min_epochs=1,
        max_steps=-1,
        accelerator="cpu",
        devices=1,
    )
    trainer.fit(bo_module)