In [9]:
%load_ext autoreload 
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
%load_ext watermark
%watermark -v -p numpy,pandas,scipy,scikit-learn,torch,rdkit,gpytorch,matplotlib,botorch,wandb


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.8.0

numpy       : 1.23.5
pandas      : 1.5.3
scipy       : 1.10.1
scikit-learn: 1.2.2
torch       : 2.0.1
rdkit       : 2023.3.1
gpytorch    : 1.10
matplotlib  : 3.3.2
botorch     : 0.8.2.dev9+g7f3aa92f
wandb       : 0.15.3



In [11]:
from chaos.data.module import BaseDataModule, Featurizer
from chaos.bo.module import BoModule
from chaos.initialization.initializers import BOInitializer
from pytorch_lightning import seed_everything
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger

To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


# CHAOS Tutorial

Welcome to the CHAOS! (tutorial) We will walk through the entire workflow of the CHAOS framework, starting with loading your dataset from a CSV file to running a Bayesian optimization loop to find the optimal experimental settings for your chemical reactions.

## Introduction

CHAOS, which stands for CHemical Additives Optimization Screening is an open-source framework that leverages Bayesian optimization and machine learning to facilitate the optimization of chemical reactions. The objective of this tutorial is to provide a hands-on guide to using CHAOS for chemical reaction optimization.


In [14]:
## Data loading and featurization

In [None]:
featurizer = Featurizer(
    nBits=512, bond_radius=7, representation="drfp", task="reaction_optimization"
)
initializer = BOInitializer(
    method="kmeans", n_clusters=10, use_pca=10, metric="jaccard"
)
dm = BaseDataModule(
    data_path="../data/additives/additive_rxn_screening_plate_1.csv",
    input_column="rxn",
    target_column="objective",
    initializer=initializer,
    featurizer=featurizer,
)

In [None]:
## Setting up the surrogate model

In [13]:
model_config = {
    "class_path": "chaos.surrogate_models.gp.SimpleGP",
    "init_args": {
        "likelihood": {
            "class_path": "gpytorch.likelihoods.GaussianLikelihood",
        },
        "covar_module": {
            "class_path": "gpytorch.kernels.ScaleKernel",
            "init_args": {
                "base_kernel": {
                    "class_path": "gpytorch.kernels.MaternKernel",
                    "init_args": {"eps": 1.0e-06},
                },
                "eps": 1.0e-06,
            },
        },
        "standardize": True,
        "normalize": False,
        "initial_noise_val": 0.0001,
        "noise_constraint": 1.0e-05,
        "initial_outputscale_val": 2.0,
        "initial_lengthscale_val": 0.5,
    },
}

In [None]:
## Setting up the BO loop

In [None]:
logger = (
    WandbLogger(project="additives-rebuttal") if bo_module.enable_plotting else None
)
trainer = Trainer(
    max_epochs=100,
    logger=logger,
    log_every_n_steps=1,
    num_sanity_val_steps=0,
    min_epochs=1,
    max_steps=-1,
    accelerator="cpu",
    devices=1,
)
trainer.fit(bo_module)

In [None]:
## running for multiple seeds

In [None]:
for seed in list(range(1, 21)):
    seed_everything(1)
    featurizer = Featurizer(
        nBits=512, bond_radius=7, representation="drfp", task="reaction_optimization"
    )
    initializer = BOInitializer(
        method="kmeans", n_clusters=10, use_pca=10, metric="jaccard"
    )
    dm = BaseDataModule(
        data_path="../data/additives/additive_rxn_screening_plate_1.csv",
        input_column="rxn",
        target_column="objective",
        initializer=initializer,
        featurizer=featurizer,
    )
    bo_module = BoModule(
        data=dm,
        model_config=model_config,
        enable_plotting=True,
        enable_logging_images=True,
        beta=0.1,
    )
    logger = (
        WandbLogger(project="additives-rebuttal") if bo_module.enable_plotting else None
    )
    trainer = Trainer(
        max_epochs=100,
        logger=logger,
        log_every_n_steps=1,
        num_sanity_val_steps=0,
        min_epochs=1,
        max_steps=-1,
        accelerator="cpu",
        devices=1,
    )
    trainer.fit(bo_module)