In [38]:
import os
import datamol as dm
import logging
import pandas as pd
import random
import torch
import numpy as np

from pathlib import Path
from sklearn.ensemble import GradientBoostingRegressor


path_train = Path("../data/train/raw/data.csv")
path_test = Path("../data/test/raw/data.csv")
train = pd.read_csv(path_train)[:4]
test = pd.read_csv(path_test)

In [39]:
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    logging.info(f"🐙 Seeding everything with: {seed}")

In [40]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("feature-extraction", model="seyonec/ChemBERTa-zinc-base-v1")

Device set to use cuda:0


In [41]:
train["X"] = train["CXSMILES"]
test["X"] = test["CXSMILES"]
train.drop(columns=["CXSMILES"], inplace=True)
test.drop(columns=["CXSMILES"], inplace=True)
train.head()

Unnamed: 0,pIC50 (MERS-CoV Mpro),pIC50 (SARS-CoV-2 Mpro),X
0,4.1,5.58,O=C(CC1=CN=CC2=CC=CC=C12)N1C[C@H](F)C[C@H]1C1=...
1,4.5,,O=C(N[C@@H]1C[C@H](O)C2=CC=CC=C21)C1=CN=CC2=CC...
2,6.02,7.04,O=C(CN1CC2=CC=C(Cl)C=C2[C@H](C(=O)NC2=CN=CC3=C...
3,,4.82,O=C(CC1=CN=CC2=CC=CC=C12)N1[C@H]2CCC[C@H]1C1=C...


In [42]:
# Use ChemBERTa to featurize the SMILES strings
X_train_dl = np.array([pipe(x)[0][0] for x in train.X])
X_test_dl = np.array([pipe(x)[0][0] for x in test.X])

y_pred_dl_train = {}
y_pred_dl_test = {}

target_cols = {"pIC50 (SARS-CoV-2 Mpro)", "pIC50 (MERS-CoV Mpro)"}

# For each of the targets...
for tgt in target_cols:

    # We get the training targets
    # Note that we need to mask out NaNs since the multi-task matrix is sparse.
    y_true = train[tgt]
    mask = ~np.isnan(y_true)

    # We'll train a simple baseline model
    model_dl = GradientBoostingRegressor()
    model_dl.fit(X_train_dl[mask], y_true[mask])

    # And then use that to predict the targets for both train and test sets
    y_pred_dl_train[tgt] = model_dl.predict(X_train_dl)
    y_pred_dl_test[tgt] = model_dl.predict(X_test_dl)

In [43]:
# Prepare the input data. We'll use Datamol to compute the ECFP fingerprints for both the train and test columns.
X_train = np.array([dm.to_fp(dm.to_mol(smi)) for smi in train.X])
X_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test.X])

y_pred_train = {}
y_pred_test = {}
target_cols = {"pIC50 (SARS-CoV-2 Mpro)", "pIC50 (MERS-CoV Mpro)"}

# For each of the targets...
for tgt in target_cols:

    # We get the training targets
    # Note that we need to mask out NaNs since the multi-task matrix is sparse.
    y_true = train[tgt]
    mask = ~np.isnan(y_true)

    # We'll train a simple baseline model
    model = GradientBoostingRegressor()
    model.fit(X_train[mask], y_true[mask])

    # And then use that to predict the targets for both train and test set
    y_pred_train[tgt] = model.predict(X_train)
    y_pred_test[tgt] = model.predict(X_test)

In [44]:
import sys

sys.path.append("../")

from evaluation import eval_potency

targets_train = {
    "pIC50 (SARS-CoV-2 Mpro)": train["pIC50 (SARS-CoV-2 Mpro)"],
    "pIC50 (MERS-CoV Mpro)": train["pIC50 (MERS-CoV Mpro)"],
}

targets_test = {
    "pIC50 (SARS-CoV-2 Mpro)": test["pIC50 (SARS-CoV-2 Mpro)"],
    "pIC50 (MERS-CoV Mpro)": test["pIC50 (MERS-CoV Mpro)"],
}

eval_dl = eval_potency(y_pred_dl_train, targets_train)
eval_base = eval_potency(y_pred_train, targets_train)
eval_dl_test = eval_potency(y_pred_dl_test, targets_test)
eval_base_test = eval_potency(y_pred_test, targets_test)

In [45]:
from pprint import pprint

print("ChemBERTa-based-features model on train set:")
pprint(dict(eval_dl))

print("\nBaseline model on train set:")
pprint(dict(eval_base))

print("\nChemBERTa-based-features model on test set:")
pprint(dict(eval_dl_test))

print("\nBaseline model on test set:")
pprint(dict(eval_base_test))

ChemBERTa-based-features model on train set:
{'aggregated': {'macro_mean_absolute_error': np.float64(2.101301778682322e-05),
                'macro_r2': np.float64(0.9999999992944921)},
 'pIC50 (MERS-CoV Mpro)': {'kendall_tau': np.float64(1.0),
                           'mean_absolute_error': 2.0304713816384396e-05,
                           'r2': 0.9999999992944921},
 'pIC50 (SARS-CoV-2 Mpro)': {'kendall_tau': np.float64(1.0),
                             'mean_absolute_error': 2.1721321757262047e-05,
                             'r2': 0.9999999992944921}}

Baseline model on train set:
{'aggregated': {'macro_mean_absolute_error': np.float64(2.101301778682322e-05),
                'macro_r2': np.float64(0.9999999992944921)},
 'pIC50 (MERS-CoV Mpro)': {'kendall_tau': np.float64(1.0),
                           'mean_absolute_error': 2.0304713816384396e-05,
                           'r2': 0.9999999992944921},
 'pIC50 (SARS-CoV-2 Mpro)': {'kendall_tau': np.float64(1.0),
               