# 🗜️ CLAMP demo

This notebook shows run a pretrained CLAMP model or train a new one.

Before the start: change the Runtime type to GPU.

# ⚙️ Setup
only run once

In [None]:
!pip install git+https://github.com/ml-jku/clamp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ml-jku/clamp
  Cloning https://github.com/ml-jku/clamp to /tmp/pip-req-build-wnlfha3x
  Running command git clone --filter=blob:none --quiet https://github.com/ml-jku/clamp /tmp/pip-req-build-wnlfha3x
  Resolved https://github.com/ml-jku/clamp to commit f097406dd2161e1d083e749277d24b82219e1d93
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mhnreact@ git+https://github.com/ml-jku/mhn-react.git (from clamp==1.0)
  Cloning https://github.com/ml-jku/mhn-react.git to /tmp/pip-install-hxfmgj_w/mhnreact_c57710d2a2854155b8061b3ea149669b
  Running command git clone --filter=blob:none --quiet https://github.com/ml-jku/mhn-react.git /tmp/pip-install-hxfmgj_w/mhnreact_c57710d2a2854155b8061b3ea149669b
  Resolved https://github.com/ml-jku/mhn-react.git to commit 424ab0185db53462f5dd22b192a7bc6f4ad5b2d7
  Preparing metadata (setup.py) ... [?25l[?25hdo

# Use a pretrained CLAMP model

In [None]:
import torch
import clamp

model = clamp.CLAMP(device='cpu')
model.eval()

model

PretrainedCLAMP(
  (compound_encoder): NetworkLayerNorm(
    (linear_input): Linear(in_features=8192, out_features=4096, bias=True)
    (linear_hidden_l): ModuleList(
      (0): Linear(in_features=4096, out_features=2048, bias=True)
    )
    (linear_output): Linear(in_features=2048, out_features=768, bias=True)
    (normalization_input): LayerNorm((4096,), eps=1e-05, elementwise_affine=False)
    (normalization_hidden_l): ModuleList(
      (0): LayerNorm((2048,), eps=1e-05, elementwise_affine=False)
    )
    (nonlinearity): ReLU()
    (dropout_input): Dropout(p=0.1, inplace=False)
    (dropout_hidden): Dropout(p=0.2, inplace=False)
  )
  (assay_encoder): NetworkLayerNorm(
    (linear_input): Linear(in_features=512, out_features=4096, bias=True)
    (linear_hidden_l): ModuleList(
      (0): Linear(in_features=4096, out_features=2048, bias=True)
    )
    (linear_output): Linear(in_features=2048, out_features=768, bias=True)
    (normalization_input): LayerNorm((4096,), eps=1e-05, elem

In [None]:
# encodes a smiles to the association space (without l2-norm)
model.encode_smiles(['CCC']).shape

torch.Size([1, 768])

In [None]:
model.encode_text(['HIV: Experimentally measured abilities to inhibit HIV replication.']).shape

torch.Size([1, 768])

In [None]:
molecules = [
    'CCOP(=O)(Nc1cccc(Cl)c1)OCC', #inactive
    'O=C(O)c1ccccc1O', #inactive
    'NNP(=S)(NN)c1ccccc1', #active
    'CC(=O)OC1=CC=CC=C1C(=O)O', # Aspirin
    ]
assay_descriptions = [
    'HIV: Experimentally measured abilities to inhibit HIV replication.',
    ]

with torch.no_grad():
    logits = model.forward_dense(molecules, assay_descriptions)
    probs = logits.softmax(dim=0).cpu().numpy() # probs for molecules

print("Mol probs for assay:", probs[:,0]) # res: [0.258 0.235 0.269  0.236]

Mol probs for assay: [0.25821456 0.23540354 0.26953387 0.236848  ]


# Linear-probing example
To download the preprocessed downstream datasets call

In [None]:
!wget -N -r https://cloud.ml.jku.at/s/pyJMm4yQeWFM2gG/download -O downstream.zip
!unzip downstream.zip; rm downstream.zip

will be placed in the single file you specified.

for details.

--2023-06-19 12:46:55--  https://cloud.ml.jku.at/s/pyJMm4yQeWFM2gG/download
Resolving cloud.ml.jku.at (cloud.ml.jku.at)... 140.78.90.41
Connecting to cloud.ml.jku.at (cloud.ml.jku.at)|140.78.90.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14101780 (13M) [application/zip]
Saving to: ‘downstream.zip’


2023-06-19 12:46:56 (41.3 MB/s) - ‘downstream.zip’ saved [14101780/14101780]

FINISHED --2023-06-19 12:46:56--
Total wall clock time: 0.6s
Downloaded: 1 files, 13M in 0.3s (41.3 MB/s)
Archive:  downstream.zip
replace data/downstream/toxcast/activity.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
dset = './data/downstream/tox21'
smis_df = pd.read_parquet(f'{dset}/compound_smiles.parquet') #assume one on one mapping
act_df = pd.read_parquet(f'{dset}/activity.parquet')
act_df = act_df[act_df.assay_idx==0] # for demo only first assay

In [None]:
clamp_encoding = model.encode_smiles(smis_df.CanonicalSMILES) # do batchwise for larger dsets
fp_encoding = model.prepro_smiles(smis_df.CanonicalSMILES)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

def davgp_score(y_true, y_pred, sample_weight=None):
    avgp = average_precision_score(y_true, y_pred, sample_weight=sample_weight)
    y_avg = np.average(y_true, weights=sample_weight)
    return avgp - y_avg

clf = LogisticRegression(max_iter=1500, class_weight='balanced', C=1, random_state=70135)

encodings = {'FP':fp_encoding, 'CLAMP':clamp_encoding}

for name, encoding in encodings.items():
  X_train = encoding[act_df[act_df.scaffold_split=='train'].compound_idx.values]
  y_train = act_df[act_df.scaffold_split=='train'].activity

  X_test = encoding[act_df[act_df.scaffold_split=='test'].compound_idx.values]
  y_test = act_df[act_df.scaffold_split=='test'].activity

  clf.fit(X_train, y_train)

  y_pred = clf.predict_proba(X_test)[:,1]

  print(f'{name}:\t dAP={davgp_score(y_test, y_pred):2.3f}, AUROC={roc_auc_score(y_test, y_pred):2.3f}')

FP:	 dAP=0.207, AUROC=0.705
CLAMP:	 dAP=0.308, AUROC=0.737


# 🔥Train your own model

In [None]:
# setup FS-Mol:
!wget -N -r https://cloud.ml.jku.at/s/dCjrt9c4arbz6rF/download -O fsmol.zip
!unzip fsmol.zip; rm fsmol.zip

will be placed in the single file you specified.

for details.

--2023-06-22 05:24:37--  https://cloud.ml.jku.at/s/dCjrt9c4arbz6rF/download
Resolving cloud.ml.jku.at (cloud.ml.jku.at)... 140.78.90.41
Connecting to cloud.ml.jku.at (cloud.ml.jku.at)|140.78.90.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6466693 (6.2M) [application/zip]
Saving to: ‘fsmol.zip’


2023-06-22 05:24:38 (6.38 MB/s) - ‘fsmol.zip’ saved [6466693/6466693]

FINISHED --2023-06-22 05:24:38--
Total wall clock time: 1.7s
Downloaded: 1 files, 6.2M in 1.0s (6.38 MB/s)
Archive:  fsmol.zip
  inflating: data/fsmol/activity.parquet  
  inflating: data/fsmol/assay_names.parquet  
  inflating: data/fsmol/compound_names.parquet  
  inflating: data/fsmol/compound_smiles.parquet  


In [None]:
# compute the compound encodings as input for your model run (takes a few min)
# if you leafe this step out, it computes it on the fly --
# in this case we only have 1 CPU unfortunately
!python -m clamp.dataset.encode_compound \
--compounds=./data/fsmol/compound_names.parquet \
--compound2smiles=./data/fsmol/compound_smiles.parquet \
--fp_type=morganc+rdkc --fp_size=8096 --njobs=2
# stops at 168814 with colab; no idea why --> let's do it on the fly

In [None]:
# compute the assay encodings as input for your model run
!python -m clamp.dataset.encode_assay --assay_path=./data/fsmol/assay_names.parquet --encoding=clip --gpu=0 --columns \
assay_type_description description assay_category assay_cell_type assay_chembl_id assay_classifications assay_organism assay_parameters assay_strain assay_subcellular_fraction assay_tax_id assay_test_type assay_tissue assay_type bao_format bao_label cell_chembl_id confidence_description confidence_score document_chembl_id relationship_description relationship_type src_assay_id src_id target_chembl_id tissue_chembl_id variant_sequence \
--suffix=all

[32m2023-06-22 05:24:50.051[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m224[0m - [1mexample assay description: Binding Binding affinity for human Glucagon Receptor   CHEMBL683962 []  []      B BAO_0000357 single protein format  Homologous single protein target assigned 8 CHEMBL1134597 Homologous protein target assigned H  1 CHEMBL1985  [0m
[32m2023-06-22 05:24:51.576[0m | [1mINFO    [0m | [36m__main__[0m:[36mclip_encode[0m:[36m51[0m - [1mLoad CLIP model on cpu.[0m
100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 103MiB/s]
[32m2023-06-22 05:25:06.096[0m | [1mINFO    [0m | [36m__main__[0m:[36mclip_encode[0m:[36m54[0m - [1mEncode assay descriptions using CLIP.[0m
Encode assay descriptions: 100% 3/3 [15:02<00:00, 300.70s/it]
[32m2023-06-22 05:40:08.251[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m257[0m - [1mSave assay features to data/fsmol/assay_features_clip_all.npy[0m


In [None]:
# also the lsa encoding
!python -m clamp.dataset.encode_assay --assay_path=./data/fsmol/assay_names.parquet --encoding=lsa --gpu=0 --columns \
assay_type_description description assay_category assay_cell_type assay_chembl_id assay_classifications assay_organism assay_parameters assay_strain assay_subcellular_fraction assay_tax_id assay_test_type assay_tissue assay_type bao_format bao_label cell_chembl_id confidence_description confidence_score document_chembl_id relationship_description relationship_type src_assay_id src_id target_chembl_id tissue_chembl_id variant_sequence \
--suffix=all

[32m2023-06-22 05:40:14.692[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m224[0m - [1mexample assay description: Binding Binding affinity for human Glucagon Receptor   CHEMBL683962 []  []      B BAO_0000357 single protein format  Homologous single protein target assigned 8 CHEMBL1134597 Homologous protein target assigned H  1 CHEMBL1985  [0m
[32m2023-06-22 05:40:14.693[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m230[0m - [1mEncode assay descriptions using LSA.[0m
[32m2023-06-22 05:40:14.693[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m233[0m - [1mFit a sklearn TfidfVectorizer model on training data.[0m
[32m2023-06-22 05:40:14.693[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m235[0m - [1mSave the fitted LSA-model to ./data/models/lsa.joblib, load it later using the argument --lsa_path[0m
[32m2023-06-22 05:40:14.693[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m242[0m 

In [None]:
# download default hparams (can also be copied from the repo)
!mkdir hparams
!wget https://github.com/ml-jku/clamp/raw/main/hparams/default.json
!mv default.json ./hparams/default.json

--2023-06-22 05:40:26--  https://github.com/ml-jku/clamp/raw/main/hparams/default.json
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ml-jku/clamp/main/hparams/default.json [following]
--2023-06-22 05:40:26--  https://raw.githubusercontent.com/ml-jku/clamp/main/hparams/default.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 427 [text/plain]
Saving to: ‘default.json’


2023-06-22 05:40:26 (20.2 MB/s) - ‘default.json’ saved [427/427]



In [None]:
# finally let's train our model: (compare to Table 1 in the paper @FS-Mol default split)
# takes around an hour per epoch (20 epochs)
!python -m clamp.train --dataset=./data/fsmol --assay_mode=clip_all||lsa_all --split=FSMOL_split

[32m2023-06-22 05:40:31.021[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_args_override[0m:[36m88[0m - [1mno compound_layer_sizes provided, setting to hidden_layers[0m
[32m2023-06-22 05:40:31.021[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_args_override[0m:[36m91[0m - [1mno assay_layer_sizes provided, setting to hidden_layers[0m
2023/06/22 05:40:31 INFO mlflow.tracking.fluent: Experiment with name 'debug' does not exist. Creating a new experiment.
[32m2023-06-22 05:40:31.281[0m | [1mINFO    [0m | [36mclamp.dataset.dataloader[0m:[36m_load_compound[0m:[36m174[0m - [1mloading compound_features_morganc+rdkc.npz failed, using .npy instead[0m
[32m2023-06-22 05:40:31.281[0m | [1mINFO    [0m | [36mclamp.dataset.dataloader[0m:[36m_load_compound[0m:[36m178[0m - [1mloading compound_features_morganc+rdkc.npy failed, trying to compute it on the fly[0m
{'f': None, 'dataset': './data/fsmol', 'assay_mode': 'clip_all', 'compound_mode': 'morganc+rdkc