# Encoding drug-molecules

Molecules in SMILES fromat are embedded through 3 views: MolGraph, Image, SMILES

See
 - https://github.com/BiomedSciAI/biomed-multi-view
 - https://huggingface.co/ibm/biomed.sm.mv-te-84m

In [73]:
import torch
from bmfm_sm.api.smmv_api import SmallMoleculeMultiViewModel
from bmfm_sm.core.data_modules.namespace import LateFusionStrategy

# model = torch.load('../data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-with-coeff-agg.pth')

model_molecule = SmallMoleculeMultiViewModel.from_pretrained(
    LateFusionStrategy.ATTENTIONAL,
    model_path='../data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-with-coeff-agg.pth',
)

2024-11-04 21:22:36,815 - rdkit - INFO - pop-os:130600426774272:0:0 - Enabling RDKit 2024.03.5 jupyter extensions
2024-11-04 21:22:37,865 - root - INFO - pop-os:130600426774272:0:0 - Using coeff_mlp architecture for aggregator
2024-11-04 21:22:37,865 - root - INFO - pop-os:130600426774272:0:0 - dim_list [512, 512, 768] for aggregator
2024-11-04 21:22:37,875 - root - INFO - pop-os:130600426774272:0:0 - Loading checkpoint from provided path ../data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-with-coeff-agg.pth
2024-11-04 21:22:38,151 - root - INFO - pop-os:130600426774272:0:0 - Loading pretrain checkpoint for SmallMoleculeMultiView Model - <All keys matched successfully>
2024-11-04 21:22:38,153 - root - INFO - pop-os:130600426774272:0:0 - in train False setting deterministic_eval = True


In [3]:
from bmfm_sm.predictive.data_modules.graph_finetune_dataset import Graph2dFinetuneDataPipeline
from bmfm_sm.predictive.data_modules.image_finetune_dataset import ImageFinetuneDataPipeline
from bmfm_sm.predictive.data_modules.text_finetune_dataset import TextFinetuneDataPipeline

some_smiles = [
    'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
    'Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12',
    'CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCCN5CCOCC5)ccc34)cc2)no1',
    'CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)cc2C(F)(F)F)CC1',
    'O=C(NC1CCNCC1)c1[nH]ncc1NC(=O)c1c(Cl)cccc1Cl',
    'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1',
    'CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(O)C1',
    'CNC(=O)c1ccccc1Sc1ccc2c(C=Cc3ccccn3)n[nH]c2c1',
    'CCC1C(=O)N(C)c2cnc(Nc3ccc(C(=O)NC4CCN(C)CC4)cc3OC)nc2N1C1CCCC1',
    'Cc1ccc2nc(NCCN)c3ncc(C)n3c2c1.Cl']

smiles = some_smiles[2]
graph = Graph2dFinetuneDataPipeline.smiles_to_graph_format(smiles)
text = TextFinetuneDataPipeline.smiles_to_text_format(smiles)
image = ImageFinetuneDataPipeline.smiles_to_image_format(smiles)

joint_dict = {}
joint_dict.update(graph)
joint_dict.update(text)
joint_dict.update(image)
embedding = model_molecule.get_embeddings(joint_dict, get_separate_embeddings=True)

In [8]:
for view in embedding.keys():
    print(f'{view} embeddings shape: {embedding[view].shape}')

Graph2dModel embeddings shape: torch.Size([1, 512])
ImageModel embeddings shape: torch.Size([1, 512])
TextModel embeddings shape: torch.Size([1, 768])
aggregator embeddings shape: torch.Size([1, 512])
model_coeffs embeddings shape: torch.Size([3])


---
---

# Encoding target-proteins

Proteins in amino-acid sequence format are embedded through 1 view: amino-acid sequence

See
 - https://github.com/mheinzinger/ProstT5
 - https://huggingface.co/Rostlab/ProstT5

In [43]:
import re
from transformers import T5Tokenizer, T5EncoderModel
import torch


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('../data_root/ProstT5_model_dir', do_lower_case=False)

# Load the model
model_protein = T5EncoderModel.from_pretrained("../data_root/ProstT5_model_dir").to(device)

# only GPUs support half-precision (float16) currently; if you want to run on CPU use full-precision (float32) (not recommended, much slower)
model_protein.float() if device.type=='cpu' else model_protein.half()
model_protein.dtype

torch.float32

In [71]:
# prepare your protein sequences/structures as a list of upper-case amino acid sequences
some_sequences = [
    "MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV",
    "PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLP",
    "MVLGTVLLPPNSYGRDQDTSLCCLCTEA",
    "MVDGVMILPVLIMIALP",
    "MGAAAKLAFA",
]
# replace all rare/ambiguous amino acids by X and introduce white-space between all sequences
sequences = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))).upper() for sequence in some_sequences]

# indicate the direction of the translation by prepending "<AA2fold>" if you go from 3Di to AAs (or if you want to embed AAs)
sequences = ["<AA2fold>" + " " + s for s in sequences]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding="longest", return_tensors='pt').to(device)


# generate embeddings
with torch.no_grad():
    embeddings_rpr = model_protein(ids.input_ids,  attention_mask=ids.attention_mask).last_hidden_state
print("Batch embedding: ", embeddings_rpr.shape, "\n")

# remove the special first token
embeddings = []
for i in range(embeddings_rpr.shape[0]):
    l = len(some_sequences[i])
    subseq = embeddings_rpr[i, 1:l+1]
    print("Subsequence embedding: ", subseq.shape)
    mean_subseq = subseq.mean(dim=0)
    print("Mean subsequence embedding: ", mean_subseq.shape, "\n")
    embeddings.append(mean_subseq)

Batch embedding:  torch.Size([5, 48, 1024]) 

Subsequence embedding:  torch.Size([46, 1024])
Mean subsequence embedding:  torch.Size([1024]) 

Subsequence embedding:  torch.Size([36, 1024])
Mean subsequence embedding:  torch.Size([1024]) 

Subsequence embedding:  torch.Size([28, 1024])
Mean subsequence embedding:  torch.Size([1024]) 

Subsequence embedding:  torch.Size([17, 1024])
Mean subsequence embedding:  torch.Size([1024]) 

Subsequence embedding:  torch.Size([10, 1024])
Mean subsequence embedding:  torch.Size([1024]) 



---
---