In [1]:
import numpy as np
import pickle
import torch
import rdkit
from rdkit import Chem
from sklearn.metrics.pairwise import pairwise_distances
from rdkit.Chem import rdFingerprintGenerator as rdGen

In [None]:
smiles_list = np.genfromtxt("smiles.txt", dtype=str, delimiter='\n', comments=None)[:5000]
print(f"Loaded compounds: {len(smiles_list)}")

# Models

## Mol2vec

The pre-trained model can be downloaded from [github.com/samoturk/mol2vec/blob/master/examples/models](https://github.com/samoturk/mol2vec/blob/master/examples/models/model_300dim.pkl)

In [None]:
from mol2vec.features import mol2alt_sentence, MolSentence
from gensim.models import word2vec

mol2vec_model = word2vec.Word2Vec.load("mol2vec_model_300dim.pkl")

The source of **sentence2vec** function is [github.com/samoturk/mol2vec/issues/14](https://github.com/samoturk/mol2vec/issues/14) and was a solution suggested for the deprecation problem.

In [60]:
def sentences2vec(sentences, model, unseen=None):    
    keys = set(model.wv.key_to_index)
    vec = []
    if unseen:
        unseen_vec = model.wv.get_vector(unseen)
    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.get_vector(word) if word in set(sentence) & keys else unseen_vec for word in sentence]))
        else:
            vec.append(sum([model.wv.get_vector(word) for word in sentence if word in set(sentence) & keys]))
    return np.array(vec)

def get_mol2vec(smiles_list, model=None):
    if model == None:
        global mol2vec_model
        model = mol2vec_model
    sentence_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        sentence = MolSentence(mol2alt_sentence(mol, radius=2))
        sentence_list.append(sentence)
    vec_list = sentences2vec(sentence_list, model)
    return vec_list

In [61]:
embedding = get_mol2vec(smiles_list)

## Graph2vec

In [53]:
import networkx as nx
from karateclub import Graph2Vec

In [54]:
def mol2graph(mol):
    G = nx.Graph()
    for atom in mol.GetAtoms():
        G.add_node(
            atom.GetIdx(),
            atomic_num = atom.GetAtomicNum(),
            atom_symbol = atom.GetSymbol()
        )  
    for bond in mol.GetBonds():
        G.add_edge(
            bond.GetBeginAtomIdx(),
            bond.GetEndAtomIdx(),
            bond_type = bond.GetBondType()
        )
    return G

def get_graph2vec(smiles_list, model=None):
    if model == None:
        model = Graph2Vec()
    graph_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        graph = mol2graph(mol)
        graph_list.append(graph)
    model.fit(graph_list)
    vec_list = model.get_embedding()
    return vec_list

In [65]:
embedding = get_graph2vec(smiles_list)

## ChemBERTa

In [48]:
import torch
from tqdm import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [None]:
chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
chemberta.eval()

This wrapper function for using ChemBert2a model was provided on [www.kaggle.com/code/alexandervc/chembert2a-smiles-embeddings-for-beginners](https://www.kaggle.com/code/alexandervc/chembert2a-smiles-embeddings-for-beginners/notebook)

In [50]:
def get_chemberta(smiles_list):
    embeddings_cls = torch.zeros(len(smiles_list), 600)
    embeddings_mean = torch.zeros(len(smiles_list), 600)

    with torch.no_grad():
        for i, smiles in enumerate(tqdm(smiles_list)):
            encoded_input = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
            model_output = chemberta(**encoded_input)
            # embedding = model_output[0][::,0,::]      # class embedding
            # embeddings_cls[i] = embedding
            embedding = torch.mean(model_output[0], 1)
            embeddings_mean[i] = embedding
            
    return embeddings_mean.numpy()

In [None]:
embedding = get_chemberta(smiles_list)

## Continuous Data Driven Descriptors

The descriptors are generated using the CDDD REST - see [repository](https://github.com/vaxherra/cddd_rest)

In [44]:
import requests
import json
import pandas as pd

In [45]:
def prepare_json(smiles_list, batch_size=1):
    batches = [smiles_list[i:i + batch_size] for i in range(0, len(smiles_list), batch_size)]
    json_data = {"batches": batches}
    return json_data

def get_descriptors(json_data, url="http://192.168.1.100:80/predict"):  # Replace with address of own container
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(json_data), headers=headers)
    response_json = response.json()
    response_dict = json.loads(response_json["Prediction"])
    df = pd.DataFrame.from_dict(response_dict)
    descriptor_list = df.iloc[:, 2:].values
    return descriptor_list

def get_cddd(smiles_list, batch_size=1):
    smiles_list = list(smiles_list)
    json_data = prepare_json(smiles_list)
    descriptor_list = get_descriptors(json_data)
    return descriptor_list

In [69]:
embedding = get_cddd(smiles_list)

## Molecular Transformer Embeddings

The embedding can be calculated using the scripts accessible [here](https://github.com/mpcrlab/MolecularTransformerEmbeddings). Then the generated .npz file can be ocnverted to embedding list using the function below.

In [4]:
def get_mte(file_path, smiles_list):
    embedding_npz = np.load(file_path)
    embedding_list = []
    for smiles in smiles_list:
        embedding = np.mean(embedding_npz[smiles], axis=0)
        embedding_list.append(embedding)
    embedding_list = np.array(embedding_list)
    return embedding_list

In [5]:
embedding = get_mte("mte_embedding.npz", smiles_list)

## MACAW

In [104]:
from macaw import MACAW

In [105]:
def get_macaw(smiles_list, n_dimensions=20):
    mcw = MACAW(n_components=n_dimensions)
    mcw.fit(smiles_list)
    embedding = mcw.transform(smiles_list)
    return embedding

In [None]:
embedding = get_macaw(smiles_list)

## MolFormer

In [26]:
import torch
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)

In [31]:
def get_molformer(smiles_list):
    smiles_list = [str(smiles) for smiles in smiles_list]
    inputs = tokenizer(smiles_list, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = np.array(outputs.pooler_output)
    return embedding

In [32]:
embedding = get_molformer(smiles_list)

## GPT2

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, DataCollatorWithPadding

tokenizer = GPT2TokenizerFast.from_pretrained("entropy/gpt2_zinc_87m", max_len=256)
model = GPT2LMHeadModel.from_pretrained("entropy/gpt2_zinc_87m")
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

Embeddings are generated in batches because there are no errors then.

In [37]:
def get_gpt2(smiles_list):
    smiles_list = [str(smiles) for smiles in smiles_list]
    embedding_list = []
    for i in range(0, len(smiles_list), 100):
        print(f"{i}:{i+100}")
        inputs = collator(tokenizer(smiles_list[i:i+100]))
        outputs = model(**inputs, output_hidden_states=True)
        full_embeddings = outputs[-1][-1]
        mask = inputs['attention_mask']
        embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1))
        embedding_list.extend(embeddings)
    for i, vec in enumerate(embedding_list):
        embedding_list[i] = vec.detach().numpy()
    return embedding_list

In [None]:
embedding = get_gpt2(smiles_list)

## BERT for SMILES

In [39]:
from transformers import BertTokenizerFast, BertModel

model = BertModel.from_pretrained("unikei/bert-base-smiles")
tokenizer = BertTokenizerFast.from_pretrained("unikei/bert-base-smiles")

In [42]:
def get_bert_smiles(smiles_list):
    smiles_list = [str(smiles) for smiles in smiles_list]
    inputs = tokenizer(smiles_list, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.detach().numpy()
    embedding = np.mean(embedding, axis=1)
    return embedding

In [43]:
embedding = get_bert_smiles(smiles_list)

## MAT

To use this snippet of code, download the [huggingmolecules repository](https://github.com/gmum/huggingmolecules). Before usage, download the [pretrained weights](https://github.com/gmum/huggingmolecules/blob/main/src/huggingmolecules/models/models_mat.py) and [configuration](https://github.com/gmum/huggingmolecules/blob/main/src/huggingmolecules/configuration/configuration_mat.py).

In [None]:
from huggingmolecules.models.models_mat import MatModel
from huggingmolecules.configuration.configuration_mat import MatConfig
from huggingmolecules.featurization.featurization_mat import MatFeaturizer

In [None]:
def load_mat(smiles_list):
    # Load config and add missing keys to the state dictionnary
    state_dict = torch.load("mat_masking_20M.pt")
    missing_keys = ("generator.proj.weight", "generator.proj.bias")
    missing_sizes = ((1, 1024), (1,))
    for key, size in zip(missing_keys, missing_sizes):
        state_dict[key] = torch.Tensor(np.zeros(size))
    config = MatConfig.from_pretrained('mat_masking_20M.json')
    
    # Load featurizer
    featurizer = MatFeaturizer(config)
    batch = featurizer(smiles_list[:500])
    
    # Load model
    model = MatModel(config=config)
    model.load_state_dict(state_dict)
    model.eval()
    
    return model, featurizer

def get_mat(smiles_list):
    model, featurizer = load_mat(smiles_list)
    batch_mask = torch.sum(torch.abs(batch.node_features), dim=-1) != 0
    embedded = model.src_embed(batch.node_features)
    encoding = model.encoder(embedded, batch_mask,
                           adj_matrix=batch.adjacency_matrix,
                           distance_matrix=batch.distance_matrix)
    embedding = np.mean(encoding.detach().numpy(), axis=1)
    return embedding

In [None]:
embedding = get_mat(smiles_list)

## R-MAT

To use this snippet of code, download the [huggingmolecules repository](https://github.com/gmum/huggingmolecules). Before usage, download the [pretrained weights](https://github.com/gmum/huggingmolecules/blob/main/src/huggingmolecules/models/models_rmat.py) and [configuration](https://github.com/gmum/huggingmolecules/blob/main/src/huggingmolecules/configuration/configuration_rmat.py).

In [None]:
from huggingmolecules.models.models_rmat import RMatModel
from huggingmolecules.configuration.configuration_rmat import RMatConfig
from huggingmolecules.featurization.featurization_rmat import RMatFeaturizer

In [None]:
def load_rmat(smiles_list):
    # Load config and add missing keys to the state dictionnary
    state_dict = torch.load("rmat_4M.pt")
    missing_keys = ("generator.att_net.0.weight", "generator.att_net.2.weight", "generator.proj.weight", "generator.proj.bias")
    missing_sizes = ((128, 768), (4, 128), (1, 3072), (1,))
    for key, size in zip(missing_keys, missing_sizes):
        state_dict[key] = torch.Tensor(np.zeros(size))
    config = RMatConfig.from_pretrained('rmat_4M.json')
    
    # Load featurizer
    featurizer = RMatFeaturizer(config)
    batch = featurizer(smiles_list)
    
    # Load model
    model = RMatModel(config=config)
    model.load_state_dict(state_dict)
    model.eval()
    return model, featurizer

def get_rmat(smiles_list):
    model, featurizer = load_rmat(smiles_list)
    batch_mask = torch.sum(torch.abs(batch.node_features), dim=-1) != 0
    embedded = model.src_embed(batch.node_features)
    distances_matrix = model.dist_rbf(batch.distance_matrix)
    edges_att = torch.cat((batch.bond_features, batch.relative_matrix, distances_matrix), dim=1)
    encoding = model.encoder(embedded, batch_mask, edges_att=edges_att)
    embedding = np.mean(encoding.detach().numpy(), axis=1)
    return embedding

In [None]:
embedding = get_rmat(smiles_list)

# Distance and correlation calculations

Embeddings should be 2-dimensional arrays saved as .pkl

In [63]:
def get_fingerprints(smiles_list):
    fpgen = rdGen.GetMorganGenerator(radius=2, fpSize=2048)
    fingerprint_list = np.array([fpgen.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in smiles_list], dtype=rdkit.DataStructs.cDataStructs.ExplicitBitVect)
    return fingerprint_list

def get_pairwise_similarity(smiles_list):    
    fingerprint_list = get_fingerprints(smiles_list)
    similarity_matrix = 1 - pairwise_distances(X = fingerprint_list, metric='jaccard', n_jobs = -1)
    return similarity_matrix

def get_pairwise_distance(vec_list, metric):
    distance_matrix = pairwise_distances(X=vec_list, metric=metric, n_jobs = -1)
    return distance_matrix

In [None]:
smiles_list = np.genfromtxt("smiles.txt", dtype=str, comments=None)
print(f"Loaded {len(smiles_list)} SMILES")

similarity_matrix = get_pairwise_similarity(smiles_list)
with open(f"similarity.pkl", "wb") as file:
    pickle.dump(similarity_matrix, file)

### Calculating distances with different measures: euclidean, cosine, Canberra.
It is assumed that embedding are saved in *<embedding_name>_embedding.pkl* files. The calculated distance matrices are saved in *distance_<embedding_name>_<measure_name>.pkl*.

In [None]:
embedding_list = (
    ("mol2vec", get_mol2vec),
    ("graph2vec", get_graph2vec),
    ("chemberta", get_chemberta),
    ("cddd", get_cddd),
    ("mte", get_mte),
    ("macaw", get_macaw),
    ("molformer", get_molformer),
    ("gpt2", get_mol2vec),
    ("bert_smiles", get_bert_smiles)
)
measure_list = ("euclidean", "cosine", "canberra")

for emb_name, emb_func in embedding_list:
    embedding = emb_func(smiles_list)
    embedding = np.nan_to_num(embedding)
    print(f"Calculated {emb_name} embedding: {embedding.shape}")
    with open(f"{emb_name}_embedding.pkl", "wb") as file:
        pickle.dump(similarity_matrix, file)

    for m_name in measure_list:
        distance_matrix = get_pairwise_distance(embedding, m_name)
        with open(f"distance_{emb_name}_{m_name}.pkl", "wb") as file:
            pickle.dump(distance_matrix, file)
        print(f"\tDistance {m_name} - done.")
    print(f"Distance {emb_name} - done.")

### Calculating correlation between embedding distance and Morgan Tanimoto similarity

In [None]:
embedding_list = ("mol2vec", "graph2vec", "chemberta", "cddd", "mte", "macaw", "molformer", "gpt2", "bert_smiles")
measures_list = ("euclidean", "cosine", "canberra")

with open(f"similarity.pkl", "rb") as file:
    similarity_matrix = pickle.load(file)
print(similarity_matrix.shape)
    
similarity_matrix_flat = similarity_matrix.flatten()
for embedding in embedding_list:
    print(f"{embedding.capitalize()}")
    X = [similarity_matrix_flat]
    for measure in measures_list:
        with open(f"distance_{embedding}_{measure}.pkl", "rb") as file:
            dist_matrix = pickle.load(file)
        dist_matrix = np.nan_to_num(dist_matrix)
        dist_matrix = dist_matrix.flatten()
        dist_matrix_norm = (dist_matrix - np.min(dist_matrix))/np.ptp(dist_matrix)
        X.append(dist_matrix_norm)
    corr_matrix = np.corrcoef(X)

    for measure, corr in zip(measures_list, corr_matrix[0, 1:]):
        print(f"{measure}: {corr}")

In [None]:
!pip install macaw==0.2.dev0