In [2]:
import numpy as np
import pickle
import torch
import rdkit
from rdkit import Chem
from sklearn.metrics.pairwise import pairwise_distances
from rdkit.Chem import rdFingerprintGenerator as rdGen

In [3]:
smiles_list = np.genfromtxt("data/jak2_smiles.txt", dtype=str, delimiter='\n', comments=None)[:5000]
print(f"Loaded compounds: {len(smiles_list)}")

Loaded compounds: 5000


# Models

## Mol2vec

In [None]:
from mol2vec.features import mol2alt_sentence, MolSentence
from gensim.models import word2vec

mol2vec_model = word2vec.Word2Vec.load("models/mol2vec_model_300dim.pkl")

In [72]:
'''https://github.com/samoturk/mol2vec/issues/14'''
def sentences2vec(sentences, model, unseen=None):
    """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
    sum of vectors for individual words.
    
    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032

    Returns
    -------
    np.array
    """
    
    keys = set(model.wv.key_to_index)
    vec = []
    
    if unseen:
        unseen_vec = model.wv.get_vector(unseen)
    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.get_vector(word) if word in set(sentence) & keys else unseen_vec for word in sentence]))
        else:
            vec.append(sum([model.wv.get_vector(word) for word in sentence if word in set(sentence) & keys]))
    return np.array(vec)

# Main function
def get_mol2vec(smiles_list, model=None):
    if model == None:
        global mol2vec_model
        model = mol2vec_model
    sentence_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        sentence = MolSentence(mol2alt_sentence(mol, radius=2))
        sentence_list.append(sentence)
    vec_list = sentences2vec(sentence_list, model)
    return vec_list

In [None]:
embedding = get_mol2vec(smiles_list)

## Graph2vec

In [3]:
import networkx as nx
from karateclub import Graph2Vec

In [31]:
def mol2graph(mol):
    G = nx.Graph()
    for atom in mol.GetAtoms():
        G.add_node(
            atom.GetIdx(),
            atomic_num = atom.GetAtomicNum(),
            atom_symbol = atom.GetSymbol()
        )  
    for bond in mol.GetBonds():
        G.add_edge(
            bond.GetBeginAtomIdx(),
            bond.GetEndAtomIdx(),
            bond_type = bond.GetBondType()
        )
    return G

def get_graph2vec(smiles_list, model=None):
    if model == None:
        model = Graph2Vec()
    graph_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        graph = mol2graph(mol)
        graph_list.append(graph)
    model.fit(graph_list)
    vec_list = model.get_embedding()
    return vec_list

In [None]:
embedding = get_graph2vec(smiles_list)

## ChemBERTa

In [54]:
import torch
from tqdm import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [None]:
chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
chemberta.eval()

In [56]:
'''https://www.kaggle.com/code/alexandervc/chembert2a-smiles-embeddings-for-beginners/notebook'''
def get_chemberta(smiles_list):
    embeddings_cls = torch.zeros(len(smiles_list), 600)
    embeddings_mean = torch.zeros(len(smiles_list), 600)

    with torch.no_grad():
        for i, smiles in enumerate(tqdm(smiles_list)):
            encoded_input = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
            model_output = chemberta(**encoded_input)
            # embedding = model_output[0][::,0,::]
            # embeddings_cls[i] = embedding
            embedding = torch.mean(model_output[0], 1)
            embeddings_mean[i] = embedding
            
    return embeddings_mean.numpy()

In [60]:
embedding = get_chemberta(smiles_list)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:28<00:00, 176.75it/s]


## Continuous Data Driven Descriptors

The descriptors are generated using the CDDD REST - see [repository](https://github.com/vaxherra/cddd_rest)

In [13]:
import requests
import json
import numpy as np
import pandas as pd
import pickle

In [34]:
def prepare_json(smiles_list, batch_size=1):
    batches = [smiles_list[i:i + batch_size] for i in range(0, len(smiles_list), batch_size)]
    json_data = {"batches": batches}
    return json_data

def get_descriptors(json_data, url="http://192.168.1.100:80/predict"):
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(json_data), headers=headers)
    response_json = response.json()
    response_dict = json.loads(response_json["Prediction"])
    df = pd.DataFrame.from_dict(response_dict)
    descriptor_list = df.iloc[:, 2:].values
    return descriptor_list

def get_cddd(smiles_list, batch_size=1):
    smiles_list = list(smiles_list)
    json_data = prepare_json(smiles_list)
    descriptor_list = get_descriptors(json_data)
    return descriptor_list

In [None]:
embedding = get_cddd(smiles_list)

## Molecular Transformer Embeddings

The embedding can be calculated using the scripts accessible [here](https://github.com/mpcrlab/MolecularTransformerEmbeddings). Then the generated .npz file can be ocnverted to embedding list using the function below.

In [4]:
def get_mte(file_path, smiles_list):
    embedding_npz = np.load(file_path)
    embedding_list = []
    for smiles in smiles_list:
        embedding = np.mean(embedding_npz[smiles], axis=0)
        embedding_list.append(embedding)
    embedding_list = np.array(embedding_list)
    return embedding_list

In [5]:
embedding = get_mte("data/embedding/jak2_smiles.npz", smiles_list)

## MACAW

In [None]:
from macaw import *

In [None]:
def get_macaw(smiles_list, n_dimensions=20):
    mcw = MACAW(n_components=n_dimensions)
    mcw.fit(smiles_list)
    embedding = mcw.transform(smiles_list)
    retur embedding

In [None]:
embedding = get_macaw(smiles_list)

## MolFormer

In [49]:
import torch
from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModelForMaskedLM, AutoTokenizer, GenerationConfig

model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)

In [None]:
def get_molformer(smiles_list):
    inputs = tokenizer(smiles_list, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = np.array(outputs.pooler_output)
    return embedding

In [None]:
embedding = get_molformer(smiles_list)

## GPT2

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, DataCollatorWithPadding

tokenizer = GPT2TokenizerFast.from_pretrained("entropy/gpt2_zinc_87m", max_len=256)
model = GPT2LMHeadModel.from_pretrained('entropy/gpt2_zinc_87m')
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

In [None]:
def get_gpt2(smiles_list):
    embedding_list = []
    for i in range(0, len(smiles_list), 100):
        print(f"{i}:{i+100}")
        inputs = collator(tokenizer(smiles_list[i:i+100]))
        outputs = model(**inputs, output_hidden_states=True)
        full_embeddings = outputs[-1][-1]
        mask = inputs['attention_mask']
        embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1))
        embedding_list.extend(embeddings)
    for i, vec in enumerate(embedding_list):
        embedding_list[i] = vec.detach().numpy()
    return embedding_list

In [None]:
embedding = get_gpt2(smiles_list)

## BERT for SMILES

In [64]:
from transformers import BertTokenizerFast, BertModel

model = BertModel.from_pretrained("unikei/bert-base-smiles")
tokenizer = BertTokenizerFast.from_pretrained("unikei/bert-base-smiles")

In [68]:
def get_bert_smiles(smiles_list):
    inputs = tokenizer(smiles_list, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.detach().numpy()
    embedding = np.mean(embedding, axis=1)
    return embedding

In [None]:
embedding = get_bert_smiles(smiles_list)

# Distance and correlation calculations

Embeddings should be 2-dimensional arrays saved as .pkl

In [None]:
def get_fingerprints(smiles_list):
    fpgen = rdGen.GetMorganGenerator(radius=2, fpSize=2048)
    fingerprint_list = np.array([fpgen.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in smiles_list], dtype=rdkit.DataStructs.cDataStructs.ExplicitBitVect)
    return fingerprint_list

def get_pairwise_similarity(smiles_list):    
    fingerprint_list = get_fingerprints(smiles_list)
    similarity_matrix = 1 - pairwise_distances(X = fingerprint_list, metric='jaccard', n_jobs = -1)
    return similarity_matrix

def get_pairwise_distance(vec_list, metric):
    distance_matrix = pairwise_distances(X=vec_list, metric=metric, n_jobs = -1)
    return distance_matrix

In [None]:
smiles_list = np.genfromtxt("data/jak2_smiles.txt", dtype=str, comments=None)
print(f"Loaded {len(smiles_list)} SMILES")

similarity_matrix = get_pairwise_similarity(smiles_list)
with open(f"data/distance/similarity_jak2.pkl", "wb") as file:
    pickle.dump(similarity_matrix, file)

Calculate distances with different measures

In [None]:
embedding_list = (
    ("mol2vec", get_mol2vec),
    ("graph2vec", get_graph2vec),
    ("chemberta", get_chemberta),
    ("cddd", get_cddd),
    ("mte", get_mte),
    ("macaw", get_macaw),
    ("molformer", get_molformer),
    ("gpt2", get_mol2vec),
    ("bert_smiles", get_bert_smiles)
)
measure_list = ("euclidean", "cosine", "canberra")

for emb_name, emb_func in embedding_list:
    embedding = emb_func(smiles_list)
    embedding = np.nan_to_num(embedding)
    print(f"Calculated {emb_name} embedding: {embedding.shape}")
    with open(f"data/embedding/{emb_name}_embedding.pkl", "wb") as file:
        pickle.dump(similarity_matrix, file)

    for m_name in measure_list:
        distance_matrix = get_pairwise_distance(embedding, m_name)
        with open(f"data/distance/distance_{emb_name}_{m_name}.pkl", "wb") as file:
            pickle.dump(distance_matrix, file)
        print(f"\tDistance {m_name} - done.")
    print(f"Distance {emb_name} - done.")

Calculate correlation between embedding distance and Morgan Tanimoto similarity

In [None]:
embedding_list = ("mol2vec", "graph2vec", "chemberta", "cddd", "mte", "macaw", "molformer", "gpt2", "bert_smiles")
measures_list = ("euclidean", "cosine", "canberra")

with open(f"data/distance/similarity_jak2.pkl", "rb") as file:
    similarity_matrix = pickle.load(file)[:1000,:1000]
print(similarity_matrix.shape)
    
similarity_matrix_flat = similarity_matrix.flatten()
X = [similarity_matrix_flat]
for measure in measures_list:
    print(measure)
    with open(f"data/distance/distance_{embedding}_{measure}.pkl", "rb") as file:
        dist_matrix = pickle.load(file)
    dist_matrix = np.nan_to_num(dist_matrix[:5000, :5000])
    dist_matrix = dist_matrix.flatten()
    dist_matrix_norm = (dist_matrix - np.min(dist_matrix))/np.ptp(dist_matrix)
    X.append(dist_matrix_norm)
corr_matrix = np.corrcoef(X)

print(corr_matrix)
for measure, corr in zip(measures_list, corr_matrix[0, 1:]):
    print(f"{measure}: {corr}")