In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import pairwise_distances
import pickle
import faiss
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.Chem import rdFingerprintGenerator as rdGen
from rdkit import DataStructs
from rdkit.Chem import PandasTools, Draw
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import Descriptors
import warnings
warnings.filterwarnings("ignore")

In [32]:
embedding_distance = [
    ("cddd", "euclidean", faiss.IndexFlatL2),
    ("cddd", "euclidean", faiss.IndexFlatIP),
    ("molformer", "euclidean", faiss.IndexFlatL2),
    ("molformer", "euclidean", faiss.IndexFlatIP),
    ("macaw", "euclidean", faiss.IndexFlatIP),
    # ("mol2vec", "euclidean", faiss.IndexFlatIP),
]

In [15]:
smiles_list = np.genfromtxt("data/smiles_random.csv", dtype=str, comments=None)[1:]
print(smiles_list.shape)

(10381,)


## Cluster by Tanimoto similarity on Morgan2 fingerprints

In [30]:
def butina_cluster(fingerprint_list, threshold=0.35):
    dist_matrix = []
    n = len(fingerprint_list)
    for i in range(1, n):
        sims = DataStructs.BulkTanimotoSimilarity(fingerprint_list[i], fingerprint_list[:i])
        dist_matrix.extend([1-x for x in sims])
    clusters = Butina.ClusterData(dist_matrix, nPts=n, distThresh=threshold, isDistData=True)
    indices =  np.zeros((n,))
    for idx_cls, cluster in enumerate(clusters, 1):
        for idx in cluster:
            indices[idx] = idx_cls
    return indices

In [32]:
fingerprint_list = [rdmd.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=2048) for smiles in smiles_list]
import time
start_time = time.time()
clusters_fp = butina_cluster(fingerprint_list)
print(f"Exec time: {round(time.time() - start_time, 3)}s")

28.666009664535522


## Cluster by distance on embedding

In [18]:
def kmeans_cluster(embedding_list, n_clusters=500, n_iter=20):
    kmeans = faiss.Kmeans(d=vector_dimension, k=n_clusters, niter=n_iter, verbose=True)
    kmeans.train(embedding_list)
    distances, indices = kmeans.index.search(embedding_list, 1)
    return indices

In [56]:
def rand_index(clusters_fp, clusters_emb):
    counts = [[0, 0], [0, 0]]
    n = len(clusters_fp)
    print()
    for i in range(n):
        print(f"\r{i}", end='')
        for j in range(i+1, n):
            counts[int(clusters_fp[i] == clusters_fp[j])][int(clusters_emb[i] == clusters_emb[j])] += 1
    print()
    print(counts[0][0], counts[0][1], counts[1][0], counts[1][1])
    rand_idx = (counts[0][0] + counts[1][1]) / np.sum(counts)
    return rand_idx

In [None]:
for emb_name, dist_name, index_cls in embedding_distance:
    with open(f"embedding/{emb_name}_embedding.pkl", "rb") as file:
        embedding_list = pickle.load(file)
    embedding_list = np.nan_to_num(embedding_list)
    print(embedding_list.shape)

    vector_dimension = embedding_list.shape[1]
    index = index_cls(vector_dimension)
    index.add(embedding_list)
    print(f"Added to index: {index.ntotal}")

    clusters_emb = kmeans_cluster(embedding_list, n_clusters=4100, n_iter=7)
    rand_idx = rand_index(clusters_fp, clusters_emb)
    print(f"Rand index for {emb_name} - {dist_name}: {rand_idx}")

## Compare similarity search results

In [39]:
def get_results_fp(smiles, smiles_list):
    fpgen = rdGen.GetMorganGenerator(radius=2, fpSize=2048)
    query_fp = fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))
    similarities = [DataStructs.TanimotoSimilarity(fpgen.GetFingerprint(Chem.MolFromSmiles(smi)), query_fp) for smi in smiles_list]
    distances = 1 - np.array(similarities)
    results = np.argsort(distances)
    return results

def get_results_emb(embedding, index):
    query_emb = np.array([embedding])
    distances, indices = index.search(query_emb, k=index.ntotal)
    results = indices[0]
    return results

In [33]:
query_smiles_list = np.genfromtxt("data/smiles_query.txt", dtype=str, delimiter='\n', comments=None)
print(len(query_smiles_list))

300


In [None]:
n = 10
for emb_name, dist_name, index_cls in embedding_distance:
    with open(f"embedding/{emb_name}_query_embedding.pkl", "rb") as file:
        query_embedding_list = pickle.load(file)
    query_embedding_list = np.nan_to_num(query_embedding_list)
    print(query_embedding_list.shape)  
    with open(f"embedding/{emb_name}_embedding.pkl", "rb") as file:
        embedding_list = pickle.load(file)
    embedding_list = np.nan_to_num(embedding_list)
    print(embedding_list.shape)

    vector_dimension = embedding_list.shape[1]
    index = index_cls(vector_dimension)
    index.add(embedding_list)
    print(f"Added to index: {index.ntotal}")
    
    total_common_count = 0
    total_recall = 0
    print()
    i = 1
    for query_smiles, query_emb in zip(query_smiles_list, query_embedding_list):
        print(f"\r{i} / 300", end='')
        i += 1
        results_emb = get_results_emb(query_emb, index)[:n]
        results_fp = get_results_fp(query_smiles, smiles_list)[:n]
        count = len(set(results_emb) & set(results_fp))
        total_common_count += count
        total_recall += count / 10
    print()
    print(total_common_count, total_recall)
    print(f"{emb_name} - {dist_name}: recall1={total_recall / len(query_smiles_list)}\trecall2={total_common_count / (10 * len(query_smiles_list))}")

(300, 512)
(10381, 512)
Added to index: 10381

300 / 300
1296 129.6
cddd - euclidean: recall1=0.432	recall2=0.432
(300, 512)
(10381, 512)
Added to index: 10381

80 / 300