Import libraries

In [57]:
import os
import math
import pickle
import numpy as np
from tqdm import tqdm
from rdkit.Chem import AllChem

from utils import load_pickle, get_mol, pickle_data

Defining helper functions

In [48]:
def get_morgan(mol, radius, FP_size):
    
    fpgen = AllChem.GetMorganGenerator(radius = radius, fpSize = FP_size)
    FP = fpgen.GetFingerprint(mol).ToBitString()

    return FP

def union_FP(FP_list):

    union = [] 
    FP_list = np.array(FP_list)

    for bit_idx in range(FP_list.shape[1]):

        current_bit = int(sum(FP_list[:, bit_idx]) > 0)
        union.append(current_bit)
    
    return np.array(union)

def jaccard(FP1, FP2):

    intersection = np.sum(FP1 & FP2)  # bitwise AND
    union = np.sum(FP1 | FP2)         # bitwise OR
    jaccard_similarity = intersection / union
    
    return jaccard_similarity

def get_info(smiles):

    # Add in the fingerprint
    mol = get_mol(smiles)

    r = {} 
    r["morgan4_256"] = get_morgan(mol, 2, 256)
    r["morgan4_1024"] = get_morgan(mol, 2, 1024)
    r["morgan4_2048"] = get_morgan(mol, 2, 2048)

    r["morgan6_256"] = get_morgan(mol, 3, 256)
    r["morgan6_1024"] = get_morgan(mol, 3, 1024)
    r["morgan6_2048"] = get_morgan(mol, 3, 2048)
    r["morgan6_4096"] = get_morgan(mol, 3, 4096)

    return r 

def compare_parent_similarity(r, frag_FP_list):

    keys = list(frag_FP_list[0].keys())
    results = {}

    for k in keys:

        parent_FP = np.array([int(c) for c in r[k]])
        frag_FP = union_FP([[int(b) for b in f[k]] for f in frag_FP_list])

        jaccard_similarity = jaccard(parent_FP, frag_FP)

        results[k] = {"p": parent_FP, 
                      "frag": frag_FP,
                      "sim": jaccard_similarity,
                      "p_sum_1": sum(parent_FP),
                      "frag_sum_1": sum(frag_FP)}
    
    return results

Load in the data

In [59]:
output_folder = "./results"
folder = "/data/rbg/users/klingmin/projects/MS_processing/data/nist2023/MH_plus_no_rings_w_frags"

Iterate through the data

In [55]:
all_similarity_scores = [] 

for f in tqdm(os.listdir(folder)):

    path = os.path.join(folder, f)
    data = load_pickle(path)
    all_frags_FP = [] 

    for _, frag in data["frags"].items():
        
        if len(frag) == 0: continue
        frag = frag[0][2]
        FP = get_info(frag)
        all_frags_FP.append(FP)
    
    if len(all_frags_FP) == 0: continue 
    
    # Get the similarity score and append to the master list 
    similarity = compare_parent_similarity(data, all_frags_FP)

    # Append to the master similarity score 
    all_similarity_scores.append(similarity)

100%|██████████| 36282/36282 [35:16<00:00, 17.14it/s] 


In [60]:
pickle_data(all_similarity_scores, os.path.join(output_folder, "NIST2023_frag_similarity.pkl"))

Cluster the data

In [None]:

# 1. Generate synthetic 2D data (e.g., 300 points in 4 clusters)
X, _ = make_blobs(n_samples=300, centers=4, random_state=42)

# 2. Create and fit the k-means model
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X)

# 3. Retrieve the cluster labels and the cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

# 4. Plot the clustered data
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.6)
plt.scatter(centers[:, 0], centers[:, 1], 
            c='red', marker='*', s=200, label='Cluster Centers')
plt.title("K-Means Clustering")
plt.legend()
plt.show()