In [13]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.ML.Cluster import Butina
import os

def cluster_molecules(fps, cutoff=0.2):
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1-x for x in sims])
    clusters = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return clusters

def get_unique_filename(filename):
    base, extension = os.path.splitext(filename)
    counter = 1
    new_filename = filename
    while os.path.exists(new_filename):
        new_filename = f"{base}_{counter}{extension}"
        counter += 1
    return new_filename

# Read SMILES from CSV
input_file = 'pgk2.csv'
df = pd.read_csv(input_file)
smiles_list = df['SMILES'].tolist()

# Convert SMILES to RDKit molecule objects and compute fingerprints
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in molecules]

# Specify similarity threshold (cutoff)
similarity_threshold = 0.3

# Perform clustering with the specified similarity threshold
clusters = cluster_molecules(fingerprints, cutoff=1 - similarity_threshold)

# Add cluster information to the dataframe
cluster_labels = [0] * len(molecules)
for idx, cluster in enumerate(clusters):
    for molecule_idx in cluster:
        cluster_labels[molecule_idx] = idx

df['Cluster'] = cluster_labels

# Write output to CSV with a unique filename if it already exists
output_file = 'output_clusters.csv'
output_file = get_unique_filename(output_file)
df.to_csv(output_file, index=False)

print(f'Clustering completed. Results saved to {output_file}')


Clustering completed. Results saved to output_clusters_1.csv
