In [None]:
# Advanced Chemical Clustering with Feature Engineering and Evaluation

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, MACCSkeys, RDKFingerprint
from rdkit.Avalon import pyAvalonTools
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
import plotly.express as px

# Function to compute fingerprints
def compute_fingerprints(molecules, fingerprint_type):
    if fingerprint_type == 'morgan' or fingerprint_type == 'ecfp':
        return [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in molecules]
    elif fingerprint_type == 'maccs':
        return [MACCSkeys.GenMACCSKeys(mol) for mol in molecules]
    elif fingerprint_type == 'rdkit':
        return [RDKFingerprint(mol) for mol in molecules]
    elif fingerprint_type == 'atom_pair':
        return [AllChem.GetHashedAtomPairFingerprintAsBitVect(mol) for mol in molecules]
    elif fingerprint_type == 'daylight':
        return [pyAvalonTools.GetAvalonFP(mol) for mol in molecules]
    elif fingerprint_type == 'chemical_hashed':
        return [AllChem.RDKFingerprint(mol, fpSize=1024) for mol in molecules]
    elif fingerprint_type == 'fcfp':
        return [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=True) for mol in molecules]
    elif fingerprint_type == 'topological':
        return [Chem.RDKFingerprint(mol) for mol in molecules]
    elif fingerprint_type == 'pharmacophore':
        return [AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol) for mol in molecules]
    elif fingerprint_type == 'substructure':
        return [AllChem.GetMACCSKeysFingerprint(mol) for mol in molecules]
    else:
        raise ValueError(f"Unknown fingerprint type: {fingerprint_type}")

# Function to perform clustering and save results
def cluster_and_save_results(fingerprint_type, molecules, df_smiles):
    # Compute fingerprints
    fingerprints = compute_fingerprints(molecules, fingerprint_type)
    
    # Convert fingerprints to numpy array
    fingerprint_array = np.array([np.array(fp) for fp in fingerprints])
    
    # Compute pairwise distances (1 - Tanimoto similarity)
    distance_matrix = pairwise_distances(fingerprint_array, metric='jaccard')
    
    # Perform hierarchical clustering with a distance threshold
    distance_threshold = 1.0  # Set your desired distance threshold
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='average')
    cluster_labels = clustering.fit_predict(distance_matrix)
    
    # Add cluster labels to the DataFrame
    df_smiles['Cluster'] = cluster_labels
    
    # Sort DataFrame based on cluster size
    cluster_counts = Counter(cluster_labels)
    sorted_clusters = sorted(cluster_counts.items(), key=lambda x: x[1], reverse=True)
    sorted_cluster_labels = [cluster for cluster, count in sorted_clusters]
    
    df_smiles['Cluster'] = pd.Categorical(df_smiles['Cluster'], categories=sorted_cluster_labels, ordered=True)
    df_smiles = df_smiles.sort_values('Cluster')
    
    # Save the DataFrame to a CSV file
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_filename = f'clustered_chemical_data_{fingerprint_type}_{timestamp}.csv'
    df_smiles.to_csv(output_filename, index=False)
    print(f'Data saved to {output_filename}')
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(distance_matrix, cluster_labels, metric='precomputed')
    print(f'Silhouette Score for {fingerprint_type} fingerprints: {silhouette_avg:.2f}')
    
    # Optional - Interactive visualization with Plotly
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(fingerprint_array)
    
    fig = px.scatter(tsne_results, x=0, y=1, color=cluster_labels, title=f'Chemical Clustering - {fingerprint_type.capitalize()} Fingerprints')
    fig.show()

# Step 1: Load the CSV file
df_smiles = pd.read_csv('path_to_your_file.csv')  # Update the path to your file

# Ensure that the column name matches the one in your CSV
smiles_data = df_smiles['SMILES'].tolist()

# Step 2: Convert SMILES to RDKit molecules
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_data]

# Step 3: List of fingerprint types
fingerprint_types = ['morgan', 'maccs', 'rdkit', 'atom_pair', 'daylight', 'ecfp', 'chemical_hashed', 'fcfp', 'topological', 'pharmacophore', 'substructure']

# Step 4: Cluster and save results for each fingerprint type
for fingerprint_type in fingerprint_types:
    cluster_and_save_results(fingerprint_type, molecules, df_smiles.copy())
