This script analyzes the SwissDock PDB with all of the poses
It should give you the deltaG and the group of ligands

In [3]:
import sys
import re
import numpy as np
from collections import defaultdict
from sklearn.cluster import KMeans

In [None]:
def parse_pdb(pdb_file):
    """Extracts ligand coordinates, ΔG, cluster ID, and rank from the PDB file."""
    ligands = []  # Store as (ligand_data, delta_g, rank)
    
    current_ligand = []
    delta_g = None
    cluster = None
    rank = None
    
    with open(pdb_file, 'r') as file:
        for line in file:
            if line.startswith('REMARK'):
                if 'deltaG:' in line:
                    delta_g = float(line.split()[-1])
                elif 'Cluster:' in line:
                    cluster = int(line.split()[-1])
                elif 'Rank:' in line:
                    rank = int(line.split()[-1])
            elif line.startswith('ATOM') or line.startswith('HETATM'):
                current_ligand.append(line)
            elif line.startswith('TER') and current_ligand:
                if delta_g is not None:
                    ligands.append((current_ligand, delta_g, cluster, rank))
                current_ligand = []
                delta_g = None
                cluster = None
                rank = None
    
    return ligands

def get_centroid(ligand):
    """Calculates the centroid (average x, y, z coordinates) of a ligand."""
    coords = []
    for line in ligand:
        x, y, z = float(line[30:38]), float(line[38:46]), float(line[46:54])
        coords.append((x, y, z))
    return np.mean(coords, axis=0)

def group_by_distance(ligands, num_clusters=5):
    """Groups ligands into exactly num_clusters based on centroid positions."""
    centroids = np.array([get_centroid(ligand) for ligand, _, _, _ in ligands])
    
    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(centroids)

    # Group ligands by computed clusters
    grouped_ligands = defaultdict(list)
    for label, (ligand, delta_g, cluster, rank) in zip(labels, ligands):
        grouped_ligands[label].append((ligand, delta_g, cluster, rank))

    return grouped_ligands, kmeans.cluster_centers_

# Function to write all ligands and their details to a file, deltaG, cluster, rank, centroid
def write_ligands_to_file(grouped_ligands, cluster_centers, output_file):
    """Writes grouped ligand data, including centroid, ΔG, and rank, to a file."""
    with open(output_file, 'w') as file:
        for cluster, ligands_in_cluster in sorted(grouped_ligands.items()):
            file.write(f"Cluster {cluster}:\n")
            file.write(f"  Cluster Centroid: {cluster_centers[cluster]}\n")
            
            for i, (ligand, delta_g, orig_cluster, rank) in enumerate(ligands_in_cluster, 1):
                centroid = get_centroid(ligand)
                file.write(f"  Ligand {i}: deltaG = {delta_g:.6f}, Original Cluster = {orig_cluster}, Rank = {rank}, Centroid = {centroid}\n")
            file.write("\n")

In [14]:
def main():
    if len(sys.argv) < 2:
        print("Usage: python analyze_ligands.py <pdb_file>")
        sys.exit(1)

    pdb_file = "/Users/stuytschaevers/Desktop/Thesis/atp_mg2+/swissDock/clusters.dock4.pdb"
    ligands = parse_pdb(pdb_file)
    output_dir = "/Users/stuytschaevers/Desktop/Thesis/atp_mg2+/swissDock/final_files/"

    # Apply clustering to group into 5 distance-based clusters
    grouped_ligands, cluster_centers = group_by_distance(ligands, num_clusters=5)

    # Print clusters
    for cluster, ligands_in_cluster in sorted(grouped_ligands.items()):
        print(f"Cluster {cluster}:")
        # Calculate and print the centroid of the cluster
        cluster_centroid = cluster_centers[cluster]
        print(f"  Cluster Centroid: {cluster_centroid}")
        
        # Find the ligand with the lowest ΔG in this cluster
        lowest_delta_g_ligand = min(ligands_in_cluster, key=lambda x: x[1])  # Sort by ΔG
        ligand, delta_g, orig_cluster, rank = lowest_delta_g_ligand
        centroid = get_centroid(ligand)
        print(f"  Ligand with the lowest ΔG: ΔG = {delta_g:.6f}, Original Cluster = {orig_cluster}, Rank = {rank}, Centroid = {centroid}")
        
        print()

    amount_of_ligands_file  = output_dir + "amount_of_ligands_in_clusters.csv"
    # Save the amount of ligands in each cluster to a file 
    with open(amount_of_ligands_file, "w") as f:
        f.write("Cluster,Number of Ligands\n")  # CSV header
        for cluster, ligands_in_cluster in sorted(grouped_ligands.items()):
            f.write(f"{cluster},{len(ligands_in_cluster)}\n")  # Proper CSV format

    # Write all ligands and their details to a file
    output_file = "ligands_grouped_by_distance.csv"
    output_file = output_dir + output_file
    write_ligands_to_file(grouped_ligands, cluster_centers, output_file)
    

if __name__ == "__main__":
    main()

Cluster 0:
  Cluster Centroid: [106.88643568 105.73441424 121.75448074]
  Ligand with the lowest ΔG: ΔG = -11.872108, Original Cluster = 29, Rank = 1, Centroid = [106.81913953 106.30509302 122.43097674]

Cluster 1:
  Cluster Centroid: [ 69.21029111  99.1743451  147.96309109]
  Ligand with the lowest ΔG: ΔG = -16.188068, Original Cluster = 5, Rank = 3, Centroid = [ 71.35706977 100.393      148.83476744]

Cluster 2:
  Cluster Centroid: [ 85.59030814 101.48394186 157.51561919]
  Ligand with the lowest ΔG: ΔG = -13.669934, Original Cluster = 2, Rank = 0, Centroid = [ 85.61418605 101.45216279 157.66155814]

Cluster 3:
  Cluster Centroid: [127.47181395 101.78480814 141.91376744]
  Ligand with the lowest ΔG: ΔG = -10.120633, Original Cluster = 28, Rank = 0, Centroid = [127.53923256 101.80067442 142.05893023]

Cluster 4:
  Cluster Centroid: [105.36911337 121.83590407 118.01249709]
  Ligand with the lowest ΔG: ΔG = -14.122814, Original Cluster = 19, Rank = 1, Centroid = [105.37246512 121.836255

In [11]:
import mdtraj as md
import numpy as np

def calculate_centroid(pdb_file, ligands):
    # Load the PDB file
    traj = md.load(pdb_file)

    # Find atom indices that belong to ligands (not protein)
    ligand_indices = []
    for i, res in enumerate(traj.topology.residues):
        if res.name in ligands:
            # Collect all atom indices of the ligands
            ligand_indices.extend([atom.index for atom in res.atoms])

    # Get the coordinates of the ligands
    ligand_coords = traj.xyz[0, ligand_indices, :]  # Only the first frame (structure)

    # Calculate the centroid (mean of coordinates)
    centroid = np.mean(ligand_coords, axis=0)
    return centroid

def main():
    pdb_file = "/Users/stuytschaevers/Desktop/Thesis/atp_mg2+/swissDock/clusters_final/all_clusters_FINAL.pdb"  # Replace with your actual PDB file path
    ligands = ["LIG"]  # Define your ligands of interest here

    centroid = calculate_centroid(pdb_file, ligands)
    print(f"The centroid of the ligands is: {centroid}")

if __name__ == "__main__":
    main()


The centroid of the ligands is: [ 9.934042 10.635742 13.779993]


In [7]:
# Print version of sklearn
print(sklearn.__version__)


1.5.1
