In [15]:
import numpy as np
from Bio import PDB

def get_protein_structure(pdb_file):
    """
    Parses a PDB file and returns the structure object and a list of protein atoms.
    Filters out water molecules and ligands (HETATMs).
    """
    # 1. Initialize the PDB parser
    parser = PDB.PDBParser(QUIET=True, PERMISSIVE=True)
    
    # 2. Load the structure from the file
    # 'protein_obj' is an internal ID for the structure
    structure = parser.get_structure('protein_obj', pdb_file)
    
    # 3. List to store only amino acid atoms (ignoring HETATMs)
    protein_atoms = []
    
    for model in structure:
        for chain in model:
            for residue in chain:
                # Residue ID ' ' indicates a standard amino acid
                # 'H_' would indicate a hetero-atom (ligand/buffer)
                if residue.id[0] == ' ':
                    for atom in residue:
                        protein_atoms.append(atom)
    
    print(f"Success: Found {len(protein_atoms)} protein atoms.")
    return structure, protein_atoms

# Test
file_path = "1H8D.pdb"  

try:
    struct, atoms = get_protein_structure(file_path)
    
    # Inspect the first 5 atoms to verify
    print("\nCoordination check (First 5 atoms):")
    for atom in atoms[:5]:
        res = atom.get_parent()
        print(f"Residue: {res.get_resname()} | Atom: {atom.get_name()} | XYZ: {atom.get_coord()}")

except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.")

Success: Found 2333 protein atoms.

Coordination check (First 5 atoms):
Residue: ILE | Atom: N | XYZ: [ 5.169 -8.919 17.688]
Residue: ILE | Atom: CA | XYZ: [ 4.5   -8.595 18.976]
Residue: ILE | Atom: C | XYZ: [ 3.397 -9.622 19.265]
Residue: ILE | Atom: O | XYZ: [ 2.507 -9.808 18.423]
Residue: ILE | Atom: CB | XYZ: [ 3.896 -7.17  18.971]


we use biopython 
parser is a translater. 
reads pdb_files, only looks at the protein without the ligand and without any additional Water molecules. if it worked correctly, it prints my succsess-message

In [16]:
def create_grid(atoms, spacing=1.0):
    """
    Creates a 3D grid of points around the protein structure.
    """
    # Extract all coordinates as a numpy array for faster processing
    coords = np.array([atom.get_coord() for atom in atoms])
    
    # Define the boundaries of the protein (Bounding Box)
    min_coords = coords.min(axis=0) - 5.0 # Add 5A buffer
    max_coords = coords.max(axis=0) + 5.0 # Add 5A buffer
    
    # Create ranges for X, Y, and Z
    x_range = np.arange(min_coords[0], max_coords[0], spacing)
    y_range = np.arange(min_coords[1], max_coords[1], spacing)
    z_range = np.arange(min_coords[2], max_coords[2], spacing)
    
    # Create the actual 3D grid points
    grid = np.array(np.meshgrid(x_range, y_range, z_range)).T.reshape(-1, 3)
    
    print(f"Grid created with {len(grid)} points.")
    return grid

next we put a box with points in it over the protein. every point ist 1 A away from each other. 



In [17]:
def create_search_grid(protein_atoms, spacing=2.0):
    """
    Creates a 3D grid around the protein.
    Spacing of 2.0A is good for testing (fast). 
    Use 1.0A for the final high-quality prediction.
    """
    # Convert all atom coordinates to a single numpy array
    coords = np.array([atom.get_coord() for atom in protein_atoms])
    
    # Calculate the bounding box of the protein
    min_coords = coords.min(axis=0) - 5.0  # 5 Angstrom buffer
    max_coords = coords.max(axis=0) + 5.0
    
    # Generate the grid points using numpy
    x = np.arange(min_coords[0], max_coords[0], spacing)
    y = np.arange(min_coords[1], max_coords[1], spacing)
    z = np.arange(min_coords[2], max_coords[2], spacing)
    
    # meshgrid creates the 3D coordinates efficiently
    grid = np.array(np.meshgrid(x, y, z)).T.reshape(-1, 3)
    
    print(f"Grid generated: {grid.shape[0]} points.")
    return grid

# Test the grid generation
if 'atoms' in locals():
    my_grid = create_search_grid(atoms, spacing=2.0)

Grid generated: 26796 points.


In [18]:
from scipy.spatial import KDTree
import numpy as np
import time

def find_pocket_points(grid_points, protein_atoms, min_dist=2.5, max_dist=5.0):
    """
    Filters grid points to find potential binding pockets.
    - Points too close to atoms (< min_dist) are removed (collision).
    - Points too far from any atom (> max_dist) are removed (empty space).
    """
    # Extract coordinates from Bio.PDB atom objects
    atom_coords = np.array([atom.get_coord() for atom in protein_atoms])
    
    # Create a KDTree for fast spatial lookup
    tree = KDTree(atom_coords)
    
    # 1. Remove points that clash with the protein (too close)
    # We find all points that have NO atom within min_dist
    clash_indices = tree.query_ball_point(grid_points, min_dist)
    # If the list for a point is empty, it means no atom is too close
    no_clash_mask = np.array([len(i) == 0 for i in clash_indices])
    candidate_points = grid_points[no_clash_mask]
    
    # 2. From candidates, keep only those close enough to the surface to be in a pocket
    # A point must have at least one atom within max_dist
    tree_candidates = KDTree(atom_coords) # Tree for protein atoms
    near_indices = tree_candidates.query_ball_point(candidate_points, max_dist)
    pocket_mask = np.array([len(i) > 0 for i in near_indices])
    
    pocket_points = candidate_points[pocket_mask]
    
    print(f"Filtered {len(grid_points)} grid points down to {len(pocket_points)} pocket candidates.")
    return pocket_points

# --- TEST STRATEGY ---

def test_pocket_filtering(atoms):
    print("Step 1: Creating Grid...")
    # We use a wider spacing for the test to be super fast
    test_grid = create_search_grid(atoms, spacing=2.0)
    
    print("\nStep 2: Filtering Pocket Points...")
    # Execute the filtering
    start = time.time()
    pockets = find_pocket_points(test_grid, atoms)
    end = time.time()
    
    # Check 1: Performance
    print(f"Filtering took: {end - start:.4f} seconds.")
    
    # Check 2: Sanity Check
    if len(pockets) > 0 and len(pockets) < len(test_grid):
        print("SUCCESS: Points were filtered correctly.")
        print(f"Remaining points: {len(pockets)} ({len(pockets)/len(test_grid)*100:.1f}%)")
        
        # Check 3: Coordinate range
        print(f"Pocket Center (Avg XYZ): {np.mean(pockets, axis=0)}")
    else:
        print("ERROR: Something went wrong. Either 0 or all points remained.")

# Run the test
if 'atoms' in locals():
    test_pocket_filtering(atoms)

Step 1: Creating Grid...
Grid generated: 26796 points.

Step 2: Filtering Pocket Points...
Filtered 26796 grid points down to 4087 pocket candidates.
Filtering took: 0.1135 seconds.
SUCCESS: Points were filtered correctly.
Remaining points: 4087 (15.3%)
Pocket Center (Avg XYZ): [12.98675381 -0.89637085 18.88555075]


Lets visualize in chimera

In [20]:
def save_points_to_pdb(points, output_file):
    """
    Saves a numpy array of 3D coordinates as a PDB file.
    Each point is represented as a dummy 'HETATM'.
    """
    with open(output_file, 'w') as f:
        for i, point in enumerate(points):
            # Format: HETATM, Index, Name, Residue, Chain, ResID, X, Y, Z, Occ, Temp, Element
            f.write(f"HETATM{i+1:>5}  P   PTS A   1    {point[0]:>8.3f}{point[1]:>8.3f}{point[2]:>8.3f}  1.00  0.00           N\n")
        f.write("END\n")
    print(f"File saved: {output_file}")

from sklearn.cluster import DBSCAN

def cluster_pocket_points(pocket_points, eps=2.5, min_samples=5):
    """
    Groups individual points into distinct clusters (pockets) using DBSCAN.
    - eps: Maximum distance between two points to be in the same cluster.
    - min_samples: Minimum points needed to form a pocket.
    """
    if len(pocket_points) == 0:
        return {}

    # Initialize and run DBSCAN clustering
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(pocket_points)
    labels = db.labels_

    # Organize the points into a dictionary by their cluster ID
    pockets = {}
    unique_labels = set(labels)
    
    for label in unique_labels:
        if label == -1:
            continue # Label -1 represents 'noise' (points that don't belong to any pocket)
        
        pockets[label] = pocket_points[labels == label]
    
    print(f"Success: Found {len(pockets)} distinct potential pockets.")
    return pockets
# --- FULL VISUALIZATION SEQUENCE ---

def export_all_steps(atoms):
    # Step 1: Create the full grid (The "Box")
    print("Generating Step 1: Full Grid...")
    full_grid = create_search_grid(atoms, spacing=2.0)
    save_points_to_pdb(full_grid, "step1_full_grid.pdb")
    
    # Step 2: Filter the points (The "Pockets candidates")
    print("Generating Step 2: Filtered Points...")
    pocket_candidates = find_pocket_points(full_grid, atoms)
    save_points_to_pdb(pocket_candidates, "step2_pocket_candidates.pdb")
    
    # Step 3: Cluster the points (The "Final Pockets")
    print("Generating Step 3: Clustered Pockets...")
    pockets_dict = cluster_pocket_points(pocket_candidates)
    
    if pockets_dict:
        # Export each pocket as a separate file, or just the largest one
        for p_id, p_points in pockets_dict.items():
            filename = f"step3_pocket_{p_id}.pdb"
            save_points_to_pdb(p_points, filename)
    else:
        print("No clusters found for Step 3.")

# Execution
if 'atoms' in locals():
    export_all_steps(atoms)

Generating Step 1: Full Grid...
Grid generated: 26796 points.
File saved: step1_full_grid.pdb
Generating Step 2: Filtered Points...
Filtered 26796 grid points down to 4087 pocket candidates.
File saved: step2_pocket_candidates.pdb
Generating Step 3: Clustered Pockets...
Success: Found 22 distinct potential pockets.
File saved: step3_pocket_0.pdb
File saved: step3_pocket_1.pdb
File saved: step3_pocket_2.pdb
File saved: step3_pocket_3.pdb
File saved: step3_pocket_4.pdb
File saved: step3_pocket_5.pdb
File saved: step3_pocket_6.pdb
File saved: step3_pocket_7.pdb
File saved: step3_pocket_8.pdb
File saved: step3_pocket_9.pdb
File saved: step3_pocket_10.pdb
File saved: step3_pocket_11.pdb
File saved: step3_pocket_12.pdb
File saved: step3_pocket_13.pdb
File saved: step3_pocket_14.pdb
File saved: step3_pocket_15.pdb
File saved: step3_pocket_16.pdb
File saved: step3_pocket_17.pdb
File saved: step3_pocket_18.pdb
File saved: step3_pocket_19.pdb
File saved: step3_pocket_20.pdb
File saved: step3_poc