In [1]:
import os
import requests
import json
from pathlib import Path
from Bio.PDB import PDBList, PDBParser

# Define base directory for storing PDB files
BASE_DIR = Path("pdb_files")
BASE_DIR.mkdir(exist_ok=True)

# Define subdirectories for each analysis task
TASK_DIRS = {
    "homologous_pairs": BASE_DIR / "homologous_pairs",
    "mutants": BASE_DIR / "mutants",
    "single_domain": BASE_DIR / "single_domain",
    "multi_domain": BASE_DIR / "multi_domain",
    "disentanglement": BASE_DIR / "disentanglement"
}

# Create task directories
for task_dir in TASK_DIRS.values():
    task_dir.mkdir(exist_ok=True)

# List of target proteins
TARGET_PROTEINS = [
    "lysozyme", "GFP", "beta-lactamase", "HIV protease", "hemoglobin", "NRAS"
]

# Known PDB IDs for homologous pairs and disentanglement (fallback)
FALLBACK_PDBS = {
    "lysozyme": ["2LZM", "1L63"],
    "GFP": ["1GFL", "1Q4A"],
    "beta-lactamase": ["1XPB", "1ZG4"],
    "HIV protease": ["1HXB", "1HVR"],
    "hemoglobin": ["1A3N", "1HHO"],
    "NRAS": ["1AA9", "1K8R"]
}

# RCSB PDB Search API endpoint
SEARCH_API = "https://search.rcsb.org/rcsbsearch/v2/query"
DOWNLOAD_URL = "https://files.rcsb.org/download"

# Track downloaded files per task (max 200)
DOWNLOAD_COUNTS = {
    "homologous_pairs": 0,
    "mutants": 0,
    "single_domain": 0,
    "multi_domain": 0,
    "disentanglement": 0
}
MAX_FILES_PER_TASK = 200

def search_pdb(query, return_type="entry"):
    """Search PDB using RCSB Search API."""
    payload = {
        "query": query,
        "request_options": {
            "return_all_hits": True,
            "results_content_type": ["experimental"],
            "sort": [{"sort_by": "score", "direction": "desc"}]
        },
        "return_type": return_type
    }
    try:
        response = requests.post(SEARCH_API, json=payload)
        if response.status_code == 200:
            result_set = response.json().get("result_set", [])
            print(f"Query returned {len(result_set)} results")
            return result_set
        else:
            print(f"Error searching PDB: {response.status_code}, {response.text}")
            return []
    except Exception as e:
        print(f"Exception during PDB search: {e}")
        return []

def download_pdb(pdb_id, output_path, task_name):
    """Download PDB file for a given PDB ID if it doesn't exist and limit not reached."""
    global DOWNLOAD_COUNTS
    if DOWNLOAD_COUNTS[task_name] >= MAX_FILES_PER_TASK:
        print(f"Max files ({MAX_FILES_PER_TASK}) reached for {task_name}. Skipping download.")
        return False
    
    if output_path.exists():
        print(f"PDB file {output_path} already exists. Skipping download.")
        DOWNLOAD_COUNTS[task_name] += 1  # Count existing files toward limit
        return True
    
    url = f"{DOWNLOAD_URL}/{pdb_id}.pdb"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(output_path, "w") as f:
                f.write(response.text)
            print(f"Downloaded {pdb_id} to {output_path}")
            DOWNLOAD_COUNTS[task_name] += 1
            return True
        else:
            print(f"Failed to download {pdb_id}: {response.status_code}")
            return False
    except Exception as e:
        print(f"Error downloading {pdb_id}: {e}")
        return False

def get_homologous_pairs(protein_name):
    """Search for homologous protein pairs or use fallback PDB IDs."""
    global DOWNLOAD_COUNTS
    if DOWNLOAD_COUNTS["homologous_pairs"] >= MAX_FILES_PER_TASK:
        print("Max homologous pairs reached. Skipping.")
        return
    
    # Try API search first
    query = {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct_keywords.text",
                    "operator": "contains_phrase",
                    "value": protein_name
                }
            }
        ]
    }
    results = search_pdb(query)
    
    if results:
        for result in results:
            if DOWNLOAD_COUNTS["homologous_pairs"] >= MAX_FILES_PER_TASK:
                break
            pdb_id = result["identifier"]
            output_path = TASK_DIRS["homologous_pairs"] / f"{pdb_id}.pdb"
            download_pdb(pdb_id, output_path, "homologous_pairs")
    else:
        print(f"No homologous pairs found for {protein_name}. Using fallback PDB IDs.")
        if protein_name in FALLBACK_PDBS:
            for pdb_id in FALLBACK_PDBS[protein_name]:
                if DOWNLOAD_COUNTS["homologous_pairs"] >= MAX_FILES_PER_TASK:
                    break
                output_path = TASK_DIRS["homologous_pairs"] / f"{pdb_id}.pdb"
                download_pdb(pdb_id, output_path, "homologous_pairs")

def get_mutant_structures(protein_name):
    """Search for mutant structures."""
    global DOWNLOAD_COUNTS
    if DOWNLOAD_COUNTS["mutants"] >= MAX_FILES_PER_TASK:
        print("Max mutants reached. Skipping.")
        return
    
    query = {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct_keywords.text",
                    "operator": "contains_phrase",
                    "value": protein_name
                }
            },
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct.title",
                    "operator": "contains_phrase",
                    "value": "mutant"
                }
            }
        ]
    }
    results = search_pdb(query)
    for result in results:
        if DOWNLOAD_COUNTS["mutants"] >= MAX_FILES_PER_TASK:
            break
        pdb_id = result["identifier"]
        output_path = TASK_DIRS["mutants"] / f"{pdb_id}.pdb"
        download_pdb(pdb_id, output_path, "mutants")

def get_domain_structures(protein_name, domain_type="single"):
    """Search for single- or multi-domain proteins."""
    global DOWNLOAD_COUNTS
    task_dir_name = "single_domain" if domain_type == "single" else "multi_domain"
    if DOWNLOAD_COUNTS[task_dir_name] >= MAX_FILES_PER_TASK:
        print(f"Max {task_dir_name} reached. Skipping.")
        return
    
    query = {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct_keywords.text",
                    "operator": "contains_phrase",
                    "value": protein_name
                }
            }
        ]
    }
    results = search_pdb(query)
    for result in results:
        if DOWNLOAD_COUNTS[task_dir_name] >= MAX_FILES_PER_TASK:
            break
        pdb_id = result["identifier"]
        output_path = TASK_DIRS[task_dir_name] / f"{pdb_id}.pdb"
        download_pdb(pdb_id, output_path, task_dir_name)

def get_disentanglement_structures(protein_name):
    """Search for structures suitable for disentanglement analysis."""
    global DOWNLOAD_COUNTS
    if DOWNLOAD_COUNTS["disentanglement"] >= MAX_FILES_PER_TASK:
        print("Max disentanglement structures reached. Skipping.")
        return
    
    # Simplified query without UniProt filter
    query = {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct_keywords.text",
                    "operator": "contains_phrase",
                    "value": protein_name
                }
            }
        ]
    }
    results = search_pdb(query)
    if results:
        for result in results:
            if DOWNLOAD_COUNTS["disentanglement"] >= MAX_FILES_PER_TASK:
                break
            pdb_id = result["identifier"]
            output_path = TASK_DIRS["disentanglement"] / f"{pdb_id}.pdb"
            download_pdb(pdb_id, output_path, "disentanglement")
    else:
        print(f"No disentanglement structures found for {protein_name}. Using fallback PDB IDs.")
        if protein_name in FALLBACK_PDBS:
            for pdb_id in FALLBACK_PDBS[protein_name]:
                if DOWNLOAD_COUNTS["disentanglement"] >= MAX_FILES_PER_TASK:
                    break
                output_path = TASK_DIRS["disentanglement"] / f"{pdb_id}.pdb"
                download_pdb(pdb_id, output_path, "disentanglement")

def main():
    """Main function to download PDB files for all tasks."""
    for protein in TARGET_PROTEINS:
        print(f"Processing {protein}...")
        
        # Task 1: Homologous pairs
        get_homologous_pairs(protein)
        
        # Task 2: Mutant structures
        get_mutant_structures(protein)
        
        # Task 3: Single- and multi-domain proteins
        get_domain_structures(protein, domain_type="single")
        get_domain_structures(protein, domain_type="multi")
        
        # Task 4: Disentanglement analysis
        get_disentanglement_structures(protein)
    
    # Print final download counts
    print("\nFinal download counts:")
    for task, count in DOWNLOAD_COUNTS.items():
        print(f"{task}: {count} files")

if __name__ == "__main__":
    main()

Processing lysozyme...
Query returned 810 results
Downloaded 4PJ2 to pdb_files\homologous_pairs\4PJ2.pdb
Downloaded 1BVX to pdb_files\homologous_pairs\1BVX.pdb
Downloaded 256L to pdb_files\homologous_pairs\256L.pdb
Downloaded 2GV0 to pdb_files\homologous_pairs\2GV0.pdb
Downloaded 2RSC to pdb_files\homologous_pairs\2RSC.pdb
Downloaded 3N9A to pdb_files\homologous_pairs\3N9A.pdb
Downloaded 3N9C to pdb_files\homologous_pairs\3N9C.pdb
Downloaded 3N9E to pdb_files\homologous_pairs\3N9E.pdb
Downloaded 3RU5 to pdb_files\homologous_pairs\3RU5.pdb
Downloaded 4DT3 to pdb_files\homologous_pairs\4DT3.pdb
Downloaded 4ET8 to pdb_files\homologous_pairs\4ET8.pdb
Downloaded 4ET9 to pdb_files\homologous_pairs\4ET9.pdb
Downloaded 4ETA to pdb_files\homologous_pairs\4ETA.pdb
Downloaded 4ETB to pdb_files\homologous_pairs\4ETB.pdb
Downloaded 4ETC to pdb_files\homologous_pairs\4ETC.pdb
Downloaded 4ETD to pdb_files\homologous_pairs\4ETD.pdb
Downloaded 4ETE to pdb_files\homologous_pairs\4ETE.pdb
Downloaded 4XN6