In [1]:
import os
import gzip
import shutil
import requests
import random
from pathlib import Path
from itertools import combinations
from concurrent.futures import ThreadPoolExecutor, as_completed

from Bio.PDB import PDBList, PDBParser, Superimposer
from Bio.PDB.Polypeptide import is_aa

# Configuration
CLUSTER_URL = "https://cdn.rcsb.org/resources/sequence/clusters/clusters-by-entity-95.txt"
OUTPUT_DIR = Path("pdb_mutant_pairs")
MIN_RMSD = 0.001            # Å; exclude trivial identical structures
MAX_RMSD = 2.0            # Å; allow small conformational shifts
MIN_MUTATIONS = 1         # allow identical or mutated sequences
MAX_MUTATIONS = 20        # avoid overly divergent sequences
TARGET_PAIR_COUNT = 100   # desired number of biologically meaningful pairs

# Determine optimal thread count
import os as _os
THREADS = max(8, _os.cpu_count())

# 1. Prepare output directory
print("[1/4] Preparing output directory...")
OUTPUT_DIR.mkdir(exist_ok=True)
cluster_file = OUTPUT_DIR / "clusters-by-entity-95.txt"

# 2. Download cluster file if missing
print("[2/4] Checking cluster file...")
if not cluster_file.exists():
    print("    Downloading cluster file...")
    resp = requests.get(CLUSTER_URL, stream=True)
    resp.raise_for_status()
    with open(cluster_file, 'wb') as fout:
        for chunk in resp.iter_content(chunk_size=8192): fout.write(chunk)
    print("    Cluster file downloaded.")
else:
    print("    Cluster file exists; skipping download.")

# 3. Parse clusters and select diverse candidate pairs
print("[3/4] Parsing clusters and selecting candidate pairs...")
pairs = []
with open(cluster_file) as fin:
    for line in fin:
        codes = list({e.split('_')[0] for e in line.split() if '_' in e})
        if len(codes) < 2:
            continue
        random.shuffle(codes)
        pairs.append((codes[0], codes[1]))
print(f"    {len(pairs)} pairs loaded.")
random.shuffle(pairs)

# 4. Define processing function

def process_pair(pdb1, pdb2):
    # Fetch FASTA
    try:
        fasta1 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb1}").text.splitlines()
        fasta2 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb2}").text.splitlines()
    except requests.RequestException:
        return None
    seq1 = ''.join(fasta1[1:]); seq2 = ''.join(fasta2[1:])
    if not seq1 or not seq2 or len(seq1) != len(seq2):
        return None
    # Mutation filter
    mutations = sum(a != b for a, b in zip(seq1, seq2))
    if mutations < MIN_MUTATIONS or mutations > MAX_MUTATIONS:
        return None
    # Prepare dir
    pair_dir = OUTPUT_DIR / f"pair_{pdb1}_{pdb2}"
    pair_dir.mkdir(exist_ok=True)
    # Save sequences
    with open(pair_dir / f"{pdb1}.fasta", 'w') as f:
        f.write(fasta1[0] + "\n" + seq1 + "\n")
    with open(pair_dir / f"{pdb2}.fasta", 'w') as f:
        f.write(fasta2[0] + "\n" + seq2 + "\n")
        # Download PDBs directly (uncompressed) to avoid .ent/.gz issues
    def fetch_pdb(pid):
        url = f"https://files.rcsb.org/download/{pid}.pdb"
        try:
            r = requests.get(url)
            r.raise_for_status()
            pdb_out = pair_dir / f"{pid.lower()}.pdb"
            with open(pdb_out, 'w') as f:
                f.write(r.text)
            return pdb_out
        except requests.RequestException:
            return None
    p1 = fetch_pdb(pdb1)
    p2 = fetch_pdb(pdb2)
    if not p1 or not p2:
        shutil.rmtree(pair_dir)
        return None
    if not p1 or not p2:
        shutil.rmtree(pair_dir)
        return None
    # Protein-only filter
    parser = PDBParser(QUIET=True)
    try:
        s1 = parser.get_structure(pdb1, p1)
        s2 = parser.get_structure(pdb2, p2)
    except Exception:
        shutil.rmtree(pair_dir)
        return None
    if any(not is_aa(r) for r in s1.get_residues()) or any(not is_aa(r) for r in s2.get_residues()):
        shutil.rmtree(pair_dir)
        return None
    # RMSD
    ca1 = [a for a in s1.get_atoms() if a.get_name()=='CA']
    ca2 = [a for a in s2.get_atoms() if a.get_name()=='CA']
    if len(ca1)!=len(ca2) or not ca1:
        shutil.rmtree(pair_dir)
        return None
    sup = Superimposer(); sup.set_atoms(ca1, ca2)
    rmsd_val = sup.rms
    if rmsd_val < MIN_RMSD or rmsd_val > MAX_RMSD:
        shutil.rmtree(pair_dir)
        return None
    identity_pct = round(100 * (len(seq1)-mutations)/len(seq1),1)
    return {'pdb1': pdb1, 'pdb2': pdb2, 'mutations': mutations,
            'identity_pct': identity_pct, 'rmsd': round(rmsd_val,3),
            'seq1': seq1, 'seq2': seq2}

# 5. Parallel execution
print(f"[4/4] Processing {len(pairs)} pairs with {THREADS} threads...")
selected = []; processed = 0
with ThreadPoolExecutor(max_workers=THREADS) as executor:
    futures = {executor.submit(process_pair, a, b): (a, b) for a, b in pairs}
    for future in as_completed(futures):
        processed += 1
        res = future.result()
        if res:
            selected.append(res)
            print(f"    Selected {len(selected)}/{TARGET_PAIR_COUNT}: {res['pdb1']} vs {res['pdb2']}")
        if processed % 50 == 0:
            print(f"    Processed {processed}/{len(pairs)}; {len(selected)} selected")
        if len(selected)>=TARGET_PAIR_COUNT:
            break
print(f"Finished: {len(selected)} pairs selected from {processed} processed.")

# Summary output
for e in selected:
    print(f"{e['pdb1']} vs {e['pdb2']}: Mutations={e['mutations']}, Identity={e['identity_pct']}%, RMSD={e['rmsd']} Å")


[1/4] Preparing output directory...
[2/4] Checking cluster file...
    Cluster file exists; skipping download.
[3/4] Parsing clusters and selecting candidate pairs...
    54609 pairs loaded.
[4/4] Processing 54609 pairs with 12 threads...
    Processed 50/54609; 0 selected
    Processed 100/54609; 0 selected
    Processed 150/54609; 0 selected
    Processed 200/54609; 0 selected
    Processed 250/54609; 0 selected
    Processed 300/54609; 0 selected
    Processed 350/54609; 0 selected
    Processed 400/54609; 0 selected
    Processed 450/54609; 0 selected
    Processed 500/54609; 0 selected
    Processed 550/54609; 0 selected
    Processed 600/54609; 0 selected
    Processed 650/54609; 0 selected
    Processed 700/54609; 0 selected
    Processed 750/54609; 0 selected
    Processed 800/54609; 0 selected
    Processed 850/54609; 0 selected
    Processed 900/54609; 0 selected
    Processed 950/54609; 0 selected
    Processed 1000/54609; 0 selected
    Processed 1050/54609; 0 selected
    