In [None]:
import requests
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import time

# Constants
UNIPROT_API = "https://rest.uniprot.org/uniprotkb/search"
PFAM_API = "https://pfam.xfam.org/protein/"
RCSB_API = "https://search.rcsb.org/rcsbsearch/v2/query"
OUTPUT_DIR = "protein_sequences"
BATCH_SIZE = 100
MAX_PROTEINS = 500
RETRY_LIMIT = 3
DELAY = 1
USE_XRAY = True  # Set to False to skip X-ray filtering

# Ensure output directory exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def query_uniprot(query, size=100):
    """Query UniProt API and return protein entries."""
    params = {
        "query": query,
        "format": "json",
        "size": size,
        "fields": "accession,id,sequence,ft_domain,cc_domain"
    }
    for attempt in range(RETRY_LIMIT):
        try:
            response = requests.get(UNIPROT_API, params=params)
            response.raise_for_status()
            return response.json().get("results", [])
        except requests.exceptions.RequestException as e:
            print(f"UniProt query failed (attempt {attempt + 1}): {e}")
            time.sleep(DELAY * (attempt + 1))
    return []

def query_rcsb_xray():
    """Query RCSB PDB for X-ray structures of human proteins."""
    query = {
        "query": {
            "type": "group",
            "logical_operator": "and",
            "nodes": [
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "attribute": "rcsb_entity_source_organism.taxonomy_lineage.name",
                        "operator": "exact_match",
                        "value": "Homo sapiens"
                    }
                },
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "attribute": "exptl.method",
                        "operator": "exact_match",
                        "value": "X-RAY DIFFRACTION"
                    }
                }
            ]
        },
        "return_type": "entry",
        "request_options": {
            "return_all_hits": True
        }
    }
    for attempt in range(RETRY_LIMIT):
        try:
            response = requests.post(RCSB_API, json=query)
            response.raise_for_status()
            return response.json().get("result_set", [])
        except requests.exceptions.RequestException as e:
            print(f"RCSB query failed (attempt {attempt + 1}): {e}")
            time.sleep(DELAY * (attempt + 1))
    return []

def get_pfam_domains(uniprot_id):
    """Fetch Pfam domains for a UniProt ID."""
    url = f"{PFAM_API}{uniprot_id}/entry"
    headers = {"Accept": "application/json"}
    for attempt in range(RETRY_LIMIT):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            data = response.json()
            return [entry["accession"] for entry in data.get("entry", {}).get("regions", [])]
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"Pfam query failed for {uniprot_id} (attempt {attempt + 1}): {e}")
            time.sleep(DELAY * (attempt + 1))
    return []

def map_pdb_to_uniprot(pdb_id):
    """Map PDB ID to UniProt ID."""
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
    for attempt in range(RETRY_LIMIT):
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            uniprot_ids = []
            for entity in data.get("rcsb_entry_container_identifiers", {}).get("polymer_entity_ids", []):
                entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity}"
                entity_response = requests.get(entity_url)
                entity_response.raise_for_status()
                entity_data = entity_response.json()
                for ref in entity_data.get("rcsb_polymer_entity_container_identifiers", {}).get("uniprot_ids", []):
                    uniprot_ids.append(ref)
            return uniprot_ids
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"PDB to UniProt mapping failed for {pdb_id} (attempt {attempt + 1}): {e}")
            time.sleep(DELAY * (attempt + 1))
    return []

def classify_proteins(proteins, xray_uniprot_ids=None):
    """Classify proteins into single-domain, multi-domain, and shared-domain pairs."""
    protein_domains = []
    for protein in proteins:
        uniprot_id = protein["primaryAccession"]
        if xray_uniprot_ids and uniprot_id not in xray_uniprot_ids:
            continue  # Skip proteins without X-ray structures if USE_XRAY is True
        domains = get_pfam_domains(uniprot_id)
        if domains:
            protein_domains.append({
                "uniprot_id": uniprot_id,
                "sequence": protein["sequence"]["value"],
                "domains": domains
            })
        time.sleep(DELAY)

    # Classify proteins
    single_domain = [p for p in protein_domains if len(p["domains"]) == 1]
    multi_domain = [p for p in protein_domains if len(p["domains"]) >= 2]

    # Find multi-domain proteins with shared domains
    domain_to_proteins = defaultdict(list)
    for protein in multi_domain:
        for domain in protein["domains"]:
            domain_to_proteins[domain].append(protein)

    shared_domain_proteins = []
    for domain, proteins in domain_to_proteins.items():
        if len(proteins) > 1:
            for i, p1 in enumerate(proteins):
                for p2 in proteins[i + 1:]:
                    if p1["domains"] != p2["domains"]:
                        shared_domain_proteins.append((p1, p2))

    return single_domain, multi_domain, shared_domain_proteins

def find_mutant_pairs(shared_domain_proteins):
    """Identify pairs with potential mutations (sequence differences in shared domains)."""
    mutant_pairs = []
    for p1, p2 in shared_domain_proteins:
        seq1, seq2 = p1["sequence"], p2["sequence"]
        shared_domains = set(p1["domains"]) & set(p2["domains"])
        if len(seq1) == len(seq2):  # Simple check for sequence differences
            differences = sum(a != b for a, b in zip(seq1, seq2))
            if 0 < differences < 10:  # Arbitrary threshold for "mutant" (adjust as needed)
                mutant_pairs.append((p1, p2))
    return mutant_pairs

def save_fasta(proteins, filename, description_prefix):
    """Save protein sequences to a FASTA file."""
    records = []
    for protein in proteins:
        uniprot_id = protein["uniprot_id"]
        seq = Seq(protein["sequence"])
        domains = ",".join(protein["domains"])
        record = SeqRecord(
            seq,
            id=uniprot_id,
            description=f"{description_prefix} | Domains: {domains}"
        )
        records.append(record)

    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, "w") as f:
        SeqIO.write(records, f, "fasta")
    print(f"Saved {len(records)} sequences to {filepath}")

def save_shared_domain_fasta(pairs, filename, prefix="SharedDomain"):
    """Save pairs of proteins to a FASTA file."""
    records = []
    for p1, p2 in pairs:
        for protein, idx in [(p1, 1), (p2, 2)]:
            uniprot_id = protein["uniprot_id"]
            seq = Seq(protein["sequence"])
            domains = ",".join(protein["domains"])
            record = SeqRecord(
                seq,
                id=uniprot_id,
                description=f"{prefix}_Pair{idx} | Domains: {domains}"
            )
            records.append(record)

    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, "w") as f:
        SeqIO.write(records, f, "fasta")
    print(f"Saved {len(records)} sequences to {filepath}")

def main():
    # Query human proteins
    query = "organism_id:9606 reviewed:true"
    proteins = query_uniprot(query, size=BATCH_SIZE)
    if not proteins:
        print("No proteins retrieved from UniProt.")
        return

    # Limit to MAX_PROTEINS
    proteins = proteins[:MAX_PROTEINS]
    print(f"Processing {len(proteins)} proteins...")

    # Optionally filter for X-ray structures
    xray_uniprot_ids = None
    if USE_XRAY:
        xray_entries = query_rcsb_xray()
        xray_uniprot_ids = set()
        for entry in xray_entries:
            pdb_id = entry["identifier"]
            uniprot_ids = map_pdb_to_uniprot(pdb_id)
            xray_uniprot_ids.update(uniprot_ids)
        print(f"Found {len(xray_uniprot_ids)} UniProt IDs with X-ray structures")

    # Classify proteins
    single_domain, multi_domain, shared_domain_proteins = classify_proteins(proteins, xray_uniprot_ids)

    # Find mutant pairs (if needed)
    mutant_pairs = find_mutant_pairs(shared_domain_proteins)

    # Save sequences
    save_fasta(single_domain, "single_domain.fasta", "SingleDomain")
    save_fasta(multi_domain, "multi_domain.fasta", "MultiDomain")
    save_shared_domain_fasta(shared_domain_proteins, "shared_domain_pairs.fasta")
    save_shared_domain_fasta(mutant_pairs, "mutant_pairs.fasta", prefix="Mutant")

    # Summary
    print(f"Single-domain proteins: {len(single_domain)}")
    print(f"Multi-domain proteins: {len(multi_domain)}")
    print(f"Shared domain protein pairs: {len(shared_domain_proteins)}")
    print(f"Mutant pairs: {len(mutant_pairs)}")

if __name__ == "__main__":
    main()

Processing 100 proteins...
