In [None]:
# First data compilation file, creates an 'initialSet' csv file 
import requests
import time
import pandas as pd
import json
import os

# Output file path (customize as needed)
output_file = "/Users/carmenshero/Desktop/Datasets/initialSet.csv"

# Retry wrapper for API requests
def safe_request(url, max_retries=3, delay=5):
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.json()
            if response.status_code in [404, 503]:
                return None
        except requests.exceptions.RequestException:
            pass
        time.sleep(delay)
    return None

# Get all PDB IDs from RCSB
def get_all_pdb_ids():
    query = {
        "query": {
            "type": "terminal",
            "service": "text",
            "parameters": {
                "attribute": "rcsb_entry_container_identifiers.entry_id",
                "operator": "exists"
            }
        },
        "request_options": {
            "return_all_hits": True
        },
        "return_type": "entry"
    }
    query_string = json.dumps(query)
    url = f"https://search.rcsb.org/rcsbsearch/v2/query?json={query_string}"
    response = safe_request(url)
    if response and "result_set" in response:
        return [entry["identifier"] for entry in response["result_set"]]
    return []

# Get UniProt IDs for a given PDB ID
def pdb_to_uniprot(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id.lower()}"
    data = safe_request(url)
    if not data or pdb_id.lower() not in data:
        return []
    return list(data[pdb_id.lower()].get("UniProt", {}).keys())

# Main data retrieval and appending function
def process_pdb_ids(output_file):
    # Ensure the output file exists and has headers
    if not os.path.exists(output_file):
        with open(output_file, "w") as f:
            f.write("PDB_ID,UniProt_ID\n")
        print(f"📄 Created new file: {output_file}")

    # Load already processed PDB IDs if present
    try:
        processed_df = pd.read_csv(output_file)
        processed_ids = set(processed_df["PDB_ID"].astype(str))
    except Exception:
        processed_ids = set()

    all_pdb_ids = get_all_pdb_ids()
    print(f"📊 Retrieved {len(all_pdb_ids)} PDB IDs")

    for pdb_id in all_pdb_ids:
        if pdb_id in processed_ids:
            continue

        print(f"🔍 Processing PDB ID: {pdb_id}")
        uniprot_ids = pdb_to_uniprot(pdb_id)

        for uniprot_id in uniprot_ids:
            row = {"PDB_ID": pdb_id, "UniProt_ID": uniprot_id}
            pd.DataFrame([row]).to_csv(output_file, mode="a", header=False, index=False)
            print(f"✅ Saved {pdb_id}, {uniprot_id}")

# Run the process
process_pdb_ids(output_file)
