In [46]:
import requests
import os

def search_protein_family(family_name):
    family_name = family_name.replace(" ", "+")
    base_url = f"https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Creviewed&format=tsv&query=%28%28taxonomy_id%3A9606%29+AND+%22{family_name}%22%29&size=500"

    response = requests.get(base_url)

    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        uniprot_ids = lines[1:]  # Skip the header line
        return uniprot_ids
    else:
        print(f"Error: {response.status_code}")
        return []

def download_pdb_files(family_name, top_n=5, download_directory="."):

    number_of_files = len([name for name in os.listdir(download_directory) if os.path.isfile(os.path.join(download_directory, name))])
    if number_of_files >= top_n:
        print(f"Already downloaded {top_n} files for protein family {family_name}")
        return
    elif number_of_files>0:
        print(f"Already downloaded {number_of_files} files for protein family {family_name}")
        top_n = top_n - number_of_files
    success=0
    uniprot_data = search_protein_family(family_name)
    for line in uniprot_data:
        try:
            uniprot_id = line.split("\t")[0]
            alphafold_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
            response = requests.get(alphafold_url)
            pdb_url = response.json()[0]["pdbUrl"]
            pdb_file = requests.get(pdb_url)
            with open(f"{download_directory}/{uniprot_id}.pdb", "wb") as f:
                f.write(pdb_file.content)
            print(f"Downloaded {uniprot_id}.pdb in protein family {family_name}")
            success+=1
        except:
            print(f"Failed to download {uniprot_id}.pdb in protein family {family_name}")
            pass
        ongoing_num_files = len([name for name in os.listdir(download_directory) if os.path.isfile(os.path.join(download_directory, name))])
        if(ongoing_num_files==top_n):
            break
fail = {'Amyloid beta A4 precursor protein-binding family A member 1',
 'BCL-like',
 'Bcl-2-like protein 11',
 'CD74 (Macrophage migration inhibitory factor and HLA-DR antigens-associated invariant chain)',
 'Cyclin-dependent kinases regulatory subunit 1',
 'Endothelial PAS domain-containing protein 1',
 'Guanine nucleotide-binding protein G(i) subunit alpha-1',
 'MKEAP1',
 'MNRF2',
 'Mixed-lineage leukemia 1 (MLL1)',
 'Nuclear receptor coactivator 2',
 'Nuclear receptor coactivator 3',
 'PB1-5',
 'Perilipin-1',
 'Perilipin-5',
 'S100-A10',
 'TNFB',
 'TNFR1A',
 'VEGFR3',
 'Voltage-gated N-type calcium channel alpha-1B subunit',
 'XDM2',
 'ZIPA'}
# Example usage
top_n = 5
folder = "protein_files"
for filename in os.listdir(folder):
    filepath = os.path.join(folder, filename)
    if not os.path.isdir(filepath):
        continue
    for family_name in os.listdir(filepath):
        family_path = os.path.join(filepath, family_name)
        if not os.path.isdir(family_path):
            continue
        if family_name in fail:
            download_pdb_files(family_name, top_n, family_path)


Already downloaded 3 files for protein family TNFB
Downloaded P01374.pdb in protein family TNFB
Downloaded P19438.pdb in protein family TNFB
Downloaded Q7KYK3.pdb in protein family TNFB
Already downloaded 4 files for protein family TNFR1A
Downloaded P19438.pdb in protein family TNFR1A
Downloaded J9PH39.pdb in protein family TNFR1A
Downloaded F5H6V7.pdb in protein family TNFR1A
Downloaded F5H8A6.pdb in protein family TNFR1A
Already downloaded 2 files for protein family Perilipin-5
Downloaded Q00G26.pdb in protein family Perilipin-5
Downloaded K7EIX1.pdb in protein family Perilipin-5
Already downloaded 4 files for protein family Bcl-2-like protein 11
Downloaded O43521.pdb in protein family Bcl-2-like protein 11
Downloaded E9PAM9.pdb in protein family Bcl-2-like protein 11
Downloaded H7BZE5.pdb in protein family Bcl-2-like protein 11
Downloaded C9J417.pdb in protein family Bcl-2-like protein 11
Already downloaded 3 files for protein family Nuclear receptor coactivator 2
Downloaded Q9NPJ4.