In [1]:
import biotite.database.rcsb as rcsb
import redo
import pypdb
import pandas as pd

In [2]:
def get_pdb(uniprot_id, exp_method="X-RAY DIFFRACTION", max_res=3, min_ligand_w=100):

    query_by_uniprot = rcsb.FieldQuery("rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", exact_match=uniprot_id)
    query_by_expmethpd = rcsb.FieldQuery("exptl.method", exact_match=exp_method)
    query_by_res = rcsb.FieldQuery("rcsb_entry_info.resolution_combined", less_or_equal=max_res)
    query_by_ligand_mw = rcsb.FieldQuery("chem_comp.formula_weight", molecular_definition=True, greater=min_ligand_w)

    query = rcsb.CompositeQuery(
        [
            query_by_uniprot,
            query_by_res,
            query_by_expmethpd,
            query_by_ligand_mw
        ],
        "and"
    )

    pdb_ids = rcsb.search(query)
    print(len(pdb_ids))
    print("PDB IDs")
    return pdb_ids

In [3]:
@redo.retriable(attempts=10, sleeptime=2)
def describe_one_pdb_id(pdb_id):
    """Fetch meta information from PDB."""
    described = pypdb.describe_pdb(pdb_id)
    if described is None:
        print(f"! Error while fetching {pdb_id}, retrying ...")
        raise ValueError(f"Could not fetch PDB id {pdb_id}")
    return described

In [4]:
pdb_ids = get_pdb("P00533")
pdb_descs = [describe_one_pdb_id(pdb_id) for pdb_id in pdb_ids]

276
PDB IDs


In [5]:
pdb_descs[1].get('rcsb_entry_info')

{'assembly_count': 1,
 'branched_entity_count': 0,
 'cis_peptide_count': 0,
 'deposited_atom_count': 2560,
 'deposited_deuterated_water_count': 0,
 'deposited_hydrogen_atom_count': 0,
 'deposited_model_count': 1,
 'deposited_modeled_polymer_monomer_count': 312,
 'deposited_nonpolymer_entity_instance_count': 1,
 'deposited_polymer_entity_instance_count': 1,
 'deposited_polymer_monomer_count': 333,
 'deposited_solvent_atom_count': 20,
 'deposited_unmodeled_polymer_monomer_count': 21,
 'diffrn_radiation_wavelength_maximum': 1.0,
 'diffrn_radiation_wavelength_minimum': 1.0,
 'disulfide_bond_count': 0,
 'entity_count': 3,
 'experimental_method': 'X-ray',
 'experimental_method_count': 1,
 'inter_mol_covalent_bond_count': 0,
 'inter_mol_metalic_bond_count': 0,
 'molecular_weight': 38.27,
 'na_polymer_entity_types': 'Other',
 'nonpolymer_entity_count': 1,
 'nonpolymer_molecular_weight_maximum': 0.39,
 'nonpolymer_molecular_weight_minimum': 0.39,
 'polymer_composition': 'homomeric protein',
 'p

In [6]:
def extract_pdb_info(pdb_metadata_list):
    """
    Extract resolution, number of ligands, and ligand names from a list of PDB metadata dictionaries.
    
    Args:
        pdb_metadata_list (list): List of dictionaries from describe_pdb for each PDB ID.
        
    Returns:
        list: List of dictionaries containing PDB ID, resolution, ligand count, and ligand names.
    """
    results = []
    
    for metadata in pdb_metadata_list:
        pdb_id = metadata.get('rcsb_id', 'Unknown')  # Get PDB ID
        
        # Initialize default values
        resolution = 'N/A'
        ligand_count = 0
        ligand_names = []
        
        # Extract resolution from rcsb_entry_info
        if 'rcsb_entry_info' in metadata and 'resolution_combined' in metadata['rcsb_entry_info']:
            resolution = metadata['rcsb_entry_info']['resolution_combined'][0] if metadata['rcsb_entry_info']['resolution_combined'] else 'N/A'
        elif 'rcsb_entry_info' in metadata and 'diffrn_resolution_high' in metadata['rcsb_entry_info']:
            resolution = metadata['rcsb_entry_info']['diffrn_resolution_high'].get('value', 'N/A')
        
        # Extract number of ligands
        if 'rcsb_entry_info' in metadata:
            ligand_count = metadata['rcsb_entry_info'].get('nonpolymer_entity_count', 0)
        
        # Extract ligand names
        if 'rcsb_binding_affinity' in metadata:
            # Collect unique ligand IDs (comp_id) from rcsb_binding_affinity
            ligand_names = list(set([entry['comp_id'] for entry in metadata['rcsb_binding_affinity']]))

        if 'rcsb_entry_info' in metadata:
            num_chains = metadata['rcsb_entry_info'].get('deposited_polymer_entity_instance_count', 0)
        
        
        # Append result for this PDB
        results.append({
            'pdb_id': pdb_id,
            'resolution': resolution,
            'ligand_count': ligand_count,
            'ligand_names': ', '.join(ligand_names) if ligand_names else 'None',
            'num_chains': num_chains
        })
    
    return results

In [7]:
res = extract_pdb_info(pdb_descs)
df = pd.DataFrame.from_dict(res)

In [8]:
df.head()

Unnamed: 0,pdb_id,resolution,ligand_count,ligand_names,num_chains
0,1M14,2.6,0,,1
1,1M17,2.6,1,AQ4,1
2,1MOX,2.5,4,,4
3,1NQL,2.8,1,,2
4,1XKK,2.4,2,FMM,1


In [18]:
for i in df.iterrows():
    print(i[1]['pdb_id'])

1M14
1M17
1MOX
1NQL
1XKK
1YY9
2EB2
2EB3
2GS2
2GS6
2GS7
2ITN
2ITP
2ITQ
2ITT
2ITU
2ITV
2ITW
2ITX
2ITZ
2J5F
2RFE
2RGP
3B2U
3BEL
3BUO
3G5V
3G5Y
3GOP
3GT8
3IKA
3LZB
3OB2
3OP0
3P0Y
3PFV
3POZ
3QWQ
3UG1
3UG2
3VJN
3VJO
3VRP
3VRR
3W2O
3W2P
3W2Q
3W2R
3W2S
3W32
3W33
4G5J
4HJO
4I1Z
4I22
4I23
4I24
4JQ7
4JQ8
4JR3
4JRV
4KRL
4KRM
4KRP
4LI5
4LQM
4R3P
4RIY
4RJ4
4RJ6
4RJ7
4RJ8
4UIP
4UV7
4WKQ
4WRG
4ZAU
4ZJV
4ZSE
5C8K
5C8M
5C8N
5CAL
5CAN
5CAO
5CAP
5CAQ
5CAS
5CAU
5CAV
5CNN
5CNO
5CZH
5CZI
5D41
5EDP
5EDQ
5EDR
5EM5
5EM6
5EM7
5EM8
5FED
5FEE
5GMP
5GNK
5GTZ
5HCX
5HCY
5HCZ
5HG5
5HG7
5HG8
5HG9
5HIB
5HIC
5J9Y
5J9Z
5SX4
5SX5
5U8L
5UG8
5UG9
5UGA
5UGB
5UGC
5WB7
5WB8
5X26
5X27
5X28
5X2A
5X2C
5X2F
5XDK
5XDL
5XGM
5XGN
5XWD
5YU9
5ZTO
5ZWJ
6B3S
6D8E
6DUK
6JRJ
6JRK
6JRX
6JWL
6JX0
6JX4
6JXT
6JZ0
6LUB
6LUD
6P1D
6P1L
6P8Q
6S89
6S8A
6S9C
6S9D
6TFU
6TFV
6TFW
6TFY
6TFZ
6TG0
6TG1
6V5N
6V5P
6V66
6V6K
6V6O
6VH4
6VHN
6WA2
6WAK
6WXN
6XL4
6Z4B
6Z4D
7A2A
7A6I
7A6J
7A6K
7AEI
7AEM
7B85
7ER2
7JXI
7JXL
7JXM
7JXP
7JXQ
7JXW
7K1H
7KXZ
7LEN
7LG8


In [25]:
df.ligand_names.unique()

array(['None', 'AQ4', 'FMM', 'AEE', 'IRE', 'DJK', '0UN', '03P', 'W2R',
       'W32', 'W19', '0WM', '1C9', 'KJQ', 'KJ8', 'KJR', 'KJV', '3QW',
       '3R1', '3QS', 'YY3', '4YV', '4YW', '4YX', '4Z8', '4ZB', '4ZG',
       '4ZH', '4ZJ', '57N', '5N3', '5N4', '5Q2', '5Q3', '5Q4', 'F62',
       '60B', '60D', '60E', '6HL', '6HJ', '8BM', '7XO', '7XR', '8JC',
       '1E8', '9JO', '9LL', 'JBJ', 'CKO', 'NQ1', 'O57', 'L0Q', 'L0N',
       'QP7', 'QQJ', 'QQM', 'TOV', 'UEJ', 'R85', '6GY', 'JAU', 'VO7',
       '9LL, YY3', 'JBJ, YY3', 'VNS', 'VNS, YY3', 'XA4', 'VNS, 8RC',
       'YFA', '35Z', 'M0R', 'M19', 'I0A', '7VH', 'R25', 'V58', 'KY9',
       'QFO', 'X9H', 'D0D'], dtype=object)

In [30]:
df.query("ligand_names != 'None' and ligand_count > 1")

Unnamed: 0,pdb_id,resolution,ligand_count,ligand_names,num_chains
4,1XKK,2.400,2,FMM,1
19,2ITZ,2.800,2,IRE,1
39,3UG2,2.500,2,IRE,1
47,3W2R,2.050,2,W2R,1
48,3W2S,1.900,2,W2R,1
...,...,...,...,...,...
216,8A27,1.070,3,KY9,1
217,8A2A,1.430,2,KY9,1
221,8D76,2.400,2,QFO,2
223,8F1H,2.800,2,X9H,1
