In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
import requests
from io import StringIO
from Bio.PDB import PDBParser, PDBIO, Select
from tape.datasets import LMDBDataset
from collections import Counter
from functools import partial
from Bio.PDB import PDBParser, DSSP
from pathlib import Path
from foldingdiff.datasets import extract_pdb_code_and_chain

class ChainSelect(Select):
    """
    A custom selection class for PDBIO that only writes out the specified chain.
    """
    def __init__(self, chain_id):
        self.chain_id = chain_id

    def accept_chain(self, chain_obj):
        # Only accept the chain with id matching the desired chain.
        if chain_obj.get_id() == self.chain_id:
            return 1
        else:
            return 0

def download_and_filter_pdb(pdb_code, chain, download_dir="pdb_files"):
    """
    Downloads the full PDB file for the given pdb_code, parses it with Biopython, and writes
    out only the structure corresponding to the specified chain.
    """
    os.makedirs(download_dir, exist_ok=True)
    filename = os.path.join(download_dir, f"{pdb_code}_{chain}.pdb")
    url = f"https://files.rcsb.org/download/{pdb_code}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        pdb_text = response.text
        # Parse the PDB content using a StringIO stream
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_code, StringIO(pdb_text))
        
        # Prepare the PDB writer with our custom ChainSelect
        io = PDBIO()
        io.set_structure(structure)        
        io.save(filename, select=ChainSelect(chain))
        # print(f"Downloaded and filtered {pdb_code} chain {chain} successfully.")
    else:
        print(f"Error: Could not download {pdb_code}; status code {response.status_code}")


In [2]:
def process_sample(sample, download_dir):
    """Process one dataset sample: extract id, get the PDB code and download the file."""
    dataset_id = sample['id']
    # print(f"Processing: {dataset_id}")
    pdb_code, chain = extract_pdb_code_and_chain(dataset_id)
    download_and_filter_pdb(pdb_code, chain, download_dir)
    return dataset_id  # You may return any result you need

# Set an appropriate number of worker threads (adjust max_workers as needed)
max_workers = 100

def process_dataset(dataset_ids, download_dir):
    # If LMDBDataset isn’t a list (and doesn’t have __len__), consider converting it to a list first 
    # so that tqdm knows the total number of items. For example:
    dataset_ids = list(dataset_ids)
    process_sample_partial = partial(process_sample, download_dir=download_dir)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Use executor.map to apply process_sample to each sample in the dataset
        # executor.map returns an iterator that produces results in order
        results = list(tqdm(executor.map(process_sample_partial, dataset_ids), total=len(dataset_ids)))

    # Optionally, process or log the collected results
    print("Finished processing samples.")

In [3]:
train = LMDBDataset(f'/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/remote_homology_train.lmdb')
counts = Counter((s['fold_label'] for s in train))
keep = set([k for k in counts if counts[k]>50])
print(len(keep))
for suffix in ['train','valid','test_family_holdout','test_fold_holdout','test_superfamily_holdout']:
    dataset_ids = LMDBDataset(f'/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/remote_homology_{suffix}.lmdb')
    dataset_ids = [s for s in dataset_ids if s['fold_label'] in keep]
    process_dataset(dataset_ids, f'/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/{suffix}_pdbs')



45


100%|██████████| 6102/6102 [15:37<00:00,  6.51it/s]  


Finished processing samples.


100%|██████████| 242/242 [00:37<00:00,  6.45it/s]


Finished processing samples.


100%|██████████| 809/809 [02:11<00:00,  6.16it/s] 


Finished processing samples.


100%|██████████| 239/239 [00:42<00:00,  5.66it/s]


Finished processing samples.


100%|██████████| 460/460 [01:15<00:00,  6.06it/s]

Finished processing samples.





In [3]:
bad = []
for suffix in ['train','valid','test_family_holdout','test_fold_holdout','test_superfamily_holdout']:
    folder = f'/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/{suffix}_pdbs'
    for f in tqdm(os.listdir(folder)):
        fname = os.path.join(folder, f)
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure(Path(fname).stem, fname)
        model = structure[0]  # assuming you want the first model
        try:
            dssp = DSSP(model, fname)        
        except:
            print(fname)
            bad.append(fname)

empty protein, or no valid complete residues

  6%|▌         | 315/5392 [00:35<10:26,  8.11it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/2RCJ_A.pdb


empty protein, or no valid complete residues

  6%|▌         | 323/5392 [00:36<09:52,  8.55it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1XBP_G.pdb


empty protein, or no valid complete residues

 19%|█▉        | 1046/5392 [02:13<06:44, 10.74it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1HRB_A.pdb


empty protein, or no valid complete residues

 25%|██▌       | 1364/5392 [02:56<06:13, 10.78it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/2ZJR_1.pdb


empty protein, or no valid complete residues

 26%|██▌       | 1387/5392 [02:59<07:06,  9.39it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/2MYS_B.pdb


TER    3574      HOH A7502                                                       

empty protein, or no valid complete residues

 33%|███▎      | 1764/5392 [03:57<06:21,  9.51it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1C53_A.pdb


empty protein, or no valid complete residues

 36%|███▋      | 1961/5392 [04:24<06:23,  8.95it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1FFK_W.pdb


TER    1343      HOH A 523                                                       

empty protein, or no valid complete residues

 56%|█████▋    | 3043/5392 [06:55<03:53, 10.06it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/2ILA_A.pdb


TER    2662      HOH A1363                                                       

empty protein, or no valid complete residues

 61%|██████    | 3285/5392 [07:31<03:34,  9.81it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1KVP_A.pdb


TER    1460      HOH A 322                                                       

empty protein, or no valid complete residues

 70%|██████▉   | 3751/5392 [08:36<02:57,  9.22it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1A1Q_A.pdb


empty protein, or no valid complete residues

 75%|███████▌  | 4065/5392 [09:21<02:18,  9.58it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/1ZVO_A.pdb


TER    1814      HOH A 585                                                       

TER    4791      HOH A1240                                                       

TER    1301      HOH A 359                                                       

TER    3783      HOH B2423                                                       

empty protein, or no valid complete residues



/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/train_pdbs/2MYS_C.pdb


TER    3642      HOH A 810                                                       

100%|██████████| 5392/5392 [12:34<00:00,  7.15it/s]
empty protein, or no valid complete residues

 29%|██▉       | 69/240 [00:10<00:22,  7.59it/s]

/n/holylfs06/LABS/mzitnik_lab/Users/msun415/foldingdiff/data/remote_homology/valid_pdbs/1MLI_A.pdb


100%|██████████| 240/240 [00:38<00:00,  6.20it/s]
100%|██████████| 776/776 [01:57<00:00,  6.60it/s]
100%|██████████| 220/220 [00:35<00:00,  6.24it/s]
TER    1244      HOH A 706                                                       

100%|██████████| 448/448 [01:06<00:00,  6.72it/s]


In [6]:
for f in bad:
    os.remove(f)