In [10]:
#
# Program reads a comma separated list of PDB IDs and download them
# to the PDB_DIR global. 
# Used to download the list of proteins containing at least one SS bond
# with the ID list generated from: http://www.rcsb.org/
# 
#
# Author: Eric G. Suchanek, PhD
# Last modification 11/26/22
#
# 

import numpy
import os
import shutil
from glob import glob

from tqdm import tqdm
from Bio.PDB import PDBList, PDBParser

from Disulfide import check_header_from_file

PDB_DIR = '/Users/egs/PDB'
BAD_DIR = PDB_DIR + '/bad'

pdblist = PDBList(pdb=PDB_DIR, verbose=False)
parser = PDBParser(PERMISSIVE=True)

os.chdir(PDB_DIR)
all_pdb_files = glob("*.ent")

print(f'Found: {len(all_pdb_files)} PDB files')

# list of IDs containing >1 SSBond record

ssfile = open(f'{PDB_DIR}/ss_ids.txt')
Line = ssfile.readlines()
dlines = Line[0]

for line in Line:
    entries = line.split(',')

print(f'Found: {len(entries)} entries')
bad = []


Found: 9 PDB files
Found: 38215 entries


In [11]:
# file to track already downloaded entries.
bad_filename = f'{PDB_DIR}/ss_bad.txt'
badfile_handle = open(bad_filename, 'w+')

# this is the file containing the raw pdb IDs from the RCSB.org web query
for dl in dlines[0]:
    # create a list of pdb id already downloaded
    SS_raw_ids = dl.split(',')

count = len(SS_raw_ids)
badcount = 0

# Loop over all entries, 
pbar = tqdm(all_pdb_files[5:], ncols=100)
for entry in pbar:
    pbar.set_postfix({'Entry': entry, 'Bad': badcount})
    # fname = f'pdb{entry}.ent'
    if not check_header_from_file(entry):
        #shutil.move(entry, BAD_DIR)
        badcount += 1
        shutil.move(entry, 'bad')
    else:
        shutil.move(entry, 'good')
    count += 1


print(f'Overall count processed: {count}')
print(f'Bad files found: {badcount}')


100%|███████████████████████████████████████| 4/4 [00:00<00:00, 21.07it/s, Entry=pdb7frg.ent, Bad=3]

Overall count processed: 5
Bad files found: 4





In [None]:
#check_header_from_file(f'{PDB_DIR}/pdb5rsa.ent', verbose=True, dbg=True)