In [1]:
import requests
import json
import pandas as pd
from itertools import batched
import os

In [3]:
# read in list of pdb codes for initial consideration
pdbCodeFile = './pdbListAll.txt'

# file contains comma separated list of pdb ids
with open(pdbCodeFile) as f:
    fileRead=f.read()
pdbCodes = fileRead.strip().split(' ')

'''
# dsv file with one column labeled 'pdbid'
df = pd.read_csv(pdbCodeFile)
pdbCodes=list(df['pdbid'])
'''

pdbCodes = pdbCodes[:1000]

print(len(pdbCodes),'codes\n')

1000 codes



In [None]:
# download multiple entries, taking care that PDBe only allows batches up to 1000 at a time
batchSize = 100  # must be less than 1000

urlPrefix = 'https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/'
pdbCodesBatched = batched( pdbCodes, batchSize ) # first separate list of codes into batches of 1000 
reportDict = {}
for batch in pdbCodesBatched:
    print('downloading batch...', end='')
    codeString = ','.join(batch)
    report=requests.post(urlPrefix,data=codeString)
    reportDict.update( json.loads(report.text) )
    print(len(reportDict),'total entries')
print(len(reportDict),'entries downloaded')

In [None]:
# create summary dataframe from downloaded dictionary

'''
# keys in downloaded dictionary - not used below but listed here for completeness
entryKeys = ['title', 'processing_site', 'deposition_site', 'deposition_date', 'release_date', 'revision_date', \
             'experimental_method_class', 'experimental_method', 'split_entry', 'related_structures', 'entry_authors', \
             'number_of_entities', 'assemblies']
'''

## lists of keys that will be used:

# 1. keys with numerical, string or list values
simpleKeys = ['title', 'deposition_date', 'experimental_method']

# 2. keys of the sub-dictionaries that are the values associated with keys 'number_of_entities' and 'assemblies'
entityKeys = ['water', 'polypeptide', 'dna', 'rna', 'dna/rna', 'sugar', 'ligand', 'carbohydrate_polymer', 'other']
assemblyKeys = ['assembly_id', 'name', 'form']

# columns (keys) of summary dataframe (dictionary)
dataKeys = ['pdbid'] + simpleKeys + entityKeys + ['assemblies'] + assemblyKeys

dataDict = { k:[] for k in dataKeys } 
for pdbid,entry in reportDict.items():
    dataDict['pdbid'].append(pdbid)
    for k in simpleKeys:
        dataDict[k].append(entry[0][k])
    for k in entityKeys:
        dataDict[k].append(entry[0]['number_of_entities'][k])
    dataDict['assemblies'].append(len(entry[0]['assemblies']))

    # now go through the assemblies and extract data from preferred assembly: the Protein Data Bank in Europe (PDBe)
    # defines the preferred assembly as the smallest assembly containing all polymeric entities.

    for d in entry[0]['assemblies']:
        if d['preferred']:
            for ak in assemblyKeys:
                dataDict[ak].append(d[ak])

dataDf = pd.DataFrame(dataDict)  

In [None]:
dataDf

In [None]:
# download preferred assembly files
assemblyDirectory = '../DATA/db/assemblies'

os.makedirs(assemblyDirectory,exist_ok=True)
for i in dataDf[ dataDf['assembly_id'] == '4' ].index:
    code=dataDf.at[i,'pdbid']
    assembly=dataDf.at[i,'assembly_id']
    fileName = code + '-assembly' + assembly + '.cif'
    url = 'https://files.rcsb.org/download/' + fileName
    download = requests.get(url)
    print(url)
    with open( os.path.join(assemblyDirectory,fileName), 'w' ) as f:
        f.write( download.text )

# code for downloading PDBe - molecules
to get list of which chains are protein and which are DNA

https://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/:pdbid
https://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/1ph7

In [5]:
# download multiple entries, taking care that PDBe only allows batches up to 1000 at a time
batchSize = 100  # must be less than 1000

urlPrefix = 'https://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/'
pdbCodesBatched = batched( pdbCodes, batchSize ) # first separate list of codes into batches of 1000 
reportDict = {}
for batch in pdbCodesBatched:
    print('downloading batch...', end='')
    codeString = ','.join(batch)
    report=requests.post(urlPrefix,data=codeString)
    reportDict.update( json.loads(report.text) )
    print(len(reportDict),'total entries')
print(len(reportDict),'entries downloaded')

downloading batch...100 total entries
downloading batch...200 total entries
downloading batch...300 total entries
downloading batch...400 total entries
downloading batch...500 total entries
downloading batch...600 total entries
downloading batch...700 total entries
downloading batch...800 total entries
downloading batch...900 total entries
downloading batch...1000 total entries
1000 entries downloaded


In [7]:
chainsDict = { k:[] for k in ['pdbid','protein','dna'] } 
for code,entry in reportDict.items():   # iterates over pdbid, entry
    proteinChains=[]
    dnaChains=[]
    for molecule in entry:
        if molecule['molecule_type']=='polypeptide(L)':
            proteinChains+=molecule['in_chains']
        elif molecule['molecule_type']=='polydeoxyribonucleotide':
            dnaChains+=molecule['in_chains']
        else: 
            pass 
    chainsDict['pdbid'].append(code)
    chainsDict['protein'].append(proteinChains)
    chainsDict['dna'].append(dnaChains)
    

In [9]:
chainsDf = pd.DataFrame(chainsDict)

In [11]:
chainsDf

Unnamed: 0,pdbid,protein,dna
0,1bss,"[A, B]","[C, D]"
1,1c7u,"[A, B]","[C, D]"
2,1bnz,[A],"[B, C]"
3,1ca5,[A],"[B, C]"
4,1c8c,[A],"[B, C]"
...,...,...,...
995,2cdm,"[A, C]","[B, D]"
996,2e52,"[A, B, C, D]","[E, F, G, H, I, J]"
997,2e2h,"[A, B, C, E, F, H, I, J, K, L]","[T, N]"
998,2e2i,"[A, B, C, E, F, H, I, J, K, L]","[T, N]"


In [13]:
chainsDf.to_csv('summaryChains.csv')