In [62]:
import requests
import json
import pandas as pd
from itertools import batched

In [104]:
# read in list of pdb codes for initial consideration
pdbCodeFile = './pdbListAll.txt'

# file contains comma separated list of pdb ids
with open(pdbCodeFile) as f:
    fileRead=f.read()
pdbCodes = fileRead.strip().split(',')

'''
# dsv file with one column labeled 'pdbid'
df = pd.read_csv(pdbCodeFile)
pdbCodes=list(df['pdbid'])
'''

pdbCodes = pdbCodes[:1500]

print(len(pdbCodes),'codes\n')

1500 codes



In [106]:
# download multiple entries, taking care that PDBe only allows batches up to 1000 at a time
batchSize = 100  # must be less than 1000

urlPrefix = 'https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/'
pdbCodesBatched = batched( pdbCodes, batchSize ) # first separate list of codes into batches of 1000 
reportDict = {}
for batch in pdbCodesBatched:
    print('downloading batch...', end='')
    codeString = ','.join(batch)
    report=requests.post(urlPrefix,data=codeString)
    reportDict.update( json.loads(report.text) )
    print(len(reportDict),'total entries')
print(len(reportDict),'entries downloaded')

downloading batch...100 total entries
downloading batch...200 total entries
downloading batch...300 total entries
downloading batch...400 total entries
downloading batch...500 total entries
downloading batch...600 total entries
downloading batch...700 total entries
downloading batch...800 total entries
downloading batch...900 total entries
downloading batch...1000 total entries
downloading batch...1100 total entries
downloading batch...1200 total entries
downloading batch...1300 total entries
downloading batch...1400 total entries
downloading batch...1500 total entries
1500 entries downloaded


In [108]:
# create summary dataframe from downloaded dictionary

# keys in downloaded dictionary
entryKeys = ['title', 'processing_site', 'deposition_site', 'deposition_date', 'release_date', 'revision_date', \
             'experimental_method_class', 'experimental_method', 'split_entry', 'related_structures', 'entry_authors', \
             'number_of_entities', 'assemblies']

# keys with numerical, string or list values
simpleKeys = ['title', 'deposition_date', 'experimental_method', 'related_structures']

# keys of the sub-dictionaries that are the values associated with keys 'number_of_entities' and 'assemblies'
entityKeys = ['water', 'polypeptide', 'dna', 'rna', 'dna/rna', 'sugar', 'ligand', 'carbohydrate_polymer', 'other']
assemblyKeys = ['assembly_id', 'name', 'form', 'preferred']

# columns (keys) of summary dataframe (dictionary)
dataKeys = ['pdbid'] + simpleKeys + entityKeys + ['assemblies']

dataDict = { k:[] for k in dataKeys } 
for pdbid,entry in reportDict.items():
    dataDict['pdbid'].append(pdbid)
    for k in simpleKeys:
        dataDict[k].append(entry[0][k])
    for k in entityKeys:
        dataDict[k].append(entry[0]['number_of_entities'][k])
    dataDict['assemblies'].append(len(entry[0]['assemblies']))
dataDf = pd.DataFrame(dataDict)  

In [110]:
dataDf

Unnamed: 0,pdbid,title,deposition_date,experimental_method,related_structures,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies
0,10mh,TERNARY STRUCTURE OF HHAI METHYLTRANSFERASE WI...,19980810,[X-ray diffraction],[],1,1,2,0,0,0,1,0,0,1
1,173d,MULTIPLE BINDING MODES OF ANTICANCER DRUG ACTI...,19940418,[X-ray diffraction],[],1,1,1,0,0,0,0,0,0,2
2,185d,SEQUENCE SPECIFICITY OF QUINOXALINE ANTIBIOTIC...,19940810,[Solution NMR],[],0,1,1,0,0,0,1,0,0,1
3,193d,SOLUTION STRUCTURE OF A QUINOMYCIN BISINTERCAL...,19940930,[Solution NMR],[],0,1,1,0,0,0,1,0,0,1
4,1a02,"STRUCTURE OF THE DNA BINDING DOMAINS OF NFAT, ...",19971208,[X-ray diffraction],[],1,3,2,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,3c2i,The Crystal Structure of Methyl-CpG Binding Do...,20080125,[X-ray diffraction],[],1,1,2,0,0,0,0,0,0,1
1496,3c2k,DNA POLYMERASE BETA with a gapped DNA substrat...,20080125,[X-ray diffraction],[],1,1,3,0,0,0,4,0,0,1
1497,3c2l,Ternary complex of DNA POLYMERASE BETA with a ...,20080125,[X-ray diffraction],[],1,1,3,0,0,0,3,0,0,1
1498,3c2m,Ternary complex of DNA POLYMERASE BETA with a ...,20080125,[X-ray diffraction],[],1,1,3,0,0,0,4,0,0,1


In [None]:
# now got through the entries and 

In [None]:
dataDf

In [None]:
dataDf.to_csv('pdbListAll.csv')

In [32]:
for i in range(10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [54]:
for b in it.batched(a,3):
    print(b)

(0, 1, 2)
(3, 4, 5)
(6, 7, 8)
(9,)


In [56]:
it.batched(a,3)

<itertools.batched at 0x776fdef73bb0>