In [252]:
import requests
import json
import pandas as pd
from itertools import batched
import os

In [104]:
# read in list of pdb codes for initial consideration
pdbCodeFile = './pdbListAll.txt'

# file contains comma separated list of pdb ids
with open(pdbCodeFile) as f:
    fileRead=f.read()
pdbCodes = fileRead.strip().split(',')

'''
# dsv file with one column labeled 'pdbid'
df = pd.read_csv(pdbCodeFile)
pdbCodes=list(df['pdbid'])
'''

pdbCodes = pdbCodes[:1500]

print(len(pdbCodes),'codes\n')

1500 codes



In [106]:
# download multiple entries, taking care that PDBe only allows batches up to 1000 at a time
batchSize = 100  # must be less than 1000

urlPrefix = 'https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/'
pdbCodesBatched = batched( pdbCodes, batchSize ) # first separate list of codes into batches of 1000 
reportDict = {}
for batch in pdbCodesBatched:
    print('downloading batch...', end='')
    codeString = ','.join(batch)
    report=requests.post(urlPrefix,data=codeString)
    reportDict.update( json.loads(report.text) )
    print(len(reportDict),'total entries')
print(len(reportDict),'entries downloaded')

downloading batch...100 total entries
downloading batch...200 total entries
downloading batch...300 total entries
downloading batch...400 total entries
downloading batch...500 total entries
downloading batch...600 total entries
downloading batch...700 total entries
downloading batch...800 total entries
downloading batch...900 total entries
downloading batch...1000 total entries
downloading batch...1100 total entries
downloading batch...1200 total entries
downloading batch...1300 total entries
downloading batch...1400 total entries
downloading batch...1500 total entries
1500 entries downloaded


In [160]:
# create summary dataframe from downloaded dictionary

# keys in downloaded dictionary
entryKeys = ['title', 'processing_site', 'deposition_site', 'deposition_date', 'release_date', 'revision_date', \
             'experimental_method_class', 'experimental_method', 'split_entry', 'related_structures', 'entry_authors', \
             'number_of_entities', 'assemblies']

# keys with numerical, string or list values
simpleKeys = ['title', 'deposition_date', 'experimental_method', 'related_structures']

# keys of the sub-dictionaries that are the values associated with keys 'number_of_entities' and 'assemblies'
entityKeys = ['water', 'polypeptide', 'dna', 'rna', 'dna/rna', 'sugar', 'ligand', 'carbohydrate_polymer', 'other']
assemblyKeys = ['assembly_id', 'name', 'form']

# columns (keys) of summary dataframe (dictionary)
dataKeys = ['pdbid'] + simpleKeys + entityKeys + ['assemblies'] + assemblyKeys

dataDict = { k:[] for k in dataKeys } 
for pdbid,entry in reportDict.items():
    dataDict['pdbid'].append(pdbid)
    for k in simpleKeys:
        dataDict[k].append(entry[0][k])
    for k in entityKeys:
        dataDict[k].append(entry[0]['number_of_entities'][k])
    dataDict['assemblies'].append(len(entry[0]['assemblies']))

    # now go through the assemblies and extract data from preferred assembly: the Protein Data Bank in Europe (PDBe)
    # defines the preferred assembly as the smallest assembly containing all polymeric entities.

    for d in entry[0]['assemblies']:
        if d['preferred']:
            for ak in assemblyKeys:
                dataDict[ak].append(d[ak])

dataDf = pd.DataFrame(dataDict)  

In [162]:
dataDf

Unnamed: 0,pdbid,title,deposition_date,experimental_method,related_structures,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies,assembly_id,name,form
0,10mh,TERNARY STRUCTURE OF HHAI METHYLTRANSFERASE WI...,19980810,[X-ray diffraction],[],1,1,2,0,0,0,1,0,0,1,1,trimer,hetero
1,173d,MULTIPLE BINDING MODES OF ANTICANCER DRUG ACTI...,19940418,[X-ray diffraction],[],1,1,1,0,0,0,0,0,0,2,1,tetramer,hetero
2,185d,SEQUENCE SPECIFICITY OF QUINOXALINE ANTIBIOTIC...,19940810,[Solution NMR],[],0,1,1,0,0,0,1,0,0,1,1,trimer,hetero
3,193d,SOLUTION STRUCTURE OF A QUINOMYCIN BISINTERCAL...,19940930,[Solution NMR],[],0,1,1,0,0,0,1,0,0,1,1,trimer,hetero
4,1a02,"STRUCTURE OF THE DNA BINDING DOMAINS OF NFAT, ...",19971208,[X-ray diffraction],[],1,3,2,0,0,0,0,0,0,1,1,pentamer,hetero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,3c2i,The Crystal Structure of Methyl-CpG Binding Do...,20080125,[X-ray diffraction],[],1,1,2,0,0,0,0,0,0,1,1,trimer,hetero
1496,3c2k,DNA POLYMERASE BETA with a gapped DNA substrat...,20080125,[X-ray diffraction],[],1,1,3,0,0,0,4,0,0,1,1,tetramer,hetero
1497,3c2l,Ternary complex of DNA POLYMERASE BETA with a ...,20080125,[X-ray diffraction],[],1,1,3,0,0,0,3,0,0,1,1,tetramer,hetero
1498,3c2m,Ternary complex of DNA POLYMERASE BETA with a ...,20080125,[X-ray diffraction],[],1,1,3,0,0,0,4,0,0,1,1,tetramer,hetero


In [178]:
dataDf['assembly_id']

0       1
1       1
2       1
3       1
4       1
       ..
1495    1
1496    1
1497    1
1498    1
1499    1
Name: assembly_id, Length: 1500, dtype: object

In [114]:
dataDf[ dataDf['assemblies'] >4 ]

Unnamed: 0,pdbid,title,deposition_date,experimental_method,related_structures,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies
77,1c9b,CRYSTAL STRUCTURE OF A HUMAN TBP CORE DOMAIN-H...,19990801,[X-ray diffraction],[],1,2,2,0,0,0,0,0,0,5
177,1g3x,INTERCALATION OF AN 9ACRIDINE-PEPTIDE DRUG IN ...,20001025,[X-ray diffraction],[],1,1,1,0,0,0,1,0,0,6
509,1pp8,crystal structure of the T. vaginalis IBP39 In...,20030616,[X-ray diffraction],[],0,1,2,0,0,0,1,0,0,6
560,1qzh,Crystal structure of Pot1 (protection of telom...,20030916,[X-ray diffraction],[],1,1,1,0,0,0,0,0,0,6
785,1yfj,T4Dam in Complex with AdoHcy and 15-mer Oligon...,20050102,[X-ray diffraction],[],1,1,1,0,0,0,3,0,0,6
1288,2qkk,Human RNase H catalytic domain mutant D210N in...,20070711,[X-ray diffraction],[],1,1,1,1,0,0,4,0,0,6


In [172]:
dataDf.describe()

Unnamed: 0,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.8,1.388,1.762667,0.031333,0.004,0.0,1.088667,0.026,0.0,1.258667
std,0.400133,1.34858,0.626046,0.174275,0.072944,0.0,1.129154,0.159188,0.0,0.637727
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0
max,1.0,13.0,8.0,1.0,2.0,0.0,8.0,1.0,0.0,6.0


In [146]:
dic=reportDict['1c9b'][0]['assemblies']

In [154]:
for d in dic:
    print(d['preferred'])

True
False
False
False
False


In [156]:
for d in dic:
    if d['preferred']:
        print(d)

{'assembly_id': '1', 'name': 'tetramer', 'form': 'hetero', 'preferred': True}


In [54]:
for b in it.batched(a,3):
    print(b)

(0, 1, 2)
(3, 4, 5)
(6, 7, 8)
(9,)


In [190]:
dataDf[ dataDf['pdbid']=='2oyq'][


#dataDf['pdbid','assembly_id']]:

Unnamed: 0,pdbid,title,deposition_date,experimental_method,related_structures,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies,assembly_id,name,form
1240,2oyq,Crystal structure of RB69 gp43 in complex with...,20070222,[X-ray diffraction],[],1,1,2,0,0,0,2,0,0,4,3,trimer,hetero


In [268]:
# download preferred assembly files
assemblyDirectory = '../DATA/db/assemblies'

os.makedirs(assemblyDirectory,exist_ok=True)
for i in dataDf[ dataDf['assembly_id'] == '4' ].index:
    code=dataDf.at[i,'pdbid']
    assembly=dataDf.at[i,'assembly_id']
    fileName = code + '-assembly' + assembly + '.cif'
    url = 'https://files.rcsb.org/download/' + fileName
    download = requests.get(url)
    print(url)
    with open( os.path.join(assemblyDirectory,fileName), 'w' ) as f:
        f.write( download.text )

https://files.rcsb.org/download/1rzt-assembly4.cif
https://files.rcsb.org/download/2dtu-assembly4.cif
https://files.rcsb.org/download/2pzs-assembly4.cif


In [266]:
dataDf[ dataDf['assembly_id'] == '4' ]

Unnamed: 0,pdbid,title,deposition_date,experimental_method,related_structures,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies,assembly_id,name,form
615,1rzt,Crystal structure of DNA polymerase lambda com...,20031229,[X-ray diffraction],[],1,1,3,0,0,0,2,0,0,4,4,tetramer,hetero
948,2dtu,Crystal structure of the beta hairpin loop del...,20060715,[X-ray diffraction],[],1,1,2,0,0,0,0,0,0,4,4,trimer,hetero
1278,2pzs,Phi29 DNA polymerase complexed with primer-tem...,20070518,[X-ray diffraction],[],1,1,2,0,0,0,0,0,0,4,4,trimer,hetero


In [278]:
dataDf.describe(

Unnamed: 0,water,polypeptide,dna,rna,dna/rna,sugar,ligand,carbohydrate_polymer,other,assemblies
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.8,1.388,1.762667,0.031333,0.004,0.0,1.088667,0.026,0.0,1.258667
std,0.400133,1.34858,0.626046,0.174275,0.072944,0.0,1.129154,0.159188,0.0,0.637727
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0
max,1.0,13.0,8.0,1.0,2.0,0.0,8.0,1.0,0.0,6.0


In [282]:
dataDf.info

<bound method DataFrame.info of      pdbid                                              title deposition_date  \
0     10mh  TERNARY STRUCTURE OF HHAI METHYLTRANSFERASE WI...        19980810   
1     173d  MULTIPLE BINDING MODES OF ANTICANCER DRUG ACTI...        19940418   
2     185d  SEQUENCE SPECIFICITY OF QUINOXALINE ANTIBIOTIC...        19940810   
3     193d  SOLUTION STRUCTURE OF A QUINOMYCIN BISINTERCAL...        19940930   
4     1a02  STRUCTURE OF THE DNA BINDING DOMAINS OF NFAT, ...        19971208   
...    ...                                                ...             ...   
1495  3c2i  The Crystal Structure of Methyl-CpG Binding Do...        20080125   
1496  3c2k  DNA POLYMERASE BETA with a gapped DNA substrat...        20080125   
1497  3c2l  Ternary complex of DNA POLYMERASE BETA with a ...        20080125   
1498  3c2m  Ternary complex of DNA POLYMERASE BETA with a ...        20080125   
1499  3c2p  X-ray crystal structure of the N4 mini-vRNAP P...        20080125

In [288]:
print(dataDf)

     pdbid                                              title deposition_date  \
0     10mh  TERNARY STRUCTURE OF HHAI METHYLTRANSFERASE WI...        19980810   
1     173d  MULTIPLE BINDING MODES OF ANTICANCER DRUG ACTI...        19940418   
2     185d  SEQUENCE SPECIFICITY OF QUINOXALINE ANTIBIOTIC...        19940810   
3     193d  SOLUTION STRUCTURE OF A QUINOMYCIN BISINTERCAL...        19940930   
4     1a02  STRUCTURE OF THE DNA BINDING DOMAINS OF NFAT, ...        19971208   
...    ...                                                ...             ...   
1495  3c2i  The Crystal Structure of Methyl-CpG Binding Do...        20080125   
1496  3c2k  DNA POLYMERASE BETA with a gapped DNA substrat...        20080125   
1497  3c2l  Ternary complex of DNA POLYMERASE BETA with a ...        20080125   
1498  3c2m  Ternary complex of DNA POLYMERASE BETA with a ...        20080125   
1499  3c2p  X-ray crystal structure of the N4 mini-vRNAP P...        20080125   

      experimental_method r