In [None]:
import requests
import json
import pandas as pd
from itertools import batched
import os

In [None]:
# read in list of pdb codes for initial consideration
pdbCodeFile = './pdbListAll.txt'

# file contains comma separated list of pdb ids
with open(pdbCodeFile) as f:
    fileRead=f.read()
pdbCodes = fileRead.strip().split(',')

'''
# dsv file with one column labeled 'pdbid'
df = pd.read_csv(pdbCodeFile)
pdbCodes=list(df['pdbid'])
'''

pdbCodes = pdbCodes[:1500]

print(len(pdbCodes),'codes\n')

In [None]:
# download multiple entries, taking care that PDBe only allows batches up to 1000 at a time
batchSize = 100  # must be less than 1000

urlPrefix = 'https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/'
pdbCodesBatched = batched( pdbCodes, batchSize ) # first separate list of codes into batches of 1000 
reportDict = {}
for batch in pdbCodesBatched:
    print('downloading batch...', end='')
    codeString = ','.join(batch)
    report=requests.post(urlPrefix,data=codeString)
    reportDict.update( json.loads(report.text) )
    print(len(reportDict),'total entries')
print(len(reportDict),'entries downloaded')

In [None]:
# create summary dataframe from downloaded dictionary

# keys in downloaded dictionary
entryKeys = ['title', 'processing_site', 'deposition_site', 'deposition_date', 'release_date', 'revision_date', \
             'experimental_method_class', 'experimental_method', 'split_entry', 'related_structures', 'entry_authors', \
             'number_of_entities', 'assemblies']

# keys with numerical, string or list values
simpleKeys = ['title', 'deposition_date', 'experimental_method', 'related_structures']

# keys of the sub-dictionaries that are the values associated with keys 'number_of_entities' and 'assemblies'
entityKeys = ['water', 'polypeptide', 'dna', 'rna', 'dna/rna', 'sugar', 'ligand', 'carbohydrate_polymer', 'other']
assemblyKeys = ['assembly_id', 'name', 'form']

# columns (keys) of summary dataframe (dictionary)
dataKeys = ['pdbid'] + simpleKeys + entityKeys + ['assemblies'] + assemblyKeys

dataDict = { k:[] for k in dataKeys } 
for pdbid,entry in reportDict.items():
    dataDict['pdbid'].append(pdbid)
    for k in simpleKeys:
        dataDict[k].append(entry[0][k])
    for k in entityKeys:
        dataDict[k].append(entry[0]['number_of_entities'][k])
    dataDict['assemblies'].append(len(entry[0]['assemblies']))

    # now go through the assemblies and extract data from preferred assembly: the Protein Data Bank in Europe (PDBe)
    # defines the preferred assembly as the smallest assembly containing all polymeric entities.

    for d in entry[0]['assemblies']:
        if d['preferred']:
            for ak in assemblyKeys:
                dataDict[ak].append(d[ak])

dataDf = pd.DataFrame(dataDict)  

In [None]:
dataDf

In [None]:
dataDf['assembly_id']

In [None]:
dataDf[ dataDf['assemblies'] >4 ]

In [None]:
dataDf.describe()

In [None]:
dic=reportDict['1c9b'][0]['assemblies']

In [None]:
for d in dic:
    print(d['preferred'])

In [None]:
for d in dic:
    if d['preferred']:
        print(d)

In [None]:
for b in it.batched(a,3):
    print(b)

In [None]:
dataDf[ dataDf['pdbid']=='2oyq'][


#dataDf['pdbid','assembly_id']]:

In [None]:
# download preferred assembly files
assemblyDirectory = '../DATA/db/assemblies'

os.makedirs(assemblyDirectory,exist_ok=True)
for i in dataDf[ dataDf['assembly_id'] == '4' ].index:
    code=dataDf.at[i,'pdbid']
    assembly=dataDf.at[i,'assembly_id']
    fileName = code + '-assembly' + assembly + '.cif'
    url = 'https://files.rcsb.org/download/' + fileName
    download = requests.get(url)
    print(url)
    with open( os.path.join(assemblyDirectory,fileName), 'w' ) as f:
        f.write( download.text )

In [None]:
dataDf[ dataDf['assembly_id'] == '4' ]

In [None]:
dataDf.describe(

In [None]:
dataDf.info

In [None]:
print(dataDf)