In [1]:
# !./batch_download.sh -f pdblist -p -o compressed_pdbs

# Import modules

In [2]:
import os
import pandas as pd

# List all files including PDBs

In [3]:
allfiles = os.listdir('./')

In [4]:
realpdbs = []
for file in os.listdir("./"):
    if file.endswith(".pdb"):
        realpdbs.append(file)

In [5]:
print("All files:", len(allfiles), "\nPDB files:", len(realpdbs))

All files: 454 
PDB files: 446


# Import Biopython PDB parser

In [6]:
from Bio.PDB.PDBParser import PDBParser

In [7]:
parser = PDBParser(PERMISSIVE=1)

# Single file

In [8]:
structure_id = "1AQ1"
filename = "1AQ1.pdb"
structure = parser.get_structure(structure_id, filename)

In [9]:
resolution = structure.header["resolution"]

In [10]:
structure.header["name"]

'human cyclin dependent kinase 2 complexed with the inhibitor staurosporine'

# All pdb files for CDK2

## CDK2 name list

In [11]:
cdk2_list = []

for i in realpdbs:
    filename = i
    structure_id = i.strip(".pdb")
    structure = parser.get_structure(structure_id, filename)
    cdk2_list.append(structure.header["name"])































In [12]:
len(cdk2_list)

446

## CDK2 PDB files resolution

In [13]:
resolution = []

for i in realpdbs:
    filename = i
    structure_id = i.strip(".pdb")
    structure = parser.get_structure(structure_id, filename)
    resolution.append(structure.header["resolution"])































In [14]:
len(resolution)

446

# Extract Publish year

## single file

In [15]:
structure_id = "1AQ1"
filename = "1AQ1.pdb"

with open(filename) as f:
    first_line = f.readline()

In [16]:
first_line

'HEADER    PROTEIN KINASE                          05-AUG-97   1AQ1              \n'

In [17]:
publish_date_single = first_line.split()[-2]
print(publish_date_single[-2:])

97


## Publish Date for CDK2 PDb files

In [18]:
publish_date = []

for i in realpdbs:
    with open(i) as f:
        first_line = f.readline()
        release_date = first_line.split()[-2]
        publish_date.append(release_date[-2:])

In [19]:
publish_date

['97',
 '98',
 '98',
 '98',
 '98',
 '99',
 '99',
 '00',
 '00',
 '00',
 '00',
 '96',
 '00',
 '00',
 '00',
 '00',
 '01',
 '01',
 '01',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '96',
 '96',
 '96',
 '96',
 '01',
 '01',
 '01',
 '01',
 '01',
 '01',
 '01',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '99',
 '03',
 '03',
 '03',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '06',
 '06',
 '06',
 '06',
 '06',
 '06',
 '05',
 '06',
 '06',
 '06',
 '06',
 '06',
 '06',
 '06',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',

# Extract chains

In [20]:
from Bio import *

In [21]:
from __future__ import print_function

In [22]:
!pip install pdb-tools

Defaulting to user installation because normal site-packages is not writeable


In [23]:
chain_id = !sed -n '/^ATOM/p' 1AQ1.pdb | awk '{ print $5 }' | uniq

In [24]:
chain_ids = []
for i in realpdbs:
    !sed -n '/^ATOM/p' 1AQ1.pdb | awk '{ print $5 }' | uniq

A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A
A


In [None]:
# import gzip
import warnings
from pathlib import Path
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB import PDBParser

# To get rid of those annoying warnings like 'WARNING: Chain B is discontinuous at line 4059.'
warnings.simplefilter('ignore', PDBConstructionWarning)

parser = PDBParser()

if __name__ == "__main__":
    pdb_zips = Path("zipped_pdbs").glob('**/*.ent.gz')
    for pdb_filename in pdb_zips:
        with gzip.open(pdb_filename, "rt") as file_handle:
            structure = parser.get_structure("?", file_handle)
        # you could of course parse the pdb code from the file name as well. 
        # But I found this to be easier implemented.       
        pdb_code = structure.header.get("idcode")
        resolution = structure.header.get("resolution")

        for chain in structure.get_chains():
            print(f"{pdb_code}  {chain.id}  {resolution}")

In [43]:
# import gzip
import warnings
from pathlib import Path
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB import PDBParser

# To get rid of those annoying warnings like 'WARNING: Chain B is discontinuous at line 4059.'
warnings.simplefilter('ignore', PDBConstructionWarning)

parser = PDBParser()
chain_id = []
if __name__ == "__main__":
#     pdb_zips = Path("zipped_pdbs").glob('**/*.ent.gz')
    for pdb_filename in realpdbs:
        with open(pdb_filename, "rt") as file_handle:
            structure = parser.get_structure("?", file_handle)
        # you could of course parse the pdb code from the file name as well. 
        # But I found this to be easier implemented.       
        pdb_code = structure.header.get("idcode")
#         resolution = structure.header.get("resolution")

        for chain in structure.get_chains():
            chain_id.append(pdb_code+chain.id)

In [46]:
len(chain_id)

881

# Digest to make Dataframe

In [None]:
digest = {'PDB_ID': realpdbs,
        'Title': cdk2_list,
         'Resolution': resolution,
          'Publish_date': publish_date,
         }

In [None]:
# set pandas option to dsplay all rows
pd.set_option('display.max_rows', None)

## Create Dataframe

In [None]:
pdb_df = pd.DataFrame(digest)

In [45]:
pdb_df

NameError: name 'pdb_df' is not defined

In [None]:
# check Dataframe dimension
pdb_df.shape

## Sort Dataframe based on Resolution 

In [None]:
pdb_df_sorted_by_res = pdb_df.sort_values("Resolution")

In [None]:
pdb_df_sorted_by_res

## Sort Dataframe based on the year

In [None]:
pdb_df_sorted_by_year = pdb_df.sort_values("Publish_date")

In [None]:
pdb_df_sorted_by_year

# Plotting

In [None]:
# !pip install matplotlib

In [None]:
# from matplotlib import *

In [None]:
# pdb_df.plot(kind = 'hist')

In [None]:
# !pip install scipy

In [None]:
# # Libraries
# from matplotlib import pyplot as plt
# from scipy.cluster.hierarchy import dendrogram, linkage
# import numpy as np
 
# # Data set
# df = pdb_df.set_index('Resolution')
 
# # Calculate the distance between each sample
# Z = linkage(pdb_df, 'ward')
 
# # Control number of clusters in the plot + add horizontal line.
# dendrogram(Z, color_threshold=240)
# plt.axhline(y=240, c='grey', lw=1, linestyle='dashed')

# # Show the graph
# plt.show()