In [1]:
# !./batch_download.sh -f pdblist -p -o compressed_pdbs

# Import modules

In [1]:
import os
import pandas as pd

# List all files including PDBs

In [2]:
allfiles = os.listdir('./')

In [3]:
realpdbs = []
for file in os.listdir("./"):
    if file.endswith(".pdb"):
        realpdbs.append(file)

In [4]:
print("All files:", len(allfiles), "\nPDB files:", len(realpdbs))

All files: 459 
PDB files: 446


# Import Biopython PDB parser

In [5]:
from Bio.PDB.PDBParser import PDBParser

In [6]:
parser = PDBParser(PERMISSIVE=1)

# Single file

In [7]:
structure_id = "1AQ1"
filename = "1AQ1.pdb"
structure = parser.get_structure(structure_id, filename)

In [8]:
resolution = structure.header["resolution"]

In [9]:
structure.header["name"]

'human cyclin dependent kinase 2 complexed with the inhibitor staurosporine'

# All pdb files for CDK2

## CDK2 name list

In [10]:
cdk2_list = []

for i in realpdbs:
    filename = i
    structure_id = i.strip(".pdb")
    structure = parser.get_structure(structure_id, filename)
    cdk2_list.append(structure.header["name"])































In [11]:
len(cdk2_list)

446

## CDK2 PDB files resolution

In [12]:
resolution = []

for i in realpdbs:
    filename = i
    structure_id = i.strip(".pdb")
    structure = parser.get_structure(structure_id, filename)
    resolution.append(structure.header["resolution"])































In [13]:
len(resolution)

446

# Extract Publish year

## single file

In [14]:
structure_id = "1AQ1"
filename = "1AQ1.pdb"

with open(filename) as f:
    first_line = f.readline()

In [15]:
first_line

'HEADER    PROTEIN KINASE                          05-AUG-97   1AQ1              \n'

In [16]:
publish_date_single = first_line.split()[-2]
print(publish_date_single[-2:])

97


## Publish Date for CDK2 PDb files

In [17]:
publish_date = []

for i in realpdbs:
    with open(i) as f:
        first_line = f.readline()
        release_date = first_line.split()[-2]
        publish_date.append(release_date[-2:])

In [18]:
publish_date

['97',
 '98',
 '98',
 '98',
 '98',
 '99',
 '99',
 '00',
 '00',
 '00',
 '00',
 '96',
 '00',
 '00',
 '00',
 '00',
 '01',
 '01',
 '01',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '02',
 '96',
 '96',
 '96',
 '96',
 '01',
 '01',
 '01',
 '01',
 '01',
 '01',
 '01',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '03',
 '99',
 '03',
 '03',
 '03',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '04',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '05',
 '06',
 '06',
 '06',
 '06',
 '06',
 '06',
 '05',
 '06',
 '06',
 '06',
 '06',
 '06',
 '06',
 '06',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',
 '07',

# Extract chains

In [19]:
from Bio import *

In [20]:
from __future__ import print_function

In [21]:
# !pip install pdb-tools

In [22]:
chain_id = !sed -n '/^ATOM/p' 1AQ1.pdb | awk '{ print $5 }' | uniq

In [23]:
# chain_ids = []
# for i in realpdbs:
#     !sed -n '/^ATOM/p' 1AQ1.pdb | awk '{ print $5 }' | uniq

In [24]:
# # import gzip
# import warnings
# from pathlib import Path
# from Bio.PDB.PDBExceptions import PDBConstructionWarning
# from Bio.PDB import PDBParser

# # To get rid of those annoying warnings like 'WARNING: Chain B is discontinuous at line 4059.'
# warnings.simplefilter('ignore', PDBConstructionWarning)

# parser = PDBParser()

# if __name__ == "__main__":
#     pdb_zips = Path("zipped_pdbs").glob('**/*.ent.gz')
#     for pdb_filename in pdb_zips:
#         with gzip.open(pdb_filename, "rt") as file_handle:
#             structure = parser.get_structure("?", file_handle)
#         # you could of course parse the pdb code from the file name as well. 
#         # But I found this to be easier implemented.       
#         pdb_code = structure.header.get("idcode")
#         resolution = structure.header.get("resolution")

#         for chain in structure.get_chains():
#             print(f"{pdb_code}  {chain.id}  {resolution}")

In [25]:
# # import gzip
# import warnings
# from pathlib import Path
# from Bio.PDB.PDBExceptions import PDBConstructionWarning
# from Bio.PDB import PDBParser

# # To get rid of those annoying warnings like 'WARNING: Chain B is discontinuous at line 4059.'
# warnings.simplefilter('ignore', PDBConstructionWarning)

# parser = PDBParser()
# chain_id = []
# if __name__ == "__main__":
# #     pdb_zips = Path("zipped_pdbs").glob('**/*.ent.gz')
#     for pdb_filename in realpdbs:
#         with open(pdb_filename, "rt") as file_handle:
#             structure = parser.get_structure("?", file_handle)
#         # you could of course parse the pdb code from the file name as well. 
#         # But I found this to be easier implemented.       
#         pdb_code = structure.header.get("idcode")
# #         resolution = structure.header.get("resolution")

#         for chain in structure.get_chains():
#             chain_id.append(pdb_code+chain.id)

In [26]:
# pd.DataFrame({'chainid': chain_id})
# len(chain_id)

In [114]:
def add_values_in_dict(sample_dict, key, list_of_values):
    ''' Append multiple values to a key in 
        the given dictionary '''
    if key not in sample_dict:
        sample_dict[key] = list()
    sample_dict[key].extend(list_of_values)
    return sample_dict

In [115]:
# import warnings
# from pathlib import Path
# from Bio.PDB.PDBExceptions import PDBConstructionWarning
# from Bio.PDB import PDBParser

# # To get rid of those annoying warnings like 'WARNING: Chain B is discontinuous at line 4059.'
# warnings.simplefilter('ignore', PDBConstructionWarning)

# parser = PDBParser()

# chainDict = dict()

# if __name__ == "__main__":
#     for pdb_filename in realpdbs:
#         with open(pdb_filename, "rt") as file_handle:
#             structure = parser.get_structure("?", file_handle)
#         # you could of course parse the pdb code from the file name as well. 
#         # But I found this to be easier implemented.       
#         pdb_code = structure.header.get("idcode")
# #         resolution = structure.header.get("resolution")

#         for chain in structure.get_chains():
#             chainId = []
#             chainId.append(chain.id)
#             chainDict[pdb_code] : chainId

In [116]:
import warnings
from pathlib import Path
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB import PDBParser

# To get rid of those annoying warnings like 'WARNING: Chain B is discontinuous at line 4059.'
warnings.simplefilter('ignore', PDBConstructionWarning)

parser = PDBParser()

chainDict = dict()

if __name__ == "__main__":
    for pdb_filename in realpdbs:
        with open(pdb_filename, "rt") as file_handle:
            structure = parser.get_structure("?", file_handle)
        # you could of course parse the pdb code from the file name as well. 
        # But I found this to be easier implemented.       
        pdb_code = structure.header.get("idcode")
#         resolution = structure.header.get("resolution")

        for chain in structure.get_chains():
            chainId = []
            chainId.append(chain.id)
            chainDict = add_values_in_dict(chainDict, pdb_code, chainId)

In [72]:
len(chainDict)

446

In [73]:
chain_df = pd.DataFrame(pd.Series(chainDict).reset_index()).set_axis(["id", "chain_list"], axis='columns')

In [74]:
chain_df

Unnamed: 0,id,chain_list
0,1AQ1,[A]
1,1B38,[A]
2,1B39,[A]
3,1BUH,"[A, B]"
4,1CKP,[A]
...,...,...
441,7ZPC,[A]
442,8B54,"[A, B, C, D]"
443,8CUR,[A]
444,8H6P,"[A, B]"


In [79]:
chain_df.shape

(446, 2)

# Digest to make Dataframe

In [96]:
digest = {'File': realpdbs,
        'Title': cdk2_list,
         'Resolution': resolution,
          'Year': publish_date,
         }

In [97]:
# set pandas option to dsplay all rows
pd.set_option('display.max_rows', None)

## Create Dataframe

In [98]:
head_df = pd.DataFrame(digest)

In [99]:
head_df

Unnamed: 0,File,Title,Resolution,Year
0,1AQ1.pdb,human cyclin dependent kinase 2 complexed with...,2.0,97
1,1B38.pdb,human cyclin-dependent kinase 2,2.0,98
2,1B39.pdb,human cyclin-dependent kinase 2 phosphorylated...,2.1,98
3,1BUH.pdb,crystal structure of the human cdk2 kinase com...,2.6,98
4,1CKP.pdb,human cyclin dependent kinase 2 complexed with...,2.05,98
5,1DI8.pdb,the structure of cyclin-dependent kinase 2 (cd...,2.2,99
6,1DM2.pdb,human cyclin-dependent kinase 2 complexed with...,2.1,99
7,1E1V.pdb,human cyclin dependent kinase 2 complexed with...,1.95,0
8,1E1X.pdb,human cyclin dependent kinase 2 complexed with...,1.85,0
9,1E9H.pdb,thr 160 phosphorylated cdk2 - human cyclin a3 ...,2.5,0


In [100]:
# check Dataframe dimension
head_df.shape

(446, 4)

In [104]:
pdb_df = head_df.join(chain_df)

In [105]:
pdb_df.shape

(446, 6)

## Sort Dataframe based on Resolution 

In [106]:
pdb_df_sorted_by_res = pdb_df.sort_values("Resolution")

In [107]:
pdb_df_sorted_by_res

Unnamed: 0,File,Title,Resolution,Year,id,chain_list
407,6Q4G.pdb,cdk2 in complex with fraglite37,0.98,18,6Q4G,[A]
408,6Q4H.pdb,cdk2 in complex with fraglite36,1.0,18,6Q4H,[A]
400,6Q49.pdb,cdk2 in complex with fraglite6,1.0,18,6Q49,[A]
399,6Q48.pdb,cdk2 in complex with fraglite7,1.03,18,6Q48,[A]
410,6Q4J.pdb,cdk2 in complex with fraglite34,1.05,18,6Q4J,[A]
411,6Q4K.pdb,cdk2 in complex with fraglite38,1.06,18,6Q4K,[A]
405,6Q4E.pdb,cdk2 in complex with fraglite33,1.06,18,6Q4E,[A]
404,6Q4D.pdb,cdk2 in complex with fraglite31,1.07,18,6Q4D,[A]
409,6Q4I.pdb,cdk2 in complex with fraglite35,1.11,18,6Q4I,[A]
396,6Q3B.pdb,cdk2 in complex with fraglite2,1.11,18,6Q3B,[A]


## Sort Dataframe based on the year

In [109]:
pdb_df_sorted_by_year = pdb_df.sort_values("Year")

In [110]:
pdb_df_sorted_by_year

Unnamed: 0,File,Title,Resolution,Year,id,chain_list
13,1FVT.pdb,the structure of cyclin-dependent kinase 2 (cd...,2.2,0,1FVT,[A]
15,1G5S.pdb,crystal structure of human cyclin dependent ki...,2.61,0,1G5S,[A]
14,1FVV.pdb,the structure of cdk2/cyclin a in complex with...,2.8,0,1FVV,"[A, B, C, D]"
12,1FQ1.pdb,crystal structure of kinase associated phospha...,3.0,0,1FQ1,"[A, B]"
7,1E1V.pdb,human cyclin dependent kinase 2 complexed with...,1.95,0,1E1V,[A]
8,1E1X.pdb,human cyclin dependent kinase 2 complexed with...,1.85,0,1E1X,[A]
9,1E9H.pdb,thr 160 phosphorylated cdk2 - human cyclin a3 ...,2.5,0,1E9H,"[A, B, C, D]"
10,1F5Q.pdb,crystal structure of murine gamma herpesvirus ...,2.5,0,1F5Q,"[A, B, C, D]"
18,1GIJ.pdb,human cyclin dependent kinase 2 complexed with...,2.2,1,1GIJ,[A]
17,1GII.pdb,human cyclin dependent kinase 2 complexed with...,2.0,1,1GII,[A]


# Plotting

In [None]:
# !pip install matplotlib

In [None]:
# from matplotlib import *

In [None]:
# pdb_df.plot(kind = 'hist')

In [None]:
# !pip install scipy

In [None]:
# # Libraries
# from matplotlib import pyplot as plt
# from scipy.cluster.hierarchy import dendrogram, linkage
# import numpy as np
 
# # Data set
# df = pdb_df.set_index('Resolution')
 
# # Calculate the distance between each sample
# Z = linkage(pdb_df, 'ward')
 
# # Control number of clusters in the plot + add horizontal line.
# dendrogram(Z, color_threshold=240)
# plt.axhline(y=240, c='grey', lw=1, linestyle='dashed')

# # Show the graph
# plt.show()