# Tutorial PRDBv3.0 dataset

PYTHON LIBRARIES USED 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import json
import lxml
import matplotlib.pyplot as plt

### Read json file

In [2]:
# Read the JSON file into a pandas DataFrame
prdbv3_read_json = pd.read_json('/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/PRDBv3_pdbList_chains.json')

In [3]:
prdbv3_read_json

Unnamed: 0,C_PDB,C_chain_PR,Structural_class,Flexible_class,Docking_case,Binding_affinity,C_pro_chain,C_pro_seq_length,C_RNA_chain,C_RNA_seq_length,U_pro_PDB,U_pro_chain,U_pro_seq_length,U_RNA_PDB,U_RNA_chain,U_RNA_seq_length
0,1ASY,A:R,A,R,UU,yes,A,490,R,75,1EOV,A,487,2TRA,A,75
1,1B23,P:R,A,F,UU,no,P,405,R,74,1TUI,A,405,1U0B*,A,74
2,1C0A,A:B,A,S,UU,no,A,585,B,77,1EQR,A,590,1EFW*,C,73
3,1QTQ,A:B,A,S,UU,yes,A,553,B,75,1NYL,A,539,3KNH*,Y,75
4,2BTE,A:B,A,R,UU,no,A,878,B,83,1OBC,A,878,2NQP*,F,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,2PJP,A:B,C,,BU,yes,A,121,B,23,,,,1MFK,A,23
193,3BT7,A:C,C,,BU,yes,A,369,C,19,,,,1EVV,A,76
194,6HTU,ABC:DF,C,,BU,yes,ABC,182+182+182,DF,19+19,,,,6HU6,AE,19+19
195,1LNG,A:B,D,,BU,no,A,87,B,97,,,,1Z43,A,101


In [4]:
# Total number of unique domains in PRDBv3.0
print(f"Number of unique domains in PRDBv3: {prdbv3_read_json['C_PDB'].nunique()}\n")
# Number of unique PDB ID in each structural class
print(f"Number of unique PDB ID in each structural class:\n{prdbv3_read_json['Structural_class'].value_counts()}\n")
# Number of unique PDB ID in each flexible class
print(f"Number of unique PDB ID in each flexible class:\n{prdbv3_read_json['Flexible_class'].value_counts()}\n")
# Number of unique PDB ID in each docking case
print(f"Number of unique PDB ID in each docking case:\n{prdbv3_read_json['Docking_case'].value_counts()}\n")

Number of unique domains in PRDBv3: 197

Number of unique PDB ID in each structural class:
Structural_class
D    86
C    62
A    40
B     9
Name: count, dtype: int64

Number of unique PDB ID in each flexible class:
Flexible_class
R    117
S     41
F     29
Name: count, dtype: int64

Number of unique PDB ID in each docking case:
Docking_case
UB    160
UU     27
BU     10
Name: count, dtype: int64



## Read PDB files from PRDBv3.0 dataset

In [5]:
# PRDBv3 dataset directory
PRDBv3_dir = "/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0"

### PRDBv3.0: List only directories from different classes

In [6]:
# List only directories
# List only directories in different classes
for root, dirs, files in os.walk(PRDBv3_dir):
    for dir in dirs:
        if dir in prdbv3_read_json.C_PDB.values:
            print(os.path.join(root, dir))

/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/6HTU
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1JBS
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2PY9
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2GJW
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2AZ0
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/6L1W
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2GIC
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1SDS
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1J1U
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1K8W
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2GXB
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1M5O
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/5EX7
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/3MOJ
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/6HAU
/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2BTE
/home/shrikant/Projects/

### PRDBv3.0: List all files with directories

In [7]:
def get_PRDBv3_info(path, df_to_parse):
    print(f'INFO PRDBv3:\nTotal {len(df_to_parse)} directories in dataframe: \n')
    # Iterate through each directory in the PRDBv3 dataset directory
    count_dir = 1
    for pdbdir in df_to_parse['C_PDB'].values:
        try:
            print(f'({count_dir}) Directory path:{os.path.join(path, pdbdir)}')
            print(f'Total {len(os.listdir(os.path.join(path, pdbdir)))} PDB files in {pdbdir} directory.')
            print(f'All files in {pdbdir} directory:\n{os.listdir(os.path.join(path, pdbdir))}\n')
            count_dir += 1
            # Catch exception for directories not found
        except:
            print(f'Directory not found: {os.path.join(path, pdbdir)}\n')
# Read directories, files and its contents in PRDBv3 dataset directory
get_PRDBv3_info(PRDBv3_dir, prdbv3_read_json)

INFO PRDBv3:
Total 197 directories in dataframe: 

(1) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1ASY
Total 8 PDB files in 1ASY directory.
All files in 1ASY directory:
['2TRA.pdb', '1EOV.pdb', '1ASY_1EOV.pdb', '2TRA_mod.pdb', '1ASY_mod.pdb', '1ASY_2TRA.pdb', '1ASY.pdb', '1EOV_mod.pdb']

(2) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1B23
Total 8 PDB files in 1B23 directory.
All files in 1B23 directory:
['1TUI_mod.pdb', '1B23.pdb', '1TUI.pdb', '1U0B.pdb', '1B23_mod.pdb', '1U0B_mod.pdb', '1B23_1TUI.pdb', '1B23_1U0B.pdb']

(3) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1C0A
Total 8 PDB files in 1C0A directory.
All files in 1C0A directory:
['1C0A_1EQR.pdb', '1C0A.pdb', '1EQR_mod.pdb', '1C0A_1EFW.pdb', '1EFW.pdb', '1EFW_mod.pdb', '1C0A_mod.pdb', '1EQR.pdb']

(4) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1QTQ
Total 8 PDB files in 1QTQ directory.
All files in 1QTQ directory:
['1QT

In [8]:
print(f"Number of unique PDB ID in each structural class:\n{prdbv3_read_json['Structural_class'].value_counts()}\n")

Number of unique PDB ID in each structural class:
Structural_class
D    86
C    62
A    40
B     9
Name: count, dtype: int64



In [9]:
# Read PRDBv3 data directory and its contents based on the different structural classes
def get_PRDBv3_info_by_class(path, df_to_parse, class_name):
    print(f"INFO PRDBv3:\nTotal {len(df_to_parse[df_to_parse['Structural_class'] == class_name]['C_PDB'].values)} directories in clas {class_name}: \n")
    # Iterate through each directory in the PRDBv3 dataset directory
    count_dir = 1
    for pdbdir in df_to_parse[df_to_parse['Structural_class'] == class_name]['C_PDB'].values:
        try:
            print(f'({count_dir}) Directory path:{os.path.join(path, pdbdir)}')
            print(f'Total {len(os.listdir(os.path.join(path, pdbdir)))} PDB files in {pdbdir} directory.')
            print(f'All files in {pdbdir} directory:\n{os.listdir(os.path.join(path, pdbdir))}\n')
            count_dir += 1
            # Catch exception for directories not found
        except:
            print(f'Directory not found: {os.path.join(path, pdbdir)}\n')
    
# Read directories, files and its contents in PRDBv3 dataset directory based on structural class
get_PRDBv3_info_by_class(PRDBv3_dir, prdbv3_read_json, 'A')

INFO PRDBv3:
Total 40 directories in clas A: 

(1) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1ASY
Total 8 PDB files in 1ASY directory.
All files in 1ASY directory:
['2TRA.pdb', '1EOV.pdb', '1ASY_1EOV.pdb', '2TRA_mod.pdb', '1ASY_mod.pdb', '1ASY_2TRA.pdb', '1ASY.pdb', '1EOV_mod.pdb']

(2) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1B23
Total 8 PDB files in 1B23 directory.
All files in 1B23 directory:
['1TUI_mod.pdb', '1B23.pdb', '1TUI.pdb', '1U0B.pdb', '1B23_mod.pdb', '1U0B_mod.pdb', '1B23_1TUI.pdb', '1B23_1U0B.pdb']

(3) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1C0A
Total 8 PDB files in 1C0A directory.
All files in 1C0A directory:
['1C0A_1EQR.pdb', '1C0A.pdb', '1EQR_mod.pdb', '1C0A_1EFW.pdb', '1EFW.pdb', '1EFW_mod.pdb', '1C0A_mod.pdb', '1EQR.pdb']

(4) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1QTQ
Total 8 PDB files in 1QTQ directory.
All files in 1QTQ directory:
['1QTQ_mo

In [10]:
print(f"Number of unique PDB ID in each flexible class:\n{prdbv3_read_json['Flexible_class'].value_counts()}\n")

Number of unique PDB ID in each flexible class:
Flexible_class
R    117
S     41
F     29
Name: count, dtype: int64



In [11]:
# Read PRDBv3 data directory and its contents based on the different flexible class
def get_PRDBv3_info_by_flexible_class(path, df_to_parse, flex_class):
    print(f"INFO PRDBv3:\nTotal {len(df_to_parse[df_to_parse['Flexible_class'] == flex_class]['C_PDB'].values)} directories in flexible class {flex_class}: \n")
    # Iterate through each directory in the PRDBv3 dataset directory
    count_dir = 1
    for pdbdir in df_to_parse[df_to_parse['Flexible_class'] == flex_class]['C_PDB'].values:
        try:
            print(f'({count_dir}) Directory path:{os.path.join(path, pdbdir)}')
            print(f'Total {len(os.listdir(os.path.join(path, pdbdir)))} PDB files in {pdbdir} directory.')
            print(f'All files in {pdbdir} directory:\n{os.listdir(os.path.join(path, pdbdir))}\n')
            count_dir += 1
            # Catch exception for directories not found
        except:
            print(f'Directory not found: {os.path.join(path, pdbdir)}\n')
# Read directories, files and its contents in PRDBv3 dataset directory based on flexible class
get_PRDBv3_info_by_flexible_class(PRDBv3_dir, prdbv3_read_json, 'R')

INFO PRDBv3:
Total 117 directories in flexible class R: 

(1) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1ASY
Total 8 PDB files in 1ASY directory.
All files in 1ASY directory:
['2TRA.pdb', '1EOV.pdb', '1ASY_1EOV.pdb', '2TRA_mod.pdb', '1ASY_mod.pdb', '1ASY_2TRA.pdb', '1ASY.pdb', '1EOV_mod.pdb']

(2) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2BTE
Total 8 PDB files in 2BTE directory.
All files in 2BTE directory:
['2NQP_mod.pdb', '2BTE_1OBC.pdb', '2NQP.pdb', '1OBC.pdb', '1OBC_mod.pdb', '2BTE.pdb', '2BTE_2NQP.pdb', '2BTE_mod.pdb']

(3) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/2FMT
Total 8 PDB files in 2FMT directory.
All files in 2FMT directory:
['1FMT.pdb', '3CW6.pdb', '2FMT_3CW6.pdb', '2FMT_1FMT.pdb', '2FMT_mod.pdb', '2FMT.pdb', '1FMT_mod.pdb', '3CW6_mod.pdb']

(4) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/3HL2
Total 8 PDB files in 3HL2 directory.
All files in 3HL2 directory

In [12]:
print(f"Number of unique PDB ID in each docking case:\n{prdbv3_read_json['Docking_case'].value_counts()}\n")

Number of unique PDB ID in each docking case:
Docking_case
UB    160
UU     27
BU     10
Name: count, dtype: int64



In [13]:
# Read PRDBv3 data directory and its contents based on the different docking cases
def get_PRDBv3_info_by_docking_case(path, df_to_parse, docking_case):
    print(f"INFO PRDBv3:\nTotal {len(df_to_parse[df_to_parse['Docking_case'] == docking_case]['C_PDB'].values)} directories in docking case {docking_case}: \n")
    # Iterate through each directory in the PRDBv3 dataset directory
    count_dir = 1
    for pdbdir in df_to_parse[df_to_parse['Docking_case'] == docking_case]['C_PDB'].values:
        try:
            print(f'({count_dir}) Directory path:{os.path.join(path, pdbdir)}')
            print(f'Total {len(os.listdir(os.path.join(path, pdbdir)))} PDB files in {pdbdir} directory.')
            print(f'All files in {pdbdir} directory:\n{os.listdir(os.path.join(path, pdbdir))}\n')
            count_dir += 1
            # Catch exception for directories not found
        except:
            print(f'Directory not found: {os.path.join(path, pdbdir)}\n')
# Read directories, files and its contents in PRDBv3 dataset directory based on docking case
get_PRDBv3_info_by_docking_case(PRDBv3_dir, prdbv3_read_json, 'UU')

INFO PRDBv3:
Total 27 directories in docking case UU: 

(1) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1ASY
Total 8 PDB files in 1ASY directory.
All files in 1ASY directory:
['2TRA.pdb', '1EOV.pdb', '1ASY_1EOV.pdb', '2TRA_mod.pdb', '1ASY_mod.pdb', '1ASY_2TRA.pdb', '1ASY.pdb', '1EOV_mod.pdb']

(2) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1B23
Total 8 PDB files in 1B23 directory.
All files in 1B23 directory:
['1TUI_mod.pdb', '1B23.pdb', '1TUI.pdb', '1U0B.pdb', '1B23_mod.pdb', '1U0B_mod.pdb', '1B23_1TUI.pdb', '1B23_1U0B.pdb']

(3) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1C0A
Total 8 PDB files in 1C0A directory.
All files in 1C0A directory:
['1C0A_1EQR.pdb', '1C0A.pdb', '1EQR_mod.pdb', '1C0A_1EFW.pdb', '1EFW.pdb', '1EFW_mod.pdb', '1C0A_mod.pdb', '1EQR.pdb']

(4) Directory path:/home/shrikant/Projects/git_dev/PRDBv3_dataset/PRDBv3.0/1QTQ
Total 8 PDB files in 1QTQ directory.
All files in 1QTQ directory:
