In [1]:
# DisulfideBond Class Analysis
# Author: Eric G. Suchanek, PhD.
# (c) 2023 Eric G. Suchanek, PhD., All Rights Reserved
# License: MIT
# Last Modification: 2/18/23
# Cα Cβ Sγ

import pandas as pd
import numpy

import pyvista as pv
from pyvista import set_plot_theme

from Bio.PDB import *

# for using from the repo we 
import proteusPy
from proteusPy import *
from proteusPy.data import *
from proteusPy.Disulfide import *
from proteusPy.DisulfideList import DisulfideList, load_disulfides_from_id
from proteusPy.utility import Create_classes

# override any default PDB globals
# location for PDB repository
PDB_ROOT = '/Users/egs/PDB/'

# location of cleaned PDB files - these are not stored in the repo
PDB_GOOD = '/Users/egs/PDB/good/'

# location of the compressed Disulfide .pkl files
MODELS = f'{PDB_ROOT}data/'

# pyvista setup for notebooks
pv.set_jupyter_backend('trame')
#set_plot_theme('dark')

PDB_SS = Load_PDB_SS(verbose=True, subset=False)
PDB_SS.describe()

-> load_PDB_SS(): Reading /Users/egs/repos/proteusPy/proteusPy/data/PDB_SS_ALL_LOADER.pkl... done.
PDB IDs present:                    35818
Disulfides loaded:                  120697
Average structure resolution:       2.34 Å
Lowest Energy Disulfide:            2q7q_75D_140D
Highest Energy Disulfide:           1toz_456A_467A
Total RAM Used:                     29.26 GB.


In [2]:
tors_df = PDB_SS.getTorsions()
tors_df.describe()


Unnamed: 0,proximal,distal,chi1,chi2,chi3,chi4,chi5,energy,ca_distance,phi_prox,psi_prox,phi_dist,psi_dist,torsion_length
count,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0,120697.0
mean,224.952352,273.166616,-46.576729,-15.271499,-2.472293,-27.620642,-38.68503,3.547099,5.55874,-97.818195,62.190902,-96.178772,63.919627,224.176713
std,272.58512,276.640523,97.018984,102.465838,93.910914,103.712065,94.162959,2.361169,1.514885,43.803698,99.029196,43.531611,96.442547,52.598704
min,1.0,1.0,-179.998217,-179.999554,-179.981786,-179.99951,-179.999305,0.491737,2.831762,-180.0,-180.0,-180.0,-180.0,77.51608
25%,47.0,96.0,-83.383209,-87.688664,-87.401377,-92.805867,-75.113684,1.910619,5.077294,-128.635741,-27.963951,-123.289748,-24.254751,181.152368
50%,136.0,193.0,-63.664056,-58.572699,-64.468336,-66.48871,-61.045404,3.02299,5.593961,-96.109404,112.91855,-97.429163,115.186341,225.205972
75%,305.0,361.0,-45.09902,81.012589,94.710376,73.630486,-14.139138,4.35081,6.074565,-68.9236,143.684511,-70.590516,143.207727,261.539869
max,4374.0,8774.0,179.999749,179.996783,179.99507,179.999222,179.999693,18.12453,101.701154,179.912372,179.999324,179.849134,179.993389,381.310941


In [3]:
grouped = Create_classes(tors_df)
grouped.to_csv(f'{DATA_DIR}PDB_ss_classes.csv')

grouped_summary = grouped.drop(columns=['ss_id'], axis=1)
grouped_summary.to_csv(f'{DATA_DIR}PDB_ss_classes_summary.csv')


In [6]:
grouped.head(32)

Unnamed: 0,class_id,ss_id,count,incidence,percentage
0,0,"[2qhe_27A_126A, 2qhe_44A_105A, 2qhe_51A_98A, 2...",31513,0.261092,26.109182
1,2,"[2qhe_84A_96A, 4nz3_185A_200A, 4nzr_142H_208H,...",5805,0.048096,4.809564
2,20,"[3l4o_38C_86C, 3l75_144E_160E, 3ulv_95A_122A, ...",3413,0.028277,2.827742
3,22,"[2hew_70F_163F, 5hdz_216A_420A, 5fji_73A_90A, ...",1940,0.016073,1.607331
4,200,"[1j5h_37A_47A, 4lb7_2D_30D, 2hew_98F_183F, 6vp...",12735,0.105512,10.551215
5,202,"[1zed_467A_474A, 2zxt_396A_398A, 2zwl_17L_22L,...",993,0.008227,0.822721
6,220,"[1j5h_88A_93A, 1chv_14S_38S, 2qhe_29A_45A, 4nz...",5674,0.04701,4.701028
7,222,"[4yys_56A_98A, 3l4o_36C_121C, 7nd8_336A_361A, ...",5092,0.042188,4.218829
8,2000,"[1mfe_137L_196L, 6fuf_110A_187A, 4nz3_264A_281...",4749,0.039346,3.934646
9,2002,"[6snc_134A_194A, 6vsz_261A_321A, 6vsz_367A_425...",3774,0.031268,3.126838


In [9]:
class_cols = ['Idx','chi1_s','chi2_s','chi3_s','chi4_s','chi5_s','class_id','SS_Classname','FXN','count','incidence','percentage','ca_distance_mean',
'ca_distance_std','torsion_length_mean','torsion_length_std','energy_mean','energy_std']

class_df = pd.read_csv(f'{DATA_DIR}PDB_ss_classes_master.csv', dtype={'class_id': 'string'})

In [10]:
class_df.head(32)

Unnamed: 0,Idx,chi1_s,chi2_s,chi3_s,chi4_s,chi5_s,class_id,SS_Classname,FXN,count,incidence,percentage,ca_distance_mean,ca_distance_std,torsion_length_mean,torsion_length_std,energy_mean,energy_std
0,0,-1,-1,-1,-1,-1,0,-LHSpiral,UNK,31513,0.261,26.109,5.744,0.728,187.159,42.611,2.623,1.786
1,1,-1,-1,-1,-1,1,2,00002,UNK,5805,0.048,4.81,6.048,0.462,235.036,50.1,3.164,1.892
2,2,-1,-1,-1,1,-1,20,-LHHook,UNK,3413,0.028,2.828,5.374,1.276,235.619,36.678,4.669,2.95
3,3,-1,-1,-1,1,1,22,00022,UNK,1940,0.016,1.607,5.373,0.616,220.247,45.062,3.703,2.35
4,4,-1,-1,1,-1,-1,200,-RHStaple *allosteric,Allosteric,12735,0.106,10.551,4.229,1.164,196.214,22.011,4.412,1.447
5,5,-1,-1,1,-1,1,202,00202,UNK,993,0.008,0.823,5.166,1.901,240.867,31.444,4.528,2.658
6,6,-1,-1,1,1,-1,220,00220,UNK,5674,0.047,4.701,5.363,1.585,227.316,42.348,4.073,2.51
7,7,-1,-1,1,1,1,222,00222,UNK,5092,0.042,4.219,5.19,0.473,207.154,37.757,3.248,2.206
8,8,-1,1,-1,-1,-1,2000,02000,UNK,4749,0.039,3.935,5.819,2.283,268.467,50.546,4.083,2.938
9,9,-1,1,-1,-1,1,2002,02002,UNK,3774,0.031,3.127,6.248,1.677,307.771,46.913,3.487,2.703
