In [1]:
# DisulfideBond Playground
# Playing with the DisulfideBond class
# Author: Eric G. Suchanek, PhD.
# (c) 2022 Eric G. Suchanek, PhD., All Rights Reserved
# License: BSD
# Last Modification: 1/16/23
# Cα Cβ Sγ

# important preamble

import pandas as pd

import pyvista as pv
from pyvista import set_plot_theme

from Bio.PDB import *

# for using from the repo we 
from proteusPy import *
from proteusPy.data import *
from proteusPy.Disulfide import *


# override any default PDB globals
# location for PDB repository
PDB_ROOT = '/Users/egs/PDB/'

# location of cleaned PDB files - these are not stored in the repo
PDB_GOOD = '/Users/egs/PDB/good/'

# from within the repo 
PDB_REPO = '../pdb/'

# location of the compressed Disulfide .pkl files
MODELS = f'{PDB_ROOT}models/'

# pyvista setup for notebooks
pv.set_jupyter_backend('ipyvtklink')
set_plot_theme('document')


In [2]:
#
# subset of 1000 disulfides for quicker testing

SS_PICKLE_FILE = 'PDB_1000_ss.pkl'
SS_DICT_PICKLE_FILE = 'PDB_1000_ss_dict.pkl'
SS_TORSIONS_FILE = 'PDB_1000_SS_torsions.csv'


In [3]:
# Comment these out since they take so long.
# Download_Disulfides(pdb_home=PDB_ORIG, model_home=MODELS, reset=False)

#Extract_Disulfides(numb=1000, pdbdir=PDB_GOOD, datadir=MODELS, verbose=False, quiet=False)

PDB_SS = None
PDB_SS = DisulfideLoader(verbose=True, picklefile=SS_PICKLE_FILE, pickle_dict_file=SS_DICT_PICKLE_FILE,
                        torsion_file=SS_TORSIONS_FILE)

ss_list = DisulfideList([], 'tmp')
PDB_SS.TotalDisulfides


Reading disulfides from: /Users/egs/repos/proteusPy/proteusPy/data/PDB_1000_ss.pkl
Disulfides Read: 8210
Reading disulfide dict from: /Users/egs/repos/proteusPy/proteusPy/data/PDB_1000_ss_dict.pkl
Reading Torsion DF /Users/egs/repos/proteusPy/proteusPy/data/PDB_1000_SS_torsions.csv.
Read torsions DF.
PDB IDs parsed: 1000
Total Space Used: 1969317 bytes.


8210

In [4]:
ss0 = PDB_SS[0]
ss1 = PDB_SS[0]
ss2 = PDB_SS[1]
ss3 = PDB_SS[1]
sslist1 = DisulfideList([ss0, ss2], 'tmp')
sslist1.Torsion_RMS()

219.70970634935495

In [5]:
new = DisulfideList([], 'tmp')
sslist = PDB_SS.getlist()
sslist.min()


<Disulfide 1xr9_203A_259A SourceID: 1xr9 Proximal: 203 A Distal: 259 A>

In [6]:
sslist.max()

<Disulfide 1zjk_629A_660A SourceID: 1zjk Proximal: 629 A Distal: 660 A>

In [7]:

# one disulfide from the database
ss = Disulfide()
ss = PDB_SS[0]
#ss.pprint_all()

# get all disulfides for one structure. Make a 
# DisulfideList object to hold it
ss4yys = DisulfideList([], '4yys')
ss4yys = PDB_SS['4yys']

#ss4crn = DisulfideList([], '1crn')
#ss4crn = PDB_SS['1crn']

tot_ss = len(ss4yys) # number off ssbonds
print(f'tot {tot_ss}')


tot 6


In [8]:
print(f'{ss4yys.Torsion_RMS()}')

120.84848321514998


In [9]:
sslist = PDB_SS.SSList
tors_df = sslist.build_torsion_df()
tors_df.head()

100%|█████████████████████████████████████████████████████████| 8210/8210 [00:06<00:00, 1265.93it/s]


Unnamed: 0,source,ss_id,proximal,distal,chi1,chi2,chi3,chi4,chi5,energy,ca_distance,phi_prox,psi_prox,phi_dist,psi_dist
0,4yys,4yys_22A_65A,22,65,174.629233,82.51771,-83.322249,-62.523644,-73.827286,1.696237,4.502086,128.68679,128.68679,-105.731172,16.431694
1,4yys,4yys_56A_98A,56,98,-50.239063,-85.583916,97.275447,70.535692,179.046592,2.112566,4.967417,-30.489936,-30.489936,-59.378573,125.462589
2,4yys,4yys_156A_207A,156,207,62.598713,172.940042,-95.352637,-23.070934,-55.15848,2.331733,5.292317,150.741801,150.741801,60.457994,22.170381
3,4yys,4yys_22B_65B,22,65,173.666078,88.297996,-82.387276,-65.997032,-72.289506,1.958823,4.532387,130.848015,130.848015,-103.903213,16.298008
4,4yys,4yys_56B_98B,56,98,-56.410909,-81.401941,94.310784,67.035993,178.852441,1.444608,4.768629,-30.319989,-30.319989,-53.571262,120.853837


In [None]:
ss6fuf = PDB_SS['6fuf']


In [4]:
tors = pd.DataFrame()
tors = PDB_SS.getTorsions('')
tors

Unnamed: 0,source,ss_id,proximal,distal,chi1,chi2,chi3,chi4,chi5,energy,ca_distance,phi_prox,psi_prox,phi_dist,psi_dist
0,4yys,4yys_22A_65A,22,65,174.629233,82.517710,-83.322249,-62.523644,-73.827286,1.696237,4.502086,-149.776903,128.686790,-105.731172,16.431694
1,4yys,4yys_56A_98A,56,98,-50.239063,-85.583916,97.275447,70.535692,179.046592,2.112566,4.967417,-113.982916,-30.489936,-59.378573,125.462589
2,4yys,4yys_156A_207A,156,207,62.598713,172.940042,-95.352637,-23.070934,-55.158480,2.331733,5.292317,-156.930659,150.741801,60.457994,22.170381
3,4yys,4yys_22B_65B,22,65,173.666078,88.297996,-82.387276,-65.997032,-72.289506,1.958823,4.532387,-154.247515,130.848015,-103.903213,16.298008
4,4yys,4yys_56B_98B,56,98,-56.410909,-81.401941,94.310784,67.035993,178.852441,1.444608,4.768629,-113.329915,-30.319989,-53.571262,120.853837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291269,6hyg,6hyg_147A_205A,147,205,-164.604280,175.705493,-92.448401,-173.977073,177.722873,1.393562,6.353290,-125.541823,95.031918,-95.698377,117.610765
291270,6hyg,6hyg_306A_366A,306,366,-172.318665,179.769817,-64.037509,-164.413580,160.915343,2.747699,6.373312,-107.308375,127.524606,-98.326949,117.117737
291271,6hyg,6hyg_412A_470A,412,470,-157.757676,162.353491,-99.861949,-179.015120,-171.209923,2.918142,6.326811,-112.781665,103.829309,-90.111053,113.722038
291272,6fwt,6fwt_130A_172A,130,172,-59.385795,-65.896066,95.369762,85.707646,59.797795,1.653349,4.929605,-70.659535,149.321852,-158.811493,166.871203


In [8]:
ss1 = ss4yys.get_by_name('4yys_22A_65A')
ss1

<Disulfide 4yys_22A_65A SourceID: 4yys Proximal: 22 A Distal: 65 A>

In [9]:
Check_chains('4yys', PDB_GOOD)

ssbond dict: {1: ('22', '65', 'A', 'A'), 2: ('56', '98', 'A', 'A'), 3: ('156', '207', 'A', 'A'), 4: ('22', '65', 'B', 'B'), 5: ('56', '98', 'B', 'B'), 6: ('156', '207', 'B', 'B')}
multiple chains. [<Chain id=A>, <Chain id=B>]
Chain: A, length: 504
Chain: B, length: 454
chain lengths are unequal: [504, 454]


False

In [10]:
ss4yys_a = ss4yys.by_chain('A')
ss4yys_a.Torsion_RMS()


171.30285555734537

In [11]:
ss4yys_b = ss4yys.by_chain('B')
ss4yys_b.Torsion_RMS()

170.49799106566022

In [12]:
ss4yys_a1 = ss4yys_a[0]
# print(ss4yys_a1.repr_ss_coords())


In [13]:
ss4yys_b1 = ss4yys_b[0]
ss4yys_b1

<Disulfide 4yys_22B_65B SourceID: 4yys Proximal: 22 B Distal: 65 B>

In [14]:
ss4yys_a1.Distance_RMS(ss4yys_b1)

0.1145026461405659

In [15]:
ss4yys_a1.Torsion_RMS(ss4yys_b1)

7.045760800877229

In [16]:
ss4yys_a1 == ss4yys_b1

False

In [17]:
chns = ss4yys.get_chains()
ss4yys.has_chain('yyy')
chns

{'A', 'B'}

In [71]:
# load SS bonds by PDB ID
ss1 = PDB_SS['4yys']
print(ss1)
print(ss1[0].get_full_id())


[<Disulfide 4yys_22A_65A SourceID: 4yys Proximal: 22 A Distal: 65 A>, <Disulfide 4yys_56A_98A SourceID: 4yys Proximal: 56 A Distal: 98 A>, <Disulfide 4yys_156A_207A SourceID: 4yys Proximal: 156 A Distal: 207 A>, <Disulfide 4yys_22B_65B SourceID: 4yys Proximal: 22 B Distal: 65 B>, <Disulfide 4yys_56B_98B SourceID: 4yys Proximal: 56 B Distal: 98 B>, <Disulfide 4yys_156B_207B SourceID: 4yys Proximal: 156 B Distal: 207 B>]
(('4yys', 0, 'A', (' ', 22, ' ')), ('4yys', 0, 'A', (' ', 65, ' ')))


In [72]:
# you can loop over the IDList list and extract by ID
#
for id in PDB_SS.IDList[:5]:    # just show last 5
    # get the SS bonds for the given ID
    ssb = PDB_SS[id]
    numb_ss = len(ssb)
    print(f'ID: {id} has {numb_ss} Disulfides:')
    for bond in ssb:
        print(bond)
    print('\n')
    

ID: 4yys has 6 Disulfides:
<Disulfide 4yys_22A_65A SourceID: 4yys Proximal: 22 A Distal: 65 A>
<Disulfide 4yys_56A_98A SourceID: 4yys Proximal: 56 A Distal: 98 A>
<Disulfide 4yys_156A_207A SourceID: 4yys Proximal: 156 A Distal: 207 A>
<Disulfide 4yys_22B_65B SourceID: 4yys Proximal: 22 B Distal: 65 B>
<Disulfide 4yys_56B_98B SourceID: 4yys Proximal: 56 B Distal: 98 B>
<Disulfide 4yys_156B_207B SourceID: 4yys Proximal: 156 B Distal: 207 B>


ID: 1j5h has 2 Disulfides:
<Disulfide 1j5h_37A_47A SourceID: 1j5h Proximal: 37 A Distal: 47 A>
<Disulfide 1j5h_88A_93A SourceID: 1j5h Proximal: 88 A Distal: 93 A>


ID: 1mfe has 1 Disulfides:
<Disulfide 1mfe_137L_196L SourceID: 1mfe Proximal: 137 L Distal: 196 L>


ID: 1chv has 4 Disulfides:
<Disulfide 1chv_3S_21S SourceID: 1chv Proximal: 3 S Distal: 21 S>
<Disulfide 1chv_14S_38S SourceID: 1chv Proximal: 14 S Distal: 38 S>
<Disulfide 1chv_42S_53S SourceID: 1chv Proximal: 42 S Distal: 53 S>
<Disulfide 1chv_54S_59S SourceID: 1chv Proximal: 54 S Distal

In [73]:
ss_list = PDB_SS.getlist()
ss0 = ss_list[0]

print(ss0.proximal_residue_fullid)
print(ss0.chi3)
len(ss_list)


In [None]:
# routine creates 2 lists  for left-handed and right-handed disulfides 
ss_list = PDB_SS.getlist()
left_handed = DisulfideList([], 'left_handed')
right_handed = DisulfideList([], 'right_handed')

i = 0

for i in range(len(ss_list)):
    ss = ss_list[i]
    if ss.chi3 < 0:
        left_handed.append(ss)
    else:
        right_handed.append(ss)


print(f'Left Handed: {len(left_handed)}, Right Handed: {len(right_handed)}')



In [19]:
from proteusPy.Disulfide import Disulfide

# make some empty disulfides
ss1 = Disulfide('ss1')
ss2 = Disulfide('ss2')

# make a DisulfideList containing ss1, named 'tmp'
sslist = DisulfideList([ss1], 'tmp')
sslist.append(ss2)

# load the PDB Disulfide database
PDB_SS = None
PDB_SS = DisulfideLoader(verbose=True)

# extract a disulfide with typical index
ss1 = PDB_SS[0]
#print(f'{ss1.pprint_all()}')

# grab a subset via slicing
subset = DisulfideList(PDB_SS[0:10],'subset')

Reading disulfides from: /Users/egs/repos/proteusPy/proteusPy/data/PDB_all_ss.pkl
Disulfides Read: 291274
Reading disulfide dict from: /Users/egs/repos/proteusPy/proteusPy/data/PDB_all_ss_dict.pkl
Reading Torsion DF /Users/egs/repos/proteusPy/proteusPy/data/PDB_all_SS_torsions.csv.
Read torsions DF.
PDB IDs parsed: 35818
Total Space Used: 69858900 bytes.


In [20]:
rms = subset.Torsion_RMS()
rms

106.10335891066757

In [21]:
torsions = PDB_SS.getTorsions()
torsions.sort_values(by=['energy'], ascending=False, inplace=True)

torsions.head(10)

Unnamed: 0,source,ss_id,proximal,distal,chi1,chi2,chi3,chi4,chi5,energy,ca_distance,phi_prox,psi_prox,phi_dist,psi_dist
107709,6vxk,6vxk_801D_806D,801,806,-2.213905,-115.519179,4.513,142.486478,124.44711,19.428531,4.138057,-133.341225,157.790822,-164.261348,148.114156
107692,6vxk,6vxk_801B_806B,801,806,-2.215583,-115.529947,4.533066,142.493351,124.498429,19.426529,4.137166,-133.343733,157.811429,-164.219913,148.065024
221815,3gfb,3gfb_97B_111B,97,111,104.268375,-113.325134,3.845368,-114.476627,-101.93188,18.583219,5.301481,-40.343846,168.867053,-107.940461,138.94566
53512,7n1u,7n1u_15C_136C,15,136,-111.49399,-6.933328,-171.055865,4.03887,-126.874242,18.48469,4.870543,-103.157521,118.490872,-73.256412,144.808566
116895,7xfy,7xfy_6B_130B,6,130,116.872069,135.4788,-160.979158,-118.530478,-116.406268,18.15437,6.871436,-180.0,-180.0,-180.0,-180.0
109766,1toz,1toz_456A_467A,456,467,107.160553,111.293302,-13.243503,110.696801,100.941011,18.12453,5.200228,-85.062012,8.362787,-70.658073,134.346879
195660,3kvq,3kvq_740A_745A,740,745,-106.281591,-11.631489,-2.428372,27.593785,-134.476763,18.086734,4.346198,-180.0,-180.0,-180.0,-180.0
216304,7y9z,7y9z_538C_590C,538,590,-137.623374,108.35009,176.303306,126.589302,-118.193015,17.941133,6.323242,-57.245745,140.192749,-73.030225,155.213162
104153,4twt,4twt_69A_101A,69,101,-116.8812,-127.334031,162.620251,96.225097,-123.922915,17.785911,6.146849,-125.237325,91.341452,-107.766096,125.255489
231784,6xis,6xis_134C_166C,134,166,8.087406,111.778108,28.474407,124.296756,-114.226438,17.672948,6.079529,-48.122713,127.211933,-48.695907,123.235518


In [22]:
idx_max = int(torsions['energy'].idxmax())
print(f'IDMAX: {idx_max}')
ssmax = PDB_SS[idx_max]
ssmax

IDMAX: 107709


<Disulfide 6vxk_801D_806D SourceID: 6vxk Proximal: 801 D Distal: 806 D>

In [23]:
toget = 200

badlist = torsions['ss_id']

bad_SS_list = DisulfideList([],'20 top high energy')
ss = Disulfide()

for i in range(toget):
    ssid = torsions.iloc[i]['ss_id']
    ss = PDB_SS.get_by_name(ssid)
    #print(f'ID: {ss.name}: {ss.energy:.2f} kcal/mol CA: {ss.ca_distance:.2f}')
    bad_SS_list.append(ss)



In [24]:

tors_bad_rms = bad_SS_list.Torsion_RMS()
tors_bad_rms

24.037148943399217

In [25]:
goodlist = torsions['ss_id']

good_SS_list = DisulfideList([],'20 top low energy')
ss = Disulfide()

for i in range(toget):
    if i == 0:
        continue
    
    ssid = torsions.iloc[-i]['ss_id']
    ss = PDB_SS.get_by_name(ssid)
    #print(f'ID: {ss.name}: {ss.energy:.2f} kcal/mol CA: {ss.ca_distance:.2f}')
    good_SS_list.append(ss)


In [26]:

tors_good_rms = good_SS_list.Torsion_RMS()
tors_good_rms

25.158830711756238

In [27]:
from scipy.spatial import distance_matrix

good_array = good_SS_list.get_torsion_array()
bad_array = bad_SS_list.get_torsion_array()
good_array.shape

(199, 5)

In [29]:

dm1 = distance_matrix(good_array, good_array)
dm2 = distance_matrix(bad_array, bad_array)



In [30]:
import numpy as np

def EDM(A, B):
    p1 = np.sum(A**2, axis=1)[:, np.newaxis]
    p2 = np.sum(B**2, axis=1)
    p3 = -2 * np.dot(A, B.T)
    res = p1 + p2 + p3
    #res = numpy.sqrt(res)
    return res


In [31]:
dm3 = np.sqrt(dm2)

In [32]:
import plotly_express as px
fig = px.imshow(dm1)
fig.show()

In [33]:
import plotly_express as px
fig = px.imshow(dm2)
fig.show()

In [49]:
dm1.shape
dm1

array([[  0.        ,   3.98146902,   4.89625831, ..., 359.96200768,
        289.21232645, 386.2962092 ],
       [  3.98146902,   0.        ,   5.00861382, ..., 358.21888342,
        288.00187025, 384.34872479],
       [  4.89625831,   5.00861382,   0.        , ..., 355.91840721,
        284.38066648, 381.73024727],
       ...,
       [359.96200768, 358.21888342, 355.91840721, ...,   0.        ,
        199.58987586, 307.10505379],
       [289.21232645, 288.00187025, 284.38066648, ..., 199.58987586,
          0.        , 173.73791635],
       [386.2962092 , 384.34872479, 381.73024727, ..., 307.10505379,
        173.73791635,   0.        ]])

In [47]:
df = pd.DataFrame(dm1.copy())
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,...,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,370.797923,369.107921,367.253981,441.352273,198.891821,199.823286,200.429604,199.895127,199.473581,467.174659,...,202.855435,344.488041,344.496257,201.402288,289.321879,374.572644,202.470702,374.739919,270.525633,203.734504
std,151.730243,151.659649,151.452169,107.466313,165.924875,168.513269,168.492,168.444988,168.23142,143.830044,...,164.181972,147.78462,147.782875,165.18836,89.102764,132.070619,168.57472,144.992705,122.777717,168.364872
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,357.9145,357.174566,355.414619,423.632376,15.628424,13.373077,14.007832,13.758405,13.170157,374.60494,...,21.867962,229.546509,229.55619,19.839802,232.124044,264.433636,17.370074,309.319381,176.927947,19.367914
50%,376.318753,374.412439,371.760117,433.04579,261.258935,256.104441,258.987227,256.593358,256.242105,386.703382,...,250.554881,337.400287,337.389452,258.380586,243.151617,326.030742,254.12965,318.476436,199.71563,255.905498
75%,387.382421,385.363441,382.843061,502.990245,364.140128,368.941057,370.014884,369.017756,369.381733,609.410483,...,363.643017,493.807999,493.820525,366.377578,356.527918,468.485816,371.654723,497.081256,357.380415,374.409674
max,714.579465,712.145326,710.45167,625.979218,469.405995,476.217227,477.497441,475.468247,477.600731,714.532172,...,467.283251,704.741238,704.751848,468.752814,541.459055,625.979218,485.127903,706.406747,548.283437,487.718453


In [50]:
x = np.arange(0,20, 1)
y = np.arange(0,20, 1)
z = dm1
x, y = np.meshgrid(x, y)


In [51]:
# Create and plot structured grid
grid = pv.StructuredGrid(x, y, z)
grid.plot()

ValueError: Input point array shapes must match exactly