In [1]:
# DisulfideBond Playground
# Playing with the DisulfideBond class
# Author: Eric G. Suchanek, PhD.
# (c) 2023 Eric G. Suchanek, PhD., All Rights Reserved
# License: MIT
# Last Modification: 1/30/23
# Cα Cβ Sγ

import pandas as pd

import pyvista as pv
from pyvista import set_plot_theme

from Bio.PDB import *

# for using from the repo we 
import proteusPy
from proteusPy import *
from proteusPy.data import *
from proteusPy.Disulfide import *
from proteusPy.DisulfideList import DisulfideList, load_disulfides_from_id

# override any default PDB globals
# location for PDB repository
PDB_ROOT = '/Users/egs/PDB/'

# location of cleaned PDB files - these are not stored in the repo
PDB_GOOD = '/Users/egs/PDB/good/'

# location of the compressed Disulfide .pkl files
MODELS = f'{PDB_ROOT}data/'

# pyvista setup for notebooks
pv.set_jupyter_backend('ipyvtklink')
#set_plot_theme('dark')


In [2]:
from proteusPy.Disulfide import Disulfide, check_header_from_file

PDB_DIR = '/Users/egs/PDB/good/'
OK = False
OK = check_header_from_file(f'{PDB_DIR}pdb6z9g.ent', verbose=True)
OK
    

-> check_header_from_file() - Parsing file: /Users/egs/PDB/good/pdb6z9g.ent:
 -> SSBond: 1: tmp: 78B - 492B
 -> SSBond: 2: tmp: 78D - 492D
 -> SSBond: 3: tmp: 78F - 492F
 -> SSBond: 4: tmp: 78H - 492H


True

In [6]:
from proteusPy.DisulfideList import DisulfideList
_SSlist = DisulfideList([],'ss','22')
newss = load_disulfides_from_id('6z9g', verbose=False)
newss.display()



ViewInteractiveWidget(height=1024, layout=Layout(height='auto', width='100%'), width=1024)

In [None]:
def extract_firstchain_ss(sslist: DisulfideList, verbose=False) -> DisulfideList:
    '''
    Function extracts disulfides from the first chain

    :param sslist: Starting SS list
    :return: SS list from first chain ID or cross-chain
    '''
    id = ''
    chainlist = []
    pc = dc = ''
    res = DisulfideList([], sslist.id)
    xchain = 0

    # build ist of chains
    for ss in sslist:
        pc = ss.proximal_chain
        dc = ss.distal_chain
        if pc != dc:
            xchain += 1
            if verbose:
                print(f'Cross chain ss: {ss}')
        chainlist.append(pc)
    chain = chainlist[0]

    for ss in sslist:
        if ss.proximal_chain == chain:
            res.append(ss)
    
    return res, xchain

def prune_extra_ss(sslist: DisulfideList):
    '''
    Given a dict of disulfides, check for extra chains, grab only the disulfides from
    the first chain and return a dict containing only the first chain disulfides

    :param ssdict: input dictionary with disulfides
    '''
    xchain = 0

    #print(f'Processing: {ss} with: {sslist}')
    id = sslist.pdb_id
    pruned_list = DisulfideList([], id)
    pruned_list, xchain = extract_firstchain_ss(sslist)
        
    return copy.deepcopy(pruned_list), xchain


In [None]:
chain1 = extract_firstchain_ss(SSlist)
chain1

In [None]:
chain2 = prune_extra_ss(SSlist)
chain2

In [4]:
# Comment these out since they take so long.
# Download_Disulfides(pdb_home=PDB_ORIG, model_home=MODELS, reset=False)

#Extract_Disulfides(numb=1000, pdbdir=PDB_GOOD, datadir=MODELS, verbose=False, quiet=False)

PDB_SS = None
#PDB_SS = DisulfideLoader(verbose=True, picklefile=SS_PICKLE_FILE, pickle_dict_file=SS_DICT_PICKLE_FILE,
#                        torsion_file=SS_TORSIONS_FILE)

PDB_SS = DisulfideLoader(verbose=True, subset=False)

ss_list = DisulfideList([], 'tmp')
PDB_SS.TotalDisulfides
ss = PDB_SS[0]
#ss.display(style='bs')


Reading disulfides from: /Users/egs/repos/proteusPy/proteusPy/data/PDB_all_ss.pkl
Disulfides Read: 120697
Reading disulfide dict from: /Users/egs/repos/proteusPy/proteusPy/data/PDB_all_ss_dict.pkl
Reading Torsion DF /Users/egs/repos/proteusPy/proteusPy/data/PDB_all_SS_torsions.csv.
Read torsions DF.
PDB IDs parsed: 35819
Total Space Used: 30678733 bytes.


In [None]:
sslist_6z = PDB_SS.getdict('6z9g')
pruned = prune_extra_ss(sslist_6z)
print(f'{pruned}')

In [5]:
ss.display()

ViewInteractiveWidget(height=1024, layout=Layout(height='auto', width='100%'), width=1024)

In [None]:
# given the full dictionary, walk through all the keys (PDB ID)
# for each PDB_ID SS list, find and extract the SS for the first chain
# update the 'pruned' dict with the now shorter SS list

_PBAR_COLS = 105
ssdict = {}
ssdict = PDB_SS.SSDict
empty = DisulfideList([], 'empty')

tot = len(ssdict)

# make a dict with an initial bogus value, but properly initialized with an SS list
pruned_dict = {'xxx': empty}

xchain_tot = 0
removed_tot = 0

pbar = tqdm(range(tot), ncols=_PBAR_COLS)

# walk the dict, prune the SS list. This takes > 8 minutes on my Macbook Pro
# for the full dataset.

for _, pdbid_tuple in zip(pbar, enumerate(ssdict)):
    xchain = 0
    removed = 0

    # print(f'{k} {pdbid_tuple}')
    pdbid = pdbid_tuple[1]
    sslist = ssdict[pdbid]
    pruned, xchain = prune_extra_ss(sslist)
    removed = len(sslist) - len(pruned)
    removed_tot += removed
    xchain_tot += xchain
    pruned_dict[pdbid] = pruned
    
print(f'Pruned {removed_tot}, Xchain: {xchain_tot}')


In [None]:
len(pruned_dict)

In [None]:
# dump the all_ss array of disulfides to a .pkl file. ~520 MB.
datadir = '/Users/egs/PDB/data/'
picklefile = 'PDB_SS_pruned_dict.pkl'

fname = f'{datadir}{picklefile}'

with open(fname, 'wb+') as f:
    pickle.dump(pruned_dict, f)



In [None]:
_PBAR_COLS = 105

pruned_list = DisulfideList([], 'PDB_SS_SINGLE_CHAIN')

tot = len(pruned_dict)

pbar = tqdm(range(tot), ncols=_PBAR_COLS)

for _, pdbid_tuple in zip(pbar, enumerate(pruned_dict)):
    # print(f'{k} {pdbid_tuple}')
    pdbid = pdbid_tuple[1]
    sslist = pruned_dict[pdbid]
    pruned_list.extend(sslist)
    
print(f'Total SS: {pruned_list.length}')

# dump the all_ss array of disulfides to a .pkl file. ~520 MB.
datadir = '/Users/egs/PDB/data/'
picklefile = 'PDB_SS_pruned_list.pkl'

fname = f'{datadir}{picklefile}'

with open(fname, 'wb+') as f:
    pickle.dump(pruned_list, f)



In [None]:
datadir = '/Users/egs/PDB/data/'
torsfile = 'PDB_SS_pruned_torsions.csv'
fname = f'{datadir}{torsfile}'

tot = len(pruned_list)

tors_df = pd.DataFrame(columns=Torsion_DF_Cols)
tors_df = pruned_list.build_torsion_df()

tors_df.to_csv(fname)



In [None]:


sslist = PDB_SS.SSList
LHS_neighbors = sslist.nearest_neighbors(-60, -60, -90, -60, -60, 10.0)
LHS_neighbors.length()
# modelss = Disulfide('model', proximal=1, distal=2, pdb_id='EGS')
# modelss.build_model(-60,-60, -90, -60, -60)


In [None]:

# We use the entire database contained in PDB_SS.SSList for our search here.
# The entire database can be scanned in 13 seconds on the M1 Pro Macbook Pro.

ssmin_enrg, ssmax_enrg = PDB_SS.SSList.minmax_energy()
ssmin_enrg.pprint()
ssmax_enrg.pprint()
minmax = DisulfideList([ssmin_enrg, ssmax_enrg], 'minmax')
#minmax.display()


In [None]:
ssmin_enrg.Torsion_Distance(ssmax_enrg)

In [None]:
ssmin_enrg.Torsion_Distance(ssmin_enrg)

In [None]:
import proteusPy
from proteusPy.DisulfideLoader import DisulfideLoader
from proteusPy.DisulfideList import DisulfideList
from proteusPy.Disulfide import Disulfide

PDB_SS = None
PDB_SS = DisulfideLoader(verbose=False, subset=True)

ss_list = DisulfideList([], 'tmp')
sslist = PDB_SS.SSList
ssmin_enrg, ssmax_enrg = PDB_SS.SSList.minmax_energy()

low_energy_neighbors = DisulfideList([],'Neighbors')
low_energy_neighbors = ssmin_enrg.Torsion_neighbors(sslist, 10)

tot = low_energy_neighbors.length()
print(f'Neighbors: {tot}')

low_energy_neighbors.display_overlay()

In [None]:
drms = low_energy_neighbors.Avg_Distance()
trms = low_energy_neighbors.Avg_Torsion_Distance()
erms = low_energy_neighbors.Avg_Energy()

print(f'Low Energy Neighbors: {tot}, Distance: {drms:.2f} Torsions: {trms:.2f}, Energy: {erms:.2f}')



In [None]:
low_energy_neighbors.display_overlay()

In [None]:
high_energy_neighbors = DisulfideList([],'neighbors2')
high_energy_neighbors = ssmax_enrg.Torsion_neighbors(sslist, 20)

tot2 = high_energy_neighbors.length()
print(f'Neighbors: {tot2}')

In [None]:
drms = high_energy_neighbors.Avg_Distance()
trms = high_energy_neighbors.Avg_Torsion_Distance()
erms = high_energy_neighbors.Avg_Energy()

print(f'High Energy Neighbors: {tot}, Distance: {drms:.2f} Torsions: {trms:.2f}, Energy: {erms:.2f}')



In [None]:
high_energy_neighbors.display_overlay()


In [None]:
ssmin = Disulfide()
ssmax = Disulfide()

ssmin, ssmax = sslist.minmax_distance()
ssmin.pprint()
ssmax.pprint()

minmax = DisulfideList([ssmin, ssmax], 'minmax')
minmax.display()

In [None]:

# one disulfide from the database
ss = Disulfide()
ss = PDB_SS[0]
#ss.pprint_all()

# get all disulfides for one structure. Make a 
# DisulfideList object to hold it
ss4yys = DisulfideList([], '4yys')
ss4yys = PDB_SS['4yys']

#ss4crn = DisulfideList([], '1crn')
#ss4crn = PDB_SS['1crn']

tot_ss = len(ss4yys) # number off ssbonds
print(f'tot {tot_ss}')


In [None]:
print(f'{ss4yys.Torsion_Distance()}')

In [None]:
sslist = PDB_SS.SSList
tors= sslist.torsion_df
tors.head()

In [None]:
ca_df = sslist.distance_df
ca_df.head(10)

In [None]:
ss6fuf = PDB_SS['6fuf']


In [None]:
ss1 = ss4yys.get_by_name('4yys_22A_65A')
ss1

In [None]:
Check_chains('4yys', PDB_GOOD)

In [None]:
ss4yys_a = ss4yys.by_chain('A')
ss4yys_a.Torsion_Distance()


In [None]:
ss4yys_b = ss4yys.by_chain('B')
ss4yys_b.Torsion_Distance()

In [None]:
ss4yys_a1 = ss4yys_a[0]
# print(ss4yys_a1.repr_ss_coords())


In [None]:
ss4yys_b1 = ss4yys_b[0]
ss4yys_b1

In [None]:
ss4yys_a1.Distance_RMS(ss4yys_b1)

In [None]:
ss4yys_a1.Torsion_Distance(ss4yys_b1)

In [None]:
ss4yys_a1 == ss4yys_b1

In [None]:
chns = ss4yys.get_chains()
ss4yys.has_chain('yyy')
chns

In [None]:
# load SS bonds by PDB ID
ss1 = PDB_SS['4yys']
print(ss1)
print(ss1[0].get_full_id())


In [None]:
# you can loop over the IDList list and extract by ID
#
for id in PDB_SS.IDList[:5]:    # just show last 5
    # get the SS bonds for the given ID
    ssb = PDB_SS[id]
    numb_ss = len(ssb)
    print(f'ID: {id} has {numb_ss} Disulfides:')
    for bond in ssb:
        print(bond)
    print('\n')
    

In [None]:
ss_list = PDB_SS.getlist()
ss0 = ss_list[0]

print(ss0.proximal_residue_fullid)
print(ss0.chi3)
len(ss_list)


In [None]:
# routine creates 2 lists  for left-handed and right-handed disulfides 
ss_list = PDB_SS.getlist()
left_handed = DisulfideList([], 'left_handed')
right_handed = DisulfideList([], 'right_handed')

i = 0

for i in range(len(ss_list)):
    ss = ss_list[i]
    if ss.chi3 < 0:
        left_handed.append(ss)
    else:
        right_handed.append(ss)


print(f'Left Handed: {len(left_handed)}, Right Handed: {len(right_handed)}')



In [None]:
from proteusPy.Disulfide import Disulfide

# make some empty disulfides
ss1 = Disulfide('ss1')
ss2 = Disulfide('ss2')

# make a DisulfideList containing ss1, named 'tmp'
sslist = DisulfideList([ss1], 'tmp')
sslist.append(ss2)

# load the PDB Disulfide database
PDB_SS = None
PDB_SS = DisulfideLoader(verbose=True, subset=True)

# extract a disulfide with typical index
ss1 = PDB_SS[0]
#print(f'{ss1.pprint_all()}')

# grab a subset via slicing
subset = DisulfideList(PDB_SS[0:10],'subset')

In [None]:
rms = subset.Torsion_Distance()
rms

In [None]:
torsions = PDB_SS.getTorsions()
torsions.sort_values(by=['energy'], ascending=False, inplace=True)

torsions.head(10)

In [None]:
idx_max = int(torsions['energy'].idxmax())
print(f'IDMAX: {idx_max}')
ssmax = PDB_SS[idx_max]
ssmax

In [None]:
toget = 200

badlist = torsions['ss_id']

bad_SS_list = DisulfideList([],'20 top high energy')
ss = Disulfide()

for i in range(toget):
    ssid = torsions.iloc[i]['ss_id']
    ss = PDB_SS.get_by_name(ssid)
    #print(f'ID: {ss.name}: {ss.energy:.2f} kcal/mol CA: {ss.ca_distance:.2f}')
    bad_SS_list.append(ss)



In [None]:

tors_bad_rms = bad_SS_list.Torsion_Distance()
tors_bad_rms

In [None]:
goodlist = torsions['ss_id']

good_SS_list = DisulfideList([],'20 top low energy')
ss = Disulfide()

for i in range(toget):
    if i == 0:
        continue
    
    ssid = torsions.iloc[-i]['ss_id']
    ss = PDB_SS.get_by_name(ssid)
    #print(f'ID: {ss.name}: {ss.energy:.2f} kcal/mol CA: {ss.ca_distance:.2f}')
    good_SS_list.append(ss)


In [None]:

tors_good_rms = good_SS_list.Torsion_Distance()
tors_good_rms

In [None]:
from scipy.spatial import distance_matrix

good_array = good_SS_list.get_torsion_array()
bad_array = bad_SS_list.get_torsion_array()
good_array.shape

In [None]:

dm1 = distance_matrix(good_array, good_array)
dm2 = distance_matrix(bad_array, bad_array)



In [None]:
import numpy as np

def EDM(A, B):
    p1 = np.sum(A**2, axis=1)[:, np.newaxis]
    p2 = np.sum(B**2, axis=1)
    p3 = -2 * np.dot(A, B.T)
    res = p1 + p2 + p3
    #res = numpy.sqrt(res)
    return res


In [None]:
dm3 = np.sqrt(dm2)

In [None]:
import plotly_express as px
fig = px.imshow(dm1)
fig.show()

In [None]:
import plotly_express as px
fig = px.imshow(dm2)
fig.show()

In [None]:
dm1.shape
dm1

In [None]:
df = pd.DataFrame(dm1.copy())
df.describe()

In [None]:
x = np.arange(0,20, 20)
y = np.arange(0,20, 20)
z = dm1
x, y = np.meshgrid(x, y)


In [None]:
# Create and plot structured grid
grid = pv.StructuredGrid(x, y, z)
grid.plot()

In [None]:
def doit(tot_ss):
    brad = BOND_RADIUS * (1 - ((tot_ss / 10)) / 50)
    return brad



In [None]:
doit(51)

In [None]:
import proteusPy
from proteusPy import DisulfideLoader
from proteusPy.Disulfide import Disulfide
from proteusPy.DisulfideList import DisulfideList

PDB_SS = DisulfideLoader(verbose=True, subset=True)



In [None]:
ss1 = PDB_SS[0]
sslist = PDB_SS[:200]
family = sslist.find_dihedral_family(ss1, 30.0)
family