# Map S-sulphenylated peptide fragments on 3D Structure

The goal of this study is to systematically map the positions of S-sulphenylation of proteins onto 3D protein structures in the Protein Data Bank.

Data source:

Gupta V, Yang J, Liebler DC, Carroll KS. Diverse Redoxome Reactivity Profiles 
of Carbon Nucleophiles. J Am Chem Soc. 2017 Apr 19;139(15):5588-5595. [doi:
10.1021/jacs.7b01791](https://doi.org/10.1021/jacs.7b01791)

Excerpt from abstract:
Analysis of sulfenic acid-reactive C-nucleophile fragments screened against a colon cancer cell proteome. Covalent ligands were identified for >1280 S-sulfenylated cysteines present in "druggable" proteins and orphan targets, revealing disparate reactivity profiles and target preferences. 

In [1]:
import requests
from io import BytesIO
import xlrd
import pandas as pd
import numpy as np
import py3Dmol
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, substring_index,regexp_extract
from mmtfPyspark.datasets import pdbToUniProt
from ipywidgets import interact, IntSlider, widgets

In [2]:
spark = SparkSession.builder.appName("S-Sulphenylation").getOrCreate()

In [3]:
# dataset = {'BTD':  'https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_002.xlsx', 
#            'DYn-2':'https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_003.xlsx',
#            'PRD':  'https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_004.xlsx',
#            'PYD':  'https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_005.xlsx',
#            'TD':   'https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_006.xlsx'}


In [4]:
dataset = {'BTD':  ('https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_002.xlsx', '418@C'), 
           'DYn-2':('https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_003.xlsx', '333@C'),
           'PRD':  ('https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_004.xlsx', '333@C'),
           'PYD':  ('https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_005.xlsx', '333@C'),
           'TD':   ('https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_006.xlsx', '333@C')}

In [5]:
w = widgets.ToggleButtons(options=dataset.keys(), description='Dataset:', disabled=False)

## Select dataset

In [6]:
display(w)

ToggleButtons(description='Dataset:', options=('BTD', 'DYn-2', 'PRD', 'PYD', 'TD'), value='BTD')

In [7]:
dataset[w.value]
url = dataset[w.value][0]
mod_string = dataset[w.value][1]
print(url, mod_string)

https://pubs.acs.org/doi/suppl/10.1021/jacs.7b01791/suppl_file/ja7b01791_si_002.xlsx 418@C


### Read dataset from supplementary data excel file

In [8]:
req = requests.get(url) # get redirected content
df = pd.read_excel(BytesIO(req.content), sheet_name='Protein View')
print("Dataset:",w.value)
df.head()

Dataset: BTD


Unnamed: 0,Accession,Cluster,Count,Coverage,Protein Group,Gene Id,Gene Name,Chromosome,Gene Family,Distinct Peptides,Distinct Matches,Filtered Spectra,Description,Modified Sites
0,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,|PRKDC|5591|P78527|DNA-dependent protein kinas...,"418@C25,418@C4061,418@C4030,418@C2469,418@C190..."
1,NP_008835.5|,,,7.194767,,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,,,,|PRKDC|5591|P78527|DNA-dependent protein kinas...,"418@C25,418@C223,418@C232,418@C373,418@C458,41..."
2,NP_001075109.1|,,,7.249207,,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,,,,|PRKDC|5591|P78527|DNA-dependent protein kinas...,"418@C25,418@C223,418@C232,418@C373,418@C458,41..."
3,"NP_001447.2|,NP_001104026.1|",249.0,2.0,12.561512,308.0,FLNA,"filamin A, alpha",Xq28,,16.0,22.0,41.0,|FLNA|2316|P21333;Q60FE5;Q6NXF2|filamin-A isof...,"418@C1018,418@C2152,418@C2160,418@C2535,418@C2..."
4,NP_001447.2|,,,12.580523,,FLNA,"filamin A, alpha",Xq28,,,,,|FLNA|2316|P21333;Q60FE5;Q6NXF2|filamin-A isof...,"418@C53,418@C478,418@C574,418@C1018,418@C1157,..."


### Standardize representation of protein modification
Here we use the following notation for modified residues (amino acid, delta mass), here (C,333).

In [9]:
df = df.assign(ptms=np.full((df.shape[0], 1), w.value))
df['modSites'] = df['Modified Sites'].map(lambda s: str(s).replace(mod_string, ''))
df.head()

Unnamed: 0,Accession,Cluster,Count,Coverage,Protein Group,Gene Id,Gene Name,Chromosome,Gene Family,Distinct Peptides,Distinct Matches,Filtered Spectra,Description,Modified Sites,ptms,modSites
0,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,|PRKDC|5591|P78527|DNA-dependent protein kinas...,"418@C25,418@C4061,418@C4030,418@C2469,418@C190...",BTD,"25,4061,4030,2469,1904,3837,3806,1128,1127,234..."
1,NP_008835.5|,,,7.194767,,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,,,,|PRKDC|5591|P78527|DNA-dependent protein kinas...,"418@C25,418@C223,418@C232,418@C373,418@C458,41...",BTD,"25,223,232,373,458,457,478,491,1128,1127,1183,..."
2,NP_001075109.1|,,,7.249207,,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,,,,|PRKDC|5591|P78527|DNA-dependent protein kinas...,"418@C25,418@C223,418@C232,418@C373,418@C458,41...",BTD,"25,223,232,373,458,457,478,491,1128,1127,1183,..."
3,"NP_001447.2|,NP_001104026.1|",249.0,2.0,12.561512,308.0,FLNA,"filamin A, alpha",Xq28,,16.0,22.0,41.0,|FLNA|2316|P21333;Q60FE5;Q6NXF2|filamin-A isof...,"418@C1018,418@C2152,418@C2160,418@C2535,418@C2...",BTD,"1018,2152,2160,2535,2543,2191,2199,1157,1453,5..."
4,NP_001447.2|,,,12.580523,,FLNA,"filamin A, alpha",Xq28,,,,,|FLNA|2316|P21333;Q60FE5;Q6NXF2|filamin-A isof...,"418@C53,418@C478,418@C574,418@C1018,418@C1157,...",BTD,"53,478,574,1018,1157,1165,1402,1453,1715,1989,..."


In [10]:
# convert columns to string
df[['Gene Family', 'Gene Name', 'Chromosome','Description', 'Modified Sites']] = df[['Gene Family', 'Gene Name', 'Chromosome','Description', 'Modified Sites']].astype(str)
ds = spark.createDataFrame(df)

In [11]:
ds = ds.withColumn('Description', split(ds.Description, "\|"))
ds = ds.withColumn("unpName", ds.Description.getItem(1))
ds = ds.withColumn("unpId", ds.Description.getItem(3))
ds = ds.withColumn("name", ds.Description.getItem(4))
ds = ds.withColumn('modSites', split(ds.modSites, ","))
ds = ds.withColumn('modSites', explode(ds.modSites))
ds = ds.drop('Modified Sites')
ds.limit(5).toPandas()

Unnamed: 0,Accession,Cluster,Count,Coverage,Protein Group,Gene Id,Gene Name,Chromosome,Gene Family,Distinct Peptides,Distinct Matches,Filtered Spectra,Description,ptms,modSites,unpName,unpId,name
0,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,"[, PRKDC, 5591, P78527, DNA-dependent protein ...",BTD,25,PRKDC,P78527,DNA-dependent protein kinase catalytic subunit...
1,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,"[, PRKDC, 5591, P78527, DNA-dependent protein ...",BTD,4061,PRKDC,P78527,DNA-dependent protein kinase catalytic subunit...
2,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,"[, PRKDC, 5591, P78527, DNA-dependent protein ...",BTD,4030,PRKDC,P78527,DNA-dependent protein kinase catalytic subunit...
3,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,"[, PRKDC, 5591, P78527, DNA-dependent protein ...",BTD,2469,PRKDC,P78527,DNA-dependent protein kinase catalytic subunit...
4,"NP_008835.5|,NP_001075109.1|",56.0,2.0,7.221987,68.0,PRKDC,"protein kinase, DNA-activated, catalytic polyp...",8q11,,26.0,41.0,73.0,"[, PRKDC, 5591, P78527, DNA-dependent protein ...",BTD,1904,PRKDC,P78527,DNA-dependent protein kinase catalytic subunit...


## Get PDB to UniProt Residue Mappings

Download PDB to UniProt mappings and filter out residues that were not observed in the 3D structure.

In [12]:
up = pdbToUniProt.get_cached_residue_mappings().filter("pdbResNum IS NOT NULL")

In [13]:
st = up.join(ds, (up.uniprotId == ds.unpId) & (up.uniprotNum == ds.modSites))

In [14]:
def view_modifications(df, cutoff_distance, *args):

    def view3d(show_labels=True,show_bio_assembly=False, show_surface=False, i=0):
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        res_num = df.iloc[i]['pdbResNum']
        labels = df.iloc[i]['ptms']
        
        # print header
        print ("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
        
        mod_res = {'chain': chain_id, 'resi': res_num}  
        
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': cutoff_distance}
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})
    
        # polymer style
        viewer.setStyle({'cartoon': {'color': 'spectrum', 'width': 0.6, 'opacity':0.8}})
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # style for modifications
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        viewer.addStyle(mod_res, {'stick':{'colorscheme':'redCarbon', 'radius': 0.4}})
        viewer.addStyle(mod_res, {'sphere':{'colorscheme':'gray', 'opacity': 0.7}})
        
        # set residue labels    
        if show_labels:
            for residue, label in zip(res_num, labels):
                viewer.addLabel(residue + ": " + label, \
                                {'fontColor':'black', 'fontSize': 8, 'backgroundColor': 'lightgray'}, \
                                {'chain': chain_id, 'resi': residue})

        viewer.zoomTo(surroundings)
        
        if show_surface:
            viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'})

        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_labels=True, show_bio_assembly=False, show_surface=False, i=s_widget)

In [15]:
sp = st.toPandas()
sp.head()

Unnamed: 0,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum,Accession,Cluster,Count,Coverage,Protein Group,...,Gene Family,Distinct Peptides,Distinct Matches,Filtered Spectra,Description,ptms,modSites,unpName,unpId,name
0,1UCH.A,95,95,P15374,95,NP_005993.1|,752.0,1.0,8.695652,949.0,...,,1.0,1.0,2.0,"[, UCHL3, 7347, P15374, ubiquitin carboxyl-ter...",BTD,95,UCHL3,P15374,ubiquitin carboxyl-terminal hydrolase isozyme ...
1,1XD3.A,95,95,P15374,95,NP_005993.1|,752.0,1.0,8.695652,949.0,...,,1.0,1.0,2.0,"[, UCHL3, 7347, P15374, ubiquitin carboxyl-ter...",BTD,95,UCHL3,P15374,ubiquitin carboxyl-terminal hydrolase isozyme ...
2,1XD3.C,95,95,P15374,95,NP_005993.1|,752.0,1.0,8.695652,949.0,...,,1.0,1.0,2.0,"[, UCHL3, 7347, P15374, ubiquitin carboxyl-ter...",BTD,95,UCHL3,P15374,ubiquitin carboxyl-terminal hydrolase isozyme ...
3,5TTE.E,54,54,P68036,54,"NP_001243284.1|,NP_001243285.1|,NP_003338.1|",629.0,3.0,17.466488,784.0,...,UBE2,2.0,3.0,6.0,"[, UBE2L3, 7332, P68036, ubiquitin-conjugating...",BTD,54,UBE2L3,P68036,ubiquitin-conjugating enzyme E2 L3 isoform 1 [...
4,1FBV.C,1054,54,P68036,54,"NP_001243284.1|,NP_001243285.1|,NP_003338.1|",629.0,3.0,17.466488,784.0,...,UBE2,2.0,3.0,6.0,"[, UBE2L3, 7332, P68036, ubiquitin-conjugating...",BTD,54,UBE2L3,P68036,ubiquitin-conjugating enzyme E2 L3 isoform 1 [...


In [16]:
view_modifications(sp, 6, 'uniprotId', 'unpName', 'name','Accession','modSites');

PDB Id: 1UCH chain Id: A
uniprotId: P15374
unpName: UCHL3
name: ubiquitin carboxyl-terminal hydrolase isozyme L3 [Homo sapiens]
Accession: NP_005993.1|
modSites: 95


In [17]:
spark.stop()