## Mapping of Cysteine Oxidative PTMs to 3D structures.

Under oxidative stress Cysteines can undergo oxidative post-translational modifications (PTMs). The study by Akter, et al. compares the differences between S-Sulfinylations (R-SO2H) and S-Sulfenylations (R-SOH) in A549 and HeLa cell lines.

In this notebook we map the positions of these PTMs on available 3D structures in the Protein Data Bank.

We use the datasets provided in the supplementary materials of the following paper:

Chemical proteomics reveals new targets of cysteine sulfinic acid reductase.
Akter S, Fu L, Jung Y, Conte ML, Lawson JR6, Lowther WT, Sun, Liu, Yang J, Carroll KS.
Nat Chem Biol. 2018 Sep 3. doi: [10.1038/s41589-018-0116-2](https://doi.org/10.1038/s41589-018-0116-2)

In [1]:
# imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, collect_set, collect_list, col, concat_ws, sort_array
from mmtfPyspark.datasets import pdbToUniProt
import pandas as pd
import numpy as np
from io import BytesIO
import xlrd
from ipywidgets import interact, IntSlider, widgets
import py3Dmol

In [2]:
# setup checkboxes for datasets
w1 = widgets.Checkbox(value=True, description='A549-RSO2H',disabled=False)
w2 = widgets.Checkbox(value=False, description='HeLa-RSO2H',disabled=False)
w3 = widgets.Checkbox(value=True, description='A549-RSOH',disabled=False)
w4 = widgets.Checkbox(value=False, description='HeLa-RSOH',disabled=False)

## Select one or more datasets (cell line-PTM)

In [3]:
display(w1, w2, w3, w4)

Checkbox(value=True, description='A549-RSO2H')

Checkbox(value=False, description='HeLa-RSO2H')

Checkbox(value=True, description='A549-RSOH')

Checkbox(value=False, description='HeLa-RSOH')

## Read and process datasets from supplementary materials

In [4]:
def read_datasets():
    dfs = []
    if w1.value:
        df1 = pd.read_excel('https://static-content.springer.com/esm/art%3A10.1038%2Fs41589-018-0116-2/MediaObjects/41589_2018_116_MOESM32_ESM.xlsx', sheet_name='A549', dtype=str)
        df1 = df1.assign(ptms=np.full((df1.shape[0], 1), "A549-RSO2H"))
        df1 = df1.rename(index=str, columns={"Modified site": "modifiedSite", "Uniprot Accession #": "uniprotAccession"})
        dfs.append(df1)

    if w2.value:
        df2 = pd.read_excel('https://static-content.springer.com/esm/art%3A10.1038%2Fs41589-018-0116-2/MediaObjects/41589_2018_116_MOESM32_ESM.xlsx', sheet_name='HeLa', dtype=str)
        df2 = df2.assign(ptms=np.full((df2.shape[0], 1), "HeLa-RSO2H"))
        df2 = df2.rename(index=str, columns={"Modified site": "modifiedSite", "Uniprot Accession #": "uniprotAccession"})
        dfs.append(df2)

    if w3.value:
        df3 = pd.read_excel('https://static-content.springer.com/esm/art%3A10.1038%2Fs41589-018-0116-2/MediaObjects/41589_2018_116_MOESM33_ESM.xlsx', sheet_name='A549', dtype=str)
        df3 = df3.assign(ptms=np.full((df3.shape[0], 1), "A549-RSOH"))
        df3 = df3.rename(index=str, columns={"Site #": "modifiedSite", "Uniprot Accession #": "uniprotAccession"})
        dfs.append(df3)

    if w4.value:
        df4 = pd.read_excel('https://static-content.springer.com/esm/art%3A10.1038%2Fs41589-018-0116-2/MediaObjects/41589_2018_116_MOESM33_ESM.xlsx', sheet_name='HeLa', dtype=str)
        df4 = df4.assign(ptms=np.full((df4.shape[0], 1), "HeLa-RSOH"))
        df4 = df4.rename(index=str, columns={"Site #": "modifiedSite", "Uniprot Accession #": "uniprotAccession"})
        dfs.append(df4)
        
    return dfs

In [5]:
# concatenate and process dataset
dfs = read_datasets()
df = pd.concat(dfs, ignore_index=True)
#display(df)
df = df[['ptms', 'modifiedSite', 'uniprotAccession', 'Description']]

df['modifiedSite'] = df['modifiedSite'].astype(np.int64)
pd.options.display.max_rows = None
display(df)

Unnamed: 0,ptms,modifiedSite,uniprotAccession,Description
0,A549-RSO2H,25,P63104,14-3-3 protein zeta/delta OS=Homo sapiens GN=Y...
1,A549-RSO2H,171,P52209,"6-phosphogluconate dehydrogenase, decarboxylat..."
2,A549-RSO2H,122,Q9H7C9,Mth938 domain-containing protein OS=Homo sapie...
3,A549-RSO2H,187,P00505,"Aspartate aminotransferase, mitochondrial OS=H..."
4,A549-RSO2H,477,P49748,Very long-chain specific acyl-CoA dehydrogenas...
5,A549-RSO2H,237,P49748,Very long-chain specific acyl-CoA dehydrogenas...
6,A549-RSO2H,733,Q9UKV3,Apoptotic chromatin condensation inducer in th...
7,A549-RSO2H,412,Q99424,Peroxisomal acyl-coenzyme A oxidase 2 OS=Homo ...
8,A549-RSO2H,286,Q562R1,Beta-actin-like protein 2 OS=Homo sapiens GN=A...
9,A549-RSO2H,45,O60218,Aldo-keto reductase family 1 member B10 OS=Hom...


## Map PTM locations to residues in PDB structures

In [6]:
# convert Pandas dataframe to a Spark dataframe
spark = SparkSession.builder.master("local[4]").appName("CysOxydationTo3DStructure").getOrCreate()
ds = spark.createDataFrame(df)
ds = ds.sort(ds.uniprotAccession, ds.modifiedSite)

Download PDB to UniProt mappings and filter out residues that were not observed in the 3D structure.

In [7]:
up = pdbToUniProt.get_cached_residue_mappings().filter("pdbResNum IS NOT NULL")

Joint PTM with PDB data if the UniProt Id and UniProt residue numbers match

In [8]:
st = up.join(ds, (up.uniprotId == ds.uniprotAccession) & (up.uniprotNum == ds.modifiedSite))

## Aggregate PTM data on a per residue and per chain basis

In [9]:
# Aggregate data
st = st.groupBy("structureChainId","pdbResNum","uniprotAccession","uniprotNum","Description").agg(collect_list("ptms").alias("ptms"))
st = st.withColumn("ptms", concat_ws((","), col("ptms")))
st = st.groupBy("structureChainId","uniprotAccession","Description").agg(collect_list("ptms").alias("ptms"), collect_list("pdbResNum").alias("pdbResNum"),  collect_list("uniprotNum").alias("uniprotNum"))

Keep only a single structural representative

In [10]:
st = st.drop_duplicates(["uniprotAccession","uniprotNum"])

## Show Table with PDB mappings

PDB residue numbers do not always match UniProt residue numbers. The table below shows the mapping for each protein chain.

In [11]:
# convert Spark dataframe back to a Pandas dataframe
sp = st.toPandas()
pd.options.display.max_rows = None # show all rows
display(sp)

Unnamed: 0,structureChainId,uniprotAccession,Description,ptms,pdbResNum,uniprotNum
0,1K4Q.A,P00390,"Glutathione reductase, mitochondrial OS=Homo s...","[A549-RSOH, A549-RSOH, A549-RSOH]","[63, 417, 423]","[107, 461, 467]"
1,1DGB.D,P04040,Catalase OS=Homo sapiens GN=CAT PE=1 SV=3,"[A549-RSO2H,A549-RSOH, A549-RSO2H,A549-RSOH]","[460, 232]","[460, 232]"
2,1QKI.D,P11413,Glucose-6-phosphate 1-dehydrogenase OS=Homo sa...,"[A549-RSOH, A549-RSOH, A549-RSOH, A549-RSOH]","[13, 385, 158, 294]","[13, 385, 158, 294]"
3,2O8E.B,P52701,DNA mismatch repair protein Msh6 OS=Homo sapie...,"[A549-RSO2H,A549-RSOH]",[765],[765]
4,2KTV.A,P62495,Eukaryotic peptide chain release factor subuni...,"[A549-RSOH, A549-RSOH]","[335, 302]","[335, 302]"
5,4M3P.D,Q93088,Betaine--homocysteine S-methyltransferase 1 OS...,[A549-RSO2H],[131],[131]
6,3B97.A,P06733,Alpha-enolase OS=Homo sapiens GN=ENO1 PE=1 SV=2,"[A549-RSOH, A549-RSOH, A549-RSOH, A549-RSOH]","[356, 398, 338, 336]","[357, 399, 339, 337]"
7,3J0S.W,P23528,Cofilin-1 OS=Homo sapiens GN=CFL1 PE=1 SV=3,"[A549-RSO2H,A549-RSOH, A549-RSOH, A549-RSOH, A...","[139, 80, 39, 147]","[139, 80, 39, 147]"
8,4TW8.B,Q02790,Peptidyl-prolyl cis-trans isomerase FKBP4 OS=H...,"[A549-RSOH, A549-RSOH]","[103, 107]","[103, 107]"
9,3LLM.A,Q08211,ATP-dependent RNA helicase A OS=Homo sapiens G...,"[A549-RSOH, A549-RSO2H,A549-RSOH]","[469, 438]","[469, 438]"


## Visualize Results
Residues with reported modifications are shown in an all atom prepresentation as red sticks with transparent spheres. Each modified residue position is labeled by the PDB residue number and the type of the modification. Residues surrounding modified residue (within 6 A) are highlighted as yellow sticks. Small molecules within the structure are rendered as gray sticks.

* Move slider to browse through the results
* To rotate the structure, hold down the left mouse button and move the mouse.

In [12]:
def view_modifications(df, cutoff_distance, *args):

    def view3d(show_labels=True,show_bio_assembly=False, show_surface=False, i=0):
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        res_num = df.iloc[i]['pdbResNum']
        labels = df.iloc[i]['ptms']
        
        # print header
        print ("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
        
        mod_res = {'chain': chain_id, 'resi': res_num}  
        
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': cutoff_distance}
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})
    
        # polymer style
        viewer.setStyle({'cartoon': {'color': 'spectrum', 'width': 0.6, 'opacity':0.8}})
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # style for modifications
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        viewer.addStyle(mod_res, {'stick':{'colorscheme':'redCarbon', 'radius': 0.4}})
        viewer.addStyle(mod_res, {'sphere':{'colorscheme':'gray', 'opacity': 0.7}})
        
        # set residue labels    
        if show_labels:
            for residue, label in zip(res_num, labels):
                viewer.addLabel(residue + ": " + label, \
                                {'fontColor':'black', 'fontSize': 9, 'backgroundColor': 'lightgray'}, \
                                {'chain': chain_id, 'resi': residue})

        viewer.zoomTo(surroundings)
        
        if show_surface:
            viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'})

        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_labels=True, show_bio_assembly=False, show_surface=False, i=s_widget)

In [13]:
view_modifications(sp, 6, 'uniprotAccession', 'Description');

interactive(children=(Checkbox(value=True, description='show_labels'), Checkbox(value=False, description='show…

In [14]:
spark.stop()