## Map Mutations to Protein-Protein and Protein-Nucleic Acid Interfaces
Here we find and visualize the mutations that occur around protein-protein and protein-nucleic acid interfaces.

In [1]:
# Disable Numba: temporary workaround for https://github.com/sbl-sdsc/mmtf-pyspark/issues/288
import os
os.environ['NUMBA_DISABLE_JIT'] = "1"

In [2]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider, FloatSlider, SelectMultiple
import py3Dmol
import pandas as pd

In [3]:
# Initialize Spark
spark = SparkSession.builder.appName("2-MapToPolymerInteractions").getOrCreate()

In [4]:
# Enable Arrow-based columnar data transfers between Spark and Pandas dataframes
# Commented out for deployment on Pangeo Binder since it causes an out of memory error
# Exceeds spark.driver.maxResultSize (1024.0 MB)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

#### Input parameters

In [5]:
distance_cutoff = 8 # distance cutoff for finding and visualizing interactions
input_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_polymer_inter.csv' # mutations mapped to polymer interactions

## Read mutation file created in the previous step

In [6]:
pd.set_option('display.max_columns', None)  # show all columns
df = pd.read_csv(input_file_name)
df['targetGroupNumber'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color,targetGroupNumber
0,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,912,912,912,"S:p.912T>I(12), QHD43416.1:p.912T>A(4)","912T>I(12), 912T>A(4)",0.181698,#fdd0bc,912
1,0.089552,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.C,6LXT,912,912,912,"S:p.912T>I(12), QHD43416.1:p.912T>A(4)","912T>I(12), 912T>A(4)",0.181698,#fdd0bc,912
2,0.084053,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.D,6LXT,912,912,912,"S:p.912T>I(12), QHD43416.1:p.912T>A(4)","912T>I(12), 912T>A(4)",0.181698,#fdd0bc,912
3,0.758837,"Spike glycoprotein, 2-acetamido-2-deoxy-beta-D...",Spike glycoprotein,3.2,6VYB.A,6VYB,912,912,912,"S:p.912T>I(12), QHD43416.1:p.912T>A(4)","912T>I(12), 912T>A(4)",0.181698,#fdd0bc,912
4,0.745483,"Spike glycoprotein, 2-acetamido-2-deoxy-beta-D...",Spike glycoprotein,3.2,6VYB.B,6VYB,912,912,912,"S:p.912T>I(12), QHD43416.1:p.912T>A(4)","912T>I(12), 912T>A(4)",0.181698,#fdd0bc,912


In [7]:
mutations=spark.createDataFrame(df) 

## Create a list of unique PDB Ids

In [8]:
pdb_ids = list(df.structureId.drop_duplicates())
print("PDB Ids:", pdb_ids)

PDB Ids: ['6LXT', '6VYB', '6Z43', '6ZHD', '7CZP', '7CZQ', '7CZR', '7CZS', '7CZT', '7CZU', '7CZV', '7CZW', '7CZX', '7CZY', '7CZZ', '7D00', '7D03', '7E7B', '7E7D', '6VW1', '6XE1', '6W41', '6YZ5', '6YZ7', '6ZBP']


## Find all polymer-polymer interactions
Find groups (residues) that interact with other polymer chains

In [9]:
structures = mmtfReader.download_mmtf_files(pdb_ids)

In [10]:
interaction_filter = InteractionFilter(distanceCutoff=distance_cutoff)

interactions = InteractionExtractor().get_polymer_interactions(structures, interaction_filter, level='group')

queryGroupId, queryChainId, and queryGroupNumber specify the residue that interacts with another chain (targetChainId)

In [11]:
interactions.limit(5).toPandas()

Unnamed: 0,structureChainId,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,7CZW.H,TYR,A,489,ARG,H,99,98,QVQLQESGPGLVKPSETLSLTCTVSGDSVSSGSYYWSWIRQPPGKG...
1,7CZW.J,ALA,N,45,GLN,J,120,119,QVQLQESGPGLVKPSETLSLTCTVSGDSVSSGSYYWSWIRQPPGKG...
2,7CZW.J,SER,N,170,PRO,J,182,181,QVQLQESGPGLVKPSETLSLTCTVSGDSVSSGSYYWSWIRQPPGKG...
3,7CZW.H,TYR,A,495,TYR,H,55,54,QVQLQESGPGLVKPSETLSLTCTVSGDSVSSGSYYWSWIRQPPGKG...
4,7CZW.C,SER,A,735,ASP,C,614,613,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...


## Filter mutations by joining with the interaction data

In [12]:
result = interactions.join(mutations, ['structureChainId', 'targetGroupNumber'], "inner")

In [13]:
mt = result.toPandas()

In [14]:
mt.head()

Unnamed: 0,structureChainId,targetGroupNumber,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,sequenceIndex,sequence,coverage,description,name,resolution,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color
0,6LXT.B,947,VAL,A,952,LYS,B,37,GVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQN...,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT,947,947,947,"S:p.947K>R(98), S:p.947K>I(2), S:p.947K>E(2)","947K>R(98), 947K>I(2), 947K>E(2)",0.303092,#fcaa8d
1,6LXT.B,947,VAL,A,1176,LYS,B,37,GVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQN...,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT,947,947,947,"S:p.947K>R(98), S:p.947K>I(2), S:p.947K>E(2)","947K>R(98), 947K>I(2), 947K>E(2)",0.303092,#fcaa8d
2,6LXT.B,947,ILE,A,1179,LYS,B,37,GVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQN...,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT,947,947,947,"S:p.947K>R(98), S:p.947K>I(2), S:p.947K>E(2)","947K>R(98), 947K>I(2), 947K>E(2)",0.303092,#fcaa8d
3,6LXT.B,947,ASN,A,1178,LYS,B,37,GVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQN...,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT,947,947,947,"S:p.947K>R(98), S:p.947K>I(2), S:p.947K>E(2)","947K>R(98), 947K>I(2), 947K>E(2)",0.303092,#fcaa8d
4,6LXT.B,947,GLN,A,1180,LYS,B,37,GVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQN...,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT,947,947,947,"S:p.947K>R(98), S:p.947K>I(2), S:p.947K>E(2)","947K>R(98), 947K>I(2), 947K>E(2)",0.303092,#fcaa8d


## Save Mappings

In [15]:
mt.to_csv(output_file_name, index=False)

In [16]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['6LXT.A', '6LXT.B', '6LXT.C', '6LXT.D', '6LXT.E', '6LXT.F', '6VW1.E', '6VW1.F', '6VYB.A', '6VYB.B', '6VYB.C', '6W41.C', '6XE1.E', '6YZ5.E', '6YZ7.AAA', '6YZ7.EEE', '6Z43.A', '6Z43.B', '6Z43.C', '6ZBP.EEE', '6ZHD.A', '6ZHD.B', '6ZHD.C', '7CZP.A', '7CZP.B', '7CZP.C', '7CZQ.A', '7CZQ.B', '7CZQ.C', '7CZR.A', '7CZR.B', '7CZR.C', '7CZS.A', '7CZS.B', '7CZS.C', '7CZT.A', '7CZT.B', '7CZT.C', '7CZU.A', '7CZU.B', '7CZU.C', '7CZV.A', '7CZV.B', '7CZV.C', '7CZW.A', '7CZW.B', '7CZW.C', '7CZX.A', '7CZX.B', '7CZX.C', '7CZY.A', '7CZY.B', '7CZY.C', '7CZZ.A', '7CZZ.B', '7CZZ.C', '7D00.A', '7D00.B', '7D00.C', '7D03.A', '7D03.B', '7D03.C', '7E7B.A', '7E7B.B', '7E7B.C', '7E7D.A', '7E7D.B', '7E7D.C']


## View mutations grouped by protein chain

Mutations are mapped onto protein chains for available 3D protein structures.

Display options:

|||
|:-|:-|
| *show_bio_assembly* | Toggle display of the biologically relevant quaternary structure |
| *show_surface* | Toggle surface for protein chain |
| *show_short_label* | Toggle display of mutation information<br>{UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 501N>Y(350436)|
| *show_long_label* | Toggle display of mutation information<br>{PDBId}.{chainId}.{PDBResidue}: {geneName}.p{UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 6XDG.E.501: S:p.501N>Y(350436) |
| *size* | Change size of visualization |
| *font* | Change font size of annotations |
| *logFreq* | Change minimum threshold to display mutations based on normalized log of mutation frequency [0.0 - 1.0]|
| *structure* | Move slider to browse through available structures |

#### Example: Move the structure slider to PDB ID:6XDG to see how mutations (e.g., 501N>Y) effect the binding of the Regeneron antibodies

In [17]:
# Setup viewer
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, logFreq, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
 
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
    
        # don't display water molecules
        viewer1.setStyle({'resn': ['HOH','DOD']}, {})
        
        modified_residues = set()
        rows = group.shape[0]
        for j in range(0, rows):
            if group.iloc[j]['scale'] > logFreq:
                res_num = str(group.iloc[j]['pdbPosition'])
                mod_res = {'resi': res_num, 'chain': chain_id} 
                modified_residues.add(res_num)
            
                # style for mutated residue
                col = group.iloc[j]['color']
                c_col = col + 'Carbon'
                viewer1.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
                viewer1.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}}) 
           
                # style for interacting residues
                surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
                viewer1.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
                viewer1.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
                # style for interacting waters
                waters = {'resn': ['HOH','DOD']}
                waters.update(surroundings)
                viewer1.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
                
                if show_short_label or show_long_label:
                    viewer1.addResLabels(surroundings, {'fontSize':font-2})
            
                # text label
                annotation = group.iloc[j]['annotation']
                variationId = group.iloc[j]['variationId']
                
                if show_short_label:
                    label = annotation
                if show_long_label:
                    label = chain_id + "." + str(res_num) + ": " + variationId
                if show_short_label or show_long_label:
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
              
        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        # print header
        resolution = group.iloc[0]['resolution']
        coverage = group.iloc[0]['coverage']
        description = group.iloc[0]['description']
        print(f"PDB Id:{pdb_id}, chain Id:{chain_id}, resolution:{resolution}, sequence coverage:{coverage:.2f}")
        print(description)
                
        return viewer1.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    l_widget = FloatSlider(value=0.8, min=0, max=1, step=0.05, description='logFreq:', 
                           continuous_update=False, orientation='horizontal', readout=True, readout_format='.2f')
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, logFreq=l_widget, i=s_widget)

def view_image1():
    return viewer1.png()

In [18]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Interacting residues within the `distance_cutoff` are rendered as orange sticks.

In [19]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer2
        viewer2 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer2.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
        
        # highlight chain of interest in blue
        viewer2.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.7}})
        
        # non-polymer style
        viewer2.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # interacting residue style
        res_num = str(df.iloc[i]['pdbPosition'])
        label = df.iloc[i]['variationId']     
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer2.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer2.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
        
        # don't display water molecules (except below for interactions)
        viewer2.setStyle({'resn': ['HOH','DOD']}, {})
            
        # select residues in interacting chains by distance from mutation site (same chain is excluded)
        surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
        
        # set style for interacting residues
        viewer2.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
        viewer2.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
    
        # set style for interacting waters
        waters = {'resn': ['HOH','DOD']}
        waters.update(surroundings)
        viewer2.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
        
        annotation = df.iloc[i]['annotation']
        variationId = df.iloc[i]['variationId']      
        
        if show_short_label:
            label = annotation
        if show_long_label:
            label = chain_id + "." + str(res_num) + ": " + variationId
        if show_short_label or show_long_label:
            viewer2.addResLabels(surroundings, {'fontSize':font-2})
            viewer2.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
            
        viewer2.zoomTo(surroundings)
        viewer2.center(mod_res)
        
        if show_surface:
             viewer2.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", variationId)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer2.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(df)-1, description='structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, i=s_widget)

def view_image2():
    return viewer2.png()

In [20]:
mt_unique = mt.drop_duplicates(["structureChainId","variationId"])

In [21]:
view_single_mutation(mt_unique, distance_cutoff);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [22]:
# Shutdown Spark
spark.stop()

## Now run the next step
Map mutations occuring at protein-ligand binding sites: [3-MapToLigandInteractions.ipynb](3-MapToLigandInteractions.ipynb)