## Map Mutations to Protein-Protein and Protein-Nucleic Acid Interfaces
Here we find and visualize the mutations that occur around protein-protein and protein-nucleic acid interfaces.

In [1]:
# Disable Numba: temporary workaround for https://github.com/sbl-sdsc/mmtf-pyspark/issues/288
import os
os.environ['NUMBA_DISABLE_JIT'] = "1"

In [2]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider, FloatSlider, SelectMultiple
import py3Dmol
import pandas as pd

In [3]:
# Initialize Spark
spark = SparkSession.builder.appName("2-MapToPolymerInteractions").getOrCreate()

In [4]:
# Enable Arrow-based columnar data transfers between Spark and Pandas dataframes
# Commented out for deployment on Pangeo Binder since it causes an out of memory error
# Exceeds spark.driver.maxResultSize (1024.0 MB)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

#### Input parameters

In [5]:
distance_cutoff = 8 # distance cutoff for finding and visualizing interactions
input_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_polymer_inter.csv' # mutations mapped to polymer interactions

## Read mutation file created in the previous step

In [6]:
pd.set_option('display.max_columns', None)  # show all columns
df = pd.read_csv(input_file_name)
df['targetGroupNumber'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color,targetGroupNumber
0,0.153181,"CR3022 Fab heavy chain, CR3022 Fab light chain...",Spike glycoprotein,3.084,6W41.C,6W41,333,333,333,"S:p.333T>I(4), S:p.333T>K(4)","333T>I(4), 333T>K(4)",0.136274,#fedbcc,333
1,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS.A,6WPS,333,333,333,"S:p.333T>I(4), S:p.333T>K(4)","333T>I(4), 333T>K(4)",0.136274,#fedbcc,333
2,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS.B,6WPS,333,333,333,"S:p.333T>I(4), S:p.333T>K(4)","333T>I(4), 333T>K(4)",0.136274,#fedbcc,333
3,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS.E,6WPS,333,333,333,"S:p.333T>I(4), S:p.333T>K(4)","333T>I(4), 333T>K(4)",0.136274,#fedbcc,333
4,0.742341,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.7,6WPT.A,6WPT,333,333,333,"S:p.333T>I(4), S:p.333T>K(4)","333T>I(4), 333T>K(4)",0.136274,#fedbcc,333


In [7]:
mutations=spark.createDataFrame(df) 

## Create a list of unique PDB Ids

In [8]:
pdb_ids = list(df.structureId.drop_duplicates())
print("PDB Ids:", pdb_ids)

PDB Ids: ['6W41', '6WPS', '6WPT', '6XCM', '6XCN', '6XDG', '6XE1', '6Z2M', '6ZCZ', '6ZDH', '6ZER', '7A5R', '7A5S', '7BEH', '7BEJ', '7BEP', '7BWJ', '7BYR', '7C01', '7C2L', '7C8W', '7CAC', '7CAI', '7CAK', '7CDI', '7CDJ', '7CHB', '7CHE', '7CHF', '7CHH', '7CM4', '7CWS', '7CZP', '7CZQ', '7CZR', '7CZS', '7CZT', '7CZU', '7CZV', '7CZW', '7CZX', '7CZY', '7CZZ', '7D00', '7D03', '7DK4', '7DPM', '7DX4', '7EAM', '7JV4', '7JV6', '7JVA', '7JVC', '7JX3', '7K43', '7K4N', '7K8S', '7K8T', '7K8U', '7K8V', '7K8W', '7K8X', '7K8Z', '7K90', '7K9Z', '7KKK', '7KKL', '7KLG', '7KLH', '7KML', '7L02', '7L06', '7L09', '7L0N', '7L2D', '7L2E', '7L2F', '7L3N', '7L56', '7LAA', '7LAB', '7LCN', '7LD1', '7LJR', '7LOP', '7LS9', '7LSS', '7LXY', '7LXZ', '7LY2', '7ND3', '7ND4', '7ND5', '7ND7', '7ND8', '7ND9', '7NDA', '7NEH', '7NTC', '6XC2', '6XC3', '6XC4', '6XC7', '6ZH9', '7BEI', '7BEK', '7BEL', '7BEO', '7C8V', '7CAH', '7CH4', '7CH5', '7CHC', '7CJF', '7CWL', '7CWM', '7CWN', '7CWO', '7CWU', '7DEO', '7DET', '7DEU', '7JMO', '7JMW'

## Find all polymer-polymer interactions
Find groups (residues) that interact with other polymer chains

In [9]:
structures = mmtfReader.download_mmtf_files(pdb_ids)

In [10]:
interaction_filter = InteractionFilter(distanceCutoff=distance_cutoff)

interactions = InteractionExtractor().get_polymer_interactions(structures, interaction_filter, level='group')

In [11]:
cols = ['queryGroupId', 'queryChainId', 'queryGroupNumber', 'sequence']
interactions = interactions.drop(*cols)

In [12]:
interactions.limit(5).toPandas()

Unnamed: 0,structureChainId,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex
0,7NTC.A,SER,A,704,734
1,7NTC.B,ALA,B,1080,1110
2,7NTC.B,LEU,B,894,924
3,7NTC.B,SER,B,968,998
4,7NTC.A,GLN,A,755,785


## Filter mutations by joining with the interaction data

In [13]:
result = interactions.join(mutations, ['structureChainId', 'targetGroupNumber'], "inner")

In [14]:
mt = result.toPandas()

In [15]:
mt.head()

Unnamed: 0,structureChainId,targetGroupNumber,targetGroupId,targetChainId,sequenceIndex,coverage,description,name,resolution,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color
0,6WPS.A,1122,VAL,A,1140,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS,1122,1122,1122,"S:p.1122V>L(1242), S:p.1122V>M(44), S:p.1122V>...","1122V>L(1242), 1122V>M(44), 1122V>A(4)",0.469379,#fb7252
1,6WPS.A,1122,VAL,A,1140,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS,1122,1122,1122,"S:p.1122V>L(1242), S:p.1122V>M(44), S:p.1122V>...","1122V>L(1242), 1122V>M(44), 1122V>A(4)",0.469379,#fb7252
2,6WPS.A,1122,VAL,A,1140,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS,1122,1122,1122,"S:p.1122V>L(1242), S:p.1122V>M(44), S:p.1122V>...","1122V>L(1242), 1122V>M(44), 1122V>A(4)",0.469379,#fb7252
3,6WPS.A,1122,VAL,A,1140,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS,1122,1122,1122,"S:p.1122V>L(1242), S:p.1122V>M(44), S:p.1122V>...","1122V>L(1242), 1122V>M(44), 1122V>A(4)",0.469379,#fb7252
4,6WPS.A,1122,VAL,A,1140,0.750196,"spike glycoprotein, S309 neutralizing antibody...",Spike glycoprotein,3.1,6WPS,1122,1122,1122,"S:p.1122V>L(1242), S:p.1122V>M(44), S:p.1122V>...","1122V>L(1242), 1122V>M(44), 1122V>A(4)",0.469379,#fb7252


## Save Mappings

In [16]:
mt.to_csv(output_file_name, index=False)

In [17]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['6W41.C', '6WPS.A', '6WPS.B', '6WPS.E', '6WPT.A', '6WPT.B', '6WPT.C', '6XC2.A', '6XC2.Z', '6XC3.C', '6XC4.A', '6XC4.Z', '6XC7.A', '6XCM.A', '6XCM.B', '6XCM.C', '6XCN.A', '6XCN.C', '6XCN.E', '6XDG.E', '6XE1.E', '6Z2M.A', '6Z2M.E', '6ZCZ.E', '6ZDH.A', '6ZDH.B', '6ZDH.C', '6ZER.A', '6ZER.D', '6ZER.E', '6ZH9.EEE', '7A5R.A', '7A5R.B', '7A5S.A', '7A5S.B', '7B3O.E', '7BEH.E', '7BEI.E', '7BEJ.E', '7BEK.E', '7BEL.R', '7BEL.X', '7BEN.C', '7BEN.E', '7BEO.R', '7BEO.X', '7BEP.C', '7BEP.E', '7BWJ.E', '7BYR.A', '7BYR.B', '7BYR.C', '7C01.A', '7C01.B', '7C2L.A', '7C2L.B', '7C2L.C', '7C8V.B', '7C8W.B', '7CAC.A', '7CAC.B', '7CAC.C', '7CAH.A', '7CAI.A', '7CAI.B', '7CAI.C', '7CAK.A', '7CAK.B', '7CAK.C', '7CDI.E', '7CDJ.E', '7CH4.R', '7CH5.R', '7CHB.R', '7CHC.R', '7CHE.R', '7CHF.R', '7CHH.A', '7CHH.B', '7CHH.C', '7CJF.C', '7CM4.A', '7CWL.A', '7CWL.B', '7CWL.C', '7CWM.A', '7CWM.B', '7CWM.C', '7CWN.A', '7CWN.B', '7CWN.C', '7CWO.A', '7CWS.O', '7CWS.Q', '7CWS.R', '7CWU.A', '7CWU.B', '7CWU.C', '7CZP.A',

## View mutations grouped by protein chain

Mutations are mapped onto protein chains for available 3D protein structures.

Display options:

|||
|:-|:-|
| *show_bio_assembly* | Toggle display of the biologically relevant quaternary structure |
| *show_surface* | Toggle surface for protein chain |
| *show_short_label* | Toggle display of mutation information<br>{UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 501N>Y(350436)|
| *show_long_label* | Toggle display of mutation information<br>{PDBId}.{chainId}.{PDBResidue}: {geneName}.p{UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 6XDG.E.501: S:p.501N>Y(350436) |
| *size* | Change size of visualization |
| *font* | Change font size of annotations |
| *logFreq* | Change minimum threshold to display mutations based on normalized log of mutation frequency [0.0 - 1.0]|
| *structure* | Move slider to browse through available structures |

#### Example: Move the structure slider to PDB ID:6XDG to see how mutations (e.g., 501N>Y) effect the binding of the Regeneron antibodies

In [18]:
# Setup viewer
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, logFreq, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
 
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
    
        # don't display water molecules
        viewer1.setStyle({'resn': ['HOH','DOD']}, {})
        
        modified_residues = set()
        rows = group.shape[0]
        for j in range(0, rows):
            if group.iloc[j]['scale'] > logFreq:
                res_num = str(group.iloc[j]['pdbPosition'])
                mod_res = {'resi': res_num, 'chain': chain_id} 
                modified_residues.add(res_num)
            
                # style for mutated residue
                col = group.iloc[j]['color']
                c_col = col + 'Carbon'
                viewer1.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
                viewer1.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}}) 
           
                # style for interacting residues
                surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
                viewer1.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
                viewer1.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
                # style for interacting waters
                waters = {'resn': ['HOH','DOD']}
                waters.update(surroundings)
                viewer1.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
                
                if show_short_label or show_long_label:
                    viewer1.addResLabels(surroundings, {'fontSize':font-2})
            
                # text label
                annotation = group.iloc[j]['annotation']
                variationId = group.iloc[j]['variationId']
                
                if show_short_label:
                    label = annotation
                if show_long_label:
                    label = chain_id + "." + str(res_num) + ": " + variationId
                if show_short_label or show_long_label:
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
              
        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        # print header
        resolution = group.iloc[0]['resolution']
        coverage = group.iloc[0]['coverage']
        description = group.iloc[0]['description']
        print(f"PDB Id:{pdb_id}, chain Id:{chain_id}, resolution:{resolution}, sequence coverage:{coverage:.2f}")
        print(description)
                
        return viewer1.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    l_widget = FloatSlider(value=0.8, min=0, max=1, step=0.05, description='logFreq:', 
                           continuous_update=False, orientation='horizontal', readout=True, readout_format='.2f')
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, logFreq=l_widget, i=s_widget)

def view_image1():
    return viewer1.png()

In [19]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Interacting residues within the `distance_cutoff` are rendered as orange sticks.

In [20]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer2
        viewer2 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer2.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
        
        # highlight chain of interest in blue
        viewer2.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.7}})
        
        # non-polymer style
        viewer2.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # interacting residue style
        res_num = str(df.iloc[i]['pdbPosition'])
        label = df.iloc[i]['variationId']     
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer2.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer2.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
        
        # don't display water molecules (except below for interactions)
        viewer2.setStyle({'resn': ['HOH','DOD']}, {})
            
        # select residues in interacting chains by distance from mutation site (same chain is excluded)
        surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
        
        # set style for interacting residues
        viewer2.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
        viewer2.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
    
        # set style for interacting waters
        waters = {'resn': ['HOH','DOD']}
        waters.update(surroundings)
        viewer2.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
        
        annotation = df.iloc[i]['annotation']
        variationId = df.iloc[i]['variationId']      
        
        if show_short_label:
            label = annotation
        if show_long_label:
            label = chain_id + "." + str(res_num) + ": " + variationId
        if show_short_label or show_long_label:
            viewer2.addResLabels(surroundings, {'fontSize':font-2})
            viewer2.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
            
        viewer2.zoomTo(surroundings)
        viewer2.center(mod_res)
        
        if show_surface:
             viewer2.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", variationId)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer2.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(df)-1, description='structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, i=s_widget)

def view_image2():
    return viewer2.png()

In [21]:
mt_unique = mt.drop_duplicates(["structureChainId","variationId"])

In [22]:
view_single_mutation(mt_unique, distance_cutoff);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [23]:
# Shutdown Spark
spark.stop()

## Now run the next step
Map mutations occuring at protein-ligand binding sites: [3-MapToLigandInteractions.ipynb](3-MapToLigandInteractions.ipynb)