## Map Mutations to Protein-Protein and Protein-Nucleic Acid Interfaces
Here we find and visualize the mutations that occur around protein-protein and protein-nucleic acid interfaces.

In [1]:
# Disable Numba: temporary workaround for https://github.com/sbl-sdsc/mmtf-pyspark/issues/288
import os
os.environ['NUMBA_DISABLE_JIT'] = "1"

In [2]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider, FloatSlider, SelectMultiple
import py3Dmol
import pandas as pd



In [3]:
# Initialize Spark
spark = SparkSession.builder.appName("2-MapToPolymerInteractions").getOrCreate()

In [4]:
# Enable Arrow-based columnar data transfers between Spark and Pandas dataframes
# Commented out for deployment on Pangeo Binder since it causes an out of memory error
# spark.conf.set("spark.sql.execution.arrow.enabled", "true")

#### Input parameters

In [5]:
distance_cutoff = 8 # distance cutoff for finding and visualizing interactions
input_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_polymer_inter.csv' # mutations mapped to polymer interactions

## Read mutation file created in the previous step

In [6]:
pd.set_option('display.max_columns', None)  # show all columns
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color
0,0.026709,Spike protein S2,Spike glycoprotein,2.47,6LVN.A,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8
1,0.027494,Spike protein S2,Spike glycoprotein,2.47,6LVN.B,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8
2,0.026709,Spike protein S2,Spike glycoprotein,2.47,6LVN.D,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8
3,0.27337,Spike glycoprotein,Spike glycoprotein,3.0,6XRA.A,6XRA,1169,1169,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8
4,0.27337,Spike glycoprotein,Spike glycoprotein,3.0,6XRA.B,6XRA,1169,1169,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8


## Create a list of unique PDB Ids

In [7]:
pdb_ids = list(df.structureId.drop_duplicates())
print("PDB Ids:", pdb_ids)

PDB Ids: ['6LVN', '6XRA', '6LXT', '6VSB', '6VXX', '6VYB', '6WPS', '6WPT', '6X29', '6X2A', '6X2B', '6X2C', '6X6P', '6X79', '6XCM', '6XCN', '6XEY', '6XF5', '6XF6', '6XKL', '6XLU', '6XM0', '6XM3', '6XM4', '6XM5', '6XR8', '6XS6', '6Z43', '6Z97', '6ZB4', '6ZB5', '6ZDH', '6ZGE', '6ZGG', '6ZGH', '6ZGI', '6ZHD', '6ZOW', '6ZOX', '6ZOY', '6ZOZ', '6ZP0', '6ZP1', '6ZP2', '6ZP5', '6ZP7', '6ZWV', '6ZXN', '7A93', '7A94', '7A95', '7A96', '7A97', '7A98', '7BYR', '7C2L', '7CAI', '7CAK', '7CHH', '7CN9', '7JJI', '7JJJ', '7JZL', '7JZN', '6M1V', '6LZG', '6M0J', '6VW1', '6W41', '6XC2', '6XC3', '6XC4', '6XC7', '6XDG', '6XE1', '6YLA', '6YM0', '6YOR', '6YZ5', '6YZ7', '6Z2M', '6ZBP', '6ZCZ', '6ZDG', '6ZER', '6ZFO', '6ZH9', '7A5R', '7A5S', '7A91', '7A92', '7BWJ', '7BZ5', '7C01', '7C8D', '7C8V', '7C8W', '7CAH', '7CAN', '7CH4', '7CH5', '7CHB', '7CHC', '7CHE', '7CHF', '7JMO', '7JZM', '7JZU', '6M17', '7JMP']


## Find all polymer-polymer interactions
Find groups (residues) that interact with other polymer chains

In [8]:
structures = mmtfReader.download_mmtf_files(pdb_ids)

queryGroupId, queryChainId, and queryGroupNumber specify the residue that interacts with another chain (targetChainId)

In [9]:
interaction_filter = InteractionFilter(distanceCutoff=distance_cutoff)

interactions = InteractionExtractor().get_polymer_interactions(structures, interaction_filter, level='group').toPandas()
interactions.head()

Unnamed: 0,structureChainId,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,6XM0.A,LEU,B,752,GLY,A,971,957,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
1,6XM0.A,ASP,C,571,VAL,A,42,28,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
2,6XM0.B,GLN,C,913,PRO,B,1090,1076,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
3,6XM0.C,SER,A,1037,VAL,C,1040,1026,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
4,6XM0.C,GLN,A,784,VAL,C,1040,1026,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
5,6XM0.C,GLU,A,1031,CYS,C,1043,1029,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
6,6XM0.B,GLN,C,895,SER,B,708,694,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
7,6XM0.C,PHE,A,1042,ARG,C,1039,1025,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
8,6XM0.B,ILE,A,712,ILE,B,896,882,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...
9,6XM0.C,GLY,B,548,ASP,C,745,731,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...


## Filter mutations by joining with the interaction data

In [10]:
mt = df.merge(interactions, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','targetGroupNumber'], how='inner')
mt.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,0.026709,Spike protein S2,Spike glycoprotein,2.47,6LVN.A,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8,LEU,C,33,ILE,A,2,1,DISGINASVVNIQKEIDRLNEVAKNLNESLIDLQEL
1,0.026709,Spike protein S2,Spike glycoprotein,2.47,6LVN.A,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8,GLN,C,34,ILE,A,2,1,DISGINASVVNIQKEIDRLNEVAKNLNESLIDLQEL
2,0.027494,Spike protein S2,Spike glycoprotein,2.47,6LVN.B,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8,LEU,D,33,ILE,B,2,1,DISGINASVVNIQKEIDRLNEVAKNLNESLIDLQEL
3,0.027494,Spike protein S2,Spike glycoprotein,2.47,6LVN.B,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8,GLN,D,34,ILE,B,2,1,DISGINASVVNIQKEIDRLNEVAKNLNESLIDLQEL
4,0.026709,Spike protein S2,Spike glycoprotein,2.47,6LVN.D,6LVN,1169,2,1169,"QHD43416.1:p.1169I>L(1), QHD43416.1:p.1169I>F(...","1169I>L(1), 1169I>F(1), 1169I>T(1)",0.097329,#fee5d8,GLN,B,34,ILE,D,2,1,DISGINASVVNIQKEIDRLNEVAKNLNESLIDLQEL


## Save Mappings

In [11]:
mt.to_csv(output_file_name, index=False)

In [12]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['6LVN.A', '6LVN.B', '6LVN.C', '6LVN.D', '6LXT.A', '6LXT.B', '6LXT.C', '6LXT.D', '6LXT.E', '6LXT.F', '6LZG.B', '6M0J.E', '6M17.E', '6M17.F', '6VSB.A', '6VSB.B', '6VSB.C', '6VW1.E', '6VW1.F', '6VXX.A', '6VXX.B', '6VXX.C', '6VYB.A', '6VYB.B', '6VYB.C', '6W41.C', '6WPS.A', '6WPS.B', '6WPS.E', '6WPT.A', '6WPT.B', '6WPT.C', '6X29.A', '6X29.B', '6X29.C', '6X2A.A', '6X2A.B', '6X2A.C', '6X2B.A', '6X2B.B', '6X2B.C', '6X2C.A', '6X2C.B', '6X2C.C', '6X6P.A', '6X6P.B', '6X6P.C', '6X79.A', '6X79.B', '6X79.C', '6XC2.A', '6XC2.Z', '6XC3.C', '6XC4.A', '6XC4.Z', '6XC7.A', '6XCM.A', '6XCM.B', '6XCM.C', '6XCN.A', '6XCN.C', '6XCN.E', '6XDG.E', '6XE1.E', '6XEY.A', '6XEY.B', '6XEY.C', '6XF5.A', '6XF5.B', '6XF5.C', '6XF6.A', '6XF6.B', '6XF6.C', '6XKL.A', '6XKL.B', '6XKL.C', '6XLU.A', '6XLU.B', '6XLU.C', '6XM0.A', '6XM0.B', '6XM0.C', '6XM3.A', '6XM3.B', '6XM3.C', '6XM4.A', '6XM4.B', '6XM4.C', '6XM5.A', '6XM5.B', '6XM5.C', '6XR8.A', '6XR8.B', '6XR8.C', '6XRA.A', '6XRA.B', '6XRA.C', '6XS6.A', '6XS6.B', '

## View mutations grouped by protein chain

In [13]:
# Setup viewer
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, logFreq, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
 
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
    
        # don't display water molecules
        viewer1.setStyle({'resn': ['HOH','DOD']}, {})
        
        modified_residues = set()
        rows = group.shape[0]
        for j in range(0, rows):
            if group.iloc[j]['scale'] > logFreq:
                res_num = str(group.iloc[j]['pdbPosition'])
                mod_res = {'resi': res_num, 'chain': chain_id} 
                modified_residues.add(res_num)
            
                # style for mutated residue
                col = group.iloc[j]['color']
                c_col = col + 'Carbon'
                viewer1.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
                viewer1.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}}) 
           
                # style for interacting residues
                surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
                viewer1.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
                viewer1.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
                # style for interacting waters
                waters = {'resn': ['HOH','DOD']}
                waters.update(surroundings)
                viewer1.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
                
                if show_short_label or show_long_label:
                    viewer1.addResLabels(surroundings, {'fontSize':font-2})
            
                # text label
                annotation = group.iloc[j]['annotation']
                variationId = group.iloc[j]['variationId']
                
                if show_short_label:
                    label = annotation
                if show_long_label:
                    label = chain_id + "." + str(res_num) + ": " + variationId
                if show_short_label or show_long_label:
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
              
        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        # print header
        resolution = group.iloc[0]['resolution']
        coverage = group.iloc[0]['coverage']
        description = group.iloc[0]['description']
        print(f"PDB Id:{pdb_id}, chain Id:{chain_id}, resolution:{resolution}, sequence coverage:{coverage:.2f}")
        print(description)
                
        return viewer1.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    l_widget = FloatSlider(value=0.5, min=0, max=1, step=0.05, description='logFreq:', 
                           continuous_update=False, orientation='horizontal', readout=True, readout_format='.2f')
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, logFreq=l_widget, i=s_widget)

def view_image1():
    return viewer1.png()

In [14]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Interacting residues within the `distance_cutoff` are rendered as orange sticks.

In [15]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer2
        viewer2 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer2.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
        
        # highlight chain of interest in blue
        viewer2.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.7}})
        
        # non-polymer style
        viewer2.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # interacting residue style
        res_num = str(df.iloc[i]['pdbPosition'])
        label = df.iloc[i]['variationId']     
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer2.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer2.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
        
        # don't display water molecules (except below for interactions)
        viewer2.setStyle({'resn': ['HOH','DOD']}, {})
            
        # select residues in interacting chains by distance from mutation site (same chain is excluded)
        surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
        
        # set style for interacting residues
        viewer2.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
        viewer2.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
    
        # set style for interacting waters
        waters = {'resn': ['HOH','DOD']}
        waters.update(surroundings)
        viewer2.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
        
        annotation = df.iloc[i]['annotation']
        variationId = df.iloc[i]['variationId']      
        
        if show_short_label:
            label = annotation
        if show_long_label:
            label = chain_id + "." + str(res_num) + ": " + variationId
        if show_short_label or show_long_label:
            viewer2.addResLabels(surroundings, {'fontSize':font-2})
            viewer2.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
            
        viewer2.zoomTo(surroundings)
        viewer2.center(mod_res)
        
        if show_surface:
             viewer2.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", variationId)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer2.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(df)-1, description='structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, i=s_widget)

def view_image2():
    return viewer2.png()

In [16]:
mt_unique = mt.drop_duplicates(["structureChainId","variationId"])

In [17]:
view_single_mutation(mt_unique, distance_cutoff);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [18]:
# Shutdown Spark
spark.stop()

## Now run the next step
Map mutations occuring at protein-ligand binding sites: [3-MapToLigandInteractions.ipynb](3-MapToLigandInteractions.ipynb)