## Map Mutations to Protein-Protein and Protein-Nucleic Acid Interfaces
Here we find and visualize the mutations that occur around protein-protein and protein-nucleic acid interfaces.

In [1]:
# Disable Numba: temporary workaround for https://github.com/sbl-sdsc/mmtf-pyspark/issues/288
import os
os.environ['NUMBA_DISABLE_JIT'] = "1"

In [2]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider, FloatSlider, SelectMultiple
import py3Dmol
from py2neo import Graph
import pandas as pd



In [3]:
# Initialize Spark
spark = SparkSession.builder.appName("3-MapToPolymerInteractions").getOrCreate()

In [4]:
# Enable Arrow-based columnar data transfers between Spark and Pandas dataframes
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

#### Input parameters

In [5]:
distance_cutoff = 8 # distance cutoff for finding and visualizing interactions
input_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_polymer_inter.csv' # mutations mapped to polymer interactions

#### Connect to COVID-19-Community Knowledge Graph
[COVID-19-Net Knowledge Graph](https://github.com/covid-19-net/covid-19-community)

In [6]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

## Read mutation file created in the previous step

In [7]:
pd.set_option('display.max_columns', None)  # show all columns
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color
0,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,336,336,336,QHD43416.1:p.336C>R(2),336C>R(2),0.061764,#ffebe2
1,0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,2.77,7C8W.B,7C8W,336,336,336,QHD43416.1:p.336C>R(2),336C>R(2),0.061764,#ffebe2
2,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,337,337,337,"QHD43416.1:p.337P>R(4), QHD43416.1:p.337P>S(2)","337P>R(4), 337P>S(2)",0.159657,#fdd5c4
3,0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,2.77,7C8W.B,7C8W,337,337,337,"QHD43416.1:p.337P>R(4), QHD43416.1:p.337P>S(2)","337P>R(4), 337P>S(2)",0.159657,#fdd5c4
4,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,338,338,338,QHD43416.1:p.338F>L(4),338F>L(4),0.123527,#fee1d3


## Create a list of unique PDB Ids

In [8]:
pdb_ids = list(df.structureId.drop_duplicates())
print("PDB Ids:", pdb_ids)

PDB Ids: ['6YZ5', '7C8W']


## Find all polymer-polymer interactions
Find groups (residues) that interact with other polymer chains

In [9]:
structures = mmtfReader.download_mmtf_files(pdb_ids)

queryGroupId, queryChainId, and queryGroupNumber specify the residue that interacts with another chain (targetChainId)

In [10]:
interaction_filter = InteractionFilter(distanceCutoff=distance_cutoff)

interactions = InteractionExtractor().get_polymer_interactions(structures, interaction_filter, level='group').toPandas()
interactions.head()

Unnamed: 0,structureChainId,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,7C8W.B,ASP,A,101,PHE,B,456,129,AGSPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSA...
1,7C8W.A,SER,B,494,LYS,A,99,98,QVQLVESGGGLVQAGGSLRLSCAASGFPVEVWRMEWYRQAPGKERE...
2,7C8W.B,LEU,A,104,ASN,B,501,174,AGSPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSA...
3,7C8W.B,LYS,A,65,CYS,B,480,153,AGSPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSA...
4,7C8W.B,ALA,A,50,GLU,B,484,157,AGSPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSA...


## Filter mutations by joining with the interaction data

In [11]:
mt = df.merge(interactions, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','targetGroupNumber'], how='inner')
mt.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,346,346,346,QHD43416.1:p.346R>K(4),346R>K(4),0.123527,#fee1d3,ARG,F,27,ARG,E,346,16,PNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFS...
1,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,346,346,346,QHD43416.1:p.346R>K(4),346R>K(4),0.123527,#fee1d3,PHE,F,29,ARG,E,346,16,PNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFS...
2,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,346,346,346,QHD43416.1:p.346R>K(4),346R>K(4),0.123527,#fee1d3,SER,F,30,ARG,E,346,16,PNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFS...
3,0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,2.77,7C8W.B,7C8W,403,403,403,QHD43416.1:p.403R>S(2),403R>S(2),0.061764,#ffebe2,LEU,A,104,ARG,B,403,76,AGSPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSA...
4,0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,2.77,7C8W.B,7C8W,403,403,403,QHD43416.1:p.403R>S(2),403R>S(2),0.061764,#ffebe2,ASP,A,101,ARG,B,403,76,AGSPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSA...


## Save Mappings

In [12]:
mt.to_csv(output_file_name, index=False)

In [13]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['6YZ5.E', '7C8W.B']


## View mutations grouped by protein chain

In [14]:
# Setup viewer
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, logFreq, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
 
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
    
        # don't display water molecules
        viewer1.setStyle({'resn': ['HOH','DOD']}, {})
        
        modified_residues = set()
        rows = group.shape[0]
        for j in range(0, rows):
            if group.iloc[j]['scale'] > logFreq:
                res_num = str(group.iloc[j]['pdbPosition'])
                mod_res = {'resi': res_num, 'chain': chain_id} 
                modified_residues.add(res_num)
            
                # style for mutated residue
                col = group.iloc[j]['color']
                c_col = col + 'Carbon'
                viewer1.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
                viewer1.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}}) 
           
                # style for interacting residues
                surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
                viewer1.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
                viewer1.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
                # style for interacting waters
                waters = {'resn': ['HOH','DOD']}
                waters.update(surroundings)
                viewer1.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
                
                if show_short_label or show_long_label:
                    viewer1.addResLabels(surroundings, {'fontSize':font-2})
            
                # text label
                annotation = group.iloc[j]['annotation']
                variationId = group.iloc[j]['variationId']
                
                if show_short_label:
                    label = annotation
                if show_long_label:
                    label = chain_id + "." + str(res_num) + ": " + variationId
                if show_short_label or show_long_label:
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
              
        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        # print header
        resolution = group.iloc[0]['resolution']
        coverage = group.iloc[0]['coverage']
        description = group.iloc[0]['description']
        print(f"PDB Id:{pdb_id}, chain Id:{chain_id}, resolution:{resolution}, sequence coverage:{coverage:.2f}")
        print(description)
                
        return viewer1.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    l_widget = FloatSlider(value=0.5, min=0, max=1, step=0.05, description='logFreq:', 
                           continuous_update=False, orientation='horizontal', readout=True, readout_format='.2f')
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, logFreq=l_widget, i=s_widget)

def view_image1():
    return viewer1.png()

In [15]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Interacting residues within the `distance_cutoff` are rendered as orange sticks.

In [16]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer2
        viewer2 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer2.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
        
        # highlight chain of interest in blue
        viewer2.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.7}})
        
        # non-polymer style
        viewer2.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # interacting residue style
        res_num = str(df.iloc[i]['pdbPosition'])
        label = df.iloc[i]['variationId']     
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer2.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer2.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
        
        # don't display water molecules (except below for interactions)
        viewer2.setStyle({'resn': ['HOH','DOD']}, {})
            
        # select residues in interacting chains by distance from mutation site (same chain is excluded)
        surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': distance_cutoff, 'sel': mod_res}}
        
        # set style for interacting residues
        viewer2.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
        viewer2.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
    
        # set style for interacting waters
        waters = {'resn': ['HOH','DOD']}
        waters.update(surroundings)
        viewer2.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
        
        annotation = df.iloc[i]['annotation']
        variationId = df.iloc[i]['variationId']      
        
        if show_short_label:
            label = annotation
        if show_long_label:
            label = chain_id + "." + str(res_num) + ": " + variationId
        if show_short_label or show_long_label:
            viewer2.addResLabels(surroundings, {'fontSize':font-2})
            viewer2.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, mod_res)
            
        viewer2.zoomTo(surroundings)
        viewer2.center(mod_res)
        
        if show_surface:
             viewer2.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", variationId)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer2.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(df)-1, description='structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, 
                    size=z_widget, font=f_widget, i=s_widget)

def view_image2():
    return viewer2.png()

In [17]:
mt_unique = mt.drop_duplicates(["structureChainId","variationId"])

In [18]:
view_single_mutation(mt_unique, distance_cutoff);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [19]:
# Shutdown Spark
spark.stop()

## Now run the next step
Map mutations occuring at protein-ligand binding sites: [3-MapToLigandInteractions.ipynb](3-MapToLigandInteractions.ipynb)