## Map Mutations to Protein-Protein and Protein-Nucleic Acid Interfaces
Here we find and visualize the mutations that occur around protein-protein and protein-nucleic acid

In [1]:
from pyspark.sql.functions import explode
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, InteractionFingerprinter
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd

#### Input parameters

In [2]:
cutoff_distance = 8 # cutoff distance for finding and visualizing interactions
input_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures

## Read 'mutations.csv' file created in the previous step

In [3]:
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df.head()

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,structureChainId
0,1GUH,A,83,G,hgvs-grch37,chr6:g.52619766C>T,1GUH,A,9606,Homo sapiens,1GUH.A
1,1XZ7,B,108,N,hgvs-grch37,chr11:g.5246945G>T,1XZ7,B,9606,Homo sapiens,1XZ7.B
2,2VCT,D,83,G,hgvs-grch37,chr6:g.52619766C>T,2VCT,D,9606,Homo sapiens,2VCT.D
3,3KMF,G,708,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,G,9606,Homo sapiens,3KMF.G
4,1GSF,D,83,G,hgvs-grch37,chr6:g.52619766C>T,1GSF,D,9606,Homo sapiens,1GSF.D


## Create a list of unique PDB Ids

In [4]:
pdb_ids = list(df.structureId.drop_duplicates())
print("PDB Ids:", pdb_ids)

PDB Ids: ['1GUH', '1XZ7', '2VCT', '3KMF', '1GSF', '1YDZ', '6ATQ', '1Y09', '6ATO', '2FO0', '1XYE', '4ACS', '2WJU', '4WA9', '2G1T', '1XY0', '4TWP', '1K3Y', '1YHE', '1OPL', '2V7A', '1YHR', '1YZI', '1QXD', '1XZV', '1VWT', '3UE4', '2R6K', '1PL1', '3QRI', '1DXT', '6FQF', '2HIW', '2GQG', '1XZ4', '6ATR', '2F4J', '1Y0A', '1Y0D', '1XZU', '2W6V', '1YH9', '2G2H', '5JCU', '4ZOG', '1XZ5', '1GSE', '1Y0C', '1Y8W', '2HZI', '1PKW', '3KTL', '3QRK', '1XXT', '1XZ2', '6ATP', '1K3L', '4XS0', '2G2F', '5MO4']


## Find all polymer-polymer interactions

In [5]:
structures = mmtfReader.download_mmtf_files(pdb_ids)

In [6]:
interactionFilter = InteractionFilter(distanceCutoff=cutoff_distance, minInteractions=1)
# The following step is slow. It will be replaced by a faster implementation.
ifp = InteractionFingerprinter.get_polymer_interactions(structures, interactionFilter)
ifp = ifp.select(ifp.structureChainId, ifp.queryChainId, ifp.groupNumbers)
ifp = ifp.withColumn("groupNumbers", explode(ifp.groupNumbers))
ifp = ifp.dropDuplicates()
interactions = ifp.toPandas()
interactions.head()

Unnamed: 0,structureChainId,queryChainId,groupNumbers
0,1PL1.B,A,75
1,1Y09.A,D,139
2,1K3Y.A,B,91
3,1YHR.D,A,96
4,1Y0A.C,D,110


## Filter mutations by joining with the interaction data

In [7]:
mt = df.merge(interactions, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','groupNumbers'], how='inner')
mt.head()

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,structureChainId,queryChainId,groupNumbers
0,3KMF,G,708,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,G,9606,Homo sapiens,3KMF.G,E,708
1,1Y09,D,108,N,hgvs-grch37,chr11:g.5246945G>T,1Y09,D,9606,Homo sapiens,1Y09.D,C,108
2,1YHE,D,108,N,hgvs-grch37,chr11:g.5246945G>T,1YHE,D,9606,Homo sapiens,1YHE.D,C,108
3,1YZI,B,108,N,hgvs-grch37,chr11:g.5246945G>T,1YZI,B,9606,Homo sapiens,1YZI.B,A,108
4,1QXD,D,108,N,hgvs-grch37,chr11:g.5246945G>T,1QXD,D,9606,Homo sapiens,1QXD.D,C,108


## View mutations grouped by protein chain
Use the slider to view each protein chain. Turn labels off for an unobstructed view of the mutations.

In [8]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['1QXD.D', '1VWT.D', '1XXT.D', '1XZ2.D', '1XZU.D', '1Y09.D', '1YHE.D', '1YZI.B', '2G2F.A', '2W6V.D', '3KMF.G', '6FQF.D']


In [9]:
def view_grouped_mutations(grouped_df, cutoff_distance, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
 
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
    
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        rows = group.shape[0]
        for j in range(0, rows):
            res_num = str(group.iloc[j]['pdbPosition'])
            mod_res = {'resi': res_num, 'chain': chain_id} 
            col = 'red'
            c_col = col + 'Carbon'
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})          
            if show_labels:
                label = group.iloc[j]['variationId']
                viewer.addLabel(label, {'fontSize':7,'fontColor': 'black','backgroundColor':'ivory'}, {'resi': res_num, 'chain': chain_id})
        
        viewer.zoomTo({'chain': chain_id})
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        #print header
        print("PDB Id: " + pdb_id + " chain Id: " + chain_id)
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

Turn off scrolling in the viewer cell below

In [10]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

In [11]:
view_grouped_mutations(chains, cutoff_distance);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Interacting residues within the `cutoff_distance` are rendered as orange sticks.

In [12]:
def view_single_mutation(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})
        
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.7}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # interacting residue style
        res_num = str(df.iloc[i]['pdbPosition'])
        label = df.iloc[i]['variationId']     
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = 'red'
        c_col = col + 'Carbon'
        viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.8}})   
        
        # don't display water molecules (except below for interactions)
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        #if show_labels:
        #    viewer.addLabel(label, {'fontSize':12,'fontColor': 'black','backgroundColor':'ivory'}, mod_res)
            
        # select residues in interacting chains by distance from mutation site (same chain is excluded)
        surroundings = {'not': {'chain': chain_id}, 'byres': True, 'within': {'distance': cutoff_distance, 'sel': mod_res}}
        
        # set style for interacting residues
        viewer.setStyle(surroundings,{'cartoon':{'color':'orange', 'width': 0.6}})
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        
        if show_labels:
            viewer.addResLabels(surroundings, {'fontSize':10})
            viewer.addLabel(label, {'fontSize':12,'fontColor': 'black','backgroundColor':'ivory'}, mod_res)
            
        viewer.zoomTo(surroundings)
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", label)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [13]:
view_single_mutation(mt, cutoff_distance);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## Now run the next step
Map mutations occuring at protein-ligand binding sites: [4-MapToLigandInteractions.ipynb](4-MapToLigandInteractions.ipynb)