## Map Mutations to Protein-Ligand Interactions
Here we find and visualize the mutations at protein-ligand binding sites

In [1]:
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd

#### Input parameters

In [2]:
cutoff_distance = 8 # cutoff distance for finding and visualizing interactions
input_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_ligand.csv' # mutations mapped to protein-ligand interactions

### Read 'mutations3d.csv' file created in the previous step

In [3]:
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,structureChainId
0,1GUH,A,83,G,hgvs-grch37,chr6:g.52619766C>T,1GUH,A,9606,Homo sapiens,1GUH.A
1,1XZ7,B,108,N,hgvs-grch37,chr11:g.5246945G>T,1XZ7,B,9606,Homo sapiens,1XZ7.B
2,2VCT,D,83,G,hgvs-grch37,chr6:g.52619766C>T,2VCT,D,9606,Homo sapiens,2VCT.D
3,3KMF,G,708,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,G,9606,Homo sapiens,3KMF.G
4,1GSF,D,83,G,hgvs-grch37,chr6:g.52619766C>T,1GSF,D,9606,Homo sapiens,1GSF.D
5,1YDZ,B,108,N,hgvs-grch37,chr11:g.5246945G>T,1YDZ,B,9606,Homo sapiens,1YDZ.B
6,6ATQ,A,83,G,hgvs-grch37,chr6:g.52619766C>T,6ATQ,A,9606,Homo sapiens,6ATQ.A
7,3KMF,C,308,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,C,9606,Homo sapiens,3KMF.C
8,1Y09,D,108,N,hgvs-grch37,chr11:g.5246945G>T,1Y09,D,9606,Homo sapiens,1Y09.D
9,6ATO,A,83,G,hgvs-grch37,chr6:g.52619766C>T,6ATO,A,9606,Homo sapiens,6ATO.A


## Create a list of unique PDB IDs

In [4]:
pdb_ids = list(df.structureId.drop_duplicates())
pdb_ids

['1GUH',
 '1XZ7',
 '2VCT',
 '3KMF',
 '1GSF',
 '1YDZ',
 '6ATQ',
 '1Y09',
 '6ATO',
 '2FO0',
 '1XYE',
 '4ACS',
 '2WJU',
 '4WA9',
 '2G1T',
 '1XY0',
 '4TWP',
 '1K3Y',
 '1YHE',
 '1OPL',
 '2V7A',
 '1YHR',
 '1YZI',
 '1QXD',
 '1XZV',
 '1VWT',
 '3UE4',
 '2R6K',
 '1PL1',
 '3QRI',
 '1DXT',
 '6FQF',
 '2HIW',
 '2GQG',
 '1XZ4',
 '6ATR',
 '2F4J',
 '1Y0A',
 '1Y0D',
 '1XZU',
 '2W6V',
 '1YH9',
 '2G2H',
 '5JCU',
 '4ZOG',
 '1XZ5',
 '1GSE',
 '1Y0C',
 '1Y8W',
 '2HZI',
 '1PKW',
 '3KTL',
 '3QRK',
 '1XXT',
 '1XZ2',
 '6ATP',
 '1K3L',
 '4XS0',
 '2G2F',
 '5MO4']

## Find all polymer-ligand interactions

In [5]:
structures = mmtfReader.download_mmtf_files(pdb_ids)

In [6]:
interaction_filter = InteractionFilter(distanceCutoff=cutoff_distance)
interaction_filter.set_query_groups(False, ["HOH", "DOD"])  # exclude waters

interactions = InteractionExtractor().get_ligand_polymer_interactions(structures, interaction_filter).toPandas()
interactions

Unnamed: 0,structureChainId,queryLigandId,queryLigandChainId,queryLigandNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,4ZOG.A,MXE,A,604,ALA,A,412,183,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
1,4ZOG.B,MES,B,602,GLU,B,334,105,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
2,4ZOG.A,VX6,A,601,TYR,A,312,83,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
3,4ZOG.B,VX6,B,601,ASP,B,381,152,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
4,4ZOG.B,VX6,B,601,GLY,B,254,25,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
5,4ZOG.A,VX6,A,601,LYS,A,247,18,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
6,4ZOG.A,VX6,A,601,GLY,A,250,21,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
7,4ZOG.A,MES,B,602,GLU,A,281,52,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
8,4ZOG.A,MXE,A,602,TYR,A,253,24,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...
9,4ZOG.B,VX6,B,601,THR,B,267,38,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...


## Filter mutations by joining with the interaction data

In [7]:
mt = df.merge(interactions, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','targetGroupNumber'], how='inner')
mt

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,structureChainId,queryLigandId,queryLigandChainId,queryLigandNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,3KMF,C,308,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,C,9606,Homo sapiens,3KMF.C,HEM,C,347,ASN,C,308,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...
1,2FO0,A,272,Y,hgvs-grch37,chr9:g.133738358A>T,2FO0,A,9606,Homo sapiens,2FO0.A,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
2,4WA9,B,253,Y,hgvs-grch37,chr9:g.133738358A>T,4WA9,B,9606,Homo sapiens,4WA9.B,AXI,B,9000,TYR,B,253,26,GSSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKT...
3,2G1T,C,253,Y,hgvs-grch37,chr9:g.133738358A>T,2G1T,C,9606,Homo sapiens,2G1T.C,MG,C,1701,TYR,C,253,27,GHMSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVK...
4,2G1T,C,253,Y,hgvs-grch37,chr9:g.133738358A>T,2G1T,C,9606,Homo sapiens,2G1T.C,112,G,1301,TYR,C,253,27,GHMSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVK...
5,4TWP,B,253,Y,hgvs-grch37,chr9:g.133738358A>T,4TWP,B,9606,Homo sapiens,4TWP.B,AXI,B,601,TYR,B,253,20,DKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTM...
6,1OPL,A,272,Y,hgvs-grch37,chr9:g.133738358A>T,1OPL,A,9606,Homo sapiens,1OPL.A,P16,A,539,TYR,A,272,271,MGQQPGKVLGDQRRPSLPALHFIKGAGKRDSSRHGGPHCNVFVEHE...
7,2V7A,A,253,Y,hgvs-grch37,chr9:g.133738358A>T,2V7A,A,9606,Homo sapiens,2V7A.A,627,A,1504,TYR,A,253,26,GPSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKT...
8,1YZI,B,108,N,hgvs-grch37,chr11:g.5246945G>T,1YZI,B,9606,Homo sapiens,1YZI.B,HEM,B,148,ASN,B,108,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...
9,1QXD,D,108,N,hgvs-grch37,chr11:g.5246945G>T,1QXD,D,9606,Homo sapiens,1QXD.D,HEM,D,148,ASN,D,108,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...


## Save protein-ligand mapping

In [8]:
mt.to_csv(output_file_name, index=False)

## View mutations grouped by protein chain
Use the slider to view each protein chain. Turn labels off for an unobstructed view of the mutations. Interacting ligands are rendered as spheres with green carbon atoms.

In [9]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['1OPL.A', '1QXD.D', '1YZI.B', '2F4J.A', '2FO0.A', '2G1T.C', '2G2F.A', '2G2H.A', '2GQG.B', '2HIW.A', '2HZI.A', '2V7A.A', '2W6V.D', '3KMF.C', '3QRI.A', '3QRK.A', '3UE4.A', '4TWP.B', '4WA9.B', '4XS0.B', '4ZOG.A', '5MO4.A', '6FQF.D']


In [10]:
def view_grouped_mutations(grouped_df, cutoff_distance, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
   
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        rows = group.shape[0]
        for j in range(0, rows):
            # interacting residue info
            res_num = str(group.iloc[j]['pdbPosition'])
            mod_res = {'resi': res_num, 'chain': chain_id}
            
            # interacting ligand info
            lig_id = group.iloc[j]['queryLigandId']
            lig_chain = group.iloc[j]['queryLigandChainId']
            lig_num = group.iloc[j]['queryLigandNumber']
            lig_res = {'resi': lig_num, 'chain': lig_chain}
            
            col = 'red'
            c_col = col + 'Carbon'
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
            viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})
            
            if show_labels:
                label = group.iloc[j]['variationId']
                viewer.addLabel(label, {'fontSize':8,'fontColor': 'black','backgroundColor':'ivory'}, mod_res)
                viewer.addLabel(lig_id + lig_num, {'fontSize':8}, lig_res)

        viewer.zoomTo({'chain': chain_id})
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        #print header
        print("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + group.iloc[j][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

Turn off scrolling in the viewer cell below

In [11]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

In [12]:
view_grouped_mutations(chains, cutoff_distance);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Surrounding residues within the `cutoff_distance` are rendered as orange sticks. Interacting ligands are rendered as spheres with green carbon atoms.

In [13]:
def view_single_mutation(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.7}})
       
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'width': 0.6, 'opacity':0.5}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        # interacting residue info
        res_num = str(df.iloc[i]['pdbPosition'])
        mod_res = {'resi': res_num, 'chain': chain_id}
        
        # interacting ligand info
        label = df.iloc[i]['variationId']  
        lig_id = df.iloc[i]['queryLigandId']
        lig_chain = df.iloc[i]['queryLigandChainId']
        lig_num = df.iloc[i]['queryLigandNumber']
        lig_res = {'resi': lig_num, 'chain': lig_chain}
        
        col = 'red'
        c_col = col + 'Carbon'
        viewer.addStyle(mod_res, {'stick':{'colorscheme': c_col, 'radius': 0.2}})
        viewer.addStyle(mod_res, {'sphere':{'color': col, 'opacity': 0.8}})  

        if show_labels:
            viewer.addLabel(label, {'fontSize':12,'fontColor': 'black','backgroundColor':'ivory'}, mod_res) 
            viewer.addLabel(lig_id + lig_num, {'fontSize':10}, lig_res) 
            
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': cutoff_distance}
        
        # residues surrounding mutation site
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})

        
        # interacting ligand style
        viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})  
    
        if show_surface:
             viewer.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
         
        viewer.zoomTo(surroundings)
        
        # print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "ligand:", lig_id + lig_num, "mutation:", label)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [14]:
view_single_mutation(mt, cutoff_distance);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## Now run the next step
Map mutations occuring at protein-polymer interfaces: [5-MapToDrugInteractions.ipynb](5-MapToDrugInteractions.ipynb)