## Map Mutations to Protein-Ligand Interactions
Here we find and visualize the mutations at protein-ligand binding sites.

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd

In [2]:
# Initialize Spark
spark = SparkSession.builder.master("local[4]").appName("4-MapLigandInteractions").getOrCreate()

#### Input parameters

In [3]:
distance_cutoff = 8  # distance cutoff for finding and visualizing interactions
input_file_name = 'mutations3d.csv'  # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_ligand.csv'  # mutations mapped to protein-ligand binding sites

### Read 'mutations3d.csv' file created in the previous step

In [4]:
pd.set_option('display.max_columns', None)  # show all columns
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str')  # PDB residue numbers must be string to handle insertion codes
df.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity,tax_id,scientific_name,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum,ID,CHROM,POS,REF,ALT,annotation,color,var_id
0,32437406,432.18,C,7.6822e-158,215.0,215.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_C_1,1,218,hgvs-grch37,"[Row(pdbAminoAcid='G', pdbPosition=83, queryAm...",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,221,2018-05-03,chr6:g.52619766C>T,4ACS,83,G,98.623853,9606,Homo sapiens,4ACS.C,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,chr6:g.52619766C>T
1,32437406,432.18,C,7.6822e-158,215.0,215.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_C_1,1,218,hgvs-grch37,"[Row(pdbAminoAcid='G', pdbPosition=83, queryAm...",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,221,2018-05-03,chr6:g.52619766C>T,4ACS,83,G,98.623853,9606,Homo sapiens,4ACS.C,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,chr6:g.52619766C>T
2,32437407,432.18,A,7.6822e-158,215.0,215.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_A_1,1,218,hgvs-grch37,"[Row(pdbAminoAcid='G', pdbPosition=83, queryAm...",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,221,2018-05-03,chr6:g.52619766C>T,4ACS,83,G,98.623853,9606,Homo sapiens,4ACS.A,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,chr6:g.52619766C>T
3,32437407,432.18,A,7.6822e-158,215.0,215.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_A_1,1,218,hgvs-grch37,"[Row(pdbAminoAcid='G', pdbPosition=83, queryAm...",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,221,2018-05-03,chr6:g.52619766C>T,4ACS,83,G,98.623853,9606,Homo sapiens,4ACS.A,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,chr6:g.52619766C>T
4,32437408,429.483,D,6.700580000000001e-157,214.0,214.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_D_1,1,217,hgvs-grch37,"[Row(pdbAminoAcid='G', pdbPosition=83, queryAm...",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,220,2018-05-03,chr6:g.52619766C>T,4ACS,83,G,98.617512,9606,Homo sapiens,4ACS.D,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,chr6:g.52619766C>T


## Create a list of unique PDB IDs

In [5]:
pdb_ids = list(df.structureId.drop_duplicates())
print("PDB Ids:", pdb_ids)

PDB Ids: ['4ACS', '2WJU', '2VCT', '2FO0', '1OPL', '5MO4', '2F4J', '2G1T', '2G2F', '2GQG', '2HIW', '3QRI', '3UE4', '2G2H', '4ZOG', '2HZI', '3QRK', '4WA9', '4TWP', '2V7A', '1DXT', '1YZI', '1YHR', '1YHE', '1YH9', '1YDZ', '1Y8W', '1Y0D', '1Y0C', '1Y0A', '1Y09', '1XZV', '1XZU', '1XZ7', '1XZ5', '1XZ4', '1XZ2', '1XYE', '1XY0', '1XXT', '4XS0', '2W6V', '1VWT', '1QXD', '3KMF', '6FQF']


## Find all polymer-ligand interactions

In [6]:
structures = mmtfReader.download_mmtf_files(pdb_ids)
structures = structures.filter(lambda s: s[1].num_models == 1)  ## Currently, only structures with 1 model are supported

In [7]:
interaction_filter = InteractionFilter(distanceCutoff=distance_cutoff)
interaction_filter.set_query_groups(False, ["HOH", "DOD"])  # exclude water interactions

interactions = InteractionExtractor().get_ligand_polymer_interactions(structures, interaction_filter, level='group').toPandas()
interactions.head()

Unnamed: 0,structureChainId,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,1XZ4.A,HEM,A,142,ARG,A,92,91,MLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTAFPHF...
1,1XZ4.A,HEM,A,142,HIS,A,45,44,MLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTAFPHF...
2,1XZ4.D,HEM,D,147,GLY,D,107,106,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...
3,1XZ4.D,HEM,D,147,LYS,D,65,64,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...
4,1XZ4.C,HEM,C,142,ALA,C,63,62,MLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTAFPHF...


## Filter mutations by joining with the interaction data

In [8]:
mt = df.merge(interactions, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','targetGroupNumber'], how='inner')
mt.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity,tax_id,scientific_name,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum,ID,CHROM,POS,REF,ALT,annotation,color,var_id,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,17063512,993.416,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,2fo0_A_1,1,466,hgvs-grch37,"[Row(pdbAminoAcid='Y', pdbPosition=272, queryA...",65,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,46,488093,511,2017-10-25,chr9:g.133738358A>T,2FO0,272,Y,99.785408,9606,Homo sapiens,2FO0.A,272,230,P00519,253,rs121913460,9,133738358,A,T,ABL1 missense mutation,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
1,18504755,994.186,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,2fo0_A_1,1,466,hgvs-grch37,"[Row(pdbAminoAcid='Y', pdbPosition=272, queryA...",65,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,65,529412,530,2017-10-25,chr9:g.133738358A>T,2FO0,272,Y,99.785408,9606,Homo sapiens,2FO0.A,272,230,P00519,253,rs121913460,9,133738358,A,T,ABL1 missense mutation,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
2,17063512,993.416,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,2fo0_A_1,1,466,hgvs-grch37,"[Row(pdbAminoAcid='Y', pdbPosition=272, queryA...",65,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,46,488093,511,2017-10-25,chr9:g.133738358A>T,2FO0,272,Y,99.785408,9606,Homo sapiens,2FO0.A,272,230,P00519,253,rs121913460,9,133738358,A,T,ABL1 missense mutation,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
3,18504755,994.186,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,2fo0_A_1,1,466,hgvs-grch37,"[Row(pdbAminoAcid='Y', pdbPosition=272, queryA...",65,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,65,529412,530,2017-10-25,chr9:g.133738358A>T,2FO0,272,Y,99.785408,9606,Homo sapiens,2FO0.A,272,230,P00519,253,rs121913460,9,133738358,A,T,ABL1 missense mutation,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
4,17063513,961.444,A,0.0,450.0,451.0,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,1,1opl,1opl_A_1,1,451,hgvs-grch37,"[Row(pdbAminoAcid='Y', pdbPosition=272, queryA...",81,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,62,488093,512,2017-10-25,chr9:g.133738358A>T,1OPL,272,Y,99.778271,9606,Homo sapiens,1OPL.A,272,272,P00519,253,rs121913460,9,133738358,A,T,ABL1 missense mutation,green,chr9:g.133738358A>T,P16,A,539,TYR,A,272,271,MGQQPGKVLGDQRRPSLPALHFIKGAGKRDSSRHGGPHCNVFVEHE...


## Save protein-ligand mapping

In [9]:
mt.to_csv(output_file_name, index=False)

## View mutations grouped by protein chain
Use the slider to view each protein chain. Turn labels off for an unobstructed view of the mutations. Interacting ligands are rendered as spheres with green carbon atoms.

In [10]:
chains = mt.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['1OPL.A', '1QXD.B', '1QXD.D', '1Y8W.D', '1YZI.B', '2F4J.A', '2FO0.A', '2G1T.A', '2G1T.B', '2G1T.C', '2G1T.D', '2G2F.A', '2G2H.A', '2G2H.B', '2GQG.A', '2GQG.B', '2HIW.A', '2HIW.B', '2HZI.A', '2V7A.A', '2V7A.B', '2W6V.B', '2W6V.D', '3KMF.C', '3QRI.A', '3QRK.A', '3UE4.A', '4TWP.B', '4WA9.B', '4XS0.B', '4ZOG.A', '5MO4.A', '6FQF.D']


In [22]:
# View mutations per protein chain
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, font, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
   
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        rows = group.shape[0]
        for j in range(0, rows):
            # interacting residue info
            res_num = str(group.iloc[j]['pdbPosition'])
            mod_res = {'resi': res_num, 'chain': chain_id}
            
            # interacting ligand info
            lig_id = group.iloc[j]['queryGroupId']
            lig_chain = group.iloc[j]['queryChainId']
            lig_num = group.iloc[j]['queryGroupNumber']
            lig_res = {'resi': lig_num, 'chain': lig_chain}
            
            col = group.iloc[j]['color']
            c_col = col + 'Carbon'
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
            viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})
            
            annotation = group.iloc[j]['annotation']
            mutation = group.iloc[j]['variationId']
            label = ""
            if show_labels:
                label = label + mutation + " "
            if show_annotations:
                label = label + annotation
            if show_labels or show_annotations:
                viewer.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory'}, mod_res)
                viewer.addLabel(lig_id + lig_num, {'fontSize':font}, lig_res)

        viewer.zoomTo({'chain': chain_id})
        viewer.center({'chain': chain_id})
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        # print header
        print("PDB Id: ", pdb_id, "chain Id:", chain_id, "annotation", annotation)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + group.iloc[j][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, font=8, i=s_widget)

def view_image():
    return viewer.png()

In [23]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

### View static image
Shown below is a static image of the 3D visualization above. To save the image, right-click on the image and choose "Copy Image" or "Save Image As..."

In [24]:
view_image()

## View one mutation at a time
Use the slider to view each mutation. Surrounding residues within the `distance_cutoff` are rendered as orange sticks. Interacting ligands are rendered as spheres with green carbon atoms.

In [25]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.7}})
       
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'width': 0.6, 'opacity':0.5}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        # interacting residue info
        res_num = str(df.iloc[i]['pdbPosition'])
        mod_res = {'resi': res_num, 'chain': chain_id}
        
        # interacting ligand info
        label = df.iloc[i]['variationId']  
        lig_id = df.iloc[i]['queryGroupId']
        lig_chain = df.iloc[i]['queryChainId']
        lig_num = df.iloc[i]['queryGroupNumber']
        lig_res = {'resi': lig_num, 'chain': lig_chain}
        lig_label = lig_id + "-" + lig_chain + lig_num
        
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer.addStyle(mod_res, {'stick':{'colorscheme': c_col, 'radius': 0.2}})
        viewer.addStyle(mod_res, {'sphere':{'color': col, 'opacity': 0.8}})  

        annotation = df.iloc[i]['annotation']
        mutation = df.iloc[i]['variationId']
        label = ""
        if show_labels:
            label = label + mutation + " "
        if show_annotations:
            label = label + annotation
        if show_labels or show_annotations:
            viewer.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory'}, mod_res) 
            viewer.addLabel(lig_label, {'fontSize':font-2}, lig_res) 
            
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': distance_cutoff}
        
        # residues surrounding mutation site
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})

        
        # interacting ligand style
        viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})  
        
        # set style for interacting waters
        waters = {'resn': ['HOH','DOD']}
        waters.update(surroundings)
        viewer.addStyle(waters,{'sphere':{'color':'orange', 'radius': 0.5}})
    
        if show_surface:
             viewer.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
         
        viewer.zoomTo(surroundings)
        viewer.center(mod_res)
        
        # print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "ligand:", lig_label, "mutation:", mutation, "annotation:", annotation)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, font=12, i=s_widget)

def view_image():
    return viewer.png()

In [26]:
mt_unique = mt.drop_duplicates(["structureChainId","variationId",'queryGroupId','queryGroupNumber'])

In [27]:
view_single_mutation(mt_unique, distance_cutoff);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

### View static image
Shown below is a static image of the 3D visualization above. To save the image, right-click on the image and choose "Copy Image" or "Save Image As..."

In [28]:
view_image()

In [29]:
# Shutdown Spark
spark.stop()

## Now run the next step
Map mutations occuring at protein-polymer interfaces: [5-MapToDrugInteractions.ipynb](5-MapToDrugInteractions.ipynb)