## Map Mutations to Protein-Drug Interactions
Here we find and visualize the mutations at protein-drug binding sites.

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.datasets import drugBankDataset, pdbjMineDataset
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd

In [2]:
# Initialize Spark
spark = SparkSession.builder.master("local[4]").appName("5-MapToDrugInteractions").getOrCreate()

#### Input parameters

In [3]:
distance_cutoff = 8  # distance cutoff for visualizing interactions
mw_min = 250  # minimum molecular weight for drug molecules
input_file_name = 'mutations3d_ligand.csv'  # mutations mapped to protein-ligand binding sites
output_file_name = 'mutations3d_drug.csv'  # mutations mapped to protein-drug binding sites

## Read 'mutation3d_ligand.csv' file created in the previous step

In [4]:
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df['queryGroupNumber'] = df['queryGroupNumber'].astype('str')
df.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,...,color,var_id,queryGroupId,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,17063512,993.416,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,...,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
1,18504755,994.186,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,...,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
2,17063512,993.416,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,...,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
3,18504755,994.186,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,...,green,chr9:g.133738358A>T,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
4,17063513,961.444,A,0.0,450.0,451.0,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,1,1opl,...,green,chr9:g.133738358A>T,P16,A,539,TYR,A,272,271,MGQQPGKVLGDQRRPSLPALHFIKGAGKRDSSRHGGPHCNVFVEHE...


## Get InChiKey for ligands in PDB with molecular weight >= 250

In [5]:
ccQuery = "SELECT c.id as ligand_id, c.formula_weight, d.descriptor as inchi_key FROM pdbj.chem_comp c \
           JOIN cc.pdbx_chem_comp_descriptor d ON d.comp_id = c.id \
           WHERE d.type = 'InChIKey' AND c.formula_weight >= " + str(mw_min)

ligands = pdbjMineDataset.get_dataset(ccQuery).dropDuplicates().toPandas()
ligands.head()

Unnamed: 0,ligand_id,formula_weight,inchi_key
0,LF0,405.486,ZFERZAMPQIXCPM-QHCPKHFHSA-N
1,6RF,496.539,QPRMAEKTXODJGJ-INIZCTEOSA-N
2,CRT,596.925,VAZQBTJCYODOSV-RISZBRKMSA-N
3,PQQ,330.206,MMXZSJMASHPLLR-UHFFFAOYSA-N
4,2GB,393.328,DATQTWKKBHKLSV-INIZCTEOSA-N


## Join dataset on ligand id to add InchiKeys

In [6]:
df = df.merge(ligands, left_on=['queryGroupId'], right_on=['ligand_id'], how='inner')
df = df.drop_duplicates()
df.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,...,queryChainId,queryGroupNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence,ligand_id,formula_weight,inchi_key
0,17063512,993.416,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,...,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...,P16,427.283,ZIQFYVPVJZEOFS-UHFFFAOYSA-N
1,18504755,994.186,A,0.0,465.0,466.0,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,ARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVL...,1,2fo0,...,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...,P16,427.283,ZIQFYVPVJZEOFS-UHFFFAOYSA-N
4,17063513,961.444,A,0.0,450.0,451.0,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,1,1opl,...,A,539,TYR,A,272,271,MGQQPGKVLGDQRRPSLPALHFIKGAGKRDSSRHGGPHCNVFVEHE...,P16,427.283,ZIQFYVPVJZEOFS-UHFFFAOYSA-N
5,18504756,962.214,A,0.0,450.0,451.0,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,DPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNG...,1,1opl,...,A,539,TYR,A,272,271,MGQQPGKVLGDQRRPSLPALHFIKGAGKRDSSRHGGPHCNVFVEHE...,P16,427.283,ZIQFYVPVJZEOFS-UHFFFAOYSA-N
8,17063549,582.408,B,0.0,271.0,271.0,DKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTM...,DKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTM...,1,2g2h,...,B,532,TYR,B,272,27,GHMSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVK...,P16,427.283,ZIQFYVPVJZEOFS-UHFFFAOYSA-N


## Download open DrugBank dataset¶

For this demo we use the open DrugBank dataset. One disadvantage of the open DrugBank dataset is that in not only contains approved drugs, but many other compounds in pharmaceutical use such as ethanol, ATP, etc.

In [7]:
drugs = drugBankDataset.get_open_drug_links() \
                       .filter("StandardInChIKey IS NOT NULL") \
                       .filter("CAS IS NOT NULL") \
                       .toPandas()

The DrugBank password protected datasets contain more information (e.g., approval status). To use these datasets, you need to create a free DrugBank account and supply username/passwork to access these datasets. 

For this demo, we continue with the open drug bank dataset.

[Create DrugBank account](https://www.drugbank.ca/public_users/sign_up)

In [8]:
# username = "<your DrugBank account username>"
# password = "<your DrugBank account password>"
# drugs = drugBankDataset.get_drug_links("APPROVED", username,password) \
#                        .filter("StandardInChIKey IS NOT NULL") \
#                        .toPandas()

## Print some sample DrugBank data

In [9]:
drugs.head()

Unnamed: 0,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,DB00006,BIOD00076 | BTD00076 | DB02351 | EXPT03302,Bivalirudin,128270-60-0,TN9BEX005G,Bivalirudin | Bivalirudina | Bivalirudinum | H...,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00014,BIOD00113 | BTD00113,Goserelin,65807-02-5,0F65R8P09N,Goserelin | Goserelina,BLCLNMBMMGCOAS-URPVMXJPSA-N
2,DB00027,BIOD00036 | BTD00036,Gramicidin D,1405-97-6,5IE62321P4,Bacillus brevis gramicidin D | Gramicidin | Gr...,NDAYQJDHGXTBJL-MWWSRJDJSA-N
3,DB00035,BIOD00061 | BIOD00112 | BTD00061 | BTD00112,Desmopressin,16679-58-6,ENR1LLB0FP,1-(3-mercaptopropionic acid)-8-D-arginine-vaso...,NFLWUMRGJYTJIN-NXBWRCJVSA-N
4,DB00050,APRD00686 | BIOD00115 | BTD00115,Cetrorelix,120287-85-6,OON1HFZ4BA,Cetrorelix | Cetrorelixum,SBNPWPIBESPSIF-MHWMIDJBSA-N


In [10]:
df = df.merge(drugs, left_on=['inchi_key'], right_on=['StandardInChIKey'], how='inner')
df.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,...,ligand_id,formula_weight,inchi_key,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,17063515,671.003,A,0.0,315.0,316.0,NLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQG...,NLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQG...,1,5mo4,...,NIL,529.516,HHZIURLSWUIHRB-UHFFFAOYSA-N,DB04868,,Nilotinib,641571-10-0,F41401512X,Nilotinib | Nilotinibum,HHZIURLSWUIHRB-UHFFFAOYSA-N
1,18504758,671.389,A,0.0,315.0,316.0,NLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQG...,NLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQG...,1,5mo4,...,NIL,529.516,HHZIURLSWUIHRB-UHFFFAOYSA-N,DB04868,,Nilotinib,641571-10-0,F41401512X,Nilotinib | Nilotinibum,HHZIURLSWUIHRB-UHFFFAOYSA-N
2,17063535,587.03,A,0.0,272.0,272.0,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...,7,2gqg,...,1N1,488.006,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,DB01254,,Dasatinib,302962-49-8,X78UG0A0RN,anh. dasatinib | Anhydrous dasatinib | BMS das...,ZBNZXTGUTAYRHI-UHFFFAOYSA-N
3,18504778,588.186,A,0.0,272.0,272.0,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...,7,2gqg,...,1N1,488.006,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,DB01254,,Dasatinib,302962-49-8,X78UG0A0RN,anh. dasatinib | Anhydrous dasatinib | BMS das...,ZBNZXTGUTAYRHI-UHFFFAOYSA-N
4,17063537,585.874,B,0.0,271.0,271.0,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...,SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLK...,7,2gqg,...,1N1,488.006,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,DB01254,,Dasatinib,302962-49-8,X78UG0A0RN,anh. dasatinib | Anhydrous dasatinib | BMS das...,ZBNZXTGUTAYRHI-UHFFFAOYSA-N


### Keep only unique ligands per structure
Here we drop rows with the same structureId and ligandId.

## Save protein-ligand mapping

In [11]:
df.to_csv(output_file_name, index=False)

## View mutations grouped by protein chain
Use the slider to view each protein chain. Turn labels off for an unobstructed view of the mutations. Interacting ligands are rendered as spheres with green carbon atoms.

In [12]:
chains = df.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['2GQG.A', '2GQG.B', '2V7A.A', '2V7A.B', '3QRI.A', '3UE4.A', '4TWP.B', '4WA9.B', '5MO4.A']


## View mutations grouped by protein chain

In [13]:
# Setup viewer
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, font, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
   
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        rows = group.shape[0]
        for j in range(0, rows):
            # interacting residue info
            res_num = str(group.iloc[j]['pdbPosition'])
            mod_res = {'resi': res_num, 'chain': chain_id}
            
            # interacting ligand info
            lig_chain = group.iloc[j]['queryChainId']
            lig_num = group.iloc[j]['queryGroupNumber']
            lig_name = df.iloc[i]['Commonname']
            lig_res = {'resi': lig_num, 'chain': lig_chain}
            
            col = group.iloc[j]['color']
            c_col = col + 'Carbon'
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
            viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})
            
            annotation = group.iloc[j]['annotation']
            mutation = group.iloc[j]['variationId']
            label = ""
            if show_labels:
                label = label + mutation + " "
            if show_annotations:
                label = label + annotation
            if show_labels or show_annotations:
                viewer.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory'}, mod_res)
                viewer.addLabel(lig_name, {'fontSize':font}, lig_res)

        viewer.zoomTo({'chain': chain_id})
        viewer.center({'chain': chain_id})
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:", chain_id, "annotation:", annotation)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + group.iloc[j][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, font=8, i=s_widget)

def view_image():
    return viewer.png()

In [20]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

### View static image
Shown below is a static image of the 3D visualization above. To save the image, right-click on the image and choose "Copy Image" or "Save Image As..."

In [21]:
view_image()

## View one mutation at a time
Use the slider to view each mutation. Surrounding residues within the `distance_cutoff` are rendered as orange sticks. Interacting ligands are rendered as spheres with green carbon atoms.

In [22]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.7}})
       
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'width': 0.6, 'opacity':0.5}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        # interacting residue info
        res_num = str(df.iloc[i]['pdbPosition'])
        mod_res = {'resi': res_num, 'chain': chain_id}
        
        # interacting ligand info
        label = df.iloc[i]['variationId']  
        
        lig_id = df.iloc[i]['queryGroupId']
        lig_chain = df.iloc[i]['queryChainId']
        lig_num = df.iloc[i]['queryGroupNumber']
        lig_name = df.iloc[i]['Commonname']
        lig_res = {'resi': lig_num, 'chain': lig_chain}
        lig_label = lig_id + "-" + lig_chain + lig_num + " " + lig_name
        
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer.addStyle(mod_res, {'sphere':{'color': col, 'opacity': 0.8}})  # TODO opacity has no effect

        annotation = df.iloc[i]['annotation']
        mutation = df.iloc[i]['variationId']
        label = ""
        if show_labels:
            label = label + mutation + " "
        if show_annotations:
            label = label + annotation
        if show_labels or show_annotations:
            viewer.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory'}, mod_res) 
            viewer.addLabel(lig_label, {'fontSize':font-2}, lig_res) 
            
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': distance_cutoff}
        
        # residues surrounding mutation site
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})

        # interacting ligand style
        viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})   
        
    
        if show_surface:
             viewer.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
         
        viewer.zoomTo(surroundings)
        
        # print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "ligand:", lig_id + "-" + lig_chain + lig_num, "mutation:", mutation, "annotation:", annotation)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, font=12, i=s_widget)

def view_image():
    return viewer.png()

In [23]:
view_single_mutation(df, distance_cutoff, 'DrugBankID', 'Commonname');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

### View static image
Shown below is a static image of the 3D visualization above. To save the image, right-click on the image and choose "Copy Image" or "Save Image As..."

In [24]:
view_image()

In [25]:
# Shutdown Spark
spark.stop()