## Map Mutations to Protein-Drug Interactions
Here we find and visualize the mutations at protein-drug binding sites

In [1]:
from mmtfPyspark.datasets import drugBankDataset, customReportService, pdbjMineDataset
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd

#### Input parameters

In [2]:
cutoff_distance = 8  # cutoff distance for finding and visualizing interactions
mw_min = 250  # minimum molecular weight for drug molecules
input_file_name = 'mutations3d_ligand.csv'  # mutations mapped to 3D protein structures
output_file_name = 'mutations3d_drug.csv'  # mutations mapped to protein-ligand interactions

## Read 'mutation3d_ligand.csv' file created in the previous step

In [3]:
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str') # PDB residue numbers must be string to handle insertion codes
df['queryLigandNumber'] = df['queryLigandNumber'].astype('str')
df.head()

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,structureChainId,queryLigandId,queryLigandChainId,queryLigandNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence
0,3KMF,C,308,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,C,9606,Homo sapiens,3KMF.C,HEM,C,347,ASN,C,308,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...
1,2FO0,A,272,Y,hgvs-grch37,chr9:g.133738358A>T,2FO0,A,9606,Homo sapiens,2FO0.A,P16,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...
2,4WA9,B,253,Y,hgvs-grch37,chr9:g.133738358A>T,4WA9,B,9606,Homo sapiens,4WA9.B,AXI,B,9000,TYR,B,253,26,GSSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKT...
3,2G1T,C,253,Y,hgvs-grch37,chr9:g.133738358A>T,2G1T,C,9606,Homo sapiens,2G1T.C,MG,C,1701,TYR,C,253,27,GHMSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVK...
4,2G1T,C,253,Y,hgvs-grch37,chr9:g.133738358A>T,2G1T,C,9606,Homo sapiens,2G1T.C,112,G,1301,TYR,C,253,27,GHMSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVK...


## Get InChiKey for ligands in PDB with molecular weight >= 250

In [4]:
ccQuery = "SELECT c.id as ligand_id, c.formula_weight, d.descriptor as inchi_key FROM pdbj.chem_comp c \
           JOIN cc.pdbx_chem_comp_descriptor d ON d.comp_id = c.id \
           WHERE d.type = 'InChIKey' AND c.formula_weight >= " + str(mw_min)

ligands = pdbjMineDataset.get_dataset(ccQuery).dropDuplicates().toPandas()
ligands.head()

Unnamed: 0,ligand_id,formula_weight,inchi_key
0,PQQ,330.206,MMXZSJMASHPLLR-UHFFFAOYSA-N
1,PY4,334.262,VRMPGTOTVVJQMU-SNVBAGLBSA-N
2,CRT,596.925,VAZQBTJCYODOSV-RISZBRKMSA-N
3,VS4,621.79,PPIYQXGSPPWVLJ-CONSDPRKSA-N
4,107,449.506,MBXKBJLIESPLIK-UHFFFAOYSA-N


## Join dataset on ligand id to add InchiKeys

In [5]:
df = df.merge(ligands, left_on=['queryLigandId'], right_on=['ligand_id'], how='inner')
df = df.drop_duplicates()
df.head()

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,...,queryLigandChainId,queryLigandNumber,targetGroupId,targetChainId,targetGroupNumber,sequenceIndex,sequence,ligand_id,formula_weight,inchi_key
0,3KMF,C,308,N,hgvs-grch37,chr11:g.5246945G>T,3KMF,C,9606,Homo sapiens,...,C,347,ASN,C,308,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,HEM,616.487,KABFMIBPWCXCRK-RGGAHWMASA-L
1,1YZI,B,108,N,hgvs-grch37,chr11:g.5246945G>T,1YZI,B,9606,Homo sapiens,...,B,148,ASN,B,108,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,HEM,616.487,KABFMIBPWCXCRK-RGGAHWMASA-L
2,1QXD,D,108,N,hgvs-grch37,chr11:g.5246945G>T,1QXD,D,9606,Homo sapiens,...,D,148,ASN,D,108,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,HEM,616.487,KABFMIBPWCXCRK-RGGAHWMASA-L
3,6FQF,D,108,N,hgvs-grch37,chr11:g.5246945G>T,6FQF,D,9606,Homo sapiens,...,D,201,ASN,D,108,107,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,HEM,616.487,KABFMIBPWCXCRK-RGGAHWMASA-L
4,2FO0,A,272,Y,hgvs-grch37,chr9:g.133738358A>T,2FO0,A,9606,Homo sapiens,...,A,2,TYR,A,272,229,MGQQPGKVLGDQRREPQGLSEAARWNSKENLLAGPSENDPNLFVAL...,P16,427.283,ZIQFYVPVJZEOFS-UHFFFAOYSA-N


## Download open DrugBank dataset¶

For this demo we use the open DrugBank dataset. One disadvantage of the open DrugBank dataset is that in not only contains approved drugs, but many other compounds in pharmaceutical use such as ethanol, ATP, etc.

In [6]:
drugs = drugBankDataset.get_open_drug_links() \
                       .filter("StandardInChIKey IS NOT NULL") \
                       .filter("CAS IS NOT NULL") \
                       .toPandas()

The DrugBank password protected datasets contain more information (e.g., approval status). To use these datasets, you need to create a free DrugBank account and supply username/passwork to access these datasets. 

For this demo, we continue with the open drug bank dataset.

[Create DrugBank account](https://www.drugbank.ca/public_users/sign_up)

In [7]:
# username = "<your DrugBank account username>"
# password = "<your DrugBank account password>"
# drugs = drugBankDataset.get_drug_links("APPROVED", username,password) \
#                        .filter("StandardInChIKey IS NOT NULL") \
#                        .toPandas()

## Print some sample DrugBank data

In [8]:
drugs.head()

Unnamed: 0,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,DB00006,BIOD00076 | BTD00076 | DB02351 | EXPT03302,Bivalirudin,128270-60-0,TN9BEX005G,Bivalirudin | Bivalirudina | Bivalirudinum | H...,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00014,BIOD00113 | BTD00113,Goserelin,65807-02-5,0F65R8P09N,Goserelin | Goserelina,BLCLNMBMMGCOAS-URPVMXJPSA-N
2,DB00027,BIOD00036 | BTD00036,Gramicidin D,1405-97-6,5IE62321P4,Bacillus brevis gramicidin D | Gramicidin | Gr...,NDAYQJDHGXTBJL-MWWSRJDJSA-N
3,DB00035,BIOD00061 | BIOD00112 | BTD00061 | BTD00112,Desmopressin,16679-58-6,ENR1LLB0FP,1-(3-mercaptopropionic acid)-8-D-arginine-vaso...,NFLWUMRGJYTJIN-NXBWRCJVSA-N
4,DB00050,APRD00686 | BIOD00115 | BTD00115,Cetrorelix,120287-85-6,OON1HFZ4BA,Cetrorelix | Cetrorelixum,SBNPWPIBESPSIF-MHWMIDJBSA-N


In [9]:
df = df.merge(drugs, left_on=['inchi_key'], right_on=['StandardInChIKey'], how='inner')
df.head()

Unnamed: 0,structureId,chainId,pdbPosition,pdbAminoAcid,refGenome,variationId,pdbid,chain,tax_id,scientific_name,...,ligand_id,formula_weight,inchi_key,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,4WA9,B,253,Y,hgvs-grch37,chr9:g.133738358A>T,4WA9,B,9606,Homo sapiens,...,AXI,386.47,RITAVMQDGBJQJZ-FMIVXFBMSA-N,DB06626,,Axitinib,319460-85-0,C9LVQ0YUXG,Axitinib | Axitinibum,RITAVMQDGBJQJZ-FMIVXFBMSA-N
1,4TWP,B,253,Y,hgvs-grch37,chr9:g.133738358A>T,4TWP,B,9606,Homo sapiens,...,AXI,386.47,RITAVMQDGBJQJZ-FMIVXFBMSA-N,DB06626,,Axitinib,319460-85-0,C9LVQ0YUXG,Axitinib | Axitinibum,RITAVMQDGBJQJZ-FMIVXFBMSA-N
2,2V7A,A,253,Y,hgvs-grch37,chr9:g.133738358A>T,2V7A,A,9606,Homo sapiens,...,627,474.555,XKFTZKGMDDZMJI-HSZRJFAPSA-N,DB11778,,Danusertib,827318-97-8,M3X659D0FY,,XKFTZKGMDDZMJI-HSZRJFAPSA-N
3,3UE4,A,253,Y,hgvs-grch37,chr9:g.133738358A>T,3UE4,A,9606,Homo sapiens,...,DB8,530.446,UBPYILGKFZZVDX-UHFFFAOYSA-N,DB06616,,Bosutinib,380843-75-4,5018V4AEZ0,"4-((2,4-Dichloro-5-methoxyphenyl)amino)-6-meth...",UBPYILGKFZZVDX-UHFFFAOYSA-N
4,3QRI,A,253,Y,hgvs-grch37,chr9:g.133738358A>T,3QRI,A,9606,Homo sapiens,...,919,553.587,WVXNSAVVKYZVOE-UHFFFAOYSA-N,DB13005,,Rebastinib,1020172-07-9,75017Q6I97,,WVXNSAVVKYZVOE-UHFFFAOYSA-N


### Keep only unique ligands per structure
Here we drop rows with the same structureId and ligandId.

## Save protein-ligand mapping

In [10]:
df.to_csv(output_file_name, index=False)

## View mutations grouped by protein chain
Use the slider to view each protein chain. Turn labels off for an unobstructed view of the mutations. Interacting ligands are rendered as spheres with green carbon atoms.

In [11]:
chains = df.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['2GQG.B', '2V7A.A', '3QRI.A', '3UE4.A', '4TWP.B', '4WA9.B', '5MO4.A']


In [12]:
def view_grouped_mutations(grouped_df, cutoff_distance, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
   
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        rows = group.shape[0]
        for j in range(0, rows):
            # interacting residue info
            res_num = str(group.iloc[j]['pdbPosition'])
            mod_res = {'resi': res_num, 'chain': chain_id}
            
            # interacting ligand info
            lig_id = group.iloc[j]['queryLigandId']
            lig_chain = group.iloc[j]['queryLigandChainId']
            lig_num = group.iloc[j]['queryLigandNumber']
            lig_name = df.iloc[i]['Commonname']
            lig_res = {'resi': lig_num, 'chain': lig_chain}
            
            col = 'red'
            c_col = col + 'Carbon'
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})   
            viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})
            
            if show_labels:
                label = group.iloc[j]['variationId']
                viewer.addLabel(label, {'fontSize':9,'fontColor': 'black','backgroundColor':'ivory'}, mod_res)
                viewer.addLabel(lig_name, {'fontSize':9}, lig_res)

        viewer.zoomTo({'chain': chain_id})
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        #print header
        print("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + group.iloc[j][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

Turn off scrolling in the viewer cell below

In [13]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

In [14]:
view_grouped_mutations(chains, cutoff_distance);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Surrounding residues within the `cutoff_distance` are rendered as orange sticks. Interacting ligands are rendered as spheres with green carbon atoms.

In [15]:
def view_single_mutation(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.7}})
       
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'width': 0.6, 'opacity':0.5}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # don't display water molecules
        viewer.setStyle({'resn': ['HOH','DOD']}, {})
        
        # interacting residue info
        res_num = str(df.iloc[i]['pdbPosition'])
        mod_res = {'resi': res_num, 'chain': chain_id}
        
        # interacting ligand info
        label = df.iloc[i]['variationId']  
        
        lig_id = df.iloc[i]['queryLigandId']
        lig_chain = df.iloc[i]['queryLigandChainId']
        lig_num = df.iloc[i]['queryLigandNumber']
        lig_name = df.iloc[i]['Commonname']
        lig_res = {'resi': lig_num, 'chain': lig_chain}
        lig_label = lig_id + "-" + lig_chain + lig_num + " " + lig_name
        
        col = 'red'
        c_col = col + 'Carbon'
#        viewer.addStyle(mod_res, {'stick':{'colorscheme': c_col, 'radius': 0.2}})
        viewer.addStyle(mod_res, {'sphere':{'color': col, 'opacity': 0.8}})  # TODO opacity has no effect

        if show_labels:
            viewer.addLabel(label, {'fontSize':12,'fontColor': 'black','backgroundColor':'ivory'}, mod_res) 
            viewer.addLabel(lig_label, {'fontSize':10}, lig_res) 
            
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': cutoff_distance}
        
        # residues surrounding mutation site
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})

        # interacting ligand style
        viewer.addStyle(lig_res, {'sphere': {'colorscheme': 'greenCarbon'}})   
        
    
        if show_surface:
             viewer.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
         
        viewer.zoomTo(surroundings)
        
        # print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "ligand:", lig_id + lig_num, "mutation:", label)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [16]:
view_single_mutation(df, cutoff_distance, 'DrugBankID', 'Commonname');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…