# Map Benign Mutations to 3D Structure

This notebook maps a dataset of 63,197 missense mutations with allele frequencies >=1% and <25% extracted from the ExAC database to 3D structures in the Protein Data Bank.
The dataset is described in:
    
[1] Niroula A, Vihinen M (2019) How good are pathogenicity predictors in detecting benign variants? 
PLoS Comput Biol 15(2): e1006481. doi: [10.1371/journal.pcbi.1006481](https://doi.org/10.1371/journal.pcbi.1006481)

In [1]:
# Disable Numba: temporary workaround for https://github.com/sbl-sdsc/mmtf-pyspark/issues/288
import os
os.environ['NUMBA_DISABLE_JIT'] = "1"

In [2]:
from pyspark.sql import SparkSession
from mmtfPyspark.datasets import dbSnpDataset, pdbjMineDataset
from ipywidgets import interact, IntSlider
import pandas as pd
import py3Dmol

In [3]:
#### Initialize Spark
spark = SparkSession.builder.appName("BenignMutationsTo3DStructure").getOrCreate()

In [4]:
# Enable Arrow-based columnar data transfers between Spark and Pandas dataframes
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## Read ExAC_ASS dataset [1]

In [5]:
df = pd.read_excel('http://structure.bmc.lu.se/VariBench/ExAC_AAS_20171214.xlsx', dtype=str, nrows=63198)
df = df[df.RSID.str.startswith('rs')]  # keep only rows that contain rs ids.
df = df[df.RSID.str.contains(';') == False]  # skip rows with an ';' in the RSID column
df['rs_id'] = df.RSID.str[2:].astype('int')  #  create integer column of rs ids
df.head()

Unnamed: 0,Chromosome,Position,Reference_allele,Altered_allele,Reference_AA,Altered_AA,RSID,hg19_chr,hg19_pos.1.based.,AF_Adj,...,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,CADD_phred,rs_id
0,1,69428,T,G,F,C,rs140739101,1,69428,0.0246222622,...,T,-5.05,D,0.261,-0.9558,T,0.0007,T,23.7,140739101
1,1,69761,A,T,D,V,rs200505207,1,69761,0.1771387237,...,T,-2.22,N,0.111,-1.1242,T,0.0,T,0.684,200505207
2,1,930314,C,T,H,Y,rs9988179,1,865694,0.0381036473,...,.,-4.51,D,0.171,-1.084,T,0.0008,T,21.3,9988179
3,1,942951,C,T,P,L,rs148327885,1,878331,0.0183868339,...,.,-3.81,D,0.046,-1.042,T,0.0526,T,13.8,148327885
4,1,946538,G,A,S,L,rs35471880,1,881918,0.0470013621,...,T,-3.63,D,0.163,-1.1375,T,0.0123,T,24.1,35471880


Convert Pandas dataframe to Spark Dataframe

In [6]:
ds = spark.createDataFrame(df)

## Read file with dbSNP info
The following dataset was created from the NCBI dbSNP SNP3D_PDB_GRCH37 dataset by mapping non-synonymous SNPs to human proteins with >= 95% sequence identity in the PDB.

In [7]:
dn = dbSnpDataset.get_cached_dataset()

## Find the intersection between the two dataframes

In [8]:
pd.set_option('display.max_columns', None)  # show all columns
dp = dn.join(ds, dn.snp_id == ds.rs_id).toPandas()
dp = dp.sort_values(['chr', 'pos'])
dp.head()

Unnamed: 0,chr,pos,snp_id,master_acc,master_gi,master_pos,master_res,master_var,pdb_gi,pdb_res,pdb_pos,blast_ident,clinsig,pdbChainId,tax_id,pdbResNum,uniprotId,uniprotNum,Chromosome,Position,Reference_allele,Altered_allele,Reference_AA,Altered_AA,RSID,hg19_chr,hg19_pos.1.based.,AF_Adj,AF_AFR,AF_AMR,AF_EAS,AF_FIN,AF_NFE,AF_OTH,AF_SAS,AF_MALE,AF_FEMALE,AF_CONS,PON-P2_score,PON-P2_pred,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,CADD_phred,rs_id
1780,1,949422,143888043,NP_005092,4826774,21,S,N,340780633,S,28,100.0,Benign,3SDL.C,9606,21,P05161,21.0,1,1014042,G,A,S,N,rs143888043,1,949422,0.0016525877,0.0168897603,0.0011257263,0.0,0.0,0.000137057,0.0011209506,6.18e-05,0.0010919627,0.0023602674,0.0,0.081,Neutral,0.732,T,0.0,B,0.0,B,0.541861,N,1.0,N,-0.59,N,-0.85,T,0.34,N,0.0085,-0.9246,T,0.1229,T,0.254,143888043
321,1,949608,1921,NP_005092,4826774,83,S,N,340780633,S,90,100.0,Benign,3SDL.C,9606,83,P05161,83.0,1,1014228,G,A,S,N,rs1921,1,949608,0.370246762,0.4111114339,0.2454099255,0.205113894,0.4757061975,0.4041889166,0.3777358071,0.3399131441,0.3734580725,0.3662007238,0.339171018,0.334,Unknown,0.311,T,0.01,B,0.005,B,0.000631,N,0.999999,P,1.6,L,1.59,T,-1.02,N,0.012,-0.9028,T,0.0,T,2.276,1921
322,1,949608,1921,NP_005092,4826774,83,S,T,340780633,S,90,100.0,Benign,3SDL.C,9606,83,P05161,83.0,1,1014228,G,A,S,N,rs1921,1,949608,0.370246762,0.4111114339,0.2454099255,0.205113894,0.4757061975,0.4041889166,0.3777358071,0.3399131441,0.3734580725,0.3662007238,0.339171018,0.334,Unknown,0.311,T,0.01,B,0.005,B,0.000631,N,0.999999,P,1.6,L,1.59,T,-1.02,N,0.012,-0.9028,T,0.0,T,2.276,1921
2415,1,1262635,142934629,NP_001025056,71274150,46,L,Q,524934171,L,47,100.0,,4K80.A,9606,46,Q5TA50,46.0,1,1327255,T,A,L,Q,rs142934629,1,1262635,0.0019542234,0.000102795,0.0001756219,0.0,0.0,0.0008136429,0.0,0.0106200524,0.0023970592,0.0013848768,0.014814129,0.129,Neutral,0.001,D,0.999,D,0.985,D,9e-06,U,1.0,D,3.37,M,.,.,-4.38,D,0.948,0.1935,D,0.5264,D,25.7,142934629
1009,1,1262979,61746802,NP_001025056,71274150,161,A,T,524934171,A,162,100.0,,4K80.A,9606,161,Q5TA50,161.0,1,1327599,G,A,A,T,rs61746802,1,1262979,0.0011473985,0.0137367435,0.0004735701,0.0,0.0,1.68e-05,0.0,6.32e-05,0.0008197235,0.0015836973,0.0,0.407,Unknown,0.006,D,1.0,D,0.999,D,0.0,U,1.0,D,3.475,M,.,.,-3.8,D,0.918,0.1628,D,0.4086,T,25.8,61746802


## View mutations grouped by protein chain
Use the slider to view each protein chain.

In [9]:
chains = dp.groupby('pdbChainId')

In [10]:
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

#       # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

#       # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
       
        # highlight chain of interest in blue
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        rows = group.shape[0]
        for j in range(0, rows):
            res_num = str(group.iloc[j]['pdbResNum'])
            mod_res = {'resi': res_num, 'chain': chain_id} 
            col = 'red'
            c_col = col + 'Carbon'
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})          
            if show_labels:
                label = 'rs' + str(group.iloc[j]['rs_id'])
                viewer.addLabel(label, {'fontSize':10,'fontColor': 'black','backgroundColor':'ivory'}, {'resi': res_num, 'chain': chain_id})
        
        
        #print header
        print("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + group.iloc[0][a])

        viewer.zoomTo({'chain': chain_id})
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})
                
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [11]:
view_grouped_mutations(chains, 'uniprotId','Chromosome');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [12]:
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):        
        pdb_id, chain_id = df.iloc[i]['pdbChainId'].split('.')
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
       
        # highlight chain of interest in green
        viewer.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.5}})
        
        # 
        res_num = str(df.iloc[i]['pdbResNum'])
        label = 'rs' + str(df.iloc[i]['rs_id'])
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = 'red'
        c_col = col + 'Carbon'
        viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.8}})          
        
        if show_labels:
            viewer.addLabel(label, {'fontSize':12,'fontColor': 'black','backgroundColor':'ivory'}, {'resi': res_num, 'chain': chain_id})

        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': distance_cutoff}
        
        # residues surrounding mutation positions
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        
        viewer.zoomTo(surroundings)
        
        if show_surface:
             viewer.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
         #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", label)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + str(df.iloc[i][a]))
            
        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

## View one mutation at a time
Use the slider to view each mutation. Interacting residues within the distance_cutoff of 8 A are rendered as orange sticks.

In [13]:
distance_cutoff = 8
view_single_mutation(dp, distance_cutoff, 'uniprotId','Chromosome','Position','Reference_allele','Altered_allele','Reference_AA','Altered_AA','clinsig', 'AF_Adj');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [14]:
spark.stop()