## Map SNPs from dbSNP to 3D structures from PDB.
This notebook is a prototype for visualizing the positions of missense mutations mapped from [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/) (GRCh37 build) to 3D protein structures in the Protein Data Bank.

In [1]:
import warnings
warnings.filterwarnings("ignore") # numpy version issue?
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import collect_set, collect_list, concat_ws
from mmtfPyspark.datasets import dbSnpDataset
import pandas as pd
from ipywidgets import interact, IntSlider, widgets
from IPython.display import display
import py3Dmol

### Setup widgets

In [2]:
field = widgets.Dropdown(options=('none','snp_id', 'pdbChainId','uniprotId','sqlQuery'),description='Select field:')
selection = widgets.Textarea(description='Enter id(s):', value=(''))
significance = widgets.SelectMultiple(description='Significance:', \
                                      options=('All', 'Benign', 'Likely benign', 'Likely pathogenic', \
                                               'Pathogenic', 'drug-response', 'untested', \
                                               'Uncertain significance', 'other', 'null'), \
                                      value=('Benign', 'Likely benign', 'Likely pathogenic', \
                                               'Pathogenic', 'drug-response'))

## Select clinical significance
Select one of more significance level from ClinVar (MacOS: hold command key to select multiple criteria). 

Default: Benign, Likely benign, Likely pathogenic, Pathogenic, drug-response.

In [3]:
display(significance)

SelectMultiple(description='Significance:', index=(1, 2, 3, 4, 5), options=('All', 'Benign', 'Likely benign', …

## Optionally, filter dataset
Select a query field and enter a comma separated list of identifiers:

Example queries below are for missense mutations in the Cystic Fibrosis [CFTR2 gene](https://www.cftr2.org/mutations_history).

* none: no filtering (Default)
* snp_id:  397508256, 397508796 (also called the rsId, e.g. rs397508256)
* pdbChainId: 5UAK.A
* uniprotId: P13569
* sqlQuery: any valid sql query (e.g., chr = 7 AND pos = 117149089)

In [4]:
display(field)
display(selection)

Dropdown(description='Select field:', options=('none', 'snp_id', 'pdbChainId', 'uniprotId', 'sqlQuery'), value…

Textarea(value='', description='Enter id(s):')

### Create query string

In [5]:
if significance.value and not 'All' in significance.value:
    sig_query = "clinsig IN " + str(significance.value).replace(",)", ")")
    print("Query:", sig_query)

if field.value == 'sqlQuery':
    query = selection.value
    print("Query:", query)
    
elif field.value != 'none':
    query = field.value + " IN " + str(tuple(selection.value.split(","))).replace(",)", ")")
    print("Query:", query)

Query: clinsig IN ('Benign', 'Likely benign', 'Likely pathogenic', 'Pathogenic', 'drug-response')


### Initialize Spark

In [6]:
spark = SparkSession.builder.master("local[4]").appName("dbSNPTo3DChain").getOrCreate()

## Read file with dbSNP info
The following dataset was created from the SNP3D_PDB_GRCH37 dataset by mapping non-synonymous SNPs to human proteins with >= 95% sequence identity in the PDB.

In [7]:
ds = dbSnpDataset.get_cached_dataset()
ds.count()

1171630

### Run query

In [8]:
if significance.value and not 'All' in significance.value:
    ds = ds.filter(sig_query)

if field.value in ['snp_id','pdbChainId','uniprotId','sqlQuery']:
    ds = ds.filter(query)
    
print("Results: ", ds.count())

Results:  49713


### Show some sample results

In [9]:
ds.toPandas().head(20)

Unnamed: 0,chr,pos,snp_id,master_acc,master_gi,master_pos,master_res,master_var,pdb_gi,pdb_res,pdb_pos,blast_ident,clinsig,pdbChainId,tax_id,pdbResNum,uniprotId,uniprotNum
0,X,100630190,128620185,NP_001274274,565324227,28,R,H,5542071,R,27,100.0,Pathogenic,1B55.A,9606,28,Q06187,28.0
1,16,223496,63750010,NP_000508,4504345,109,T,N,3891440,T,109,100.0,Likely pathogenic,1BZ1.A,9606,109,P69905,109.0
2,22,51065646,74315462,NP_000478,313569791,138,P,L,12084623,P,118,99.796,Pathogenic,1E2S.P,9606,136,P15289,136.0
3,22,51065646,74315462,NP_001078894,313569793,138,P,L,12084623,P,118,99.796,Pathogenic,1E2S.P,9606,136,P15289,136.0
4,22,51065646,74315462,NP_001078895,313569795,138,P,L,12084623,P,118,99.796,Pathogenic,1E2S.P,9606,136,P15289,136.0
5,22,51065646,74315462,NP_001078896,313569797,138,P,L,12084623,P,118,99.796,Pathogenic,1E2S.P,9606,136,P15289,136.0
6,22,51065136,199476366,NP_000478,313569791,246,R,H,12084623,R,226,99.796,Pathogenic,1E2S.P,9606,244,P15289,244.0
7,22,51065136,199476366,NP_001078894,313569793,246,R,H,12084623,R,226,99.796,Pathogenic,1E2S.P,9606,244,P15289,244.0
8,22,51065137,74315470,NP_000478,313569791,246,R,C,12084623,R,226,99.796,Likely pathogenic,1E2S.P,9606,244,P15289,244.0
9,22,51065137,74315470,NP_001078894,313569793,246,R,C,12084623,R,226,99.796,Likely pathogenic,1E2S.P,9606,244,P15289,244.0


## Aggregate data on the residue and chain level

In [10]:
ds = ds.groupBy("pdbChainId","pdbResNum","master_res","uniprotId").agg(collect_set("master_var").alias("master_var"),collect_set("clinsig").alias("clinsig"))
ds = ds.withColumn("master_var", concat_ws((""), ds.master_var))
ds = ds.withColumn("clinsig", concat_ws((","), ds.clinsig))
ds = ds.withColumn("snps", concat_ws(("->"), ds.master_res, ds.master_var))
ds = ds.drop("master_res")
ds = ds.groupBy("pdbChainId","uniprotId").agg(collect_list("pdbResNum").alias("pdbResNums"), \
                                              collect_list("snps").alias("snps"), \
                                              collect_list("clinsig").alias("clinsig"))

In [11]:
df = ds.toPandas()
df.head(20)

Unnamed: 0,pdbChainId,uniprotId,pdbResNums,snps,clinsig
0,2AMY.A,O15305,"[63, 132, 69, 171, 226, 187, 32, 157, 141, 188...","[K->N, I->T, P->SA, T->N, T->S, W->*, L->R, F-...","[Likely pathogenic, Likely pathogenic, Likely ..."
1,3CJW.A,P24468,"[245, 343, 286, 341]","[L->P, C->*, V->G, S->Y]","[Likely pathogenic, Likely pathogenic, Pathoge..."
2,5IZQ.A,P15328,"[182, 74, 96]","[R->*, C->Y, Q->*]","[Likely pathogenic, Likely pathogenic, Pathoge..."
3,5VBU.A,Q16874,"[63, 319, 54, 173, 31, 238, 357, 184, 126, 237...","[H->L, Q->*, Q->E, I->N, P->L, V->E, R->W, D->...","[Pathogenic, Pathogenic, Likely pathogenic, Pa..."
4,5FDY.A,O60939,[119],[P->L],[Likely benign]
5,2M3T.A,P22914,[18],[G->V],[Pathogenic]
6,4LY9.A,Q14397,"[103, 529, 51]","[V->ML, Q->*, R->*]","[Likely pathogenic, Likely pathogenic, Likely ..."
7,1KO9.A,O15527,[46],[R->Q],[Pathogenic]
8,4AOH.A,P03950,"[112, 12, 31, 39, 40, 46, 113, 28, 17, 60]","[P->L, Q->L, R->K, C->W, K->I, I->V, V->I, S->...","[Pathogenic, Pathogenic, Pathogenic, Pathogeni..."
9,3W1W.A,P22830,"[129, 334, 267, 194, 362, 172, 417, 408, 185, ...","[G->R, P->L, M->I, Y->L, V->G, E->K, F->S, N->...","[Likely benign, Likely pathogenic, Likely beni..."


### Setup visualization

In [12]:
def view_modifications(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        pdb_id, chain_id = df.iloc[i]['pdbChainId'].split('.')
        res_num = df.iloc[i]['pdbResNums']
        labels = df.iloc[i]['snps']
        sigs = df.iloc[i]['clinsig']
        
        sig_dir = {'Benign':'green', 'Likely benign':'turquoise', 'Likely pathogenic':'palevioletred', \
                   'Pathogenic':'red', 'drug-response':'plum', 'untested':'white', \
                   'Uncertain significance': 'lightgray', 'other':'white', 'null':'white'}
        
        # print header
        print ("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        all_residues = {'resi': res_num, 'chain': chain_id}
        
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': cutoff_distance}
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # residues surrounding mutation positions
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        
        # mutation positions
        for label, res, sig in zip(labels, res_num, sigs):
            sig1 = sig.split(',')[0] # if multiple values, use the first one
            col = (sig_dir[sig1])
            mod_res = {'resi': res, 'chain': chain_id} 
            c_col = col + "Carbon"
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})
                
            if show_labels:
                viewer.addLabel(label + " " + sig, {'fontSize':10,'fontColor':col,'backgroundColor':'ivory'}, {'resi': res, 'chain': chain_id})
        
        viewer.zoomTo(all_residues)
        
        if show_surface:
            viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'})

        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

## Visualize locations of missense mutations
Residues affected by mutations are rendered in as sticks and transparent spheres, and colored by ClinVar significance. Each mutated residue position is labeled by the PDB residue number and ClinVar significance. Residues surrounding mutation sites (within 6 A) are rendered as thin orange sticks. Small molecules within the structure are rendered as gray sticks.

Move the slider to browse through the structures. For an unobstructed view, turn off the labels.

In [13]:
view_modifications(df, 6, 'uniprotId');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## Alternative visualization

In [14]:
def view_surface(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        pdb_id, chain_id = df.iloc[i]['pdbChainId'].split('.')
        res_num = df.iloc[i]['pdbResNums']
        labels = df.iloc[i]['snps']
        sigs = df.iloc[i]['clinsig']
        
        sig_dir = {'Benign':'green', 'Likely benign':'turquoise', 'Likely pathogenic':'palevioletred', \
                   'Pathogenic':'red', 'drug-response':'plum', 'untested':'white', \
                   'Uncertain significance': 'lightgray', 'other':'white', 'null':'white'}
        
        # print header
        print ("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
            
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        all_residues = {'resi': res_num, 'chain': chain_id}
        
        # polymer style
        viewer.setStyle({'sphere': {'colorscheme': 'chain', 'opacity':0.6}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})

        # mutation style
        for label, res, sig in zip(labels, res_num, sigs):
            sig1 = sig.split(',')[0] # if multiple values, use the first one
            col = (sig_dir[sig1])
            mod_res = {'resi': res, 'chain': chain_id} 
            viewer.setStyle(mod_res, {'sphere':{'color':col}})
        
            if show_labels:
                viewer.addLabel(label + " " + sig, {'fontSize':10,'fontColor':col,'backgroundColor':'ivory'}, {'resi': res, 'chain': chain_id})
        
        viewer.zoomTo(all_residues)        
        
        if show_surface:
            viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'})

        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [15]:
view_surface(df, 6, 'uniprotId');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [16]:
spark.stop()