## Find Missense Mutations in PDB
This notebook is a prototype for visualizing the positions of missense mutations from [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/) (GRCh37 build) for cases where a protein structure contains the mutated amino acid.

In [1]:
import warnings
warnings.filterwarnings("ignore") # numpy version issue?
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import collect_set, collect_list, concat_ws
from mmtfPyspark.datasets import dbSnpDataset
import pandas as pd
from ipywidgets import interact, IntSlider, widgets
from IPython.display import display
import py3Dmol

In [2]:
cf = pd.df = pd.read_excel('https://www.cftr2.org/sites/default/files/CFTR2_8December2017_2.xlsx', skiprows=10,skipfooter=5,na_values=['not found','nan','NaN'])

In [3]:
cf['rsID'] = cf['rsID'].astype(str)
#cf['Variant final determination 17 March 2017 (previous version)'] = cf['Variant final determination 17 March 2017 (previous version)'].astype(str)
cf['snpId'] = cf['rsID'].map(lambda s: s.replace('rs', ''))
# cf.ix[0].astype(str)
# cf.ix[1].astype(str)
# cf.ix[2].astype(str)
# cf.ix[7].astype(str)
# cf.ix[8].astype(str)
# cf.ix[9].astype(str)
cf['Variant final determination\n17 March 2017\n(previous version)'] = cf['Variant final determination\n17 March 2017\n(previous version)'].astype(str)
cf.dtypes

Variant cDNA name\n(ordered 5' to 3')                                               object
Variant protein name                                                                object
Variant legacy name                                                                 object
rsID                                                                                object
# alleles in CFTR2                                                                   int64
Allele frequency in CFTR2\n(of 142,036 identified variants)*                       float64
% pancreatic insufficient (patients with variant in trans with ACMG-PI variant)    float64
Variant final determination\n17 March 2017\n(previous version)                      object
Variant final determination\n6 December 2017\n(current version)                     object
Change from previous version?                                                       object
snpId                                                                               object

In [4]:
cf.head(500)

Unnamed: 0,Variant cDNA name (ordered 5' to 3'),Variant protein name,Variant legacy name,rsID,# alleles in CFTR2,"Allele frequency in CFTR2 (of 142,036 identified variants)*",% pancreatic insufficient (patients with variant in trans with ACMG-PI variant),Variant final determination 17 March 2017 (previous version),Variant final determination 6 December 2017 (current version),Change from previous version?,snpId
0,c.-9_14del23,No protein name,124del23bp,rs397508136,6,0.000042,1.000000,CF-causing,CF-causing,No,397508136
1,c.(?_1)_(53+1_54-1)del,p.Glu2GlyfsX17,CFTRdele1,,6,0.000042,1.000000,CF-causing,CF-causing,No,
2,c.1A>G,p.Met1Val,M1V,rs397508328,26,0.000183,0.842105,CF-causing,CF-causing,No,397508328
3,c.4C>T,p.Gln2X,Q2X,rs397508740,5,0.000035,1.000000,CF-causing,CF-causing,No,397508740
4,c.11C>A,p.Ser4X,S4X,rs397508173,14,0.000099,1.000000,CF-causing,CF-causing,No,397508173
5,c.14C>T,p.Pro5Leu,P5L,rs193922501,60,0.000422,0.095238,,Varying clinical consequence,Yes,193922501
6,c.50delT,p.Phe17SerfsX8,182delT,rs397508742,9,0.000063,0.857143,CF-causing,CF-causing,No,397508742
7,c.(53+1_54-1)_(164+1_165-1)del,No protein name,CFTRdele2,,46,0.000324,1.000000,CF-causing,CF-causing,No,
8,c.(53+1_54-1)_(489+1_490-1)del,No protein name,CFTRdele2-4,,4,0.000028,1.000000,CF-causing,CF-causing,No,
9,c.53+1G>T,No protein name,185+1G->T,rs397508746,8,0.000056,1.000000,CF-causing,CF-causing,No,397508746


### Initialize Spark

In [5]:
spark = SparkSession.builder.master("local[4]").appName("CTFR2_SNPsTo3D").getOrCreate()

In [6]:
ctfr2 = spark.createDataFrame(cf)
ctfr2.printSchema()

root
 |-- Variant cDNA name
(ordered 5' to 3'): string (nullable = true)
 |-- Variant protein name: string (nullable = true)
 |-- Variant legacy name: string (nullable = true)
 |-- rsID: string (nullable = true)
 |-- # alleles in CFTR2: long (nullable = true)
 |-- Allele frequency in CFTR2
(of 142,036 identified variants)*: double (nullable = true)
 |-- % pancreatic insufficient (patients with variant in trans with ACMG-PI variant): double (nullable = true)
 |-- Variant final determination
17 March 2017
(previous version): string (nullable = true)
 |-- Variant final determination
6 December 2017
(current version): string (nullable = true)
 |-- Change from previous version?: string (nullable = true)
 |-- snpId: string (nullable = true)



## Read file with dbSNP info
The following dataset was created from the SNP3D_PDB_GRCH37 dataset by mapping non-synonymous SNPs to human proteins with >= 95% sequence identity in the PDB.

In [7]:
dbsnp = dbSnpDataset.get_cached_dataset()
dbsnp.count()

1171630

In [8]:
ds = ctfr2.join(dbsnp, ctfr2.snpId == dbsnp.snp_id)
ds.show()

+------------------------------------+--------------------+-------------------+-----------+------------------+-----------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------------+-------------------------------------------------------------+-----------------------------+---------+---+---------+---------+----------+---------+----------+----------+----------+----------+-------+-------+-----------+--------------------+----------+------+---------+---------+----------+
|Variant cDNA name
(ordered 5' to 3')|Variant protein name|Variant legacy name|       rsID|# alleles in CFTR2|Allele frequency in CFTR2
(of 142,036 identified variants)*|% pancreatic insufficient (patients with variant in trans with ACMG-PI variant)|Variant final determination
17 March 2017
(previous version)|Variant final determination
6 December 2017
(current version)|Change from previous versi

## Filter by clinical significance

In [9]:
# if significance.value and not 'All' in significance.value:
#     ds = ds.filter(sig_query)
#     print("Results: ", ds.count())

In [10]:
# if field.value in ['pdbChainId', 'uniprotId']:
#     print("Filtered by query: ", query)
#     ds = ds.filter(query)
#     ds.show(5)

### Show some sample results

In [11]:
ds.toPandas().head(20)

Unnamed: 0,Variant cDNA name (ordered 5' to 3'),Variant protein name,Variant legacy name,rsID,# alleles in CFTR2,"Allele frequency in CFTR2 (of 142,036 identified variants)*",% pancreatic insufficient (patients with variant in trans with ACMG-PI variant),Variant final determination 17 March 2017 (previous version),Variant final determination 6 December 2017 (current version),Change from previous version?,...,pdb_gi,pdb_res,pdb_pos,blast_ident,clinsig,pdbChainId,tax_id,pdbResNum,uniprotId,uniprotNum
0,c.1721C>A,p.Pro574His,P574H,rs121908758,25,0.000176,0.35,,CF-causing,Yes,...,1132605083,P,574,100.0,Likely pathogenic,5UAK.A,9606,574,P13569,574
1,c.720_741delAGGGAGAATGATGATGAAGTAC,p.Gly241GlufsX13,852del22,rs121908804,13,9.2e-05,1.0,CF-causing,CF-causing,No,...,1132605083,G,241,100.0,Pathogenic,5UAK.A,9606,241,P13569,241
2,c.1021T>C,p.Ser341Pro,S341P,rs397508144,23,0.000162,0.375,CF-causing,CF-causing,No,...,1132605083,S,341,100.0,Pathogenic,5UAK.A,9606,341,P13569,341
3,c.4036_4042delCTAAGCC,p.Leu1346MetfsX6,4168delCTAAGCC,rs397508662,3,2.1e-05,1.0,,CF-causing,Yes,...,1132605083,L,1346,100.0,,5UAK.A,9606,1346,P13569,1346
4,c.494T>C,p.Leu165Ser,L165S,rs397508736,21,0.000148,0.368421,,CF-causing,Yes,...,1132605083,L,165,100.0,untested,5UAK.A,9606,165,P13569,165
5,c.1692delA,p.Asp565MetfsX7,1824delA,rs193922505,5,3.5e-05,1.0,CF-causing,CF-causing,No,...,1132605083,D,565,100.0,Pathogenic,5UAK.A,9606,565,P13569,565
6,c.1650delA,p.Gly551ValfsX8,1782delA,rs397508251,4,2.8e-05,1.0,CF-causing,CF-causing,No,...,1132605083,G,551,100.0,Pathogenic,5UAK.A,9606,551,P13569,551
7,c.310delA,p.Arg104GlufsX3,442delA,rs397508499,7,4.9e-05,0.857143,CF-causing,CF-causing,No,...,1132605083,R,104,100.0,Pathogenic,5UAK.A,9606,104,P13569,104
8,c.543_546delTAGT,p.Leu183PhefsX5,675del4,rs397508750,4,2.8e-05,1.0,CF-causing,CF-causing,No,...,1132605083,L,183,100.0,Pathogenic,5UAK.A,9606,183,P13569,183
9,c.859_863delAACTT,p.Asn287LysfsX19,991del5,rs397508805,15,0.000106,1.0,CF-causing,CF-causing,No,...,1132605083,N,287,100.0,Pathogenic,5UAK.A,9606,287,P13569,287


## Aggregate data on the residue and chain level

In [12]:
ds = ds.groupBy("pdbChainId","pdbResNum","master_res","uniprotId").agg(collect_set("master_var").alias("master_var"),collect_set("clinsig").alias("clinsig"))
ds = ds.withColumn("master_var", concat_ws((""), ds.master_var))
ds = ds.withColumn("clinsig", concat_ws((","), ds.clinsig))
ds = ds.withColumn("snps", concat_ws(("->"), ds.master_res, ds.master_var))
ds = ds.drop("master_res")
ds = ds.groupBy("pdbChainId","uniprotId").agg(collect_list("pdbResNum").alias("pdbResNums"), \
                                              collect_list("snps").alias("snps"), \
                                              collect_list("clinsig").alias("clinsig"))

In [13]:
df = ds.toPandas()
df.head(20)

Unnamed: 0,pdbChainId,uniprotId,pdbResNums,snps,clinsig
0,5UAK.A,P13569,"[98, 349, 970, 525, 201, 334, 945, 560, 1371, ...","[Q->*PR, A->V, G->RS, Q->*, V->M, R->QW, S->L,...","[Pathogenic,untested, Pathogenic, Pathogenic, ..."


In [14]:
def view_modifications(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        pdb_id, chain_id = df.iloc[i]['pdbChainId'].split('.')
        res_num = df.iloc[i]['pdbResNums']
        labels = df.iloc[i]['snps']
        sigs = df.iloc[i]['clinsig']
        
        # added '' for clinvar sig
        sig_dir = {'Benign':'green', 'Likely benign':'turquoise', 'Likely pathogenic':'palevioletred', \
                    'Pathogenic':'red', 'drug-response':'plum', 'untested':'white', \
                    'Uncertain significance': 'lightgray', 'other':'white', 'null':'white', '':'white'}
        
       # print header
        print ("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        all_residues = {'resi': res_num, 'chain': chain_id}
        
        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': cutoff_distance}
        
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        # polymer style
        viewer.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
        
        # residues surrounding mutation positions
        viewer.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        
        # mutation positions
        for label, res, sig in zip(labels, res_num, sigs):
            sig1 = sig.split(',')[0] # if multiple values, use the first one
            col = (sig_dir[sig1])
            mod_res = {'resi': res, 'chain': chain_id} 
            c_col = col + "Carbon"
            viewer.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}})
                
            if show_labels:
                viewer.addLabel(label + " " + sig, {'fontSize':10,'fontColor':col,'backgroundColor':'ivory'}, {'resi': res, 'chain': chain_id})
        
        viewer.zoomTo(all_residues)
        
        if show_surface:
            viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'})

        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [15]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

## Visualize locations of missense mutations
Mutated residues are rendered in as sticks and transparent spheres, and colored by ClinVar significance. Each mutation is labeled by the PDB residue number and ClinVar significance. Residues surrounding mutation sites (within 6 A) are rendered as thin orange sticks. Small molecules within the structure are rendered as gray sticks.

In [16]:
view_modifications(df, 6, 'uniprotId');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [17]:
def view_surface(df, cutoff_distance, *args):

    def view3d(show_bio_assembly=False, show_surface=False, show_labels=True, i=0):
        pdb_id, chain_id = df.iloc[i]['pdbChainId'].split('.')
        res_num = df.iloc[i]['pdbResNums']
        labels = df.iloc[i]['snps']
        sigs = df.iloc[i]['clinsig']
        
        sig_dir = {'Benign':'green', 'Likely benign':'turquoise', 'Likely pathogenic':'palevioletred', \
                   'Pathogenic':'red', 'drug-response':'plum', 'untested':'white', \
                   'Uncertain significance': 'lightgray', 'other':'white', 'null':'white'}
        
        # print header
        print ("PDB Id: " + pdb_id + " chain Id: " + chain_id)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
            
        viewer = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly})

        all_residues = {'resi': res_num, 'chain': chain_id}
        
        # polymer style
        viewer.setStyle({'sphere': {'colorscheme': 'chain', 'opacity':0.6}})
        
        # non-polymer style
        viewer.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})

        # mutation style
        for label, res, sig in zip(labels, res_num, sigs):
            sig1 = sig.split(',')[0] # if multiple values, use the first one
            col = (sig_dir[sig1])
            mod_res = {'resi': res, 'chain': chain_id} 
            viewer.setStyle(mod_res, {'sphere':{'color':col}})
        
            if show_labels:
                viewer.addLabel(label + " " + sig, {'fontSize':10,'fontColor':col,'backgroundColor':'ivory'}, {'resi': res, 'chain': chain_id})
        
        viewer.zoomTo(all_residues)        
        
        if show_surface:
            viewer.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'})

        return viewer.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, i=s_widget)

In [18]:
view_surface(df, 6, 'uniprotId');

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [19]:
spark.stop()