## Map SNPs from 23andme report to 3D structures from PDB.
This notebook is a prototype for visualizing the positions of missense mutations mapped from [dbSNP](https://www.ncbi.nlm.nih.gov/projects/SNP/) (GRCh37 build) to 3D protein structures in the Protein Data Bank.

In [1]:
import warnings
warnings.filterwarnings("ignore") # numpy version issue?
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import collect_set, collect_list, concat_ws
from mmtfPyspark.datasets import dbSnpDataset
import pandas as pd
from ipywidgets import interact, IntSlider, widgets
from IPython.display import display
import py3Dmol

In [2]:
filename = "/Users/peter/work/genome/genome.txt"

In [3]:
df = pd.read_csv(filename, comment='#', sep='\t', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])

#### Filter out deletions

In [4]:
df = df[df.genotype != '--']

#### Filer out rows without an rsid

In [5]:
df = df[df.rsid.str.startswith('rs')]
df

Unnamed: 0,rsid,chromosome,position,genotype
0,rs548049170,1,69869,TT
2,rs9283150,1,565508,AA
4,rs116587930,1,727841,GG
5,rs3131972,1,752721,GG
6,rs12184325,1,754105,CC
7,rs12567639,1,756268,AA
8,rs114525117,1,759036,GG
10,rs12127425,1,794332,GG
11,rs79373928,1,801536,TT
13,rs7538305,1,824398,AA


In [6]:
#### Split genotypes into alleles

In [7]:
df[['allele1', 'allele2']] = df['genotype'].str.split('', expand=True).drop([0,3], axis=1)

## Read file with dbSNP info
The following dataset was created from the SNP3D_PDB_GRCH37 dataset by mapping non-synonymous SNPs to human proteins with >= 95% sequence identity in the PDB.

In [8]:
ds = dbSnpDataset.get_cached_dataset().toPandas()

#### Run query

In [9]:
# https://www.ncbi.nlm.nih.gov/snp/rs762582956
## add columns ref, alt, codon, so_term

In [10]:
ds

Unnamed: 0,chr,pos,snp_id,master_acc,master_gi,master_pos,master_res,master_var,pdb_gi,pdb_res,pdb_pos,blast_ident,clinsig,pdbChainId,tax_id,pdbResNum,uniprotId,uniprotNum
0,11,67354025,762582956,NP_000843,4504183,204,I,V,4389047,I,204,100.000,,12GS.A,9606,203,P09211,204.0
1,4,79525461,764726341,NP_005130,4826643,274,R,*,157829892,R,274,100.000,,1AII.A,9606,275,P12429,274.0
2,4,79525462,771966889,NP_005130,4826643,274,R,P,157829892,R,274,100.000,,1AII.A,9606,275,P12429,274.0
3,4,79525462,771966889,NP_005130,4826643,274,R,Q,157829892,R,274,100.000,,1AII.A,9606,275,P12429,274.0
4,4,79525488,953760923,NP_005130,4826643,283,D,N,157829892,D,283,100.000,,1AII.A,9606,284,P12429,283.0
5,9,21077604,918229587,NP_002167,4504603,89,A,P,3318960,A,68,100.000,,1AU1.A,9606,68,P01574,89.0
6,2,128186235,140582220,NP_000303,4506115,367,V,M,392312036,V,156,100.000,,1AUT.C,9606,162,P04070,367.0
7,2,128186236,767730328,NP_000303,4506115,367,V,A,392312036,V,156,100.000,,1AUT.C,9606,162,P04070,367.0
8,4,122590843,199713216,NP_001145,4502107,273,M,L,493847,M,273,100.000,,1AVH.A,9606,273,P08758,273.0
9,7,129917687,145958174,NP_001860,217416390,240,R,W,157830140,R,222,99.751,,1AYE.A,9606,130,P48052,240.0


In [14]:
pd.options.display.max_columns = None  # show all columns
pd.options.display.max_rows = None  # show all rows
df = df.merge(ds, left_on=['chromosome', 'position'], right_on=['chr', 'pos'])
df = df.drop_duplicates(subset=['rsid'])
df = df[df.allele1 != df.allele2]
df

Unnamed: 0,rsid,chromosome,position,genotype,allele1,allele2,chr_x,pos_x,snp_id_x,master_acc_x,master_gi_x,master_pos_x,master_res_x,master_var_x,pdb_gi_x,pdb_res_x,pdb_pos_x,blast_ident_x,clinsig_x,pdbChainId_x,tax_id_x,pdbResNum_x,uniprotId_x,uniprotNum_x,chr_y,pos_y,snp_id_y,master_acc_y,master_gi_y,master_pos_y,master_res_y,master_var_y,pdb_gi_y,pdb_res_y,pdb_pos_y,blast_ident_y,clinsig_y,pdbChainId_y,tax_id_y,pdbResNum_y,uniprotId_y,uniprotNum_y,chr_x.1,pos_x.1,snp_id_x.1,master_acc_x.1,master_gi_x.1,master_pos_x.1,master_res_x.1,master_var_x.1,pdb_gi_x.1,pdb_res_x.1,pdb_pos_x.1,blast_ident_x.1,clinsig_x.1,pdbChainId_x.1,tax_id_x.1,pdbResNum_x.1,uniprotId_x.1,uniprotNum_x.1,chr_y.1,pos_y.1,snp_id_y.1,master_acc_y.1,master_gi_y.1,master_pos_y.1,master_res_y.1,master_var_y.1,pdb_gi_y.1,pdb_res_y.1,pdb_pos_y.1,blast_ident_y.1,clinsig_y.1,pdbChainId_y.1,tax_id_y.1,pdbResNum_y.1,uniprotId_y.1,uniprotNum_y.1
19,rs145552478,16,88871972,AG,A,G,16,88871972,145552478,NP_112190,188497689,205,G,S,262118476,G,205,99.817,Uncertain significance,2WVR.C,9606,205,Q9H211,205.0,16,88871972,145552478,NP_112190,188497689,205,G,S,262118476,G,205,99.817,Uncertain significance,2WVR.C,9606,205,Q9H211,205.0,16,88871972,145552478,NP_112190,188497689,205,G,S,262118476,G,205,99.817,Uncertain significance,2WVR.C,9606,205,Q9H211,205.0,16,88871972,145552478,NP_112190,188497689,205,G,S,262118476,G,205,99.817,Uncertain significance,2WVR.C,9606,205,Q9H211,205.0
51,rs2292954,16,89613123,AG,A,G,16,89613123,2292954,NP_003110,4507173,503,T,A,158430509,T,200,99.617,other,2QZ4.A,9606,503,Q9UQ90,503.0,16,89613123,2292954,NP_003110,4507173,503,T,A,158430509,T,200,99.617,other,2QZ4.A,9606,503,Q9UQ90,503.0,16,89613123,2292954,NP_003110,4507173,503,T,A,158430509,T,200,99.617,other,2QZ4.A,9606,503,Q9UQ90,503.0,16,89613123,2292954,NP_003110,4507173,503,T,A,158430509,T,200,99.617,other,2QZ4.A,9606,503,Q9UQ90,503.0
110,rs12948217,17,3397702,CT,C,T,17,3397702,12948217,NP_001121557,189339202,231,Y,*,160285727,Y,231,100.0,Pathogenic,2O4H.A,9606,231,P45381,231.0,17,3397702,12948217,NP_001121557,189339202,231,Y,*,160285727,Y,231,100.0,Pathogenic,2O4H.A,9606,231,P45381,231.0,17,3397702,12948217,NP_001121557,189339202,231,Y,*,160285727,Y,231,100.0,Pathogenic,2O4H.A,9606,231,P45381,231.0,17,3397702,12948217,NP_001121557,189339202,231,Y,*,160285727,Y,231,100.0,Pathogenic,2O4H.A,9606,231,P45381,231.0
130,rs238238,17,4856376,AG,A,G,17,4856376,238238,NP_001967,301897469,71,N,S,311771970,S,72,99.539,untested,2XSX.A,9606,71,P13929,71.0,17,4856376,238238,NP_001967,301897469,71,N,S,311771970,S,72,99.539,untested,2XSX.A,9606,71,P13929,71.0,17,4856376,238238,NP_001967,301897469,71,N,S,311771970,S,72,99.539,untested,2XSX.A,9606,71,P13929,71.0,17,4856376,238238,NP_001967,301897469,71,N,S,311771970,S,72,99.539,untested,2XSX.A,9606,71,P13929,71.0
909,rs1979277,17,18232096,AG,A,G,17,18232096,1979277,NP_001268715,528881075,336,L,F,5821827,L,464,99.415,Uncertain significance,1BJ4.A,9606,474,P34896,474.0,17,18232096,1979277,NP_001268715,528881075,336,L,F,5821827,L,464,99.415,Uncertain significance,1BJ4.A,9606,474,P34896,474.0,17,18232096,1979277,NP_001268715,528881075,336,L,F,5821827,L,464,99.415,Uncertain significance,1BJ4.A,9606,474,P34896,474.0,17,18232096,1979277,NP_001268715,528881075,336,L,F,5821827,L,464,99.415,Uncertain significance,1BJ4.A,9606,474,P34896,474.0
964,rs1050565,17,28576076,CT,C,T,17,28576076,1050565,NP_000377,4557367,443,I,V,7245509,I,442,99.779,Benign,1CB5.A,9606,443,Q13867,443.0,17,28576076,1050565,NP_000377,4557367,443,I,V,7245509,I,442,99.779,Benign,1CB5.A,9606,443,Q13867,443.0,17,28576076,1050565,NP_000377,4557367,443,I,V,7245509,I,442,99.779,Benign,1CB5.A,9606,443,Q13867,443.0,17,28576076,1050565,NP_000377,4557367,443,I,V,7245509,I,442,99.779,Benign,1CB5.A,9606,443,Q13867,443.0
1017,rs41283425,17,39925713,CT,C,T,17,39925713,41283425,NP_002221,4504811,142,R,H,258588652,R,19,100.0,Likely benign,3IFQ.A,9606,142,P14923,142.0,17,39925713,41283425,NP_002221,4504811,142,R,H,258588652,R,19,100.0,Likely benign,3IFQ.A,9606,142,P14923,142.0,17,39925713,41283425,NP_002221,4504811,142,R,H,258588652,R,19,100.0,Likely benign,3IFQ.A,9606,142,P14923,142.0,17,39925713,41283425,NP_002221,4504811,142,R,H,258588652,R,19,100.0,Likely benign,3IFQ.A,9606,142,P14923,142.0
1031,rs2304497,17,40065774,GT,G,T,17,40065774,2304497,NP_001290204,740086852,229,E,D,350610483,E,175,98.776,,3PFF.A,9606,175,P53396,175.0,17,40065774,2304497,NP_001290204,740086852,229,E,D,350610483,E,175,98.776,,3PFF.A,9606,175,P53396,175.0,17,40065774,2304497,NP_001290204,740086852,229,E,D,350610483,E,175,98.776,,3PFF.A,9606,175,P53396,175.0,17,40065774,2304497,NP_001290204,740086852,229,E,D,350610483,E,175,98.776,,3PFF.A,9606,175,P53396,175.0
1427,rs17679445,17,46022065,AG,A,G,17,46022065,17679445,NP_060599,8922498,116,R,Q,28948989,R,116,100.0,Likely benign,1NRG.A,9606,116,Q9NVS9,116.0,17,46022065,17679445,NP_060599,8922498,116,R,Q,28948989,R,116,100.0,Likely benign,1NRG.A,9606,116,Q9NVS9,116.0,17,46022065,17679445,NP_060599,8922498,116,R,Q,28948989,R,116,100.0,Likely benign,1NRG.A,9606,116,Q9NVS9,116.0,17,46022065,17679445,NP_060599,8922498,116,R,Q,28948989,R,116,100.0,Likely benign,1NRG.A,9606,116,Q9NVS9,116.0
1496,rs4362,17,61573761,CT,C,T,17,61573761,770429365,NP_690043,23238214,556,V,Q,149242580,V,489,100.0,,2OC2.A,9606,525,P12821,1130.0,17,61573761,770429365,NP_690043,23238214,556,V,Q,149242580,V,489,100.0,,2OC2.A,9606,525,P12821,1130.0,17,61573761,770429365,NP_690043,23238214,556,V,Q,149242580,V,489,100.0,,2OC2.A,9606,525,P12821,1130.0,17,61573761,770429365,NP_690043,23238214,556,V,Q,149242580,V,489,100.0,,2OC2.A,9606,525,P12821,1130.0
