## Map Mutations to 3D Structures in the Protein Data Bank

In [6]:
from pyspark.sql import SparkSession
from mmtfPyspark.datasets import g2sDataset, pdbjMineDataset, pdbToUniProt
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd

In [7]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [8]:
# Initialize Spark
spark = SparkSession.builder.appName("2-MapTo3DStructures").getOrCreate()

#### Input parameters

In [9]:
distance_cutoff = 8 # distance cutoff for visualizing interactions
input_file_name = 'mutations.csv' # mutations in standard format (e.g., 5:g.149440497C>T)
mapping_file_name = 'mutations_g2s.csv' # results from G2S mapping
output_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures

## Read 'mutations.csv' file created in the previous step

In [10]:
df = pd.read_csv(input_file_name)

## Create a list of the variants

In [11]:
var_ids = df['var_id'].tolist()
var_ids

['6:g.52619766C>T', '9:g.133738358A>T', '11:g.5246945G>T']

## Map the mutations from genome locations to 3D PDB positions¶
Here we use [g2sDataset](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/mmtfPyspark/datasets/g2sDataset.py) to retrieve genome to PDB mapping information using the [G2S](https://g2s.genomenexus.org/) (Genome to Structure) web services.

Note, this step may take several minutes to complete.

In [12]:
pdb_map = g2sDataset.get_full_dataset(var_ids, ref_genome='hgvs-grch37').toPandas()

In [13]:
pdb_map.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid
0,15184682,431.409,B,1.96091e-153,213.0,216.0,AEKPKLHY N RGRMES RWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYFNARGRMESTRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,1ags,1ags_B_1,1,221,hgvs-grch37,"[(R, 82, G, 83)]",1,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2017-10-25,6:g.52619766C>T,1AGS,82,R
1,15184683,431.409,A,1.96091e-153,213.0,216.0,AEKPKLHY N RGRMES RWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYFNARGRMESTRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,1ags,1ags_A_1,1,221,hgvs-grch37,"[(R, 82, G, 83)]",1,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2017-10-25,6:g.52619766C>T,1AGS,82,R
2,15184686,425.631,B,3.81593e-151,210.0,214.0,AEKPKLHY N RGRMES RWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYFNARGRMESTRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,1pl1,1pl1_B_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2017-10-25,6:g.52619766C>T,1PL1,83,G
3,15184687,425.631,A,3.81593e-151,210.0,214.0,AEKPKLHY N RGRMES RWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYFNARGRMESTRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,1pl1,1pl1_A_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2017-10-25,6:g.52619766C>T,1PL1,83,G
4,15184688,425.631,B,3.81593e-151,210.0,214.0,AEKPKLHY N RGRMES RWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYFNARGRMESTRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,1pkw,1pkw_B_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2017-10-25,6:g.52619766C>T,1PKW,83,G


In [14]:
pdb_map.to_csv(mapping_file_name, index=False)

### Filter by sequence identity to PDB sequence
Keep only records where the PDB sequence is at least 98% identical to the reference sequence

In [15]:
pdb_map['seqIdentity'] = pdb_map.identity/(pdb_map.seqTo - pdb_map.seqFrom + 1) * 100
pdb_map = pdb_map[pdb_map.seqIdentity >= 98]
pdb_map

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity
30,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
31,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
32,48820805,446.432,F,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_F_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
33,48820806,446.432,E,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_E_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
34,48820807,446.432,D,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_D_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
35,48820808,446.432,C,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_C_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
36,48820809,446.432,B,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_B_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
37,48820810,446.432,A,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_A_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511
38,52640951,432.18,C,1.1291800000000001e-157,215.0,215.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_C_1,1,218,hgvs-grch37,"[(G, 83, G, 83)]",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,221,2019-10-01,6:g.52619766C>T,4ACS,83,G,98.623853
39,52640952,432.18,A,1.1291800000000001e-157,215.0,215.0,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,1,4acs,4acs_A_1,1,218,hgvs-grch37,"[(G, 83, G, 83)]",4,KPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRNDGY...,4,434735,221,2019-10-01,6:g.52619766C>T,4ACS,83,G,98.623853


## Filter PDB Chains

Here we use the SIFTS annotation provided by EBI to filter by taxonomy. To learn more about how to [retrieve SIFTS annotation](
https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/SiftsDataDemo.ipynb).

#### To filter by organism, we first retrieve the SIFTS taxonomy annotations on a PDB chain level
Here we are only interested in human proteins.

In [16]:
taxonomyQuery = "SELECT * FROM sifts.pdb_chain_taxonomy WHERE sifts.pdb_chain_taxonomy.scientific_name = 'Homo sapiens'"
taxonomy = pdbjMineDataset.get_dataset(taxonomyQuery).toPandas()
taxonomy.head()

Unnamed: 0,pdbid,chain,tax_id,scientific_name,structureChainId
0,10GS,A,9606,Homo sapiens,10GS.A
1,10GS,B,9606,Homo sapiens,10GS.B
2,11GS,A,9606,Homo sapiens,11GS.A
3,11GS,B,9606,Homo sapiens,11GS.B
4,121P,A,9606,Homo sapiens,121P.A


## Filter the PDB chains by joining with the taxonomy data

In [17]:
pdb_filtered = pdb_map.merge(taxonomy, left_on=['structureId','chainId'], right_on=['pdbid','chain'], how='inner')
pdb_filtered = pdb_filtered.drop(['pdbid','chain'], axis=1)  # remove redundant columns
pdb_filtered['pdbPosition'] = pdb_filtered['pdbPosition'].astype('str') # must be string
pdb_filtered.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity,tax_id,scientific_name,structureChainId
0,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.H
1,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.H
2,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.G
3,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.G
4,48820805,446.432,F,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_F_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.F


In [18]:
chains = set(pdb_filtered.structureChainId)
print(chains)

{'2FO0.A', '3KMF.C', '2WJU.G', '1Y8W.D', '2WJU.C', '2V7A.A', '1YHE.D', '1YHR.B', '1YH9.D', '2W6V.D', '2G1T.D', '4TWP.B', '1XZ7.D', '2GQG.B', '1Y8W.B', '2VCT.F', '2WJU.F', '2GQG.A', '1XZV.B', '2VCT.D', '1Y0D.D', '2HZI.A', '1XZ4.B', '1Y0A.D', '1XXT.B', '2G1T.B', '2VCT.C', '2VCT.H', '1XZ2.D', '1VWT.B', '1Y0A.B', '1YZI.B', '1XZ5.D', '1XZU.D', '2WJU.B', '4ACS.B', '4ACS.D', '1XY0.B', '1Y09.D', '1QXD.D', '1YHE.B', '1XZV.D', '2HIW.B', '2VCT.G', '1XY0.D', '1XYE.B', '3QRI.A', '3QRK.A', '4ACS.A', '4ACS.C', '1YDZ.D', '1Y09.B', '2V7A.B', '1XXT.D', '2WJU.D', '2WJU.H', '1XZ4.D', '1XYE.D', '1VWT.D', '6FQF.C', '1Y0C.D', '2W6V.B', '1Y0C.B', '1YHR.D', '1OPL.A', '1XZU.B', '1DXT.B', '2G2F.A', '2G1T.A', '1YDZ.B', '2G2H.B', '2F4J.A', '2VCT.A', '5MO4.A', '1XZ2.B', '2WJU.E', '1DXT.D', '4ZOG.A', '2VCT.E', '2VCT.B', '1Y0D.B', '2G2H.A', '2G1T.C', '4WA9.B', '1XZ7.B', '4XS0.B', '1YH9.B', '6FQF.D', '3KMF.G', '1XZ5.B', '2HIW.A', '2WJU.A', '3UE4.A', '1QXD.B'}


## Get PDB to UniProt Residue Mappings

Download PDB to UniProt mappings and filter out residues that were not observed in the 3D structure.

In [19]:
up = pdbToUniProt.get_cached_residue_mappings().filter("pdbResNum IS NOT NULL").filter("uniprotNum IS NOT NULL")
up_map = up.filter(up.structureChainId.isin(chains)).toPandas()
up_map['uniprotNum'] = up_map.uniprotNum.astype('int') 
                       
up_map.head()

Unnamed: 0,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum
0,1QXD.B,1,1,P68871,2
1,1QXD.B,2,2,P68871,3
2,1QXD.B,3,3,P68871,4
3,1QXD.B,4,4,P68871,5
4,1QXD.B,5,5,P68871,6


In [20]:
#pdb_filtered = pdb_filtered.merge(up_map, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','pdbResNum'], how='inner')

# changed from inner join to left join since not all pdb chains have a UniProt mapping and the chached UniProt mappings are not up to date!

pdb_filtered = pdb_filtered.merge(up_map, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','pdbResNum'], how='left')

In [21]:
pdb_filtered.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity,tax_id,scientific_name,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum
0,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.H,83,83,P09210,83
1,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.H,83,83,P09210,83
2,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.G,83,83,P09210,83
3,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.G,83,83,P09210,83
4,48820805,446.432,F,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_F_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.F,83,83,P09210,83


## Merge results with input dataframe

In [22]:
pdb_filtered = pdb_filtered.merge(df, left_on='variationId', right_on='var_id', how='inner')
pdb_filtered.head()

Unnamed: 0,alignmentId,bitscore,chainId,evalue,identity,identityPositive,midlineAlign,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity,tax_id,scientific_name,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum,ID,CHROM,POS,REF,ALT,annotation,color,var_id
0,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.H,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,6:g.52619766C>T
1,48820803,446.432,H,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_H_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.H,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,6:g.52619766C>T
2,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.G,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,6:g.52619766C>T
3,48820804,446.432,G,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_G_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.G,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,6:g.52619766C>T
4,48820805,446.432,F,4.25707e-163,220.0,221.0,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,1,2wju,2wju_F_1,1,221,hgvs-grch37,"[(G, 83, G, 83)]",2,AEKPKLHYSNIRGRMESIRWLLAAAGVEFEEKFIKSAEDLDKLRND...,2,434735,222,2019-05-10,6:g.52619766C>T,2WJU,83,G,99.547511,9606,Homo sapiens,2WJU.F,83,83,P09210,83,rs147776857,6,52619766,C,T,GSTA2 missense mutation,blue,6:g.52619766C>T


In [23]:
pdb_filtered.fillna('', inplace=True)

## Save mappings

In [24]:
pdb_filtered.to_csv(output_file_name, index=False)

## View mutations grouped by protein chain
Use the slider to view each protein chain.

In [25]:
chains = pdb_filtered.groupby('structureChainId')
print("Chains:", list(chains.groups.keys()))

Chains: ['1DXT.B', '1DXT.D', '1OPL.A', '1QXD.B', '1QXD.D', '1VWT.B', '1VWT.D', '1XXT.B', '1XXT.D', '1XY0.B', '1XY0.D', '1XYE.B', '1XYE.D', '1XZ2.B', '1XZ2.D', '1XZ4.B', '1XZ4.D', '1XZ5.B', '1XZ5.D', '1XZ7.B', '1XZ7.D', '1XZU.B', '1XZU.D', '1XZV.B', '1XZV.D', '1Y09.B', '1Y09.D', '1Y0A.B', '1Y0A.D', '1Y0C.B', '1Y0C.D', '1Y0D.B', '1Y0D.D', '1Y8W.B', '1Y8W.D', '1YDZ.B', '1YDZ.D', '1YH9.B', '1YH9.D', '1YHE.B', '1YHE.D', '1YHR.B', '1YHR.D', '1YZI.B', '2F4J.A', '2FO0.A', '2G1T.A', '2G1T.B', '2G1T.C', '2G1T.D', '2G2F.A', '2G2H.A', '2G2H.B', '2GQG.A', '2GQG.B', '2HIW.A', '2HIW.B', '2HZI.A', '2V7A.A', '2V7A.B', '2VCT.A', '2VCT.B', '2VCT.C', '2VCT.D', '2VCT.E', '2VCT.F', '2VCT.G', '2VCT.H', '2W6V.B', '2W6V.D', '2WJU.A', '2WJU.B', '2WJU.C', '2WJU.D', '2WJU.E', '2WJU.F', '2WJU.G', '2WJU.H', '3KMF.C', '3KMF.G', '3QRI.A', '3QRK.A', '3UE4.A', '4ACS.A', '4ACS.B', '4ACS.C', '4ACS.D', '4TWP.B', '4WA9.B', '4XS0.B', '4ZOG.A', '5MO4.A', '6FQF.C', '6FQF.D']


In [26]:
# Setup viewer
def view_grouped_mutations(grouped_df, *args):
    chainIds = list(grouped_df.groups.keys())

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, font, i):
        group = grouped_df.get_group(chainIds[i])
        
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
       
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        rows = group.shape[0]
        for j in range(0, rows):
            res_num = str(group.iloc[j]['pdbPosition'])
            mod_res = {'resi': res_num, 'chain': chain_id}
            col = group.iloc[j]['color']
            c_col = col + 'Carbon'
            viewer1.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
            viewer1.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.6}}) 
            
            annotation = group.iloc[j]['annotation']
            mutation = group.iloc[j]['variationId']
            label = ""
            if show_labels:
                label = label + mutation + " "
            if show_annotations:
                label = label + annotation

            if show_annotations or show_labels:
                viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory'}, {'resi': res_num, 'chain': chain_id})
               
        # print header
        print("PDB Id:", pdb_id, "chain Id:", chain_id, "annotation:", annotation)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})

        return viewer1.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, font=9, i=s_widget)

def view_image1():
    return viewer1.png()

In [27]:
view_grouped_mutations(chains);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

## View one mutation at a time
Use the slider to view each mutation. Surrounding residues within the `distance_cutoff` are rendered as orange sticks.

In [28]:
# Setup viewer
def view_single_mutation(df, distance_cutoff, *args):

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, font, i):        
        pdb_id, chain_id = df.iloc[i]['structureChainId'].split('.')
        
        global viewer2
        viewer2 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer2.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.9}})

        # non-polymer style
        viewer2.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
       
        # highlight chain of interest in green
        viewer2.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue', 'opacity':0.5}})
        
        # style for mutated residue
        res_num = str(df.iloc[i]['pdbPosition'])
        mod_res = {'resi': res_num, 'chain': chain_id} 
        col = df.iloc[i]['color']
        c_col = col + 'Carbon'
        viewer2.addStyle(mod_res, {'stick':{'colorscheme':c_col, 'radius': 0.2}})
        viewer2.addStyle(mod_res, {'sphere':{'color':col, 'opacity': 0.8}})          
        
        # text label
        mutation = df.iloc[i]['variationId']
        annotation = df.iloc[i]['annotation']
        label = ""
        if show_labels:
            label = label + mutation + " "
        if show_annotations:
            label = label + annotation
            
        if show_labels or show_annotations:
            viewer2.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory'}, {'resi': res_num, 'chain': chain_id})

        # select neigboring residues by distance
        surroundings = {'chain': chain_id, 'resi': res_num, 'byres': True, 'expand': distance_cutoff}
        
        # residues surrounding mutation positions
        viewer2.addStyle(surroundings,{'stick':{'colorscheme':'orangeCarbon', 'radius': 0.15}})
        
        viewer2.zoomTo(surroundings)
        viewer2.center(surroundings)
        
        if show_surface:
             viewer2.addSurface(py3Dmol.SES, {'opacity':0.8,'color':'lightblue'}, {'chain': chain_id})
                
        #print header
        print("PDB Id:", pdb_id, "chain Id:" , chain_id, "residue:", res_num, "mutation:", mutation, "annotation", annotation)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])
          
        return viewer2.show()
       
    s_widget = IntSlider(min=0, max=len(df)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, font=12, i=s_widget)

def view_image2():
    return viewer2.png()

In [29]:
view_single_mutation(pdb_filtered, distance_cutoff);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [30]:
# Shutdown Spark
spark.stop()

## Now run the next step
Map mutations occuring at protein-polymer interfaces: [3-MapToPolymerInteractions.ipynb](3-MapToPolymerInteractions.ipynb)