# Maps SARS-CoV-2 Mutations to 3D Protein Structures
[Work in progress]

This notebook maps mutation frequency of SARS-CoV-2 strains onto 3D protein structures with bound antibody fragments in the [Protein Data Bank](https://www.wwpdb.org/). SARS-CoV2-2 strains and their mutations have been aggregated in the [COVID-19-Net Knowledge Graph](https://github.com/covid-19-net/covid-19-community).

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import matplotlib.cm as cm
import ipywidgets as widgets
from ipywidgets import interact, IntSlider, FloatSlider, SelectMultiple
from py2neo import Graph
import py3Dmol

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
output_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures

#### Connect to COVID-19-Community Knowledge Graph
[COVID-19-Net Knowledge Graph](https://github.com/covid-19-net/covid-19-community)

In [4]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### Taxonomy ids for pathogen (SARS-CoV-2) and host (human)

In [5]:
pathogen_taxonomy_id = 'taxonomy:2697049'
host_taxonomy_id = 'taxonomy:9606'
lineage = 'B.1.1.7'

### Get list of SARS-CoV-2 proteins

In [6]:
query = """
MATCH (p:Protein{taxonomyId: $pathogen_taxonomy_id})-[t:HAS_TERTIARY_STRUCTURE]->(:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
WHERE t.coverage > 0.2 // eliminate polyprotein
RETURN p.name AS protein, p.accession as accession, p.proId as proId, p.sequence as sequence, s.description AS description
ORDER BY protein
"""

In [7]:
structures = graph.run(query, pathogen_taxonomy_id=pathogen_taxonomy_id).to_data_frame()

In [8]:
proteins = structures[['protein', 'accession', 'proId', 'sequence']].drop_duplicates()

In [9]:
proteins.head(50)

Unnamed: 0,protein,accession,proId,sequence
0,3C-like proteinase,uniprot:P0DTC1,uniprot.chain:PRO_0000449639,SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTS...
29,Helicase,uniprot:P0DTD1,uniprot.chain:PRO_0000449630,AVGACVLCNSQTSLRCGACIRRPFLCCKCCYDHVISTSHKLVLSVN...
31,Non-structural protein 10,uniprot:P0DTC1,uniprot.chain:PRO_0000449644,AGNATEVPANSTVLSFCAFAVDAAKAYKDYLASGGQPITNCVKMLC...
32,Non-structural protein 7,uniprot:P0DTC1,uniprot.chain:PRO_0000449641,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...
34,Non-structural protein 8,uniprot:P0DTC1,uniprot.chain:PRO_0000449642,AIASEFSSLPSYAAFATAQEAYEQAVANGDSEVVLKKLKKSLNVAK...
36,Non-structural protein 9,uniprot:P0DTC1,uniprot.chain:PRO_0000449643,NNELSPVALRQMSCAAGTTQTACTDDNALAYYNTTKGGRFVLALLS...
38,Nucleoprotein,uniprot:P0DTC9,uniprot.chain:PRO_0000449656,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...
42,ORF9b protein,uniprot:P0DTD2,uniprot.chain:PRO_0000449657,MDPKISEMHPALRLVDPQIQLAVTRMENAVGRDQNNVGPKVYPIIL...
44,Spike glycoprotein,uniprot:P0DTC2,,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
98,Spike glycoprotein,uniprot:P0DTC2,uniprot.chain:PRO_0000449646,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...


In [10]:
protein_list = structures['protein'].unique()

In [11]:
protein_widget = widgets.Dropdown(options=protein_list, description='Select protein:', value='Spike glycoprotein',
                                  style={'description_width': 'initial'}, )

### Select SARS-CoV-2 Protein

In [12]:
display(protein_widget)

Dropdown(description='Select protein:', index=8, options=('3C-like proteinase', 'Helicase', 'Non-structural pr…

In [13]:
protein_name = protein_widget.value
print('Protein name:', protein_name)

Protein name: Spike glycoprotein


In [14]:
subset = structures.query(f'protein == "{protein_name}"').copy()
subset.fillna('', inplace=True)

In [15]:
subset.head()

Unnamed: 0,accession,description,proId,protein,sequence
44,uniprot:P0DTC2,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",,Spike glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
45,uniprot:P0DTC2,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",,Spike glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
46,uniprot:P0DTC2,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",,Spike glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
47,uniprot:P0DTC2,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",,Spike glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
48,uniprot:P0DTC2,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",,Spike glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...


In [16]:
# descriptions = list(subset['description'].unique())
# descriptions.sort()
# descriptions.insert(0, 'All')

In [17]:
# description_widget = SelectMultiple(options=descriptions, value=['All'], description='Select subset', 
#                                     layout={'width': 'max-content'}, style={'description_width': 'initial'}, 
#                                     rows=len(descriptions), disabled=False)

### Select a subset of structures to be analyzed based on PDB structure description
The default is All structures. Multiple values can be selected with shift and/or ctrl (or command) pressed and mouse clicks or arrow keys.

In [18]:
# display(description_widget)

In [19]:
# print("Selected subset:")
# for value in description_widget.value:
#     print(value)

### Get total number of strains

In [20]:
query = """
MATCH (s:Strain)
WHERE s.taxonomyId = $pathogen_taxonomy_id AND s.hostTaxonomyId = $host_taxonomy_id AND s.lineage = $lineage
RETURN count(s)
"""

In [21]:
strains = graph.evaluate(query, pathogen_taxonomy_id=pathogen_taxonomy_id, 
                         host_taxonomy_id=host_taxonomy_id, lineage=lineage)

In [22]:
print('Total number of human strains:', strains)

Total number of human strains: 230753


### Get variants for selected protein

In [23]:
query = """
MATCH (p:Protein{reviewed: True})-[:HAS_VARIANT]->(v:Variant{variantConsequence:'missense_variant'})<-[:HAS_VARIANT]-(s:Strain)
WHERE p.name = $protein_name AND p.taxonomyId = $pathogen_taxonomy_id AND s.hostTaxonomyId = $host_taxonomy_id AND s.lineage = $lineage
WITH v.proteinPosition AS residue, count(v.proteinVariant) AS count, 
     v.proteinVariant + '(' + count(v.proteinVariant) + ')' AS variationId ,
     split(v.proteinVariant, ':')[1] + '(' + count(v.proteinVariant) + ')' AS annotation
     ORDER by count DESC
WITH residue, count, variationId, annotation
RETURN residue, collect(variationId) AS variationId, collect(annotation) AS annotation, sum(count) AS count ORDER BY residue
"""

### Add mutation annotation to each residue

In [24]:
variants = graph.run(query, protein_name=protein_name, pathogen_taxonomy_id=pathogen_taxonomy_id, 
                     host_taxonomy_id=host_taxonomy_id, lineage=lineage).to_data_frame()

In [25]:
variants.shape

(946, 4)

In [26]:
variants.head()

Unnamed: 0,annotation,count,residue,variationId
0,"[p.2F>S(4), p.2F>L(4)]",8,2,"[S:p.2F>S(4), S:p.2F>L(4)]"
1,"[p.3V>G(30), p.3V>I(17), p.3V>F(2)]",49,3,"[S:p.3V>G(30), S:p.3V>I(17), S:p.3V>F(2)]"
2,"[p.4F>S(5), p.4F>I(1)]",6,4,"[S:p.4F>S(5), S:p.4F>I(1)]"
3,[p.5L>F(3945)],3945,5,[S:p.5L>F(3945)]
4,"[p.6V>A(363), p.6V>F(20), p.6V>I(8), p.6V>L(1)]",392,6,"[S:p.6V>A(363), S:p.6V>F(20), S:p.6V>I(8), S:p..."


In [27]:
variants.sort_values('count', ascending=False, inplace=True)

In [28]:
variants['variationId'] = variants['variationId'].apply(lambda x: ', '.join(x))
variants['annotation'] = variants['annotation'].apply(lambda x: ', '.join(x))

In [29]:
variants['annotation'] = variants['annotation'].str.replace('p.', '')

Create a color scale based on the log mutation frequency

In [30]:
total = variants['count'].sum()
variants['scale'] = variants['count'].apply(np.log10) / math.log10(total)

In [31]:
n_colors = 100
colors = cm.Reds(np.linspace(0.0, 1.0, n_colors))
col = np.empty(n_colors, dtype=object)

for i, color in enumerate(colors):
    col[i] = matplotlib.colors.rgb2hex(color)

In [32]:
variants['color'] = variants['scale'].apply(lambda x: col[round(x*n_colors)])

In [33]:
variants.head()

Unnamed: 0,annotation,count,residue,variationId,scale,color
479,"614D>G(339712), 614D>N(18), 614D>A(6)",339736,614,"S:p.614D>G(339712), S:p.614D>N(18), S:p.614D>A(6)",0.865227,#a30f15
556,"716T>I(339522), 716T>A(56), 716T>S(2)",339580,716,"S:p.716T>I(339522), S:p.716T>A(56), S:p.716T>S(2)",0.865196,#a30f15
447,"570A>D(339534), 570A>S(12), 570A>V(4), 570A>G(2)",339552,570,"S:p.570A>D(339534), S:p.570A>S(12), S:p.570A>V...",0.86519,#a30f15
530,"681P>H(339282), 681P>R(258), 681P>S(4)",339544,681,"S:p.681P>H(339282), S:p.681P>R(258), S:p.681P>...",0.865189,#a30f15
734,982S>A(339500),339500,982,S:p.982S>A(339500),0.86518,#a30f15


### Get 3D structure for selected protein

In [34]:
# some Electron microscopy structures are of low resolution. Keep only high and medium resolution structures (0 - 4 Å).
resolution_threshold = 4.0

In [35]:
query = """
MATCH (g:Gene)-[:ENCODES]->(p:Protein{name: $protein_name, taxonomyId: $pathogen_taxonomy_id})-[h:HAS_TERTIARY_STRUCTURE]->(c:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
WHERE s.resolution <= $resolution_threshold
RETURN p.name AS name, p.start, p.end, c.name AS structureChainId, c.uniprotStart, c.uniprotEnd, c.pdbStart, c.pdbEnd, s.resolution AS resolution, s.description AS description, h.coverage AS coverage
ORDER BY resolution, coverage DESC
"""

In [36]:
chains = graph.run(query, protein_name=protein_name, pathogen_taxonomy_id=pathogen_taxonomy_id, 
                   resolution_threshold=resolution_threshold).to_data_frame()

In [37]:
chains['structureChainId'] = chains['structureChainId'].str[4:]

In [38]:
chains['structureId'] = chains['structureChainId'].str[:4]

In [39]:
chains.head()

Unnamed: 0,c.pdbEnd,c.pdbStart,c.uniprotEnd,c.uniprotStart,coverage,description,name,p.end,p.start,resolution,structureChainId,structureId
0,[528],[334],[528],[334],0.153181,"Spike glycoprotein, Nanobody H11-D4, 2-acetami...",Spike glycoprotein,1273,1,1.8,6YZ5.E,6YZ5
1,[528],[334],[528],[334],0.153181,"Spike glycoprotein, H11-H4, 2-acetamido-2-deox...",Spike glycoprotein,1273,1,1.85,6ZBP.EEE,6ZBP
2,"[70, 617, 676, 1147]","[14, 76, 633, 689]","[70, 617, 676, 1147]","[14, 76, 633, 689]",0.865672,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",Spike glycoprotein,1273,1,2.6,7E7B.B,7E7B
3,"[70, 617, 676, 1147]","[14, 76, 633, 689]","[70, 617, 676, 1147]","[14, 76, 633, 689]",0.865672,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",Spike glycoprotein,1273,1,2.6,7E7B.A,7E7B
4,"[70, 617, 676, 1147]","[14, 76, 633, 689]","[70, 617, 676, 1147]","[14, 76, 633, 689]",0.865672,"Spike glycoprotein,Collagen alpha-1(I) chain, ...",Spike glycoprotein,1273,1,2.6,7E7B.C,7E7B


In [40]:
chains.drop_duplicates(subset=['structureChainId'], inplace=True)
chains.sort_values(by=['structureChainId'], inplace=True)

#### Map uniprot residue numbers to PDB residue numbers

In [41]:
def uniprot_to_pdb_mapping(row):
    mapping = dict()
    for (us,ue, ps, pe) in zip(row['c.uniprotStart'], row['c.uniprotEnd'], row['c.pdbStart'], row['c.pdbEnd']):
        ps = int(ps)
        pe = int(pe)
        if (ue-us != pe-ps):
            print('length mismatch:', row['structureChainId'], ue-us, pe-ps)
        else:
            offset = ps - us
            for v in range(us, ue+1):
                mapping[v] = offset + v
                
    #print(mapping)
    return mapping

In [42]:
chains['mapping'] = chains.apply(lambda row: uniprot_to_pdb_mapping(row), axis=1)

length mismatch: 6LXT.A 35 38
length mismatch: 6LXT.B 37 40
length mismatch: 6LXT.C 36 39
length mismatch: 6LXT.D 31 34
length mismatch: 6LXT.E 36 39
length mismatch: 6LXT.F 33 36


### Visualize mutation sites

Mutations are mapped onto protein chains for available 3D protein structures.

Display options:

|||
|:-|:-|
| *show_bio_assembly* | Toggle display of the biologically relevant quaternary structure |
| *show_surface* | Toggle surface for protein chain |
| *show_short_label* | Toggle display of mutation information<br>{UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 501N>Y(350436)|
| *show_long_label* | Toggle display of mutation information<br>{PDBId}.{chainId}.{PDBResidue}: {geneName}.p{UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 6XDG.E.501: S:p.501N>Y(350436) |
| *size* | Change size of visualization |
| *font* | Change font size of annotations |
| *logFreq* | Change minimum threshold to display mutations based on normalized log of mutation frequency [0.0 - 1.0]|
| *structure* | Move slider to browse through available structures |

#### Example: Move the structure slider to PDB ID:6XDG to see how mutations (e.g., 501N>Y) effect the binding of the Regeneron antibodies 

In [43]:
# Setup viewer
def view_mutations(df, variants, *args):
    chainIds = list(df['structureChainId'])

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, logFreq, i): 
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.8}})

        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False, 'colorscheme': 'greenCarbon'}})
        
        mapping = df['mapping'].iloc[i]

        for row in variants.itertuples():
            # get PDB residue mapping from a UniProt residue number
            res_num = mapping.get(row.residue, 0)
            col = row.color
            if res_num > 0 and row.scale > logFreq:
                mut_res = {'resi': res_num, 'chain': chain_id}
                viewer1.addStyle(mut_res, {'sphere':{'color':col, 'opacity': 1.0}}) 

                if show_short_label:
                    label = row.annotation
                if show_long_label:
                    label = chainIds[i] + "." + str(res_num) + ": " + row.variationId
                if show_short_label or show_long_label:
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, {'resi': res_num, 'chain': chain_id})

        description = df['description'].iloc[i]
        resolution = df['resolution'].iloc[i]
        coverage = df['coverage'].iloc[i]
        name = df['name'].iloc[i]
        
        print(name)
        print()
        print(f'PDB Id: {pdb_id}, chain Id: {chain_id}, resolution: {resolution}, sequence coverage: {coverage:.2f}')
        print(f'description: {description}')
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})

        return viewer1.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    l_widget = FloatSlider(value=0.8, min=0, max=1, step=0.05, description='logFreq:', 
                           continuous_update=False, orientation='horizontal', readout=True, readout_format='.2f')
    
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, size=z_widget, font=f_widget, logFreq=l_widget, i=s_widget)

def view_image1():
    return viewer1.png()

In [44]:
view_mutations(chains, variants);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [45]:
# https://stackoverflow.com/questions/32468402/how-to-explode-a-list-inside-a-dataframe-cell-into-separate-rows
import copy

def pandas_explode(df, column_to_explode):
    """
    Similar to Hive's EXPLODE function, take a column with iterable elements, and flatten the iterable to one element 
    per observation in the output table

    :param df: A dataframe to explod
    :type df: pandas.DataFrame
    :param column_to_explode: 
    :type column_to_explode: str
    :return: An exploded data frame
    :rtype: pandas.DataFrame
    """

    # Create a list of new observations
    new_observations = list()

    # Iterate through existing observations
    for row in df.to_dict(orient='records'):

        # Take out the exploding iterable
        explode_values = row[column_to_explode]
        del row[column_to_explode]

        # Create a new observation for every entry in the exploding iterable & add all of the other columns
        for explode_value in explode_values.items():

            # Deep copy existing observation
            new_observation = copy.deepcopy(row)

            # Add one (newly flattened) value from exploding iterable
            new_observation[column_to_explode] = explode_value

            # Add to the list of new observations
            new_observations.append(new_observation)

    # Create a DataFrame
    return_df = pd.DataFrame(new_observations)

    # Return
    return return_df

### Expand chains into residues

In [46]:
residues = pandas_explode(chains, 'mapping')
residues['uniprotPosition'] = residues['mapping'].apply(lambda x: x[0])
residues['pdbPosition'] = residues['mapping'].apply(lambda x: x[1])
residues.drop(columns='mapping', inplace=True)

In [47]:
residues = residues.drop(columns=['c.pdbEnd', 'c.pdbStart', 'c.uniprotEnd', 'c.uniprotStart', 'p.end', 'p.start'])

In [48]:
residues.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition
0,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,912,912
1,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,913,913
2,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,914,914
3,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,915,915
4,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,916,916


In [49]:
variants = variants[['residue', 'variationId', 'annotation', 'scale', 'color']]

In [50]:
residues_variants = residues.merge(variants, left_on='uniprotPosition', right_on='residue')

In [51]:
residues_variants.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color
0,0.088767,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.A,6LXT,912,912,912,S:p.912T>I(2),912T>I(2),0.04709,#ffede5
1,0.089552,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.C,6LXT,912,912,912,S:p.912T>I(2),912T>I(2),0.04709,#ffede5
2,0.084053,"Spike protein S2, Spike protein S2, TETRAETHYL...",Spike glycoprotein,2.9,6LXT.D,6LXT,912,912,912,S:p.912T>I(2),912T>I(2),0.04709,#ffede5
3,0.758837,"Spike glycoprotein, 2-acetamido-2-deoxy-beta-D...",Spike glycoprotein,3.2,6VYB.A,6VYB,912,912,912,S:p.912T>I(2),912T>I(2),0.04709,#ffede5
4,0.745483,"Spike glycoprotein, 2-acetamido-2-deoxy-beta-D...",Spike glycoprotein,3.2,6VYB.B,6VYB,912,912,912,S:p.912T>I(2),912T>I(2),0.04709,#ffede5


In [52]:
residues_variants.to_csv(output_file_name, index=False)

## Now run the next step
Map mutations occuring at protein-protein interaction sites: [2-MapToPolymerInteractions.ipynb](2-MapToPolymerInteractions.ipynb)