# Maps SARS-CoV-2 Mutations to 3D Protein Structures
[Work in progress]

This notebook maps mutation frequency of SARS-CoV-2 strains onto 3D protein structures in the [Protein Data Bank](https://www.wwpdb.org/). SARS-CoV2-2 strains and their mutations have been aggregated in the [COVID-19-Net Knowledge Graph](https://github.com/covid-19-net/covid-19-community).

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import matplotlib.cm as cm
import ipywidgets as widgets
from ipywidgets import interact, IntSlider, FloatSlider, SelectMultiple
from py2neo import Graph
import py3Dmol

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
output_file_name = 'mutations3d.csv' # mutations mapped to 3D protein structures

#### Connect to COVID-19-Community Knowledge Graph
[COVID-19-Net Knowledge Graph](https://github.com/covid-19-net/covid-19-community)

In [4]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### Get list of SARS-CoV-2 proteins

In [5]:
reference_genome = 'ncbiprotein:NC_045512' # Genbank reference sequence

In [6]:
query = """
MATCH (r:Strain{id: $reference_genome})-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
      -[:HAS_TERTIARY_STRUCTURE]->(:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
WHERE (g.end - g.start) < 10000 // exclude polyproteins
RETURN p.name AS protein, s.description AS description
ORDER BY protein
"""
structures = graph.run(query, reference_genome=reference_genome).to_data_frame()

In [7]:
proteins = structures['protein'].unique()

In [8]:
protein_widget = widgets.Dropdown(options=proteins, description='Select protein:', value='Spike glycoprotein',
                                  style={'description_width': 'initial'}, )

#### Select SARS-CoV-2 Protein

In [10]:
display(protein_widget)

Dropdown(description='Select protein:', index=11, options=("2'-O-methyltransferase", '3C-like proteinase', 'Ho…

In [11]:
protein_name = protein_widget.value
print('Protein name:', protein_name)

Protein name: Spike glycoprotein


In [12]:
subset = structures.query(f'protein == "{protein_name}"')

In [13]:
descriptions = list(subset['description'].unique())
descriptions.sort()
descriptions.insert(0, 'All')

In [14]:
description_widget = SelectMultiple(options=descriptions, value=['All'], description='Select subset', 
                                    layout={'width': 'max-content'}, style={'description_width': 'initial'}, 
                                    rows=len(descriptions), disabled=False)

#### Select a subset of structures to be analyzed
The default is All structures. Multiple values can be selected with shift and/or ctrl (or command) pressed and mouse clicks or arrow keys.

In [16]:
display(description_widget)

SelectMultiple(description='Select subset', index=(17, 22), layout=Layout(width='max-content'), options=('All'…

In [17]:
print("Selected subset:")
for value in description_widget.value:
    print(value)

Selected subset:
Spike glycoprotein, Nanobody H11-D4
Synthetic nanobody MR17, Spike glycoprotein


### Get total number of strains

In [18]:
query = """
MATCH (s:Strain)
WHERE s.hostTaxonomyId = 'taxonomy:9606'
RETURN count(s)
"""

In [19]:
strains = graph.evaluate(query)

In [20]:
print('Total number of human strains:', strains)

Total number of human strains: 74801


### Get variants for selected protein

In [21]:
query = """
MATCH (p:Protein{name: $protein_name})-[:HAS_VARIANT]->(v:Variant{variantConsequence:'missense_variant'})<-[:HAS_VARIANT]-(s:Strain)
WHERE s.hostTaxonomyId = 'taxonomy:9606'
WITH v.proteinPosition AS residue, count(v.proteinVariant) AS count, 
     v.proteinVariant + '(' + count(v.proteinVariant) + ')' AS variationId ,
     split(v.proteinVariant, ':')[1] + '(' + count(v.proteinVariant) + ')' AS annotation
     ORDER by count DESC
WITH residue, count, variationId, annotation
RETURN residue, collect(variationId) AS variationId, collect(annotation) AS annotation, sum(count) AS count ORDER BY residue
"""

#### Add mutation annotation to each residue

In [22]:
variants = graph.run(query, protein_name=protein_name).to_data_frame()

In [23]:
variants['variationId'] = variants['variationId'].apply(lambda x: ', '.join(x))
variants['annotation'] = variants['annotation'].apply(lambda x: ', '.join(x))

In [24]:
variants['annotation'] = variants['annotation'].str.replace('p.', '')

In [25]:
variants.head()

Unnamed: 0,annotation,count,residue,variationId
0,"3V>G(2), 3V>F(1)",3,3,"QHD43416.1:p.3V>G(2), QHD43416.1:p.3V>F(1)"
1,4F>S(1),1,4,QHD43416.1:p.4F>S(1)
2,"5L>F(326), 5L>I(2)",328,5,"QHD43416.1:p.5L>F(326), QHD43416.1:p.5L>I(2)"
3,"6V>F(48), 6V>I(1)",49,6,"QHD43416.1:p.6V>F(48), QHD43416.1:p.6V>I(1)"
4,7L>V(2),2,7,QHD43416.1:p.7L>V(2)


#### Create a color scale based on the log mutation frequency

In [26]:
variants['scale'] = variants['count'].apply(np.log) / math.log(strains)

In [27]:
n_colors = 100
colors = cm.Reds(np.linspace(0.0, 1.0, n_colors))
col = np.empty(n_colors, dtype=object)

for i, color in enumerate(colors):
    col[i] = matplotlib.colors.rgb2hex(color)

In [28]:
variants['color'] = variants['scale'].apply(lambda x: col[round(x*n_colors)])

In [29]:
variants.head()

Unnamed: 0,annotation,count,residue,variationId,scale,color
0,"3V>G(2), 3V>F(1)",3,3,"QHD43416.1:p.3V>G(2), QHD43416.1:p.3V>F(1)",0.097893,#fee5d8
1,4F>S(1),1,4,QHD43416.1:p.4F>S(1),0.0,#fff5f0
2,"5L>F(326), 5L>I(2)",328,5,"QHD43416.1:p.5L>F(326), QHD43416.1:p.5L>I(2)",0.516192,#f96044
3,"6V>F(48), 6V>I(1)",49,6,"QHD43416.1:p.6V>F(48), QHD43416.1:p.6V>I(1)",0.346785,#fc997a
4,7L>V(2),2,7,QHD43416.1:p.7L>V(2),0.061764,#ffebe2


### Get PDB structures for selected protein

In [30]:
query = """
MATCH (p:Protein{name: $protein_name})-[h:HAS_TERTIARY_STRUCTURE]->(c:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
RETURN p.name AS name, p.start, p.end, c.name AS structureChainId, c.uniprotStart, c.uniprotEnd, c.pdbStart, c.pdbEnd, s.resolution AS resolution, s.description AS description, h.coverage AS coverage
ORDER BY resolution, coverage DESC
"""

In [31]:
chains = graph.run(query, protein_name=protein_name).to_data_frame()

In [32]:
print(description_widget.value[0])
if description_widget.value[0] != 'All':
    print("Selecting subset:", description_widget.value)
    chains = chains.query(f'description in {description_widget.value}')
else:
    print('all')

Spike glycoprotein, Nanobody H11-D4
Selecting subset: ('Spike glycoprotein, Nanobody H11-D4', 'Synthetic nanobody MR17, Spike glycoprotein')


In [33]:
chains['structureChainId'] = chains['structureChainId'].str[4:]

In [34]:
chains['structureId'] = chains['structureChainId'].str[:4]

In [35]:
chains.head()

Unnamed: 0,c.pdbEnd,c.pdbStart,c.uniprotEnd,c.uniprotStart,coverage,description,name,p.end,p.start,resolution,structureChainId,structureId
0,[528],[334],[528],[334],0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1273,13,1.8,6YZ5.E,6YZ5
1,[528],[334],[528],[334],0.153181,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1273,1,1.8,6YZ5.E,6YZ5
26,[527],[333],[527],[333],0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,1273,13,2.77,7C8W.B,7C8W
27,[527],[333],[527],[333],0.153181,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,1273,1,2.77,7C8W.B,7C8W


In [36]:
chains.drop_duplicates(subset=['structureChainId'], inplace=True)
chains.sort_values(by=['structureChainId'], inplace=True)

#### Map uniprot residue numbers to PDB residue numbers

In [37]:
def uniprot_to_pdb_mapping(row):
    mapping = dict()
    for (us,ue, ps, pe) in zip(row['c.uniprotStart'], row['c.uniprotEnd'], row['c.pdbStart'], row['c.pdbEnd']):
        ps = int(ps)
        pe = int(pe)
        if (ue-us != pe-ps):
            print('length mismatch:', row['structureChainId'], ue-us, pe-ps)
        else:
            offset = ps - us
            for v in range(us, ue+1):
                mapping[v] = offset + v
                
    #print(mapping)
    return mapping

In [38]:
chains['mapping'] = chains.apply(lambda row: uniprot_to_pdb_mapping(row), axis=1)

### Visualize mutation sites

Mutations are mapped onto protein chains for available 3D protein structures.

Display options:

|||
|:-|:-|
| *show_bio_assembly* | Toggle display of the biologically relevant quaternary structure |
| *show_surface* | Toggle surface for protein chain |
| *show_annotations* | Toggle display of mutation information<br>{PDBId}.{chainId}.{PDBResidue}: {UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 6Z43.A.614: 614D>G(58984), 614D>N(6) |
| *size* | Change size of visualization |
| *font* | Change font size of annotations |
| *logFreq* | Change minimum threshold to display mutations based on normalized log of mutation frequency [0.0 - 1.0]|
| *structure* | Move slider to browse through available structures |

In [39]:
# Setup viewer
def view_mutations(df, variants, *args):
    chainIds = list(df['structureChainId'])

    def view3d(show_bio_assembly, show_surface, show_short_label, show_long_label, size, font, logFreq, i): 
        pdb_id, chain_id = chainIds[i].split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.8}})

        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False, 'colorscheme': 'greenCarbon'}})
        
        mapping = df['mapping'].iloc[i]

        for row in variants.itertuples():
            # get PDB residue mapping from a UniProt residue number
            res_num = mapping.get(row.residue, 0)
            col = row.color
            if res_num > 0 and row.scale > logFreq:
                mut_res = {'resi': res_num, 'chain': chain_id}
                viewer1.addStyle(mut_res, {'sphere':{'color':col, 'opacity': 1.0}}) 

                if show_short_label:
                    label = row.annotation
                if show_long_label:
                    label = chainIds[i] + "." + str(res_num) + ": " + row.variationId
                if show_short_label or show_long_label:
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, {'resi': res_num, 'chain': chain_id})

        description = df['description'].iloc[i]
        resolution = df['resolution'].iloc[i]
        coverage = df['coverage'].iloc[i]
        name = df['name'].iloc[i]
        
        print(name)
        print()
        print(f'PDB Id: {pdb_id}, chain Id: {chain_id}, resolution: {resolution}, sequence coverage: {coverage:.2f}')
        print(f'description: {description}')
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})

        return viewer1.show()
       
    f_widget = IntSlider(value=9, min=5, max=20, description='font size', continuous_update=False)
    z_widget = IntSlider(value=750, min=500, max=1200, description='size', continuous_update=False)
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    l_widget = FloatSlider(value=0.5, min=0, max=1, step=0.05, description='logFreq:', 
                           continuous_update=False, orientation='horizontal', readout=True, readout_format='.2f')
    
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_short_label=True, show_long_label=False, size=z_widget, font=f_widget, logFreq=l_widget, i=s_widget)

def view_image1():
    return viewer1.png()

In [40]:
view_mutations(chains, variants);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…

In [41]:
# https://stackoverflow.com/questions/32468402/how-to-explode-a-list-inside-a-dataframe-cell-into-separate-rows
import copy

def pandas_explode(df, column_to_explode):
    """
    Similar to Hive's EXPLODE function, take a column with iterable elements, and flatten the iterable to one element 
    per observation in the output table

    :param df: A dataframe to explod
    :type df: pandas.DataFrame
    :param column_to_explode: 
    :type column_to_explode: str
    :return: An exploded data frame
    :rtype: pandas.DataFrame
    """

    # Create a list of new observations
    new_observations = list()

    # Iterate through existing observations
    for row in df.to_dict(orient='records'):

        # Take out the exploding iterable
        explode_values = row[column_to_explode]
        del row[column_to_explode]

        # Create a new observation for every entry in the exploding iterable & add all of the other columns
        for explode_value in explode_values.items():

            # Deep copy existing observation
            new_observation = copy.deepcopy(row)

            # Add one (newly flattened) value from exploding iterable
            new_observation[column_to_explode] = explode_value

            # Add to the list of new observations
            new_observations.append(new_observation)

    # Create a DataFrame
    return_df = pd.DataFrame(new_observations)

    # Return
    return return_df

### Expand chains into residues

In [42]:
residues = pandas_explode(chains, 'mapping')
residues['uniprotPosition'] = residues['mapping'].apply(lambda x: x[0])
residues['pdbPosition'] = residues['mapping'].apply(lambda x: x[1])
residues.drop(columns='mapping', inplace=True)

In [43]:
residues = residues.drop(columns=['c.pdbEnd', 'c.pdbStart', 'c.uniprotEnd', 'c.uniprotStart', 'p.end', 'p.start'])

In [44]:
residues.head(10)

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition
0,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,334,334
1,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,335,335
2,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,336,336
3,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,337,337
4,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,338,338
5,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,339,339
6,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,340,340
7,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,341,341
8,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,342,342
9,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,343,343


In [45]:
variants = variants[['residue', 'variationId', 'annotation', 'scale', 'color']]

In [46]:
residues_variants = residues.merge(variants, left_on='uniprotPosition', right_on='residue')

In [47]:
residues_variants.head()

Unnamed: 0,coverage,description,name,resolution,structureChainId,structureId,uniprotPosition,pdbPosition,residue,variationId,annotation,scale,color
0,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,336,336,336,QHD43416.1:p.336C>R(2),336C>R(2),0.061764,#ffebe2
1,0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,2.77,7C8W.B,7C8W,336,336,336,QHD43416.1:p.336C>R(2),336C>R(2),0.061764,#ffebe2
2,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,337,337,337,"QHD43416.1:p.337P>R(4), QHD43416.1:p.337P>S(2)","337P>R(4), 337P>S(2)",0.159657,#fdd5c4
3,0.154639,"Synthetic nanobody MR17, Spike glycoprotein",Spike glycoprotein,2.77,7C8W.B,7C8W,337,337,337,"QHD43416.1:p.337P>R(4), QHD43416.1:p.337P>S(2)","337P>R(4), 337P>S(2)",0.159657,#fdd5c4
4,0.154639,"Spike glycoprotein, Nanobody H11-D4",Spike glycoprotein,1.8,6YZ5.E,6YZ5,338,338,338,QHD43416.1:p.338F>L(4),338F>L(4),0.123527,#fee1d3


In [48]:
residues_variants.to_csv(output_file_name, index=False)

## Now run the next step
Map mutations occuring at protein-protein interaction sites: [2-MapToPolymerInteractions.ipynb](2-MapToPolymerInteractions.ipynb)