# Sequences of genomes that are similar to 2019-nCoV / SARS-CoV-2

* Using code adapted from https://dmnfarrell.github.io/bioinformatics/bokeh-sequence-aligner
* and https://github.com/dmnfarrell/teaching/blob/master/pyviz/bokeh_sequence_align.ipynb

### According to Wu 2020, Chen 2020, and Zhou 2020, SARS-CoV-2 should: 

* be represented in GenBank MN908947, MN988668 and MN988669
* Have a nucleotide identity of 89.1% with a bat SARS-like coronavirus (CoV) isolateâ€”bat SL-CoVZC45 (GenBank accession number MG772933)
* share 79.5% sequence identity to SARS-CoV BJ01 (GenBank accession number AY278488)

### Let's explore! 

*Step 1: Import Python modules and define helper functions*

In [None]:
!pip install panel

import numpy as np
import pandas as pd
pd.options.display.width=200
import io
import random
import string
import panel as pn
import panel.widgets as pnw
pn.extension()
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment 
from Bio import SeqIO

def get_colors(seqs):
    """make colors for bases in sequence"""
    text = [i for s in list(seqs) for i in s]
    clrs =  {'A':'red','T':'green','G':'orange','C':'blue','-':'white'}
    colors = [clrs[i] for i in text]
    return colors

def view_alignment(aln, fontsize="9pt", plot_width=800):
    """Bokeh sequence alignment view"""

    #make sequence and id lists from the aln object
    seqs = [rec.seq for rec in (aln)]
    ids = [rec.id for rec in aln]    
    text = [i for s in list(seqs) for i in s]
    colors = get_colors(seqs)    
    N = len(seqs[0])
    S = len(seqs)    
    width = .4

    x = np.arange(1,N+1)
    y = np.arange(0,S,1)
    #creates a 2D grid of coords from the 1D arrays
    xx, yy = np.meshgrid(x, y)
    #flattens the arrays
    gx = xx.ravel()
    gy = yy.flatten()
    #use recty for rect coords with an offset
    recty = gy+.5
    h= 1/S
    #now we can create the ColumnDataSource with all the arrays
    source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors))
    plot_height = len(seqs)*15+50
    x_range = Range1d(0,N+1, bounds='auto')
    if N>100:
        viewlen=100
    else:
        viewlen=N
    #view_range is for the close up view
    view_range = (0,viewlen)
    tools="xpan, xwheel_zoom, reset, save"

    #entire sequence view (no text, with zoom)
    p = figure(title=None, plot_width= plot_width, plot_height=50,
               x_range=x_range, y_range=(0,S), tools=tools,
               min_border=0, toolbar_location='below')
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                 line_color=None, fill_alpha=0.6)
    p.add_glyph(source, rects)
    p.yaxis.visible = False
    p.grid.visible = False  

    #sequence text view with ability to scroll along x axis
    p1 = figure(title=None, plot_width=plot_width, plot_height=plot_height,
                x_range=view_range, y_range=ids, tools="xpan,reset",
                min_border=0, toolbar_location='below')#, lod_factor=1)          
    glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black",
                text_font="monospace",text_font_size=fontsize)
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                line_color=None, fill_alpha=0.4)
    p1.add_glyph(source, glyph)
    p1.add_glyph(source, rects)

    p1.grid.visible = False
    p1.xaxis.major_label_text_font_style = "bold"
    p1.yaxis.minor_tick_line_width = 0
    p1.yaxis.major_tick_line_width = 0

    p = gridplot([[p],[p1]], toolbar_location='below')
    return p

*Step 2: Preview the data*

In [None]:
csv = '/kaggle/input/repository-of-coronavirus-genomes/coronavirus_genomes/ncbi_collections/Coronavirus/coronavirus_sequences_table.csv'
df = pd.read_csv(csv)
df['Release_Date'] = pd.to_datetime(df.Release_Date)
df[-20:].tail(20)[['Collection_Date','Host','Genus','Family','Species','Accession']]

In [None]:
print('Count of Host Occurences: \n', df.Host.value_counts()[:10])
print('\n\nCount of Virus Occurences: \n', df.Species.value_counts()[:10])

In [None]:
sequence = '../input/coronavirus-genome-sequence/MN908947.txt'
with open(sequence) as text: 
    print('MN908947:')
    print(text.read(500)[95:])

In [None]:
print('MN908947: ')
fna = '/kaggle/input/repository-of-coronavirus-genomes/coronavirus_genomes/kaggle_collection/MN908947.fna'
alignment = AlignIO.read(fna,'fasta')
p = view_alignment(alignment, plot_width=900)
pn.pane.Bokeh(p)

In [None]:
print('MN988668: ')
fna = '/kaggle/input/repository-of-coronavirus-genomes/coronavirus_genomes/kaggle_collection/MN988668.fna'
alignment = AlignIO.read(fna,'fasta')
p = view_alignment(alignment, plot_width=900)
pn.pane.Bokeh(p)

In [None]:
print('MG772933: ')
fna = '/kaggle/input/repository-of-coronavirus-genomes/coronavirus_genomes/kaggle_collection/MG772933.fna'
alignment = AlignIO.read(fna,'fasta')
p = view_alignment(alignment, plot_width=900)
pn.pane.Bokeh(p)