In [1]:
# In this activity we will download data from different sources.
# Here is a code using urllib that will attempt to directly download from the url specifed below

 # NCBI url:
url = 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&' + \
'save=file&log$=seqview&db=nuccore&report=fasta&id=1798174254&' + \
'extrafeat=null&conwithfeat=on&hide-cdd=on'
# your local downloaded file:
f = '../../data_external/SARS-CoV-2-Wuhan-NC_045512.2.fasta'


In [2]:
import os
import urllib
import shutil
if not os.path.isdir('../../data_external/'):
       os.mkdir('../../data_external/')

r = urllib.request.urlopen(url)
fo = open(f, 'wb')
shutil.copyfileobj(r, fo)
fo.close()


In [4]:
# The fle has been opened in read-only mode. The variable lines contains a list of all the
# lines of the fle. Here are the frst fve lines:

lines = open(f, 'r').readlines()
lines[0:5]



['>NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome\n',
 'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA\n',
 'CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC\n',
 'TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG\n',
 'TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC\n']

In [5]:
# The frst line is a description of the data. The long genetic code is broken up into the
# following lines. We need to strip end-of-line characters from each such line to re-assemble
# the RNA string. Here is a way to strip off the end-of-line character:

lines[1].strip()



'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA'

In [7]:
#The following  code uses the string operation join to put together the lines into one long string. This is the RNA of the virus.

rna = ''.join([line.strip() for line in lines[1:]])

# The frst thousand characters and the last thousand characters of the RNA of the coronavirus are printed below:

rna[:1000]
rna[-1000:]





'GCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

In [8]:
# Here is the total length of the RNA:
len(rna)

29903

In [9]:
# Finding a protein.
#The article then gives the ORF7a sequence, which I have copied and pasted into the next
#cell, adding some string breaks. Note how the article has used lower case characters and
#the character u instead of T.
orf7a = 'augaaaauuauucuuuucuuggcacugauaacacucgcuacuugugagcuuuaucacuaccaagaguguguuagagguacaacaguacuuuuaaaagaaccuugcucuucuggaacauacgagggcaauucaccauuucauccucuagcugauaacaaauuugcacugacuugcuuuagcacucaauuugcuuuugcuuguccugacggcguaaaacacgucuaucaguuacgugccagaucaguuucaccuaaacuguucaucagacaagaggaaguucaagaacuuuacucuccaauuuuucuuauuguugcggcaauaguguuuauaacacuuugcuucacacucaaaagaaagacagaaugauugaacuuucauuaauugacuucuauuugugcuuuuuagccuuucugcuauuccuuguuuuaauuaugcuuauuaucuuuugguucucacuugaacugcaagaucauaaugaaacuugucacgccuaaacgaac'
# The next task in this class activity is to fnd if this sequence occurs in the RNA we just
#downloaded, and if it does, where it occurs. To this end, we frst make the replacements
#required to read the string in terms of A, T, G, and C.
s=orf7a.replace('u', 'T').replace('a', 'A').replace('g', 'G').replace('c','C')
s

'ATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAAC'

In [10]:
# The next step is now a triviality in view of python’s exceptional string handling mechanisms:
s in rna



True

In [11]:
# We may also easily fnd the location of the ORF7a sequence and read off the entire string
# beginning with the sequence.
rna.find(s)


27393

In [12]:
rna[27393:]

'ATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACG

In [13]:
#Nucleotide frequencies
# The next task in this activity is to make a python dictionary, called freq, whose keys are
#the nucleotide characters and whose values are the number of times it occurs in the virus
#RNA. Once you have made it, freq['A'], for example, should output the frequency of
#nucleotide A.
freq = {b: rna.count(b)/len(rna) for b in 'ATGC'}
freq

{'A': 0.29943483931378123,
 'T': 0.32083737417650404,
 'G': 0.19606728421897468,
 'C': 0.18366050229074005}

In [15]:
#A Washington sample
#A more recent dataset at NCBI, apparently just submitted for peer-review on April 3,
#claims to contain the genome of a virus sample from our neighboring state of Washington. You can fnd it labeled there as the data set MT293201. Let us take a look.
url2 = 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?' + \
'tool=portal&save=file&log$=seqview&db=nuccore&report=fasta&' + \
'id=1828694245&extrafeat=null&conwithfeat=on&hide-cdd=on'
f2 = '../../data_external/SARS-CoV-2-Washington_MT293201.1.fasta'

r2 = urllib.request.urlopen(url2)
fo2 = open(f2, 'wb')
shutil.copyfileobj(r2, fo2)




In [16]:
#Is this the same genetic code as from the Wuhan sample? Let’s repeat the previous procedure on this new fle to make a string object that contains the RNA from the Washington
#sample. We shall call it rna2 below.
lines = open(f2, 'r').readlines()
rna2 = ''.join([line.strip() for line in lines[1:]])

#  what are the distinct characters in the new rna2? There can be
# very simply done in python if you use the set data structure, which removes duplicates.
set(rna2)


{'A', 'C', 'G', 'T'}

In [17]:
# The next natural question might be this. Are the lengths of rna and rna2 the same?
rna2[:30], rna2[-30:]


('AACCTTTAAACTTTCGATCTCTTGTAGATC', 'TTTAATAGCTTCTTAGGAGAATGACAAAAA')

In [18]:
rna[:30], rna[-30:]

('ATTAAAGGTTTATACCTTCCCAGGTAACAA', 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')

In [None]:
# Clearly, rna and rna2 are different strings.

In [20]:
#Compare their nucleotide frequencies
freq2 = {b: rna2.count(b)/len(rna2) for b in 'ATGC'}
freq2

{'A': 0.29866648797158746,
 'T': 0.3214166052402332,
 'G': 0.1963077129263553,
 'C': 0.18360919386182403}

In [21]:
#Although the Washington genome is not identical to the Wuhan one, their nucleotide frequencies are very close to the Wuhan one, reproduced her
freq

{'A': 0.29943483931378123,
 'T': 0.32083737417650404,
 'G': 0.19606728421897468,
 'C': 0.18366050229074005}

In [22]:
#Does it contain ORF7a?

s in rna2

True

In [23]:
rna2.find(s)

27364

In [24]:
#This activity provided you with just a glimpse into the large feld of bioinformatics, which
#studies, among other things, patterns of nucleotide arrangements. If you are interested in
#this feld, you should take a look at Biopython, a bioinformatics python package.