In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/coronavirus-genome-sequence/MN908947.fna
/kaggle/input/coronavirus-genome-sequence/A_new_coronavirus_associated_with_human_respirator.pdf
/kaggle/input/coronavirus-genome-sequence/MN908947.txt


In [2]:
import Bio
from Bio.SeqIO import parse,read , to_dict


### Parsing the Genome Sequence to get a generator Object

In [3]:
# Getting a generator object.
cor_seq = [cor_seq for cor_seq in parse("/kaggle/input/coronavirus-genome-sequence/MN908947.fna","fasta")]
# Converting this sequence into a dictionary.
cor_seq_dict = to_dict(cor_seq)

In [4]:
print(cor_seq_dict)

{'MN908947.3': SeqRecord(seq=Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA', SingleLetterAlphabet()), id='MN908947.3', name='MN908947.3', description='MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', dbxrefs=[])}


### Reading the Genome Sequence

In [5]:
cor = read("/kaggle/input/coronavirus-genome-sequence/MN908947.fna","fasta")
print(cor.seq)

ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCT

In [6]:
# As we can see the T's in the Sequence we can predict that it is either a DNA or a Protein. By the information of
# data we can say that it cannot be a protein sequence.                                                    

At last of this sequence we can see there are multiple A's , this is due to the Polymerase chain Reaction.

### Taking complement of the sequence

In [7]:
# Taking the compliment.
cor_com = cor_seq[0].seq.complement()
repr(cor_com)

"Seq('TAATTTCCAAATATGGAAGGGTCCATTGTTTGGTTGGTTGAAAGCTAGAGAACA...TTT', SingleLetterAlphabet())"

### Transcribing the Sequence to get a RNA Sequence

In [8]:
# Transcribing the sequence
cor_rna = cor_seq[0].seq.transcribe()
print(cor_rna)

AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGAUCUCUUGUAGAUCUGUUCUCUAAACGAACUUUAAAAUCUGUGUGGCUGUCACUCGGCUGCAUGCUUAGUGCACUCACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCGUCCGUGUUGCAGCCGAUCAUCAGCACAUCUAGGUUUCGUCCGGGUGUGACCGAAAGGUAAGAUGGAGAGCCUUGUCCCUGGUUUCAACGAGAAAACACACGUCCAACUCAGUUUGCCUGUUUUACAGGUUCGCGACGUGCUCGUACGUGGCUUUGGAGACUCCGUGGAGGAGGUCUUAUCAGAGGCACGUCAACAUCUUAAAGAUGGCACUUGUGGCUUAGUAGAAGUUGAAAAAGGCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUCGGAUGCUCGAACUGCACCUCAUGGUCAUGUUAUGGUUGAGCUGGUAGCAGAACUCGAAGGCAUUCAGUACGGUCGUAGUGGUGAGACACUUGGUGUCCUUGUCCCUCAUGUGGGCGAAAUACCAGUGGCUUACCGCAAGGUUCUUCUUCGUAAGAACGGUAAUAAAGGAGCUGGUGGCCAUAGUUACGGCGCCGAUCUAAAGUCAUUUGACUUAGGCGACGAGCUUGGCACUGAUCCUUAUGAAGAUUUUCAAGAAAACUGGAACACUAAACAUAGCAGUGGUGUUACCCGUGAACUCAUGCGUGAGCUUAACGGAGGGGCAUACACUCGCUAUGUCGAUAACAACUUCUGUGGCCCUGAUGGCUACCCUCUUGAGUGCAUUAAAGACCUUCUAGCACGUGCUGGUAAAGCUUCAUGCACUUUGUCCGAACAACUGGACUUUAUUGACACUAAGAGGGGUGUAUACUGCUGCCGUGAACAUGAGCAUGAAAUUGCUUGGUACACGGAACGUUCU

### Translating the Sequence to Protein

Translating the 
1. DNA Sequence 
2. RNA Sequence<br>

Both to Protein Sequence

In [9]:
# Translation
# 1.
protein = cor_seq[0].seq.translate()
print(protein)
# 2.
pr = cor_rna.translate()
pr

IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER*DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS*RWHLWLSRS*KRRFAST*TALCVHQTFGCSNCTSWSCYG*AGSRTRRHSVRS*W*DTWCPCPSCGRNTSGLPQGSSS*ER**RSWWP*LRRRSKVI*LRRRAWH*SL*RFSRKLEH*T*QWCYP*THA*A*RRGIHSLCR*QLLWP*WLPS*VH*RPSSTCW*SFMHFVRTTGLY*H*EGCILLP*T*A*NCLVHGTF*KEL*IADTF*N*IGKEI*HLQWGMSKFCISLKFHNQDYSTKG*KEKA*WLYG*NSICLSSCVTK*MQPNVPFNSHEV*SLW*NFMADGRFC*SHLRILWH*EFD*RRCHYLWLLTPKCCC*NLLSSMSQFRSRT*A*SCRIP**IWLENHSS*GWSHYCLWRLCVLLCWLP*QVCLLGSTC*R*HRL*PYRCCWRRFRRS**QPS*NTPKRESQHQYCW*L*T**RDRHYFGIFFCFHKCFCGNCERFGL*SIQTNC*ILW*F*SYKRKS*KRCLEYW*TEINTESSLCICIRGCSCCTINFLPHS*NCSKFCACFTEGRYNNTRWNFTVFTETH*CYDVHI*FGY*QSSCNGLHYRWCCSVDFAVAN*HLWHCL*KTQTRP*LA*REV*GRCRVS*RRLGNC*IYLNLCL*NCRWTNCHLCKGN*GECSDIL*ACK*IFGFVC*LYHYWWS*T*SLEFR*NICHALKGIVQKVC*IQRRNWPTHASKSPKRNYLLRGRNTSHRSVNRGSCLENW*FTTIRTTY**SC*SSIGWYTSLY*RAYVARNQRHRKVLCPCT*YDGNKQYLHTQRRCTNKGYFW**HCDRSARLQECEYHF*T**KD**ST**EVLCLYS*TRYRSK*VRLCCGRCCHKNFATSI*ITYTTGH*FR*VEYGYILLI**VW



Seq('IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QD...KKK', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [10]:
# Getting the positions of Stop codons
# Stop in the first occurences of stop codons
cor_seq[0].seq.translate(to_stop=True)

Seq('IKGLYLPR', ExtendedIUPACProtein())

### Getting the Transition Table

In [11]:
# The translation table that we used is standard transition table and it is by-default in used in BioPython
import Bio.Data.CodonTable as CodonTable
print(CodonTable.unambiguous_dna_by_name["Standard"])

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

### Splitting the Protein

In [12]:
# Spltting the protein according to the stop codons
protn = protein.split("*")
for each in protn:
    if each==" ":
        continue
    print(each)

IKGLYLPR
QTNQLSISCRSVL
TNFKICVAVTRLHA
CTHAV
LITNYCR
QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER
DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS
RWHLWLSRS
KRRFAST
TALCVHQTFGCSNCTSWSCYG
AGSRTRRHSVRS
W
DTWCPCPSCGRNTSGLPQGSSS
ER

RSWWP
LRRRSKVI
LRRRAWH
SL
RFSRKLEH
T
QWCYP
THA
A
RRGIHSLCR
QLLWP
WLPS
VH
RPSSTCW
SFMHFVRTTGLY
H
EGCILLP
T
A
NCLVHGTF
KEL
IADTF
N
IGKEI
HLQWGMSKFCISLKFHNQDYSTKG
KEKA
WLYG
NSICLSSCVTK
MQPNVPFNSHEV
SLW
NFMADGRFC
SHLRILWH
EFD
RRCHYLWLLTPKCCC
NLLSSMSQFRSRT
A
SCRIP

IWLENHSS
GWSHYCLWRLCVLLCWLP
QVCLLGSTC
R
HRL
PYRCCWRRFRRS

QPS
NTPKRESQHQYCW
L
T

RDRHYFGIFFCFHKCFCGNCERFGL
SIQTNC
ILW
F
SYKRKS
KRCLEYW
TEINTESSLCICIRGCSCCTINFLPHS
NCSKFCACFTEGRYNNTRWNFTVFTETH
CYDVHI
FGY
QSSCNGLHYRWCCSVDFAVAN
HLWHCL
KTQTRP
LA
REV
GRCRVS
RRLGNC
IYLNLCL
NCRWTNCHLCKGN
GECSDIL
ACK
IFGFVC
LYHYWWS
T
SLEFR
NICHALKGIVQKVC
IQRRNWPTHASKSPKRNYLLRGRNTSHRSVNRGSCLENW
FTTIRTTY

SC
SSIGWYTSLY
RAYVARNQRHRKVLCPCT
YDGNKQYLHTQRRCTNKGYFW

HCDRSARLQECEYHF
T

KD

ST

EVLCLYS
TRYRSK
VRLCCGRCCHKNFATSI
ITYTTGH
FR
VEYGYILLI

VW

### GC percent of Sequence

In [13]:
# Now calculating the GC% in the given sequence
from Bio.SeqUtils import GC
print(f"GC% :{GC(cor_seq[0].seq)}")

GC% :37.97277865097148


#### In the Next part I'll be discussing some other information regarding the genome sequence.
If u liked my work please hit upvote !