In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd

from Bio import Entrez, SeqIO
from Bio.SeqUtils import molecular_weight
# The `GC` function has been deprecated in recent versions of Biopython.
# Instead, `gc_fraction` from `Bio.SeqUtils` needs to be used.
from Bio.SeqUtils import gc_fraction

## (1) Fetch COVID-19 Genome (**MN908947**) from NCBI

In [2]:
Entrez.email = "sayakokodera@gmail.com" 

In [3]:
# Search the genome first
handle = Entrez.esearch(
    db="nucleotide", 
    term="MN908947[Accession]", 
    retmax="40"
)
rec_list = Entrez.read(handle)
handle.close()

In [4]:
print(rec_list["IdList"])  # Number of records found

['1798172431']


In [5]:
# Fetch the search results
handle = Entrez.efetch(db="nucleotide", id=rec_list['IdList'], rettype="gb")
recs = list(SeqIO.parse(handle, 'gb')) # Now it is a list of SeqIO objects -> easy to deal with
handle.close()

In [6]:
# Or directly fetch the genome with its ID
handle = Entrez.efetch(db="nucleotide", id="MN908947", rettype="gb", retmode="text")
recs = list(SeqIO.parse(handle, 'gb'))
handle.close()

## (2) Gene analysis
What to look for...
- Sequence length: how many base pairs? 
- GC content? 
    - Because higher GC contant = more stable molecular (triple hydrogen bonds)
- Translate the gene (i.e. DNA -> mRNA -> amino acids)
- Most common amino acids? 
- Number of functional proteins?
    - 20+ amino acides => functional protein
- Larget protein; length of amino acids? 

In [7]:
covid_dna = recs[0].seq
print(covid_dna)

ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCT

#### Basic info

In [44]:
nucleotide_counts = Counter(covid_dna)
info_dict={}

info_dict.update({
    "Length": len(covid_dna),
    "Molecular Weight": molecular_weight(covid_dna),
    "GC Content (%)": gc_fraction(covid_dna) * 100,
    "A Content ": nucleotide_counts['A'],
    "C Count": nucleotide_counts['C'],
    "G Count": nucleotide_counts['G'],
    "T Count": nucleotide_counts['T'],
})

### Transcription

In [9]:
covid_mrna = covid_dna.transcribe()
print(covid_mrna)

AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGAUCUCUUGUAGAUCUGUUCUCUAAACGAACUUUAAAAUCUGUGUGGCUGUCACUCGGCUGCAUGCUUAGUGCACUCACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCGUCCGUGUUGCAGCCGAUCAUCAGCACAUCUAGGUUUCGUCCGGGUGUGACCGAAAGGUAAGAUGGAGAGCCUUGUCCCUGGUUUCAACGAGAAAACACACGUCCAACUCAGUUUGCCUGUUUUACAGGUUCGCGACGUGCUCGUACGUGGCUUUGGAGACUCCGUGGAGGAGGUCUUAUCAGAGGCACGUCAACAUCUUAAAGAUGGCACUUGUGGCUUAGUAGAAGUUGAAAAAGGCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUCGGAUGCUCGAACUGCACCUCAUGGUCAUGUUAUGGUUGAGCUGGUAGCAGAACUCGAAGGCAUUCAGUACGGUCGUAGUGGUGAGACACUUGGUGUCCUUGUCCCUCAUGUGGGCGAAAUACCAGUGGCUUACCGCAAGGUUCUUCUUCGUAAGAACGGUAAUAAAGGAGCUGGUGGCCAUAGUUACGGCGCCGAUCUAAAGUCAUUUGACUUAGGCGACGAGCUUGGCACUGAUCCUUAUGAAGAUUUUCAAGAAAACUGGAACACUAAACAUAGCAGUGGUGUUACCCGUGAACUCAUGCGUGAGCUUAACGGAGGGGCAUACACUCGCUAUGUCGAUAACAACUUCUGUGGCCCUGAUGGCUACCCUCUUGAGUGCAUUAAAGACCUUCUAGCACGUGCUGGUAAAGCUUCAUGCACUUUGUCCGAACAACUGGACUUUAUUGACACUAAGAGGGGUGUAUACUGCUGCCGUGAACAUGAGCAUGAAAUUGCUUGGUACACGGAACGUUCU

### Translation (i.e. from mRNA -> amino acids) + amino acid analysis

In [11]:
# Ensure the length is multiple of 3
end = len(covid_mrna) - len(covid_mrna) % 3
covid_aa = covid_mrna[:end].translate()
print(covid_aa[:100])

IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER*DGEPCPWFQREN


In [None]:
from Bio.Data import IUPACData # For the amino acid names

# Check which amino acids are most common
common_amino = Counter(covid_aa)
# Take onnly the top 10 most common amino acids (sorted in descending order)
common_amino_top10 = dict(common_amino.most_common(10)) 

# Store the info with the corresponding amino acid names
amino_acid_names = {k: v for v, k in IUPACData.protein_letters_3to1.items()}
amino_acid_names.update({
    '*' : '* (stop)'
})

In [45]:
# Update the info_dict with the top 10 amino acids
info_dict.update({
    'Total amino acids': sum(common_amino.values()) # This includes the stop codons
})
info_dict.update(
    {amino_acid_names[k]: v for k, v in common_amino_top10.items() if k in amino_acid_names}
)


### Protein analysis

In [32]:
proteins = covid_aa.split('*') # becuase * splits the chain of amino acids; it is a list of Seq objects

In [48]:
functional_proteins = []
# Take only the functional proteins (i.e. 20+ amino acids chains)
for this_protein in proteins:
    if len(this_protein) > 20:
        functional_proteins.append(this_protein)

# Top 5 proteins
top_5_proteins = sorted(functional_proteins, key=len, reverse=True)[:5]


In [None]:
# Update the dictionary
info_dict.update({
    'Total functional proteins': len(functional_proteins),
    'Length largest protein': len(top_5_proteins[0]),
})

In [49]:
info_dict

{'Length': 29903,
 'Molecular Weight': 9241219.214399999,
 'GC Content (%)': 37.97277865097147,
 'A Content ': 8954,
 'C Count': 5492,
 'G Count': 5863,
 'T Count': 9594,
 'Total amino acids': 9967,
 'Leu': 886,
 'Ser': 810,
 '* (stop)': 774,
 'Thr': 679,
 'Cys': 635,
 'Phe': 593,
 'Arg': 558,
 'Val': 548,
 'Tyr': 505,
 'Asn': 472,
 'Total functional proteins': 73,
 'Length largest protein': 2701}

In [52]:
# Save the largest protein for further analysis
with open("covid_protein_seq.fasta", "w") as file:
    file.write(f">covid protein\n{top_5_proteins[0]}")