In [3]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 3.8 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[K     |████████████████████████████████| 505 kB 14.2 MB/s eta 0:00:01
[?25hCollecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[K     |████████████████████████████████| 345 kB 52.9 MB/s eta 0:00:01
[?25hCollecting numpy>=1.22.4
  Downloading numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
[K     |████████████████████████████████| 14.0 MB 2.4 MB/s eta 0:00:011
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.2.2 pytz-2024.1 tzdata-2024.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
No

In [1]:
import pandas as pd

In [2]:
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
except ImportError:
    pass

In [3]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import sys

from urllib.request import urlretrieve

import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable

print("Python version:", sys.version_info)
print("Biopython version:", Bio.__version__)

Python version: sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)
Biopython version: 1.83


In [5]:
input_file = "unknown-sequence.fa"

fasta_loc = ("https://raw.githubusercontent.com/chris-rands/"
             "biopython-coronavirus/master/unknown-sequence.fa")

if not os.path.exists(input_file):
    urlretrieve(fasta_loc, input_file)

In [6]:
for record in SeqIO.parse(input_file, "fasta"):
    print(record.id)

Unknown_sequence


In [7]:
record = SeqIO.read(input_file, "fasta")

In [8]:
record.seq

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [9]:
print("Sequence length (bp)", len(record))

Sequence length (bp) 29903


In [10]:
print("GC content (%)", gc_fraction(record.seq)*100)

GC content (%) 37.97277865097147


Compare to other genome sequences

Let's use BLAST to align the unknown sequence to other annoated sequences in the NCBI nt database, which contains sequences from many different species from accross the tree of life.

This may take ~10 minutes since we are doing an online search against many sequences (for larger queries, it would sensible to run BLAST locally instead; see Bio.Blast.Applications)

In [None]:
%%time
result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)

Let's process the results with one of Biopython's generic parser

In [None]:
blast_qresult = SearchIO.read(result_handle, "blast-xml")
print(blast_qresult)

Program: blastn (2.15.0+)
  Query: No (29903)
         definition line
 Target: nt
   Hits: 0
