# Biopython

In this session, we’ll use the Biopython library to handle DNA and protein sequences just like a bioinformatician — reading FASTA files, calculating GC content, translating DNA to protein, and even fetching data from NCBI.

In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


# Single Sequence Manipulation in Biopython

In [12]:
from Bio.Seq import Seq

from Bio.SeqUtils import gc_fraction
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils import MeltingTemp as mt

from Bio import SeqIO

In [13]:
# Biological Setup

DNA = 'AGCGCGTATATAATAGCTCAA'

# Challenge
# a. Obtain complement sequence using biopython
# b. Obtain reverse complement sequence using biopython
# c. Transcript and Translate the sequence using biopython

In [14]:
dna1_seq = Seq(DNA, 'fasta')
print(f"Complement: {dna1_seq.complement()}\n")
print(f"Reverse Complement: {dna1_seq.reverse_complement()}\n")
print(f"Transcription: {dna1_seq.transcribe()}\n")
print(f"Translation: {dna1_seq.translate()}")

Complement: TCGCGCATATATTATCGAGTT

Reverse Complement: TTGAGCTATTATATACGCGCT

Transcription: AGCGCGUAUAUAAUAGCUCAA

Translation: SAYIIAQ


# Reading and analysing a FASTA file

In [19]:
# Biological Setup
fasta_content = SeqIO.parse('genes.fa', 'fasta')

# Challenge
# For a given index, retrieve the sequence and its relevant information
# Do the same for all the sequence and summarise it into a tabular data

In [7]:
index = 3
for i, content in enumerate(fasta_content):
  if i  == index:
    print(content.id)
    print(content.seq)
    break

4R79.1a
atgcttgaccacgttttgcttttaacctactgcttagtttcaactgttgtcagatctcaaccatcggcagatgtgtttcgaagctttgccggatacattcctgaagatcacagggtgacacatcatgaatggcaaaattcagggaagtttcaaggggatattgatggagtcgatcctaaccttctcaagctcccagaaggtccagtgcttttcaatgcattaaaaaacaagcagttaacgtgggagggcggtgtgatcccatacgaaatggatacggcattctcaccgaatgaaataaaaattttggaaaaagcgtttgacagttaccgacgaacaacatgcattagatttgaaaaacgagaaggtcaaacagactacttaaacattgtaaaaggatatgggtgctactctcaagttggacgaactggaggaaaacaggaaatttctttgggacgtggctgcttttttcatgaaataattgtacacgaactgatgcattccgtcggattttggcacgaacactcgagagctgatcgcgatgatcacattaagatcaactgggataatattctgcctggaatgaagtctcaattcgataaaatttcggcagtgttacaagatcttcaaggagaaaactacgattacaaatcaataatgcactatgacagcactgcgttttcaagaaacggacgaaacacaatagaaactgtagaaaatggattcacacaggttattgggaccgctatggacttgtcacctctggatattgtgaaaatcaacaaactgtattcgtgtaaaactaagaagaaagagaaagtaaagcctgcaactaccgaggaacctcatcaactgattccgcaagttgtggataaaaattcggtcgattctggagaaaaatgtgtcgatcattttgcggattgcccgcactttgcacaatactgtacccgtgcttcatttttctttgttatgaaatcgtactgtccatttacgtg

In [20]:
with open('Seq_information.txt', 'w') as f:
    f.write('Seq ID\tLength\tGC Content\tGC Skew\tMelting Point (°C)\tMol Weight\n')
    for content in fasta_content:
        seq = content.seq.upper()
        length = len(seq)

        gc_content = round((seq.count('G') + seq.count('C')) / length * 100, 2)
        gc_skew = round((seq.count('G') - seq.count('C')) / (seq.count('G') + seq.count('C')), 2)

        # Use Biopython's melting temp calculation
        melting_point = round(mt.Tm_Wallace(seq), 2)  # or mt.Tm_NN(seq) for nearest-neighbor model
        melting_point = round(mt.Tm_NN(seq,
                                       nn_table=mt.DNA_NN4,
                                       Na=50,
                                       Mg=1.5,
                                       dnac1=50
                                       ),
                              2) # [Na+] in mM, # [Mg++] in mM, # DNA concentration in nM
        mol_wt = round(molecular_weight(seq, seq_type="DNA"), 2)

        f.write(f'{content.id}\t{length}\t{gc_content}\t{gc_skew}\t{melting_point}\t{mol_wt}\n')

# Exploring ncbi database using biopython

In [34]:
# Biological Set Up
Author = "Lat PK"
database = 'pubmed'
retmax = 10

# Challenge
# Find the publication's PMID by the author above using biopython
# Fetch the abstract for each publications you found above and save it in a file

In [35]:
from Bio import Entrez, Medline
Entrez.email = "prince@redwoodai.com"

# Search for articles
handle = Entrez.esearch(db=database, term=f"{Author}[Author]", retmax=retmax)
record = Entrez.read(handle)
handle.close()

id_list = record["IdList"]
print(id_list)

['40832932', '40461945', '39232602', '38767921', '38716764', '33580565', '32520335', '32329781', '30275490', '29912891']


In [36]:
ids = ",".join(record["IdList"])
# Fetch the article records in Medline format
handle = Entrez.efetch(db=database, id=ids, rettype="medline", retmode="text")
records = list(Medline.parse(handle))
handle.close()

# Write nicely formatted abstracts to a file
with open(f"Abstracts_by_{Author.replace(' ', '_')}.txt", "w", encoding="utf-8") as f:
    for i, rec in enumerate(records, start=1):
        title = rec.get("TI", "No Title")
        journal = rec.get("JT", "No Journal")
        year = rec.get("DP", "No Date")
        abstract = rec.get("AB", "No Abstract")

        f.write(f"Article {i}:\n")
        f.write(f"Title: {title}\n")
        f.write(f"Journal: {journal} ({year})\n")
        f.write(f"Abstract:\n{abstract}\n")
        f.write("-" * 80 + "\n\n")  # separator between articles

print(f"Saved {len(id_list)} abstracts to file.")

Saved 10 abstracts to file.


In [37]:
# Biological Set Up
ids = ["NM_001301717", "NM_001101", "NM_000546"]


# Challenge
# Fetch the sequence by ids and prepare a fasta file

In [39]:
handle = Entrez.efetch(db="nucleotide", id=",".join(ids), rettype="fasta", retmode="text")
with open("sequences_ids.fasta", "w") as f:
    f.write(handle.read())
handle.close()