In [None]:
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import numpy as np

from Bio import Entrez, SeqIO

## General info
* this notebook is based on a [Land Dominikovic's repository](https://github.com/lanadominkovic/12-days-of-biopython?tab=readme-ov-file)
* [her tutorial video](https://www.youtube.com/watch?v=ocA2IMe7dpA) is excellent too

In [4]:
# Get to know what Seq can do
sequence = Seq("ATGCGTACGTAGCTAGCTAGCTAGCTAGCTAGC")
print("Sequence:", sequence)
print(sequence.count("CAA")) # = Glutamine
# Codon = three DNA/RNA nucleotides corresponding to a specific amino acid
# (e.g.)  "start translation" (ATG/AUG = start), "stop translation" (e.g., TAA = stop)

Sequence: ATGCGTACGTAGCTAGCTAGCTAGCTAGCTAGC
0


## Transcription

In [19]:
# Conversion: DNA -> RNA (T -> U)
coding_dna = Seq("ATGGUUATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 
template_dna = coding_dna.reverse_complement() # reverse complement
print("Coding DNA:", coding_dna)
print("Template DNA:", template_dna)
messenger_rna = coding_dna.transcribe() # DNA -> RNA (back-transcription with .back_transcribe())
print("mRNA:", messenger_rna)

# Translate (possible with both mRNA and DNA)
print(coding_dna.translate()) # DNA -> protein; to_stop=True, then it stops at UGA/UAG
print(messenger_rna.translate(to_stop=True)) # RNA -> protein; 

# AUG	Methionine	M
# GGC	Glycine	G
# CGC	Arginine	R
# UGA	Stop	*
# AAG	Lysine	K
# GGU	Glycine	G
# GCC	Alanine	A
# CGA	Arginine R
# UAG   Stop *

Coding DNA: ATGGUUATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
Template DNA: CTATCGGGCACCCTTTCAGCGGCCCATTACAATAACCAT
mRNA: AUGGUUAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
MVIVMGR*KGAR*
MVIVMGR


## Access to NCBI database

In [None]:
# Get an access to NCBI via Entrez
Entrez.email = "sayakokodera@gmail.com" 
handle = Entrez.einfo()
rec = Entrez.read(handle)
handle.close()
print(rec.keys())
print(rec['DbList']) # all available data base

dict_keys(['DbList'])
['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']


### Task: find the **chloroquine resistance transporter (CRT)** gene (KM288867) 
* Info
    * Goal = CRT gene present in Plasmodium falciparum (the parasite that causes malaria) 
    * database = nucleotide
* Steps
    * **close "handle" as soon as each step is complete!!**
    * (1) Search the database with keywords (**Entrez.esearch**)
    * (2) Download full records that are relevant for your task (**Entrez.efetch**)
        * Be mindful about fetching: data volume may get easily intractable
    * (3) Analyze the records; looking for the CRT gene (ID = KM288867)

In [None]:
# (1) Search the database
handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]', retmax="40")
rec_list = Entrez.read(handle)
handle.close()

In [None]:
print(rec_list['Count'])
print(len(rec_list['IdList']))
print(rec_list['IdList']) # IDs of the records (i.e. database entries)

3441
40
['2946626456', '2928401860', '2928401858', '2928401856', '2928401854', '2928401852', '2928401850', '2928401848', '2928401846', '2928401844', '2928401842', '2928401840', '2928401838', '2928401836', '2928401834', '2928401832', '2928401830', '2928401828', '2928401826', '2928401824', '2928401822', '2928401820', '2928401818', '2928401816', '2928401814', '2928401812', '2928401810', '2928401808', '2928401806', '2928401804', '2928401802', '2928401800', '2928401798', '2928401796', '2928401794', '2928401792', '2928401790', '2928401788', '2928401786', '2928401784']


In [None]:
# (2) Fetch the database
handle = Entrez.efetch(db="nucleotide", id=rec_list['IdList'], rettype="gb")
recs = list(SeqIO.parse(handle, 'gb')) # Now it is a list of SeqIO objects -> easy to deal with
handle.close()

In [29]:
recs[:5]

[SeqRecord(seq=Seq('GGTTCTTGTCTTGGTAAATGTGCTCATGTGTTTAAACTTATTTTTAAAGAGATT...TCC'), id='PV353992.1', name='PV353992', description='Plasmodium falciparum isolate MBC148 chloroquine resistance transporter (crt) gene, partial cds', dbxrefs=[]),
 SeqRecord(seq=Seq('TGTGCTCATGTGTTTAAACTTATTTTTAAAGAGATTAAGGATAATATTTTTATT...TTG'), id='PV172738.1', name='PV172738', description='Plasmodium falciparum isolate RO_75 chloroquine resistance transporter (crt) gene, partial cds', dbxrefs=[]),
 SeqRecord(seq=Seq('TGTGCTCATGTGTTTAAACTTATTTTTAAAGAGATTAAGGATAATATTTTTATT...TTG'), id='PV172737.1', name='PV172737', description='Plasmodium falciparum isolate RO_74 chloroquine resistance transporter (crt) gene, partial cds', dbxrefs=[]),
 SeqRecord(seq=Seq('TGTGCTCATGTGTTTAAACTTATTTTTAAAGAGATTAAGGATAATATTTTTATT...TTG'), id='PV172736.1', name='PV172736', description='Plasmodium falciparum isolate PA_73 chloroquine resistance transporter (crt) gene, partial cds', dbxrefs=[]),
 SeqRecord(seq=Seq('TGTGCTCATGTGTTT

In [None]:
# (3) Analyze: find the CRT gene by iterating over the records
for rec in recs:
    if rec.name == 'KM288867': # try finding CRT gene in 40 records we fetched
        break
print(rec.name)
print(rec.description)


PV172700
Plasmodium falciparum isolate AC_37 chloroquine resistance transporter (crt) gene, partial cds


In [38]:
print(str(rec.seq))
print(len(rec.seq))
print(len(rec.seq)//3) # number of codons


TGTGCTCATGTGTTTAAACTTATTTTTAAAGAGATTAAGGATAATATTTTTATTTATATTTTAAGTATTATTTATTTAAGTGTATCTGTAATGAATACAATTTTTGCTAAAAGAACTTTAAACAAAATTGGTAACTATAGTTTTG
145
48


In [None]:
# Ensure there is no partial codon
idx_trim = len(rec.seq) - (len(rec.seq) % 3)
print(rec.seq[:idx_trim].reverse_complement().translate())

KTIVTNFV*SSFSKNCIHYRYT*INNT*NINKNIILNLFKNKFKHMST
