# *Parsing of FASTA Sequences*

In [18]:
from Bio import Entrez, SeqIO

def download_fasta_sequence(email, accession_number, output_file):
    Entrez.email = email
    handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text")
    sequence = handle.read()
    
    with open(output_file, "w") as f:
        f.write(sequence)

# Replace with your email, accession number, and output file name
email = "pallawik@iiitd.ac.in"
accession_number = "NC_001474.2"  # Replace with your accession number
output_file = "output2.fasta"  # Replace with your desired output file name

download_fasta_sequence(email, accession_number, output_file)

In [19]:
#cat output.fasta
#https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000871845.1/
from Bio import SeqIO
for seq_record in list(SeqIO.parse("output2.fasta", "fasta"))[:5]:
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

NC_001474.2
Seq('AGTTGTTAGTCTACGTGGACCGACAAAGACAGATTCTTTGAGGGAGCTAAGCTC...TCT')
10723


In [20]:
from Bio.Seq import Seq

# Creating a DNA sequence
dna_sequence = Seq("ATCGGCTA")
print(dna_sequence)

ATCGGCTA


In [21]:
for index, letter in enumerate(dna_sequence):
    print("%i %s" % (index, letter))
length = len(dna_sequence)
print("len =",length)

0 A
1 T
2 C
3 G
4 G
5 C
6 T
7 A
len = 8


In [22]:
print(dna_sequence[0])  # first letter
print(dna_sequence[1])  # second letter
print(dna_sequence[2])  # third letter

A
T
C


In [23]:
# Count occurrences of a specific base
count_A = dna_sequence.count('A')
print("Count of A:", count_A)
gcount = dna_sequence.count("G")
ccount = dna_sequence.count("C")
print(gcount)
print(ccount)

Count of A: 2
2
2


In [24]:
# Slicing
print(dna_sequence)
subsequence = dna_sequence[1:5]
print("Subsequence:", subsequence)

ATCGGCTA
Subsequence: TCGG


In [25]:
# Concatenation
print(dna_sequence)

concatenated_sequence = dna_sequence + "AT"
print("Concatenated sequence:", concatenated_sequence)
# Indexing
first_base = dna_sequence[0]
print("First base:", first_base)

ATCGGCTA
Concatenated sequence: ATCGGCTAAT
First base: A


In [26]:
# Reverse complement
print("original sequence: ", dna_sequence)

reverse_complement = dna_sequence.reverse_complement()
print("Reverse complement:", reverse_complement)

original sequence:  ATCGGCTA
Reverse complement: TAGCCGAT


In [27]:
#Replace ambiguous bases
my_dna = Seq("ACGTnnTT")
print(my_dna.replace("nn", ""))

ACGTTT


In [28]:
#https://www.cup.uni-muenchen.de/ch/compchem/tink/as.html
from Bio.Seq import Seq

# Creating a DNA sequence
dna_sequence = Seq("ATCAACTTTTAA")
# Transcribing DNA to RNA
rna_sequence = dna_sequence.transcribe()

# Translation of RNA to protein
protein_sequence = rna_sequence.translate()

print(f"DNA Sequence: {dna_sequence}")
print(f"RNA Sequence: {rna_sequence}")
print(f"Protein Sequence: {protein_sequence}")

DNA Sequence: ATCAACTTTTAA
RNA Sequence: AUCAACUUUUAA
Protein Sequence: INF*
