In [1]:
from Bio.Seq import Seq

In [2]:
my_seq = Seq("AGTACACTGGT")

In [4]:
my_seq

Seq('AGTACACTGGT', Alphabet())

In [5]:
my_seq.complement()

Seq('TCATGTGACCA', Alphabet())

In [6]:
my_seq.reverse_complement()

Seq('ACCAGTGTACT', Alphabet())

In [7]:
from Bio import SeqIO

In [8]:
for seq_record in SeqIO.parse("data/ls_orchid.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', SingleLetterAlphabet())
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', SingleLetterAlphabet())
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GC

## Simple GenBank parsing example

In [11]:
from Bio import SeqIO
for seq_record in SeqIO.parse("data/ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())
740
Z78532.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', IUPACAmbiguousDNA())
753
Z78531.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', IUPACAmbiguousDNA())
748
Z78530.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', IUPACAmbiguousDNA())
744
Z78529.1
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', IUPACAmbiguousDNA())
733
Z78527.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', IUPACAmbiguousDNA())
718
Z78526.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', IUPACAmbiguousDNA())
730
Z78525.1
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA', IUPACAmbiguousDNA())
704
Z78524.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC', IUPACAmbiguousDNA())
740
Z78523.1
Seq('CGTAACCAGGTTTCCGTAGGTGAACCTGCGGCAGGATCATTGTTGAGACAGCAG...AAG', IUPAC

In [12]:
from Bio.Seq import Seq

In [13]:
my_seq = Seq("AGTACACTGGT")

In [14]:
my_seq

Seq('AGTACACTGGT', Alphabet())

In [15]:
my_seq.alphabet

Alphabet()

In [16]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)

In [17]:
my_seq

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [18]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_prot = Seq("AGTACACTGGT", IUPAC.protein)
my_prot

Seq('AGTACACTGGT', IUPACProtein())

In [19]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

0 G
1 A
2 T
3 C
4 G


In [20]:
print(len(my_seq))

5


In [21]:
print(my_seq[0]) # first letter
print(my_seq[2]) # third letter
print(my_seq[-1]) # last letter

G
T
G


In [22]:
from Bio.Seq import Seq
"AAAA".count("AA")

2

In [24]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
len(my_seq)
my_seq.count("G")
100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)

46.875

In [26]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GATCGATGGGCCAATATATAGGGATACGAAATCGT", IUPAC.unambiguous_dna)
GC(my_seq)

42.857142857142854

In [27]:
my_seq[0::3]

Seq('GCTGAATGTGAG', IUPACUnambiguousDNA())

In [28]:
my_seq[1::3]

Seq('AGGCATAGAATT', IUPACUnambiguousDNA())

In [29]:
my_seq[2::3]

Seq('TAGCTAGACAC', IUPACUnambiguousDNA())

In [30]:
my_seq[::1]

Seq('GATCGATGGGCCAATATATAGGGATACGAAATCGT', IUPACUnambiguousDNA())

In [31]:
str(my_seq)

'GATCGATGGGCCAATATATAGGGATACGAAATCGT'

In [32]:
print(my_seq)

GATCGATGGGCCAATATATAGGGATACGAAATCGT


In [33]:
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

>Name
GATCGATGGGCCAATATATAGGGATACGAAATCGT



In [34]:
str(my_seq)

'GATCGATGGGCCAATATATAGGGATACGAAATCGT'

## Concatenating or adding sequences

In [35]:
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
protein_seq = Seq("EVRNAK", IUPAC.protein)
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)

In [36]:
from Bio.Alphabet import generic_alphabet
protein_seq.alphabet = generic_alphabet
dna_seq.alphabet = generic_alphabet

In [37]:
protein_seq + dna_seq

Seq('EVRNAKACGT', Alphabet())

In [38]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_nucleotide
from Bio.Alphabet import IUPAC
nuc_seq = Seq("GATCGATGC", generic_nucleotide)
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)

In [39]:
nuc_seq

Seq('GATCGATGC', NucleotideAlphabet())

In [40]:
dna_seq

Seq('ACGT', IUPACUnambiguousDNA())

In [41]:
nuc_seq + dna_seq

Seq('GATCGATGCACGT', NucleotideAlphabet())

In [42]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

In [43]:
list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)]
concatenated = Seq("", generic_dna)

In [44]:
for s in list_of_seqs:
    concatenated += s


In [45]:
concatenated

Seq('ACGTAACCGGTT', DNAAlphabet())

In [46]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

In [47]:
list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)]
sum(list_of_seqs, Seq("", generic_dna))

Seq('ACGTAACCGGTT', DNAAlphabet())

In [48]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
dna_seq = Seq("acgtACGT", generic_dna)
dna_seq

Seq('acgtACGT', DNAAlphabet())

In [49]:
dna_seq.upper()

Seq('ACGTACGT', DNAAlphabet())

In [50]:
"GCTA" in dna_seq

False

In [51]:
"GTAC" in dna_seq.upper()

True

In [52]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

In [53]:
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)
dna_seq

Seq('ACGT', IUPACUnambiguousDNA())

In [54]:
dna_seq.lower()

Seq('acgt', DNAAlphabet())

In [55]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
my_seq

Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPACUnambiguousDNA())

In [56]:
my_seq.complement()

Seq('CTAGCTACCCGGATATATCCTAGCTTTTAGCG', IUPACUnambiguousDNA())

In [57]:
my_seq.reverse_complement()

Seq('GCGATTTTCGATCCTATATAGGCCCATCGATC', IUPACUnambiguousDNA())

In [58]:
my_seq[::-1]

Seq('CGCTAAAAGCTAGGATATATCCGGGTAGCTAG', IUPACUnambiguousDNA())

In [59]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
protein_seq = Seq("EVRNAK", IUPAC.protein)


In [60]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGTAG", IUPAC.unambiguous_dna)
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGTAG', IUPACUnambiguousDNA())

In [61]:
template_dna = coding_dna.reverse_complement()

In [62]:
template_dna

Seq('CTACGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT', IUPACUnambiguousDNA())

In [63]:
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGTAG', IUPACUnambiguousDNA())

In [64]:
messenger_rna = coding_dna.transcribe()
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGUAG', IUPACUnambiguousRNA())

In [65]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGUAG', IUPACUnambiguousRNA())

In [66]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGUGAAAGGGGUGUCCCGAUAG", IUPAC.unambiguous_rna)
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGUGAAAGGGGUGUCCCGAUAG', IUPACUnambiguousRNA())

In [67]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGUAG', IUPACUnambiguousRNA())

In [68]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

In [69]:
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

In [71]:
messenger_rna.back_transcribe()

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())

In [72]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

## Translation

In [73]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAGGGUGCCCAUAG", IUPAC.unambiguous_rna)
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAGGGUGCCCAUAG', IUPACUnambiguousRNA())

In [74]:
messenger_rna.translate()



Seq('MAIVMGR*RVPI', HasStopCodon(IUPACProtein(), '*'))

In [75]:
coding_dna.translate(to_stop = True)



Seq('MAIVMGR', IUPACProtein())

In [76]:
coding_dna.translate(table = 2, to_stop = True)



Seq('MAIVMGRWKGAR', IUPACProtein())

In [77]:
coding_dna.translate(table = 2, stop_symbol = "@")



Seq('MAIVMGRWKGAR', IUPACProtein())

In [78]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
gene = Seq("GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGTCCCATCCCA" + \
          "GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACGACAGGCGATCGTAGT" + \
          "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAACAT" + \
          "TATGAATGGCGAGGCAATCGCTGGCACCTACAGGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" + \
          "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA",
          generic_dna)

In [79]:
gene.translate(table = "Bacterial")

Seq('VKKMQSIVLALSLVLVVPSQHRLRKLR*SRQ*NYDRRS**SWLLLGWRSLARPR...HR*', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [80]:
gene.translate(table = "Bacterial", to_stop = True)

Seq('VKKMQSIVLALSLVLVVPSQHRLRKLR', ExtendedIUPACProtein())

In [82]:
from Bio.Data import CodonTable

In [83]:
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]

In [85]:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]

In [86]:
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [87]:
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [88]:
mito_table.stop_codons

['TAA', 'TAG', 'AGA', 'AGG']

In [90]:
mito_table.start_codons

['ATT', 'ATC', 'ATA', 'ATG', 'GTG']

In [91]:
mito_table.forward_table["ACG"]

'T'

In [92]:
from Bio.Seq import Seq

In [93]:
from Bio.Alphabet import IUPAC

In [94]:
seq1 = Seq("ACGT", IUPAC.unambiguous_dna)

In [95]:
seq2 = Seq("ACGT", IUPAC.ambiguous_dna)

In [96]:
str(seq1) == str(seq2)

True

In [97]:
str(seq1) == str(seq1)

True

In [98]:
seq1 == "ACGT"

True

In [99]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein
dna_seq = Seq("ACGT", generic_dna)
prot_seq = Seq('ACGT', generic_protein)
dna_seq == prot_seq



True

In [101]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GCCATTCTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

In [102]:
mutable_seq = my_seq.tomutable()
mutable_seq

MutableSeq('GCCATTCTAATGGGCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [104]:
from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
mutable_seq = MutableSeq("GCCATTCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

In [105]:
mutable_seq

MutableSeq('GCCATTCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [106]:
mutable_seq[5] = "C"

In [107]:
mutable_seq

MutableSeq('GCCATCCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [108]:
mutable_seq.remove("T")

In [109]:
mutable_seq

MutableSeq('GCCACCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [110]:
mutable_seq.reverse()

In [111]:
mutable_seq

MutableSeq('AGCCCGTGGGAAAGTCGCCCGGGTCCTTGACCACCG', IUPACUnambiguousDNA())

In [112]:
new_seq = mutable_seq.toseq()

In [113]:
new_seq

Seq('AGCCCGTGGGAAAGTCGCCCGGGTCCTTGACCACCG', IUPACUnambiguousDNA())

In [115]:
from Bio.Seq import UnknownSeq

In [116]:
from Bio.Alphabet import IUPAC

In [117]:
unk_dna = UnknownSeq(20, alphabet = IUPAC.ambiguous_dna)

In [118]:
unk_dna

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [119]:
print(unk_dna)

NNNNNNNNNNNNNNNNNNNN


In [120]:
unk_dna

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [121]:
unk_dna.complement()

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [123]:
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate

In [124]:
my_string = 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'

In [125]:
reverse_complement(my_string)

'CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC'

In [126]:
transcribe(my_string)

'GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG'

In [127]:
back_transcribe(my_string)

'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'

In [128]:
translate(my_string)

'AVMGRWKGGRAAG*'

In [129]:
from Bio.Seq import Seq
simple_seq = Seq("GATC")
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(simple_seq)

In [130]:
simple_seq_r.id

'<unknown id>'

In [131]:
simple_seq_r.id = "AC12345"

In [132]:
simple_seq_r.description = "Made up sequence I wish I could write a paper about"

In [133]:
print(simple_seq_r.seq)

GATC


In [134]:
simple_seq_r.seq

Seq('GATC', Alphabet())

In [135]:
from Bio.Seq import Seq

In [136]:
simple_seq = Seq("GATC")

In [137]:
from Bio.SeqRecord import SeqRecord

In [138]:
simple_seq_r = SeqRecord(simple_seq, id = "AC12345")

In [139]:
simple_seq_r.description = "Made up sequence I wish I could write a paper about"

In [140]:
print(simple_seq_r.description)

Made up sequence I wish I could write a paper about


In [141]:
simple_seq_r.seq

Seq('GATC', Alphabet())

In [142]:
from Bio.Seq import Seq

In [143]:
simple_seq = Seq("GATC")

In [144]:
from Bio.SeqRecord import SeqRecord

In [146]:
simple_seq_r = SeqRecord(simple_seq, id = "AC12345")

In [147]:
simple_seq_r.annotations["evidence"] = "None. I just made it up."

In [None]:
print(simple)