In [1]:
from Bio.Seq import Seq

In [2]:
my_seq = Seq("AGTACACTGGT")

In [4]:
my_seq

Seq('AGTACACTGGT', Alphabet())

In [5]:
my_seq.complement()

Seq('TCATGTGACCA', Alphabet())

In [6]:
my_seq.reverse_complement()

Seq('ACCAGTGTACT', Alphabet())

In [7]:
from Bio import SeqIO

In [8]:
for seq_record in SeqIO.parse("data/ls_orchid.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', SingleLetterAlphabet())
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', SingleLetterAlphabet())
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GC

## Simple GenBank parsing example

In [11]:
from Bio import SeqIO
for seq_record in SeqIO.parse("data/ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())
740
Z78532.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', IUPACAmbiguousDNA())
753
Z78531.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', IUPACAmbiguousDNA())
748
Z78530.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', IUPACAmbiguousDNA())
744
Z78529.1
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', IUPACAmbiguousDNA())
733
Z78527.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', IUPACAmbiguousDNA())
718
Z78526.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', IUPACAmbiguousDNA())
730
Z78525.1
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA', IUPACAmbiguousDNA())
704
Z78524.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC', IUPACAmbiguousDNA())
740
Z78523.1
Seq('CGTAACCAGGTTTCCGTAGGTGAACCTGCGGCAGGATCATTGTTGAGACAGCAG...AAG', IUPAC

In [12]:
from Bio.Seq import Seq

In [13]:
my_seq = Seq("AGTACACTGGT")

In [14]:
my_seq

Seq('AGTACACTGGT', Alphabet())

In [15]:
my_seq.alphabet

Alphabet()

In [16]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)

In [17]:
my_seq

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [18]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_prot = Seq("AGTACACTGGT", IUPAC.protein)
my_prot

Seq('AGTACACTGGT', IUPACProtein())

In [19]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

0 G
1 A
2 T
3 C
4 G


In [20]:
print(len(my_seq))

5


In [21]:
print(my_seq[0]) # first letter
print(my_seq[2]) # third letter
print(my_seq[-1]) # last letter

G
T
G


In [22]:
from Bio.Seq import Seq
"AAAA".count("AA")

2

In [24]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
len(my_seq)
my_seq.count("G")
100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)

46.875

In [26]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GATCGATGGGCCAATATATAGGGATACGAAATCGT", IUPAC.unambiguous_dna)
GC(my_seq)

42.857142857142854

In [27]:
my_seq[0::3]

Seq('GCTGAATGTGAG', IUPACUnambiguousDNA())

In [28]:
my_seq[1::3]

Seq('AGGCATAGAATT', IUPACUnambiguousDNA())

In [29]:
my_seq[2::3]

Seq('TAGCTAGACAC', IUPACUnambiguousDNA())

In [30]:
my_seq[::1]

Seq('GATCGATGGGCCAATATATAGGGATACGAAATCGT', IUPACUnambiguousDNA())

In [31]:
str(my_seq)

'GATCGATGGGCCAATATATAGGGATACGAAATCGT'

In [32]:
print(my_seq)

GATCGATGGGCCAATATATAGGGATACGAAATCGT


In [33]:
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

>Name
GATCGATGGGCCAATATATAGGGATACGAAATCGT



In [34]:
str(my_seq)

'GATCGATGGGCCAATATATAGGGATACGAAATCGT'

## Concatenating or adding sequences

In [35]:
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
protein_seq = Seq("EVRNAK", IUPAC.protein)
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)

In [36]:
from Bio.Alphabet import generic_alphabet
protein_seq.alphabet = generic_alphabet
dna_seq.alphabet = generic_alphabet

In [37]:
protein_seq + dna_seq

Seq('EVRNAKACGT', Alphabet())

In [38]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_nucleotide
from Bio.Alphabet import IUPAC
nuc_seq = Seq("GATCGATGC", generic_nucleotide)
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)

In [39]:
nuc_seq

Seq('GATCGATGC', NucleotideAlphabet())

In [40]:
dna_seq

Seq('ACGT', IUPACUnambiguousDNA())

In [41]:
nuc_seq + dna_seq

Seq('GATCGATGCACGT', NucleotideAlphabet())

In [42]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

In [43]:
list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)]
concatenated = Seq("", generic_dna)

In [44]:
for s in list_of_seqs:
    concatenated += s


In [45]:
concatenated

Seq('ACGTAACCGGTT', DNAAlphabet())

In [46]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

In [47]:
list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)]
sum(list_of_seqs, Seq("", generic_dna))

Seq('ACGTAACCGGTT', DNAAlphabet())

In [48]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
dna_seq = Seq("acgtACGT", generic_dna)
dna_seq

Seq('acgtACGT', DNAAlphabet())

In [49]:
dna_seq.upper()

Seq('ACGTACGT', DNAAlphabet())

In [50]:
"GCTA" in dna_seq

False

In [51]:
"GTAC" in dna_seq.upper()

True

In [52]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

In [53]:
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)
dna_seq

Seq('ACGT', IUPACUnambiguousDNA())

In [54]:
dna_seq.lower()

Seq('acgt', DNAAlphabet())

In [55]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
my_seq

Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPACUnambiguousDNA())

In [56]:
my_seq.complement()

Seq('CTAGCTACCCGGATATATCCTAGCTTTTAGCG', IUPACUnambiguousDNA())

In [57]:
my_seq.reverse_complement()

Seq('GCGATTTTCGATCCTATATAGGCCCATCGATC', IUPACUnambiguousDNA())

In [58]:
my_seq[::-1]

Seq('CGCTAAAAGCTAGGATATATCCGGGTAGCTAG', IUPACUnambiguousDNA())

In [59]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
protein_seq = Seq("EVRNAK", IUPAC.protein)


In [60]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGTAG", IUPAC.unambiguous_dna)
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGTAG', IUPACUnambiguousDNA())

In [61]:
template_dna = coding_dna.reverse_complement()

In [62]:
template_dna

Seq('CTACGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT', IUPACUnambiguousDNA())

In [63]:
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGTAG', IUPACUnambiguousDNA())

In [64]:
messenger_rna = coding_dna.transcribe()
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGUAG', IUPACUnambiguousRNA())

In [65]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGUAG', IUPACUnambiguousRNA())

In [66]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGUGAAAGGGGUGUCCCGAUAG", IUPAC.unambiguous_rna)
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGUGAAAGGGGUGUCCCGAUAG', IUPACUnambiguousRNA())

In [67]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGUAG', IUPACUnambiguousRNA())

In [68]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

In [69]:
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

In [71]:
messenger_rna.back_transcribe()

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())

In [72]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

## Translation

In [73]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAGGGUGCCCAUAG", IUPAC.unambiguous_rna)
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAGGGUGCCCAUAG', IUPACUnambiguousRNA())

In [74]:
messenger_rna.translate()



Seq('MAIVMGR*RVPI', HasStopCodon(IUPACProtein(), '*'))

In [75]:
coding_dna.translate(to_stop = True)



Seq('MAIVMGR', IUPACProtein())

In [76]:
coding_dna.translate(table = 2, to_stop = True)



Seq('MAIVMGRWKGAR', IUPACProtein())

In [77]:
coding_dna.translate(table = 2, stop_symbol = "@")



Seq('MAIVMGRWKGAR', IUPACProtein())

In [78]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
gene = Seq("GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGTCCCATCCCA" + \
          "GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACGACAGGCGATCGTAGT" + \
          "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAACAT" + \
          "TATGAATGGCGAGGCAATCGCTGGCACCTACAGGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" + \
          "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA",
          generic_dna)

In [79]:
gene.translate(table = "Bacterial")

Seq('VKKMQSIVLALSLVLVVPSQHRLRKLR*SRQ*NYDRRS**SWLLLGWRSLARPR...HR*', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [80]:
gene.translate(table = "Bacterial", to_stop = True)

Seq('VKKMQSIVLALSLVLVVPSQHRLRKLR', ExtendedIUPACProtein())

In [82]:
from Bio.Data import CodonTable

In [83]:
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]

In [85]:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]

In [86]:
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [87]:
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [88]:
mito_table.stop_codons

['TAA', 'TAG', 'AGA', 'AGG']

In [90]:
mito_table.start_codons

['ATT', 'ATC', 'ATA', 'ATG', 'GTG']

In [91]:
mito_table.forward_table["ACG"]

'T'

In [92]:
from Bio.Seq import Seq

In [93]:
from Bio.Alphabet import IUPAC

In [94]:
seq1 = Seq("ACGT", IUPAC.unambiguous_dna)

In [95]:
seq2 = Seq("ACGT", IUPAC.ambiguous_dna)

In [96]:
str(seq1) == str(seq2)

True

In [97]:
str(seq1) == str(seq1)

True

In [98]:
seq1 == "ACGT"

True

In [99]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein
dna_seq = Seq("ACGT", generic_dna)
prot_seq = Seq('ACGT', generic_protein)
dna_seq == prot_seq



True

In [101]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GCCATTCTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

In [102]:
mutable_seq = my_seq.tomutable()
mutable_seq

MutableSeq('GCCATTCTAATGGGCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [104]:
from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
mutable_seq = MutableSeq("GCCATTCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

In [105]:
mutable_seq

MutableSeq('GCCATTCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [106]:
mutable_seq[5] = "C"

In [107]:
mutable_seq

MutableSeq('GCCATCCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [108]:
mutable_seq.remove("T")

In [109]:
mutable_seq

MutableSeq('GCCACCAGTTCCTGGGCCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [110]:
mutable_seq.reverse()

In [111]:
mutable_seq

MutableSeq('AGCCCGTGGGAAAGTCGCCCGGGTCCTTGACCACCG', IUPACUnambiguousDNA())

In [112]:
new_seq = mutable_seq.toseq()

In [113]:
new_seq

Seq('AGCCCGTGGGAAAGTCGCCCGGGTCCTTGACCACCG', IUPACUnambiguousDNA())

In [115]:
from Bio.Seq import UnknownSeq

In [116]:
from Bio.Alphabet import IUPAC

In [117]:
unk_dna = UnknownSeq(20, alphabet = IUPAC.ambiguous_dna)

In [118]:
unk_dna

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [119]:
print(unk_dna)

NNNNNNNNNNNNNNNNNNNN


In [120]:
unk_dna

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [121]:
unk_dna.complement()

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [123]:
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate

In [124]:
my_string = 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'

In [125]:
reverse_complement(my_string)

'CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC'

In [126]:
transcribe(my_string)

'GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG'

In [127]:
back_transcribe(my_string)

'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'

In [128]:
translate(my_string)

'AVMGRWKGGRAAG*'

In [129]:
from Bio.Seq import Seq
simple_seq = Seq("GATC")
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(simple_seq)

In [130]:
simple_seq_r.id

'<unknown id>'

In [131]:
simple_seq_r.id = "AC12345"

In [132]:
simple_seq_r.description = "Made up sequence I wish I could write a paper about"

In [133]:
print(simple_seq_r.seq)

GATC


In [134]:
simple_seq_r.seq

Seq('GATC', Alphabet())

In [135]:
from Bio.Seq import Seq

In [136]:
simple_seq = Seq("GATC")

In [137]:
from Bio.SeqRecord import SeqRecord

In [138]:
simple_seq_r = SeqRecord(simple_seq, id = "AC12345")

In [139]:
simple_seq_r.description = "Made up sequence I wish I could write a paper about"

In [140]:
print(simple_seq_r.description)

Made up sequence I wish I could write a paper about


In [141]:
simple_seq_r.seq

Seq('GATC', Alphabet())

In [142]:
from Bio.Seq import Seq

In [143]:
simple_seq = Seq("GATC")

In [144]:
from Bio.SeqRecord import SeqRecord

In [146]:
simple_seq_r = SeqRecord(simple_seq, id = "AC12345")

In [147]:
simple_seq_r.annotations["evidence"] = "None. I just made it up."

In [148]:
print(simple_seq_r)

ID: AC12345
Name: <unknown name>
Description: <unknown description>
Number of features: 0
/evidence=None. I just made it up.
Seq('GATC', Alphabet())


In [149]:
from Bio import SeqIO
record = SeqIO.read("data/NC_005816.fna", "fasta")

In [150]:
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', SingleLetterAlphabet()), id='gi|45478711|ref|NC_005816.1|', name='gi|45478711|ref|NC_005816.1|', description='gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

In [151]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', SingleLetterAlphabet())

In [152]:
record.id

'gi|45478711|ref|NC_005816.1|'

In [153]:
record.name

'gi|45478711|ref|NC_005816.1|'

In [154]:
record.description

'gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence'

In [155]:
from Bio import SeqIO
record = SeqIO.read("data/NC_005816.gb", "genbank")

In [156]:
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', IUPACAmbiguousDNA()), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.', dbxrefs=['Project:58037'])

In [157]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', IUPACAmbiguousDNA())

In [158]:
record.id

'NC_005816.1'

In [159]:
record.name

'NC_005816'

In [160]:
record.description

'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.'

In [161]:
record.letter_annotations

{}

In [162]:
len(record.annotations)

11

In [163]:
record.annotations["source"]

'Yersinia pestis biovar Microtus str. 91001'

In [164]:
record.dbxrefs

['Project:58037']

In [165]:
len(record.features)

41

In [166]:
from Bio import SeqFeature
start_pos = SeqFeature.AfterPosition(5)

In [167]:
end_pos = SeqFeature.BetweenPosition(9, left = 8, right = 9)
my_location = SeqFeature.FeatureLocation(start_pos, end_pos)

In [168]:
print(my_location)

[>5:(8^9)]


In [169]:
my_location.start

AfterPosition(5)

In [170]:
print(my_location.start)

>5


In [171]:
my_location.end

BetweenPosition(9, left=8, right=9)

In [172]:
print(my_location.end)

(8^9)


In [173]:
int(my_location.start)

5

In [174]:
int(my_location.end)

9

In [175]:
my_location.nofuzzy_start

5

In [176]:
my_location.nofuzzy_end

9

In [177]:
exact_location = SeqFeature.FeatureLocation(5, 9)
print(exact_location)

[5:9]


In [178]:
exact_location.start

ExactPosition(5)

In [179]:
int(exact_location.start)

5

In [180]:
exact_location.nofuzzy_start

5

In [181]:
from Bio import SeqIO
my_snp = 4350

In [182]:
record = SeqIO.read("data/NC_005816.gb", "genbank")
for feature in record.features:
    if my_snp in feature:
        print("%s %s" % (feature.type, feature.qualifiers.get('db_xref')))

source ['taxon:229193']
gene ['GeneID:2767712']
CDS ['GI:45478716', 'GeneID:2767712']


In [183]:
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
example_parent = Seq("ACCGAGAGGGGCCGGCCGGGGCGAGGAGACTTCCTTCTTGCCGTGCTAGGGAATGGGAGCCTAGC")
example_feature = SeqFeature(FeatureLocation(5, 18), type = "gene", strand = -1)

In [184]:
feature_seq = example_parent[example_feature.location.start:example_feature.location.end].reverse_complement()

In [186]:
print(feature_seq)

CGGCCGGCCCCTC


In [187]:
print(example_feature.extract(example_parent))

CGGCCGGCCCCTC


In [189]:
print(len(example_feature.extract(example_parent)))

13


In [190]:
print(len(example_feature))

13


In [191]:
print(len(example_feature.location))

13


In [192]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [193]:
record1 = SeqRecord(Seq("ACGT"), id = 'test')
record2 = SeqRecord(Seq('ACGT'), id = "test")

In [194]:
record1 == record2

False

In [195]:
record1.id == record2.id

True

In [196]:
record1.seq == record2.seq

True

In [197]:
from Bio import SeqIO

In [198]:
record = SeqIO.read("data/NC_005816.gb", "genbank")

In [199]:
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', IUPACAmbiguousDNA()), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.', dbxrefs=['Project:58037'])

In [200]:
from Bio import SeqIO
record = SeqIO.read("data/NC_005816.gb", "genbank")

In [201]:
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', IUPACAmbiguousDNA()), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.', dbxrefs=['Project:58037'])

In [202]:
len(record)

9609

In [203]:
len(record.features)

41

In [204]:
print(record.features[20])

type: gene
location: [4342:4780](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']



In [205]:
print(record.features[21])

type: CDS
location: [4342:4780](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GI:45478716', 'GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']
    Key: note, Value: ['similar to many previously sequenced pesticin immunity protein entries of Yersinia pestis plasmid pPCP, e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) , gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655| emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1| (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL109969)']
    Key: product, Value: ['pesticin immunity protein']
    Key: protein_id, Value: ['NP_995571.1']
    Key: transl_table, Value: ['11']
    Key: translation, Value: ['MGGGMISKLFCLALIFLSSSGLAEKNTYTAKDILQNLELNTFGNSLSHGIYGKQTTFKQTEFTNIKSNTKKHIALINKDNSWMISLKILGIKRDEYTVCFEDFSLIRPPTYVAIHPLLIKKVKSGNFIVVKEIKKSIPGCTVYYH']



In [206]:
sub_record = record[4300:4800]

In [207]:
sub_record

SeqRecord(seq=Seq('ATAAATAGATTATTCCAAATAATTTATTTATGTAAGAACAGGATGGGAGGGGGA...TTA', IUPACAmbiguousDNA()), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.', dbxrefs=[])

In [208]:
len(sub_record)

500

In [209]:
len(sub_record.features)

2

In [210]:
print(sub_record.features)

[SeqFeature(FeatureLocation(ExactPosition(42), ExactPosition(480), strand=1), type='gene'), SeqFeature(FeatureLocation(ExactPosition(42), ExactPosition(480), strand=1), type='CDS')]


In [211]:
print(sub_record.features[0])

type: gene
location: [42:480](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']



In [214]:
print(sub_record.features[1])

type: CDS
location: [42:480](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GI:45478716', 'GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']
    Key: note, Value: ['similar to many previously sequenced pesticin immunity protein entries of Yersinia pestis plasmid pPCP, e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) , gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655| emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1| (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL109969)']
    Key: product, Value: ['pesticin immunity protein']
    Key: protein_id, Value: ['NP_995571.1']
    Key: transl_table, Value: ['11']
    Key: translation, Value: ['MGGGMISKLFCLALIFLSSSGLAEKNTYTAKDILQNLELNTFGNSLSHGIYGKQTTFKQTEFTNIKSNTKKHIALINKDNSWMISLKILGIKRDEYTVCFEDFSLIRPPTYVAIHPLLIKKVKSGNFIVVKEIKKSIPGCTVYYH']



In [215]:
sub_record.id

'NC_005816.1'

In [216]:
sub_record.name

'NC_005816'

In [217]:
sub_record.description

'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.'

In [220]:
from Bio import SeqIO
record = next(SeqIO.parse("data/example.fastq", "fastq"))

In [221]:
len(record)

25

In [222]:
print(record.seq)

CCCTTCTTGTCTTCAGCGTTTCTCC


In [224]:
print(record.letter_annotations["phred_quality"])

[26, 26, 18, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 26, 26, 26, 26, 26, 26, 26, 23, 23]


In [225]:
left = record[:20]

In [226]:
print(left.seq)

CCCTTCTTGTCTTCAGCGTT


In [227]:
print(left.letter_annotations["phred_quality"])

[26, 26, 18, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 26, 26, 26, 26]


In [228]:
right = record[21:]

In [229]:
print(right.seq)

CTCC


In [230]:
print(right.letter_annotations["phred_quality"])

[26, 26, 23, 23]


In [231]:
edited = left + right

In [232]:
len(edited)

24

In [233]:
print(edited.seq)

CCCTTCTTGTCTTCAGCGTTCTCC


In [234]:
print(edited.letter_annotations["phred_quality"])

[26, 26, 18, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 26, 26, 26, 26, 26, 26, 23, 23]


In [235]:
edited = record[:20] + record[21:]

In [237]:
from Bio import SeqIO
record = SeqIO.read("data/NC_005816.gb", "genbank")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG', IUPACAmbiguousDNA()), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.', dbxrefs=['Project:58037'])

In [238]:
len(record)

9609

In [239]:
len(record.features)

41

In [240]:
record.dbxrefs

['Project:58037']

In [241]:
record.annotations.keys()

dict_keys(['comment', 'source', 'organism', 'keywords', 'references', 'accessions', 'data_file_division', 'taxonomy', 'gi', 'sequence_version', 'date'])

In [242]:
shifted = record[2000:] + record[:2000]

In [243]:
shifted

SeqRecord(seq=Seq('GATACGCAGTCATATTTTTTACACAATTCTCTAATCCCGACAAGGTCGTAGGTC...GGA', IUPACAmbiguousDNA()), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.', dbxrefs=[])

In [244]:
len(shifted)

9609

In [245]:
len(shifted.features)

40

In [246]:
shifted.dbxrefs

[]

In [247]:
shifted.annotations.keys()

dict_keys([])

In [248]:
shifted_dbxrefs = record.dbxrefs[:]

In [249]:
shifted.annotations = record.annotations.copy()

In [250]:
shifted.dbxrefs

[]

## Reverse-complementing SeqRecord objects

In [251]:
from Bio import SeqIO
record = SeqIO.read("data/NC_005816.gb", "genbank")
print("%s %i %i %i %i" % (record.id, len(record), len(record.features), len(record.dbxrefs), len(record.annotations)))

NC_005816.1 9609 41 1 11


In [252]:
rc = record.reverse_complement(id = "TESTING")

In [254]:
from Bio import SeqIO
for seq_record in SeqIO.parse("data/ls_orchid.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', SingleLetterAlphabet())
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', SingleLetterAlphabet())
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GC

In [255]:
from Bio import SeqIO
for seq_record in SeqIO.parse("data/ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(seq_record.seq)
    print(len(seq_record))

Z78533.1
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC
740
Z78532.1
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAGAATATATGATCGAGTGAATCTGGAGGACCTGTGGTAACTCAGCTCGTCGTGGCACTGCTTTTGTCGTGACCCTGCTTTGTTGTTGGGCCTCCTCAAGAGCTTTCATGGCAGGTTTGAACTTTAGTACGGTGCAGTTTGCGCCAAGTCATATAAAGCATCACTGATGAATGACATTATTGTCAG

In [258]:
from Bio import SeqIO
identifiers = [seq_record.id for seq_record in SeqIO.parse("data/ls_orchid.gbk", "genbank")]
identifiers

['Z78533.1',
 'Z78532.1',
 'Z78531.1',
 'Z78530.1',
 'Z78529.1',
 'Z78527.1',
 'Z78526.1',
 'Z78525.1',
 'Z78524.1',
 'Z78523.1',
 'Z78522.1',
 'Z78521.1',
 'Z78520.1',
 'Z78519.1',
 'Z78518.1',
 'Z78517.1',
 'Z78516.1',
 'Z78515.1',
 'Z78514.1',
 'Z78513.1',
 'Z78512.1',
 'Z78511.1',
 'Z78510.1',
 'Z78509.1',
 'Z78508.1',
 'Z78507.1',
 'Z78506.1',
 'Z78505.1',
 'Z78504.1',
 'Z78503.1',
 'Z78502.1',
 'Z78501.1',
 'Z78500.1',
 'Z78499.1',
 'Z78498.1',
 'Z78497.1',
 'Z78496.1',
 'Z78495.1',
 'Z78494.1',
 'Z78493.1',
 'Z78492.1',
 'Z78491.1',
 'Z78490.1',
 'Z78489.1',
 'Z78488.1',
 'Z78487.1',
 'Z78486.1',
 'Z78485.1',
 'Z78484.1',
 'Z78483.1',
 'Z78482.1',
 'Z78481.1',
 'Z78480.1',
 'Z78479.1',
 'Z78478.1',
 'Z78477.1',
 'Z78476.1',
 'Z78475.1',
 'Z78474.1',
 'Z78473.1',
 'Z78472.1',
 'Z78471.1',
 'Z78470.1',
 'Z78469.1',
 'Z78468.1',
 'Z78467.1',
 'Z78466.1',
 'Z78465.1',
 'Z78464.1',
 'Z78463.1',
 'Z78462.1',
 'Z78461.1',
 'Z78460.1',
 'Z78459.1',
 'Z78458.1',
 'Z78457.1',
 'Z78456.1',

In [259]:
from Bio import SeqIO
record_iterator = SeqIO.parse("data/ls_orchid.fasta", "fasta")
first_record = next(record_iterator)
print(first_record.id)
print(first_record.description)

gi|2765658|emb|Z78533.1|CIZ78533
gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA


In [260]:
second_record = next(record_iterator)
print(second_record.id)
print(second_record.description)

gi|2765657|emb|Z78532.1|CCZ78532
gi|2765657|emb|Z78532.1|CCZ78532 C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA


In [261]:
from Bio import SeqIO
records = list(SeqIO.parse("data/ls_orchid.gbk", "genbank"))
print("Found %i records" % len(records))
print("The last record")
last_record = records[-1] # using python 's list tricks
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))

Found 94 records
The last record
Z78439.1
Seq('CATTGTTGAGATCACATAATAATTGATCGAGTTAATCTGGAGGATCTGTTTACT...GCC', IUPACAmbiguousDNA())
592


In [262]:
print("Thie first record")
first_record = records[0] # remember, Python counts from zero
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

Thie first record
Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())
740


In [264]:
from Bio import SeqIO
record_iterator = SeqIO.parse("data/ls_orchid.gbk", "genbank")
first_record = next(record_iterator)
print(first_record)

ID: Z78533.1
Name: Z78533
Description: C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA.
Number of features: 5
/source=Cypripedium irapeanum
/organism=Cypripedium irapeanum
/keywords=['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2']
/references=[Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)]
/accessions=['Z78533']
/data_file_division=PLN
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/gi=2765658
/sequence_version=1
/date=30-NOV-2006
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())


In [265]:
print(first_record.annotations)

{'source': 'Cypripedium irapeanum', 'organism': 'Cypripedium irapeanum', 'keywords': ['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2'], 'references': [Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)], 'accessions': ['Z78533'], 'data_file_division': 'PLN', 'taxonomy': ['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium'], 'gi': '2765658', 'sequence_version': 1, 'date': '30-NOV-2006'}


In [266]:
print(first_record.annotations)
print(first_record.annotations.values())

{'source': 'Cypripedium irapeanum', 'organism': 'Cypripedium irapeanum', 'keywords': ['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2'], 'references': [Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)], 'accessions': ['Z78533'], 'data_file_division': 'PLN', 'taxonomy': ['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium'], 'gi': '2765658', 'sequence_version': 1, 'date': '30-NOV-2006'}
dict_values(['Cypripedium irapeanum', 'Cypripedium irapeanum', ['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2'], [Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)], ['Z7853

In [267]:
from Bio import SeqIO
all_species = [seq_record.annotations["organism"] for seq_record in \
              SeqIO.parse("data/ls_orchid.gbk", "genbank")]
print(all_species)

['Cypripedium irapeanum', 'Cypripedium californicum', 'Cypripedium fasciculatum', 'Cypripedium margaritaceum', 'Cypripedium lichiangense', 'Cypripedium yatabeanum', 'Cypripedium guttatum', 'Cypripedium acaule', 'Cypripedium formosanum', 'Cypripedium himalaicum', 'Cypripedium macranthon', 'Cypripedium calceolus', 'Cypripedium segawai', 'Cypripedium parviflorum var. pubescens', 'Cypripedium reginae', 'Cypripedium flavum', 'Cypripedium passerinum', 'Mexipedium xerophyticum', 'Phragmipedium schlimii', 'Phragmipedium besseae', 'Phragmipedium wallisii', 'Phragmipedium exstaminodium', 'Phragmipedium caricinum', 'Phragmipedium pearcei', 'Phragmipedium longifolium', 'Phragmipedium lindenii', 'Phragmipedium lindleyanum', 'Phragmipedium sargentianum', 'Phragmipedium kaiteurum', 'Phragmipedium czerwiakowianum', 'Phragmipedium boissierianum', 'Phragmipedium caudatum', 'Phragmipedium warszewiczianum', 'Paphiopedilum micranthum', 'Paphiopedilum malipoense', 'Paphiopedilum delenatii', 'Paphiopedilum a

In [269]:
from Bio import SeqIO
all_species = []
for seq_record in SeqIO.parse("data/ls_orchid.fasta", "fasta"):
    all_species.append(seq_record.description.split()[1])
print(all_species)

['C.irapeanum', 'C.californicum', 'C.fasciculatum', 'C.margaritaceum', 'C.lichiangense', 'C.yatabeanum', 'C.guttatum', 'C.acaule', 'C.formosanum', 'C.himalaicum', 'C.macranthum', 'C.calceolus', 'C.segawai', 'C.pubescens', 'C.reginae', 'C.flavum', 'C.passerinum', 'M.xerophyticum', 'P.schlimii', 'P.besseae', 'P.wallisii', 'P.exstaminodium', 'P.caricinum', 'P.pearcei', 'P.longifolium', 'P.lindenii', 'P.lindleyanum', 'P.sargentianum', 'P.kaiteurum', 'P.czerwiakowianum', 'P.boissierianum', 'P.caudatum', 'P.warszewiczianum', 'P.micranthum', 'P.malipoense', 'P.delenatii', 'P.armeniacum', 'P.emersonii', 'P.niveum', 'P.godefroyae', 'P.bellatulum', 'P.concolor', 'P.fairrieanum', 'P.druryi', 'P.tigrinum', 'P.hirsutissimum', 'P.barbigerum', 'P.henryanum', 'P.charlesworthii', 'P.villosum', 'P.exul', 'P.insigne', 'P.gratrixianum', 'P.primulinum', 'P.victoria', 'P.victoria', 'P.glaucophyllum', 'P.supardii', 'P.kolopakingii', 'P.sanderianum', 'P.lowii', 'P.dianthum', 'P.parishii', 'P.haynaldianum', 'P

In [270]:
from Bio import SeqIO
print(sum(len(r) for r in SeqIO.parse("data/ls_orchid.gbk", "gb")))

67518


In [273]:
from Bio import SeqIO
with open("data/ls_orchid.gbk") as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "gb")))

67518


In [274]:
handle.close()

In [276]:
import bz2

In [278]:
from Bio import ExPASy
from Bio import SeqIO
handle = ExPASy.get_sprot_raw("O23729")
seq_record = SeqIO.read(handle, "swiss")
handle.close()
print(seq_record.id)
print(seq_record.name)
print(seq_record.description)
print(repr(seq_record.seq))
print("Length %i" % len(seq_record))
print(seq_record.annotations["keywords"])

O23729
CHS3_BROFI
RecName: Full=Chalcone synthase 3; EC=2.3.1.74; AltName: Full=Naringenin-chalcone synthase 3;
Seq('MAPAMEEIRQAQRAEGPAAVLAIGTSTPPNALYQADYPDYYFRITKSEHLTELK...GAE', ProteinAlphabet())
Length 394
['Acyltransferase', 'Flavonoid biosynthesis', 'Transferase']


In [279]:
from Bio import SeqIO
orchid_dict = SeqIO.to_dict(SeqIO.parse("data/ls_orchid.gbk", "genbank"))
len(orchid_dict)

94

In [280]:
list(orchid_dict.keys())

['Z78478.1',
 'Z78439.1',
 'Z78440.1',
 'Z78454.1',
 'Z78475.1',
 'Z78514.1',
 'Z78461.1',
 'Z78455.1',
 'Z78453.1',
 'Z78469.1',
 'Z78516.1',
 'Z78519.1',
 'Z78441.1',
 'Z78473.1',
 'Z78481.1',
 'Z78456.1',
 'Z78464.1',
 'Z78521.1',
 'Z78498.1',
 'Z78487.1',
 'Z78445.1',
 'Z78517.1',
 'Z78442.1',
 'Z78491.1',
 'Z78471.1',
 'Z78466.1',
 'Z78486.1',
 'Z78490.1',
 'Z78458.1',
 'Z78488.1',
 'Z78532.1',
 'Z78476.1',
 'Z78526.1',
 'Z78509.1',
 'Z78444.1',
 'Z78449.1',
 'Z78497.1',
 'Z78447.1',
 'Z78494.1',
 'Z78507.1',
 'Z78518.1',
 'Z78489.1',
 'Z78523.1',
 'Z78480.1',
 'Z78505.1',
 'Z78531.1',
 'Z78512.1',
 'Z78467.1',
 'Z78474.1',
 'Z78479.1',
 'Z78525.1',
 'Z78483.1',
 'Z78511.1',
 'Z78510.1',
 'Z78502.1',
 'Z78520.1',
 'Z78496.1',
 'Z78515.1',
 'Z78522.1',
 'Z78504.1',
 'Z78530.1',
 'Z78452.1',
 'Z78446.1',
 'Z78443.1',
 'Z78477.1',
 'Z78472.1',
 'Z78468.1',
 'Z78460.1',
 'Z78508.1',
 'Z78459.1',
 'Z78501.1',
 'Z78493.1',
 'Z78463.1',
 'Z78485.1',
 'Z78470.1',
 'Z78495.1',
 'Z78450.1',

In [281]:
seq_record = orchid_dict["Z78484.1"]
print(seq_record.description)
print(repr(seq_record.seq))

P.charlesworthii 5.8S rRNA gene and ITS1 and ITS2 DNA.
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGGGGAAGGATCATTGTTGAGATCACAT...TTT', IUPACAmbiguousDNA())


In [282]:
from Bio import SeqIO
orchid_dict = SeqIO.to_dict(SeqIO.parse("data/ls_orchid.fasta", "fasta"))
print(orchid_dict.keys())

dict_keys(['gi|2765571|emb|Z78446.1|PAZ78446', 'gi|2765583|emb|Z78458.1|PHZ78458', 'gi|2765604|emb|Z78479.1|PPZ78479', 'gi|2765599|emb|Z78474.1|PKZ78474', 'gi|2765644|emb|Z78519.1|CPZ78519', 'gi|2765606|emb|Z78481.1|PIZ78481', 'gi|2765611|emb|Z78486.1|PBZ78486', 'gi|2765620|emb|Z78495.1|PEZ78495', 'gi|2765580|emb|Z78455.1|PJZ78455', 'gi|2765600|emb|Z78475.1|PSZ78475', 'gi|2765589|emb|Z78464.1|PGZ78464', 'gi|2765633|emb|Z78508.1|PLZ78508', 'gi|2765631|emb|Z78506.1|PLZ78506', 'gi|2765591|emb|Z78466.1|PPZ78466', 'gi|2765615|emb|Z78490.1|PFZ78490', 'gi|2765648|emb|Z78523.1|CHZ78523', 'gi|2765656|emb|Z78531.1|CFZ78531', 'gi|2765574|emb|Z78449.1|PMZ78449', 'gi|2765605|emb|Z78480.1|PGZ78480', 'gi|2765567|emb|Z78442.1|PBZ78442', 'gi|2765630|emb|Z78505.1|PSZ78505', 'gi|2765576|emb|Z78451.1|PHZ78451', 'gi|2765608|emb|Z78483.1|PVZ78483', 'gi|2765652|emb|Z78527.1|CYZ78527', 'gi|2765594|emb|Z78469.1|PHZ78469', 'gi|2765638|emb|Z78513.1|PBZ78513', 'gi|2765641|emb|Z78516.1|CPZ78516', 'gi|2765646|emb|Z

In [283]:
def get_accession(record):
    """"Given a SeqRecord, return the accession number as a string.

    e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
    """
    parts = record.id.split("|")
    assert len(parts) == 5 and parts[0] == "gi" and parts[2] == "emb"
    return parts[3]


In [284]:
from Bio import SeqIO
orchid_dict = SeqIO.to_dict(SeqIO.parse("data/ls_orchid.fasta", "fasta"), key_function=get_accession)
print(orchid_dict.keys())


dict_keys(['Z78478.1', 'Z78439.1', 'Z78440.1', 'Z78454.1', 'Z78475.1', 'Z78514.1', 'Z78461.1', 'Z78455.1', 'Z78453.1', 'Z78469.1', 'Z78516.1', 'Z78519.1', 'Z78441.1', 'Z78473.1', 'Z78481.1', 'Z78456.1', 'Z78464.1', 'Z78521.1', 'Z78498.1', 'Z78487.1', 'Z78445.1', 'Z78517.1', 'Z78442.1', 'Z78491.1', 'Z78471.1', 'Z78466.1', 'Z78486.1', 'Z78490.1', 'Z78458.1', 'Z78488.1', 'Z78532.1', 'Z78476.1', 'Z78526.1', 'Z78509.1', 'Z78444.1', 'Z78449.1', 'Z78497.1', 'Z78447.1', 'Z78494.1', 'Z78507.1', 'Z78518.1', 'Z78489.1', 'Z78523.1', 'Z78480.1', 'Z78505.1', 'Z78531.1', 'Z78512.1', 'Z78467.1', 'Z78474.1', 'Z78479.1', 'Z78525.1', 'Z78483.1', 'Z78511.1', 'Z78510.1', 'Z78502.1', 'Z78520.1', 'Z78496.1', 'Z78515.1', 'Z78522.1', 'Z78504.1', 'Z78530.1', 'Z78452.1', 'Z78446.1', 'Z78443.1', 'Z78477.1', 'Z78472.1', 'Z78468.1', 'Z78460.1', 'Z78508.1', 'Z78459.1', 'Z78501.1', 'Z78493.1', 'Z78463.1', 'Z78485.1', 'Z78470.1', 'Z78495.1', 'Z78450.1', 'Z78492.1', 'Z78506.1', 'Z78462.1', 'Z78500.1', 'Z78524.1', 'Z785

In [285]:
print(orchid_dict.keys())

dict_keys(['Z78478.1', 'Z78439.1', 'Z78440.1', 'Z78454.1', 'Z78475.1', 'Z78514.1', 'Z78461.1', 'Z78455.1', 'Z78453.1', 'Z78469.1', 'Z78516.1', 'Z78519.1', 'Z78441.1', 'Z78473.1', 'Z78481.1', 'Z78456.1', 'Z78464.1', 'Z78521.1', 'Z78498.1', 'Z78487.1', 'Z78445.1', 'Z78517.1', 'Z78442.1', 'Z78491.1', 'Z78471.1', 'Z78466.1', 'Z78486.1', 'Z78490.1', 'Z78458.1', 'Z78488.1', 'Z78532.1', 'Z78476.1', 'Z78526.1', 'Z78509.1', 'Z78444.1', 'Z78449.1', 'Z78497.1', 'Z78447.1', 'Z78494.1', 'Z78507.1', 'Z78518.1', 'Z78489.1', 'Z78523.1', 'Z78480.1', 'Z78505.1', 'Z78531.1', 'Z78512.1', 'Z78467.1', 'Z78474.1', 'Z78479.1', 'Z78525.1', 'Z78483.1', 'Z78511.1', 'Z78510.1', 'Z78502.1', 'Z78520.1', 'Z78496.1', 'Z78515.1', 'Z78522.1', 'Z78504.1', 'Z78530.1', 'Z78452.1', 'Z78446.1', 'Z78443.1', 'Z78477.1', 'Z78472.1', 'Z78468.1', 'Z78460.1', 'Z78508.1', 'Z78459.1', 'Z78501.1', 'Z78493.1', 'Z78463.1', 'Z78485.1', 'Z78470.1', 'Z78495.1', 'Z78450.1', 'Z78492.1', 'Z78506.1', 'Z78462.1', 'Z78500.1', 'Z78524.1', 'Z785

In [286]:
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
for record in SeqIO.parse("data/ls_orchid.gbk", "genbank"):
    print(record.id, seguid(record.seq))


Z78533.1 JUEoWn6DPhgZ9nAyowsgtoD9TTo
Z78532.1 MN/s0q9zDoCVEEc+k/IFwCNF2pY
Z78531.1 xN45pACrTnmBH8a8Y9cWSgoLrwE
Z78530.1 yMhI5UUQfFOPcoJXb9B19XUyYlY
Z78529.1 s1Pnjq9zoSHoI/CG9jQr4GyeMZY
Z78527.1 MRf6S1OYhtbdPVS845oCmLTqMgo
Z78526.1 QCDzCtL6AKuc+h4UQDD6wFjz3Vs
Z78525.1 Zh/FImuuDRmmM/5fXaCYAHS7wo0
Z78524.1 hKw0C1fPNpi2KUM6iV0/8IadTX0
Z78523.1 71sZ82r6eAOBpwZs2solBr/biUs
Z78522.1 /8BBi+Dm0kpdHcYREmyKKbQSzKQ
Z78521.1 VrTEM/VCul51xFo2OBDsZKgiFTI
Z78520.1 //QmLx77H51zTbd8LR+71pXY9ew
Z78519.1 rb1CrF1Gbr6UDMLeRLPOWZaKIBc
Z78518.1 38aHi4XYVQUZkydCrxSVn2SYmCM
Z78517.1 swq+/4vgyWJssxu11Lcx+2IxNC4
Z78516.1 +CFmEqzVnCluPe39ytjiLRL7dvU
Z78515.1 ZG15PBF2qqDhj75CGTDiRAuT8Is
Z78514.1 Rvx7kJnPQJKgBZzNZ5dx+CnksgY
Z78513.1 IjG6hja6MS3M/CzKI9AuxFnjFxc
Z78512.1 EqkQVKx2qD2+qLp/W6hwQ1GjKo8
Z78511.1 cnm0fCmTFbHMlTN6t7i/ee4ydDc
Z78510.1 GVd6GFv7uF0uq66u4gZB95scKFY
Z78509.1 tS2JwZWXudMwdlyJBXKxjm/G1Ik
Z78508.1 e85/JrwCy5T7J+zptVbhBw8govc
Z78507.1 +pQOTxhX/lW2fFWnZ7hmBDbLz9w
Z78506.1 AUA6LlNQUBe4Vtui414zw6evNQY
Z

In [289]:
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
seguid_dict = SeqIO.to_dict(SeqIO.parse("data/ls_orchid.gbk", "genbank"),
                           lambda rec : seguid(rec.seq))
record = seguid_dict["kFkgs9JK3psvuvwaD0rp9iCRcdY"]
print(record.id)
print(record.description)

Z78440.1
P.purpuratum 5.8S rRNA gene and ITS1 and ITS2 DNA.


In [290]:
from Bio import SeqIO
orchid_dict = SeqIO.index("data/ls_orchid.gbk", "genbank")
len(orchid_dict)

94

In [291]:
orchid_dict.keys()

<dict_keyiterator at 0x104c04908>

In [292]:
seq_record = orchid_dict["Z78475.1"]

In [293]:
print(seq_record.description)

P.supardii 5.8S rRNA gene and ITS1 and ITS2 DNA.


In [294]:
seq_record.seq

Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACAT...GGT', IUPACAmbiguousDNA())

In [295]:
from Bio import SeqIO
orchid_dict = SeqIO.index("data/ls_orchid.gbk", "genbank")
len(orchid_dict)

94

In [296]:
orchid_dict.keys()

<dict_keyiterator at 0x104c2ec78>

In [297]:
def get_acc(identifier):
    """"Given a SeqRecord identifier string, return the accession number as a string.

    e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
    """
    parts = identifier.split("|")
    assert len(parts) == 5 and parts[0] == "gi" and parts[2] == "emb"
    return parts[3]


In [298]:
from Bio import SeqIO
orchid_dict = SeqIO.index("data/ls_orchid.fasta", "fasta", key_function = get_acc)
print(orchid_dict.keys())

<dict_keyiterator object at 0x104c049f8>


In [299]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein

rec1 = SeqRecord(Seq("MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD" \
                    +"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK" \
                    +"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM" \
                    +"SSAC", generic_protein),
                 id="gi|14150838|gb|AAK54648.1|AF376133_1",
                 description="chalcone synthase [Cucumis sativus]")

rec2 = SeqRecord(Seq("YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ" \
                    +"DMVVVEIPKLGKEAAVKAIKEWGQ", generic_protein),
                 id="gi|13919613|gb|AAK33142.1|",
                 description="chalcone synthase [Fragaria vesca subsp. bracteata]")

rec3 = SeqRecord(Seq("MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC" \
                    +"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP" \
                    +"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN" \
                    +"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV" \
                    +"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW" \
                    +"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT" \
                    +"TGEGLEWGVLFGFGPGLTVETVVLHSVAT", generic_protein),
                 id="gi|13925890|gb|AAK49457.1|",
                 description="chalcone synthase [Nicotiana tabacum]")

my_records = [rec1, rec2, rec3]


In [300]:
from Bio import SeqIO
SeqIO.write(my_records, "my_example.fasta", "fasta")

3

In [301]:
from Bio import SeqIO
records = SeqIO.parse("data/ls_orchid.gbk", "genbank")
count = SeqIO.write(records, "my_example.fasta", "fasta")
print("Converted %i records" % count)

Converted 94 records


In [302]:
from Bio import SeqIO
count = SeqIO.convert("data/ls_orchid.gbk", "genbank", "my_example.fasta", "fasta")
print("Converted %i records" % count)

Converted 94 records


In [303]:
from Bio import SeqIO
for record in SeqIO.parse("data/ls_orchid.gbk", "genbank"):
    print(record.id)
    print(record.seq.reverse_complement())

Z78533.1
GCGTAAACTCAGCGGGTGCCCCCGCCTGACCTGGGGTCACATCCGAATGGCGGTCAACCGCCCTCCATTGGGGTTCGGAAGGGTTCTCCTGCCTGTCCGACAAGCACGACAACATGGGGGATTCGCACGGCAGCTGCTGCCAGCATCCGTCCACCTCTTGCCGGGTTCCGGGCCATCAAAACACCAGCTCTTGGACCCGCCGCACCTAGGCACAAGGGGCCAATCTTTCACATCCGCACCACGCCGGCCTGGCTGTATGCCGGGCAAGCATTGGCAGGAGAGAGACGAAGCGCGACGCCCAAGCAGGCGTGCCCTTAGCCTGATGGCCTCGGGCGCAACTTGCGTTCAAAAGACTCGATGGTTCACGGGATCTTGCAATTCACACCACTTATCGCATTTCGCTGCGTCCTTCCATCCGATGCAAAGAGCCAAGATTCCCGTTTGCGAGAGTCATCAAAATTCATTGGGCACGCGACAGCACGCCGCCGCTCCGGGTTTTGGGGAAGACAATGCCATTCGCCGGTGATGCTTTCATATGGCTTGGCGCCCAAACTGCGCCGGGCTAGAGGTTCAAACCCGCCATGGACGCTCCCGAGGCGGCCCAACAACAAATCAGGGTCACCACGGGAGCAATGCCCCCGGTGAGCTGAGTACACCGGTCCTCCGGATTCACTCGATCGTTTATTCCACGGTCTCATCAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACG
Z78532.1
GCCTCAACTCAGCGGGTGGCCCCGCCTGACCTGGGGTCGCATCTGAATGGAAATCAACTGCCCAATGGTTATTTTAGCTCCATTGGGGTTCAATTAGGTTCTTGTGTAGGTTCGAAAAAATACAACAACATGGGGGATTCAAATAGCAGCCTTATGACTGTTAGCATTCTCCACCTCGTGCCACATTCCTACCCATCAAAGCAACAATCCTTAGACCCACCGCACCTAGGCACAAGGGGCC

In [305]:
from Bio import SeqIO
records = [rec.reverse_complement(id = "rc_"+rec.id, description = "reverse complement") \
          for rec in SeqIO.parse("data/ls_orchid.fasta", "fasta")]
len(records)

94

In [306]:
records = (rec.reverse_complement(id = "rc_" + rec.id, description = "reverse complement") \
          for rec in SeqIO.parse("data/ls_orchid.fasta", "fasta") if len(rec)<700)

In [307]:
SeqIO.write(records, "rev_comp.fasta", "fasta")

18

In [311]:
from Bio import SeqIO
from io import StringIO
records = SeqIO.parse("data/ls_orchid.gbk", "genbank")
out_handle = StringIO()
SeqIO.write(records, out_handle, "fasta")
fasta_data = out_handle.getvalue()
print(fasta_data)

>Z78533.1 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA.
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA
CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT
GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC
CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC
CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA
CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG
AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA
GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCG
GCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCG
GCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTG
GCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCC
TTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGG
GGCACCCGCTGAGTTTACGC
>Z78532.1 C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA.
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAGAATATA
TGATCGAGTGAATCTGGAGGACCTGTGGTAACTCAGCTCGTCGTGGCACTGCTTTTGTCG
TG

In [313]:
from Bio import SeqIO
out_handle = open("ls_orchid_long.tab", "w")
for record in SeqIO.parse("data/ls_orchid.gbk", "genbank"):
    if len(record) > 100:
        out_handle.write(record.format("tab"))
out_handle.close()


In [315]:
from Bio import SeqIO
records = (rec for rec in SeqIO.parse("data/ls_orchid.gbk", "genbank") if len(rec) > 100)
SeqIO.write(records, "ls_orchid.tab", "tab")


94

In [316]:
records

<generator object <genexpr> at 0x104c47830>

## Chapter 6 Multiple Sequence Alignment Objects