In [1]:
import Bio
print(Bio.__version__)

1.78


### Working with sequences

In [2]:
from Bio.Seq import Seq

my_seq = Seq('AGTACACTGGT')
my_seq

Seq('AGTACACTGGT')

In [3]:
print(f'my sequence:    {my_seq}')
print(f'complement:     {my_seq.complement()}')
print(f'rev complement: {my_seq.reverse_complement()}')

my sequence:    AGTACACTGGT
complement:     TCATGTGACCA
rev complement: ACCAGTGTACT


In [6]:
from Bio import SeqIO

for seq_record in SeqIO.parse('ls_orchid.fasta', 'fasta'):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA')
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT')
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA')
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC')
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT')
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA')
704
gi|2765649|emb|Z78524.1|CFZ78524
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC')
740
gi|2765648|emb|Z78523.1|CHZ78523
Seq('CGTAACCAGGTTTCCGT

In [8]:
from Bio import SeqIO

for seq_record in SeqIO.parse('ls_orchid.gbk', 'genbank'):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
Z78532.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753
Z78531.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA')
748
Z78530.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT')
744
Z78529.1
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA')
733
Z78527.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC')
718
Z78526.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT')
730
Z78525.1
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA')
704
Z78524.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC')
740
Z78523.1
Seq('CGTAACCAGGTTTCCGTAGGTGAACCTGCGGCAGGATCATTGTTGAGACAGCAG...AAG')
709
Z78522.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...GAG')
700
Z78521.1
Seq('GTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAGAATATATGATCGAGT...ACC')
726
Z78520.1
Seq('CGTAACAAGGTTTC

### Sequence objects

In [10]:
from Bio.Seq import Seq

my_seq = Seq('GATCG')
for index, letter in enumerate(my_seq):
    print('%i %s' % (index, letter))

0 G
1 A
2 T
3 C
4 G


In [11]:
print(my_seq[0])
print(my_seq[2])
print(my_seq[-1])

G
T
G


In [13]:
print('AAAA'.count('A'))
Seq('AAAA').count('AA')

4


2

In [37]:
from Bio.Seq import Seq

my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')
print(f'sequence length: {len(my_seq)}')
print(f'G count: {my_seq.count("G")}')  # otherwise Python returns syntax error
print(100 * float(my_seq.count('G') + my_seq.count('C')) / len(my_seq))

sequence length: 32
G count: 9
46.875


In [27]:
# a better way of getting percentage of GC
from Bio.Seq import Seq
from Bio.SeqUtils import GC

my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')
GC(my_seq)

46.875

In [29]:
from Bio.Seq import Seq

my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')
print(f'my seq 4-11: {my_seq[4:12]}')
my_seq[4:12]

my seq 4-11: GATGGGCC


Seq('GATGGGCC')

In [34]:
my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')
print(f'my seq starting from index of 0,stepping by 3:  {my_seq[::3]}')
print(f'my seq starting from index of 1, stepping by 3: {my_seq[1::3]}')
print(f'my seq starting from index of 2, stepping by 3: {my_seq[2::3]}')
print(f'my seq backwards: {my_seq[::-1]}')

my seq starting from index of 0,stepping by 3:  GCTGTAGTAAG
my seq starting from index of 1, stepping by 3: AGGCATGCATC
my seq starting from index of 2, stepping by 3: TAGCTAAGAC
my seq backwards: CGCTAAAAGCTAGGATATATCCGGGTAGCTAG


In [38]:
str(my_seq)

'GATCGATGGGCCTATATAGGATCGAAAATCGC'

In [39]:
print(my_seq)

GATCGATGGGCCTATATAGGATCGAAAATCGC


In [40]:
fasta_format_string = '>Name\n%s\n' % my_seq
print(fasta_format_string)

>Name
GATCGATGGGCCTATATAGGATCGAAAATCGC



In [41]:
from Bio.Seq import Seq

protein_seq = Seq('EVRNAK')
dna_seq = Seq('ACGT')
protein_seq + dna_seq

Seq('EVRNAKACGT')

In [42]:
list_of_seqs = [Seq('ACGT'), Seq('AACC'), Seq('GGTT')]
concatenated = ''
for s in list_of_seqs:
    concatenated += s
concatenated

Seq('ACGTAACCGGTT')

In [43]:
contigs = [Seq('ATG'), Seq('ATCCCG'), Seq('TTGCA')]
spacer = Seq('N' * 10)
spacer.join(contigs)

Seq('ATGNNNNNNNNNNATCCCGNNNNNNNNNNTTGCA')

In [44]:
# uppercase and lowercase
dna_seq = Seq('acgtACGT')
print(f'dna_seq orig case: {dna_seq}')
print(f'dan_seq uppercase: {dna_seq.upper()}')
print(f'dna_seq lowercase: {dna_seq.lower()}')

dna_seq orig case: acgtACGT
dan_seq uppercase: ACGTACGT
dna_seq lowercase: acgtacgt


In [46]:
# case insensitive matching
print('GTAC' in dna_seq)
'GTAC' in dna_seq.upper()

False


True

In [48]:
#complement and reverse complement
dna_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC')
print(f'original sequence:      {dna_seq}')
print(f'complement sequence:    {dna_seq.complement()}')
print(f'reverse complement seq: {dna_seq.reverse_complement()}')
print(f'reversed sequence:      {dna_seq[::-1]}')

original sequence:      GATCGATGGGCCTATATAGGATCGAAAATCGC
complement sequence:    CTAGCTACCCGGATATATCCTAGCTTTTAGCG
reverse complement seq: GCGATTTTCGATCCTATATAGGCCCATCGATC
reversed sequence:      CGCTAAAAGCTAGGATATATCCGGGTAGCTAG


In [49]:
protein_seq = Seq('EVRNAK')
protein_seq.complement()

Seq('EBYNTM')

In [54]:
# transcription
from Bio.Seq import Seq

coding_dna = Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
print(f'coding DNA:    {coding_dna}')
template_dna = coding_dna.reverse_complement()  # why not use complement?
print(f'template DNA:  {template_dna}')

print()
print(f'coding DNA:    {coding_dna}')
messenger_rna = coding_dna.transcribe()
print(f'messenger RNA: {messenger_rna}')

coding DNA:    ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
template DNA:  CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT

coding DNA:    ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
messenger RNA: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [57]:
# true biological transcription process

coding_dna = Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
print(f'coding DNA:     {coding_dna}')
template_dna = coding_dna.reverse_complement()  # why not use complement?
print(f'template DNA:   {template_dna}')
# m_rna = template_dna.reverse_complement().trasncribe()
# print(f'mRNA:           {m_rna}')
print(f'mRNA other way: {template_dna.transcribe()}')  # doesn't give true mRNA for this DNA sequence

coding DNA:     ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
template DNA:   CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT
mRNA other way: CUAUCGGGCACCCUUUCAGCGGCCCAUUACAAUGGCCAU


In [59]:
# back to DNA from mRNA

messenger_rna = Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
print(f'messenger_rna: {messenger_rna}')
messenger_rna.back_transcribe()

messenger_rna: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')

In [61]:
# translation - * represents stop codons

from Bio.Seq import Seq
messenger_rna = Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG')
print(f'messenger_rna: {messenger_rna}')
messenger_rna.translate()

messenger_rna: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


Seq('MAIVMGR*KGAR*')

In [62]:
coding_dna.translate(table='Vertebrate Mitochondrial')

Seq('MAIVMGRWKGAR*')

In [63]:
coding_dna.translate(table=2)

Seq('MAIVMGRWKGAR*')

In [65]:
translated_sequence = coding_dna.translate()
print(f'translated sequence:   {translated_sequence}')
translation_nature = coding_dna.translate(to_stop=True)
print(f'translation in nature: {translation_nature}')
aa_sequence = coding_dna.translate(table=2)
print(f'amino acid sequence:   {aa_sequence}')
coding_dna.translate(table=2, to_stop=True)

translated sequence:   MAIVMGR*KGAR*
translation in nature: MAIVMGR
amino acid sequence:   MAIVMGRWKGAR*


Seq('MAIVMGRWKGAR')

In [66]:
# another way of specifying stop symbol
coding_dna.translate(table=2, stop_symbol='@')

Seq('MAIVMGRWKGAR@')

#### Now, suppose you have a complete coding sequence CDS, which is to say a nucleotide sequence (e.g. mRNA – after any splicing) which is a whole number of codons (i.e. the length is a multiple of three), commences with a start codon, ends with a stop codon, and has no internal in-frame stop codons. In general, given a complete CDS, the default translate method will do what you want (perhaps with the to_stop option). However, what if your sequence uses a non-standard start codon? This happens a lot in bacteria – for example the gene yaaX in E. coli K12:

In [69]:
from Bio.Seq import Seq

gene = Seq('GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA'
           'GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACAGATAGGCGATCGTGAT'
           'AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT'
           'TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT'
           'AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA')
gene.translate(table='Bacterial')

Seq('VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HR*')

In [70]:
gene.translate(table='Bacterial', to_stop=True)

Seq('VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR')

In [71]:
# if used as a start codon GTG should be translated as methionine (M)

gene.translate(table='Bacterial', cds=True)

Seq('MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR')

In [72]:
from Bio.Data import CodonTable

standard_table = CodonTable.unambiguous_dna_by_name['Standard']
mito_table = CodonTable.unambiguous_dna_by_name['Vertebrate Mitochondrial']

# same thing as:
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [73]:
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [77]:
# useful properties - if you're doing your own gene finding
print(f'mitochondrial table stop codons: {mito_table.stop_codons}')
print(f'mitochondrial table start codons: {mito_table.start_codons}')
mito_table.forward_table['ACG']

mitochondrial table stop codons: ['TAA', 'TAG', 'AGA', 'AGG']
mitochondrial table start codons: ['ATT', 'ATC', 'ATA', 'ATG', 'GTG']


'T'

In [79]:
# does this work for Standard table as well?
print(f'standard table stop codons: {standard_table.stop_codons}')
print(f'standard table start codons: {standard_table.start_codons}')
standard_table.forward_table['ACG']

standard table stop codons: ['TAA', 'TAG', 'TGA']
standard table start codons: ['TTG', 'CTG', 'ATG']


'T'

In [80]:
# comparing Seq objects

from Bio.Seq import Seq


seq1 = 'ACGT'
print('ACGT' == seq1)
print(seq1 == 'ACGT')

True
True


In [1]:
# unknown contents
from Bio.Seq import Seq
unknown_seq = Seq(None, 10)
unknown_seq

TypeError: __init__() takes 2 positional arguments but 3 were given

In [6]:
from Bio.Seq import Seq
my_seq = Seq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')
my_seq[5] = 'G'

TypeError: 'Seq' object does not support item assignment

In [7]:
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')
mutable_seq

MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')

In [11]:
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')
mutable_seq[5] = 'C'
print(f'mutable_seq with C at index 5: {mutable_seq}')
mutable_seq.remove('T')
print(f'mutable_seq with T removed: {mutable_seq}')
mutable_seq.reverse()
print(f'mutable_seq right to left: {mutable_seq}')

mutable_seq with C at index 5: GCCATCGTAATGGGCCGCTGAAAGGGTGCCCGA
mutable_seq with T removed: GCCACGTAATGGGCCGCTGAAAGGGTGCCCGA
mutable_seq right to left: AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG


#### Unlike the Seq object, the MutableSeq object’s methods like reverse_complement() and reverse() act in-situ!

In [15]:
# getting back to read-only Seq
from Bio.Seq import Seq
new_seq = Seq(str(mutable_seq))  # sequence data given to a Seq object should be a string (not another Seq object etc)
new_seq

Seq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG')

In [20]:
# working with strings directly - functional programming
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate
my_string = 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'
print(f'my_string:                 {my_string}')
print(f'reverse complement:        {reverse_complement(my_string)}')
print(f'my_string transcribed:     {transcribe(my_string)}')
print(f'my_string backtranscribed: {back_transcribe(my_string)}')
print(f'my_string translated:      {translate(my_string)}')

my_string:                 GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG
reverse complement:        CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC
my_string transcribed:     GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG
my_string backtranscribed: GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG
my_string translated:      AVMGRWKGGRAAG*


### Sequence Annotation Objects

#### Using a SeqRecord object is not very complicated, since all of the information is presented as attributes of the class. Usually you won’t create a SeqRecord “by hand”, but instead use Bio.SeqIO to read in a sequence file for you.

In [22]:
# SeqRecord from scratch
from Bio.Seq import Seq
simple_seq = Seq('GATC')
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(simple_seq)
simple_seq_r.id

'<unknown id>'

In [24]:
simple_seq_r.id = 'AC12345'
simple_seq_r.description = 'Made up sequence I wish I could write a paper about'
print(f'description: {simple_seq_r.description}')
print(f'sequence from sequence record: {simple_seq_r.seq}')

description: Made up sequence I wish I could write a paper about
sequence from sequence record: GATC


In [25]:
# including an identifier when creating the object
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

simple_seq = Seq('GATC')
simple_seq_r = SeqRecord(simple_seq, id='AC12345')
simple_seq_r

SeqRecord(seq=Seq('GATC'), id='AC12345', name='<unknown name>', description='<unknown description>', dbxrefs=[])

#### SeqRecord has an dictionary attribute annotations. This is used for any miscellaneous annotations that doesn’t fit under one of the other more specific attributes.

In [34]:
# adding annotations
simple_seq_r.annotations['evidence'] = 'None. I just made it up.'
print(f'simple_seq_r.annotations: {simple_seq_r.annotations}')
print(f'simple_seq_r.annotations["evidence"]: {simple_seq_r.annotations["evidence"]}')

simple_seq_r.annotations: {'evidence': 'None. I just made it up.', 'phred_quality': [40, 40, 38, 30]}
simple_seq_r.annotations["evidence"]: None. I just made it up.


In [36]:
# working with letter annotations
simple_seq_r.letter_annotations['phred_quality'] = [40, 40, 38, 30]
print(f'simple_seq_r.letter_annotations: {simple_seq_r.letter_annotations}')
print(f"simple_seq_r.letter_annotations['phred_quality']: {simple_seq_r.letter_annotations['phred_quality']}")

simple_seq_r.letter_annotations: {'phred_quality': [40, 40, 38, 30]}
simple_seq_r.letter_annotations['phred_quality']: [40, 40, 38, 30]


In [39]:
# SeqRecord objects from FASTA files
breaklines=true,breaksymbolleft=]{pycon}
from Bio import SeqIO
record = SeqIO.read('NC_005816.fna', 'fasta')
record

SyntaxError: unmatched ']' (<ipython-input-39-d054b029d4d7>, line 2)

In [41]:
# SeqRecord objects from FASTA files
# breaklines=true,breaksymbolleft=]{pycon}
from Bio import SeqIO
record = SeqIO.read('NC_005816.fna', 'fasta')
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')

In [44]:
print(f'record.id: {record.id}')
print(f'record.name: {record.name}')
print(f'record.description: {record.description}')

record.id: gi|45478711|ref|NC_005816.1|
record.name: gi|45478711|ref|NC_005816.1|
record.description: gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence


In [45]:
print(f'record.dbxrefs: {record.dbxrefs}')
print(f'record.dbxrefs: {record.annotations}')
print(f'record.letter_annotations: {record.letter_annotations}')
print(f'record.features: {record.features}')

record.dbxrefs: []
record.dbxrefs: {}
record.letter_annotations: {}
record.features: []


In [52]:
# SeqRecord objects from GenBank
record = SeqIO.read('NC_005816.gb', 'genbank')
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])

In [50]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')

In [54]:
print(f'record.id: {record.id}')
print(f'record.name: {record.name}')
print(f'record.description: {record.description}')
print(f'record.letter_annotations: {record.letter_annotations}')  # GenBank files don't have per-letter annotations

record.id: NC_005816.1
record.name: NC_005816
record.description: Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence
record.letter_annotations: {}


In [57]:
# most of the annotations information gets recorded in the annotations dictionary
print(f'length of record annotations: {len(record.annotations)}')
print(f'record.annotations["source"]: {record.annotations["source"]}')
# dbxrefs list gets populated from any PROJECT or DBLINK lines
print(f'record.dbxrefs: {record.dbxrefs}')
# entries in the features table (e.g. the genes or CDS features) get recorded as SeqFeature objects in the features list
print(f'length of features: {len(record.features)}')

length of record annotations: 13
record.annotations["source"]: Yersinia pestis biovar Microtus str. 91001
record.dbxrefs: ['Project:58037']
length of features: 41


In [61]:
# creating a position with fuzzy end points

from Bio import SeqFeature
start_pos = SeqFeature.AfterPosition(5)
end_pos = SeqFeature.BetweenPosition(9, left=8, right=9)
my_location = SeqFeature.FeatureLocation(start_pos, end_pos)
print(my_location)
my_location.start

[>5:(8^9)]


AfterPosition(5)

In [64]:
my_location.end

BetweenPosition(9, left=8, right=9)

In [65]:
print(my_location.end)

(8^9)


In [68]:
# don’t want to deal with fuzzy positions and just want numbers, they are actually subclasses of integers
print(f'start: {int(my_location.start)}')
print(f'end: {int(my_location.end)}')

start: 5
end: 9


no_fuzzy_start and no_fuzzy_end give attribute locations and are compatible with older versions of Biopython

In [74]:
# create a position without worrying about fuzzy positions

exact_location = SeqFeature.FeatureLocation(5, 9)
print(f'exact_location: {exact_location}')
print(f'start: {exact_location.start}')
print(f'end: {exact_location.end}')
# exact_location.no_fuzzy_end  # gives exact_location = SeqFeature.FeatureLocation(5, 9)
# exact_location.no_fuzzy_start  # gives exact_location = SeqFeature.FeatureLocation(5, 9)

exact_location: [5:9]
start: 5
end: 9


#### Location testing

In [75]:
from Bio import SeqIO

my_snp = 4350
record = SeqIO.read("NC_005816.gb", "genbank")
for feature in record.features:
    if my_snp in feature:
         print('%s %s' % (feature.type, feature.qualifiers.get('db_xref')))

source ['taxon:229193']
gene ['GeneID:2767712']
CDS ['GI:45478716', 'GeneID:2767712']


gene and CDS features from GenBank or EMBL files defined with joins are the union of the exons – they do not cover any introns

In [78]:
# sequence described by feature or location

from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
seq = Seq('ACCGAGACGGCAAAGGCTAGCATAGGTATGAGACTTCCTTCCTGCCAGTGCTGAGGAACTGGGAGCCTAC')
feature = SeqFeature(FeatureLocation(5, 18), type='gene', strand=1)
feature_seq = seq[feature.location.start:feature.location.end].reverse_complement()
print(feature_seq)

AGCCTTTGCCGTC


In [80]:
# using extract method to get the sequence described by feature or location
feature_seq = feature.extract(seq)
print(f'feature_seq: {feature_seq}')
print(f'length of feature_seq: {len(feature_seq)}')
print(f'length of feature: {len(feature)}')
print(f'length of feature.location: {len(feature.location)}')

feature_seq: GACGGCAAAGGCT
length of feature_seq: 13
length of feature: 13
length of feature.location: 13


#### Comparison

In [81]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
record1 = SeqRecord(Seq('ACGT'), id='test')
record2 = SeqRecord(Seq('ACGT'), id='test')
record1 == record2

NotImplementedError: SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest.

In [83]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
record1 = SeqRecord(Seq('ACGT'), id='test')
record2 = SeqRecord(Seq('ACGT'), id='test')
print(record1.id == record2.id)
print(record1.seq == record2.seq)

True
True


In [86]:
# format
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
record = SeqRecord(
     Seq(
         "MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
         "GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
         "NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
         "SSAC"
     ),
     id="gi|14150838|gb|AAK54648.1|AF376133_1",
     description="chalcone synthase [Cucumis sativus]",
 )
print(record.format("fasta"))

>gi|14150838|gb|AAK54648.1|AF376133_1 chalcone synthase [Cucumis sativus]
MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD
GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK
NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM
SSAC



In [89]:
# slicing record
from Bio import SeqIO
record = SeqIO.read('NC_005816.gb', 'genbank')
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])

In [93]:
print(f'length of record: {len(record)}')
print(f'number of features: {len(record.features)}')

length of record: 9609
number of features: 41


#### Adding SeqRecord objects

In [96]:
from Bio import SeqIO
record = SeqIO.read('NC_005816.gb', 'genbank')
print(f'length of record: {len(record)}')
print(f'number of features: {len(record.features)}')
print(f'record.dbxrefs: {record.dbxrefs}')
print(f'annotations: {record.annotations.keys()}')

length of record: 9609
number of features: 41
record.dbxrefs: ['Project:58037']
annotations: dict_keys(['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi', 'keywords', 'source', 'organism', 'taxonomy', 'references', 'comment'])


In [99]:
# shift the origin
shifted = record[2000:] + record[:2000]
print(f'len(shifted): {len(shifted)}')
shifted

len(shifted): 9609


SeqRecord(seq=Seq('GATACGCAGTCATATTTTTTACACAATTCTCTAATCCCGACAAGGTCGTAGGTC...GGA'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

this isn’t perfect in that some annotation like the database cross references, all the annotations except molecule type, and one of the features (the source feature) have been lost

In [100]:
print(f'number of features in shifted: {len(shifted.features)}')
print(f'shifted.dbxrefs: {shifted.dbxrefs}')
print(f'annotations: {shifted.annotations.keys()}')

number of features in shifted: 40
shifted.dbxrefs: []
annotations: dict_keys([])


In [103]:
# SeqRecord slicing step is cautious in what annotation it preserves
# (erroneously propagating annotation can cause major problems)
# to keep the database cross references or the annotations dictionary, this must be done explicitly
shifted.dbxrefs = record.dbxrefs[:]
shifted.annotations = record.annotations.copy()
print(f'shifted.dbxrefs explicitly: {shifted.dbxrefs}')
print(f'annotations explicitly: {shifted.annotations.keys()}')

shifted.dbxrefs explicitly: ['Project:58037']
annotations explicitly: dict_keys(['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi', 'keywords', 'source', 'organism', 'taxonomy', 'references', 'comment'])


#### Reverse complementing SeqRecord objects

In [104]:
from Bio import SeqIO
record = SeqIO.read('NC_005816.gb', 'genbank')
print('%s %i %i %i %i' % (record.id, len(record), len(record.features), len(record.dbxrefs), len(record.annotations)))

NC_005816.1 9609 41 1 13


In [105]:
rc = record.reverse_complement(id='TESTING')
print('%s %i %i %i %i' % (rc.id, len(rc), len(rc.features), len(rc.dbxrefs), len(rc.annotations)))

TESTING 9609 41 0 0


### Parsing and reading sequences

In [1]:
from Bio import SeqIO

for seq_record in SeqIO.parse('ls_orchid.fasta', 'fasta'):  # to load a GenBank file, just change the file and format
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA')
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT')
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA')
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC')
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT')
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA')
704
gi|2765649|emb|Z78524.1|CFZ78524
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC')
740
gi|2765648|emb|Z78523.1|CHZ78523
Seq('CGTAACCAGGTTTCCGT

In [3]:
# another common way within a list comprehension

from Bio import SeqIO
identifiers = [seq_record.id for seq_record in SeqIO.parse('ls_orchid.fasta', 'fasta')]
identifiers

['gi|2765658|emb|Z78533.1|CIZ78533',
 'gi|2765657|emb|Z78532.1|CCZ78532',
 'gi|2765656|emb|Z78531.1|CFZ78531',
 'gi|2765655|emb|Z78530.1|CMZ78530',
 'gi|2765654|emb|Z78529.1|CLZ78529',
 'gi|2765652|emb|Z78527.1|CYZ78527',
 'gi|2765651|emb|Z78526.1|CGZ78526',
 'gi|2765650|emb|Z78525.1|CAZ78525',
 'gi|2765649|emb|Z78524.1|CFZ78524',
 'gi|2765648|emb|Z78523.1|CHZ78523',
 'gi|2765647|emb|Z78522.1|CMZ78522',
 'gi|2765646|emb|Z78521.1|CCZ78521',
 'gi|2765645|emb|Z78520.1|CSZ78520',
 'gi|2765644|emb|Z78519.1|CPZ78519',
 'gi|2765643|emb|Z78518.1|CRZ78518',
 'gi|2765642|emb|Z78517.1|CFZ78517',
 'gi|2765641|emb|Z78516.1|CPZ78516',
 'gi|2765640|emb|Z78515.1|MXZ78515',
 'gi|2765639|emb|Z78514.1|PSZ78514',
 'gi|2765638|emb|Z78513.1|PBZ78513',
 'gi|2765637|emb|Z78512.1|PWZ78512',
 'gi|2765636|emb|Z78511.1|PEZ78511',
 'gi|2765635|emb|Z78510.1|PCZ78510',
 'gi|2765634|emb|Z78509.1|PPZ78509',
 'gi|2765633|emb|Z78508.1|PLZ78508',
 'gi|2765632|emb|Z78507.1|PLZ78507',
 'gi|2765631|emb|Z78506.1|PLZ78506',
 

In [6]:
# using next() function on an iterator to step through the entries
from Bio import SeqIO
record_iterator = SeqIO.parse('ls_orchid.fasta', 'fasta')
first_record = next(record_iterator)
print(f'first record: {first_record}')
print()
print(first_record.id)
print(first_record.description)
print()
second_record = next(record_iterator)
print(second_record.id)
print(second_record.description)

first record: ID: gi|2765658|emb|Z78533.1|CIZ78533
Name: gi|2765658|emb|Z78533.1|CIZ78533
Description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
Number of features: 0
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')

gi|2765658|emb|Z78533.1|CIZ78533
gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA

gi|2765657|emb|Z78532.1|CCZ78532
gi|2765657|emb|Z78532.1|CCZ78532 C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA


#### One special case to consider is when your sequence files have multiple records, but you only want the first one. In this situation the following code is very concise

In [8]:
from Bio import SeqIO
first_record = next(SeqIO.parse('ls_orchid.gbk', 'genbank'))
print(first_record)

ID: Z78533.1
Name: Z78533
Description: C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
Number of features: 5
/molecule_type=DNA
/topology=linear
/data_file_division=PLN
/date=30-NOV-2006
/accessions=['Z78533']
/sequence_version=1
/gi=2765658
/keywords=['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2']
/source=Cypripedium irapeanum
/organism=Cypripedium irapeanum
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/references=[Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)]
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')


#### A word of warning here – using the next() function like this will silently ignore any additional records in the file. If your files have one and only one record, like some of the online examples later in this chapter, or a GenBank file for a single chromosome, then use the new Bio.SeqIO.read() function instead. This will check there are no extra unexpected records present.

#### Getting list of records in a sequence file

In [11]:
from Bio import SeqIO

records = list(SeqIO.parse('ls_orchid.gbk', 'genbank'))
print('Found %i records' % (len(records)))  # similar to f string

print('The last record')
last_record = records[-1]
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))
first_record = records[0]
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

Found 94 records
The last record
Z78439.1
Seq('CATTGTTGAGATCACATAATAATTGATCGAGTTAATCTGGAGGATCTGTTTACT...GCC')
592
Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740


#### Extracting data

In [14]:
record_iterator = SeqIO.parse('ls_orchid.gbk', 'genbank')
first_record = next(record_iterator)
print(first_record)
print()
print(first_record.annotations)
print(f'keys: {first_record.annotations.keys()}')
print(f'values: {first_record.annotations.values()}')
print()
print(first_record.annotations['source'])

ID: Z78533.1
Name: Z78533
Description: C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
Number of features: 5
/molecule_type=DNA
/topology=linear
/data_file_division=PLN
/date=30-NOV-2006
/accessions=['Z78533']
/sequence_version=1
/gi=2765658
/keywords=['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2']
/source=Cypripedium irapeanum
/organism=Cypripedium irapeanum
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/references=[Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)]
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')

{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PLN', 'date': '30-NOV-2006', 'accessions': ['Z78533'], 'sequence_version': 1, 'gi

In [15]:
# go through all the records, building up a list of the species each orchid sequence is from
from Bio import SeqIO

all_species = []
for seq_record in SeqIO.parse('ls_orchid.gbk', 'genbank'):
    all_species.append(seq_record.annotations['organism'])
print(all_species)

['Cypripedium irapeanum', 'Cypripedium californicum', 'Cypripedium fasciculatum', 'Cypripedium margaritaceum', 'Cypripedium lichiangense', 'Cypripedium yatabeanum', 'Cypripedium guttatum', 'Cypripedium acaule', 'Cypripedium formosanum', 'Cypripedium himalaicum', 'Cypripedium macranthon', 'Cypripedium calceolus', 'Cypripedium segawai', 'Cypripedium parviflorum var. pubescens', 'Cypripedium reginae', 'Cypripedium flavum', 'Cypripedium passerinum', 'Mexipedium xerophyticum', 'Phragmipedium schlimii', 'Phragmipedium besseae', 'Phragmipedium wallisii', 'Phragmipedium exstaminodium', 'Phragmipedium caricinum', 'Phragmipedium pearcei', 'Phragmipedium longifolium', 'Phragmipedium lindenii', 'Phragmipedium lindleyanum', 'Phragmipedium sargentianum', 'Phragmipedium kaiteurum', 'Phragmipedium czerwiakowianum', 'Phragmipedium boissierianum', 'Phragmipedium caudatum', 'Phragmipedium warszewiczianum', 'Paphiopedilum micranthum', 'Paphiopedilum malipoense', 'Paphiopedilum delenatii', 'Paphiopedilum a

In [16]:
# another way using list comprehension
all_species = [seq_record.annotations['organism'] for seq_record in SeqIO.parse('ls_orchid.gbk', 'genbank')]
print(f'using list comprehension: {all_species}')

using list comprehension: ['Cypripedium irapeanum', 'Cypripedium californicum', 'Cypripedium fasciculatum', 'Cypripedium margaritaceum', 'Cypripedium lichiangense', 'Cypripedium yatabeanum', 'Cypripedium guttatum', 'Cypripedium acaule', 'Cypripedium formosanum', 'Cypripedium himalaicum', 'Cypripedium macranthon', 'Cypripedium calceolus', 'Cypripedium segawai', 'Cypripedium parviflorum var. pubescens', 'Cypripedium reginae', 'Cypripedium flavum', 'Cypripedium passerinum', 'Mexipedium xerophyticum', 'Phragmipedium schlimii', 'Phragmipedium besseae', 'Phragmipedium wallisii', 'Phragmipedium exstaminodium', 'Phragmipedium caricinum', 'Phragmipedium pearcei', 'Phragmipedium longifolium', 'Phragmipedium lindenii', 'Phragmipedium lindleyanum', 'Phragmipedium sargentianum', 'Phragmipedium kaiteurum', 'Phragmipedium czerwiakowianum', 'Phragmipedium boissierianum', 'Phragmipedium caudatum', 'Phragmipedium warszewiczianum', 'Paphiopedilum micranthum', 'Paphiopedilum malipoense', 'Paphiopedilum de

In [17]:
# extracting data from a FASTA file
all_species = []
for seq_record in SeqIO.parse('ls_orchid.fasta', 'fasta'):
    all_species.append(seq_record.description.split()[1])
print(all_species)

['C.irapeanum', 'C.californicum', 'C.fasciculatum', 'C.margaritaceum', 'C.lichiangense', 'C.yatabeanum', 'C.guttatum', 'C.acaule', 'C.formosanum', 'C.himalaicum', 'C.macranthum', 'C.calceolus', 'C.segawai', 'C.pubescens', 'C.reginae', 'C.flavum', 'C.passerinum', 'M.xerophyticum', 'P.schlimii', 'P.besseae', 'P.wallisii', 'P.exstaminodium', 'P.caricinum', 'P.pearcei', 'P.longifolium', 'P.lindenii', 'P.lindleyanum', 'P.sargentianum', 'P.kaiteurum', 'P.czerwiakowianum', 'P.boissierianum', 'P.caudatum', 'P.warszewiczianum', 'P.micranthum', 'P.malipoense', 'P.delenatii', 'P.armeniacum', 'P.emersonii', 'P.niveum', 'P.godefroyae', 'P.bellatulum', 'P.concolor', 'P.fairrieanum', 'P.druryi', 'P.tigrinum', 'P.hirsutissimum', 'P.barbigerum', 'P.henryanum', 'P.charlesworthii', 'P.villosum', 'P.exul', 'P.insigne', 'P.gratrixianum', 'P.primulinum', 'P.victoria', 'P.victoria', 'P.glaucophyllum', 'P.supardii', 'P.kolopakingii', 'P.sanderianum', 'P.lowii', 'P.dianthum', 'P.parishii', 'P.haynaldianum', 'P

In [18]:
# extracting data from a FASTA file using list comprehension
all_species = [seq_record.description.split()[1] for seq_record in SeqIO.parse('ls_orchid.fasta', 'fasta')]
print(f'from FASTA file using list comprehension: {all_species}')

from FASTA file using list comprehension: ['C.irapeanum', 'C.californicum', 'C.fasciculatum', 'C.margaritaceum', 'C.lichiangense', 'C.yatabeanum', 'C.guttatum', 'C.acaule', 'C.formosanum', 'C.himalaicum', 'C.macranthum', 'C.calceolus', 'C.segawai', 'C.pubescens', 'C.reginae', 'C.flavum', 'C.passerinum', 'M.xerophyticum', 'P.schlimii', 'P.besseae', 'P.wallisii', 'P.exstaminodium', 'P.caricinum', 'P.pearcei', 'P.longifolium', 'P.lindenii', 'P.lindleyanum', 'P.sargentianum', 'P.kaiteurum', 'P.czerwiakowianum', 'P.boissierianum', 'P.caudatum', 'P.warszewiczianum', 'P.micranthum', 'P.malipoense', 'P.delenatii', 'P.armeniacum', 'P.emersonii', 'P.niveum', 'P.godefroyae', 'P.bellatulum', 'P.concolor', 'P.fairrieanum', 'P.druryi', 'P.tigrinum', 'P.hirsutissimum', 'P.barbigerum', 'P.henryanum', 'P.charlesworthii', 'P.villosum', 'P.exul', 'P.insigne', 'P.gratrixianum', 'P.primulinum', 'P.victoria', 'P.victoria', 'P.glaucophyllum', 'P.supardii', 'P.kolopakingii', 'P.sanderianum', 'P.lowii', 'P.dia

#### Annotation information is much easier to deal with if we can get our sequences from GenBank or EMBL

#### Modifying data

In [22]:
from Bio import SeqIO
record_iterator = SeqIO.parse('ls_orchid.fasta', 'fasta')
first_record = next(record_iterator)
print(f'first_record.id before: {first_record.id}')
first_record.id = 'new_id'
print(f'first_record.id after: {first_record.id}')

first_record.id before: gi|2765658|emb|Z78533.1|CIZ78533
first_record.id after: new_id


#### Parsing sequences from compresed files

In [23]:
# calculate the total length of the sequences in a multiple record GenBank file using a generator expression
from Bio import SeqIO
print(sum(len(r) for r in SeqIO.parse('ls_orchid.gbk', 'gb')))

67518


In [24]:
# calculate the total length of the sequences in a multiple record GenBank file using with statement and file handle
from Bio import SeqIO
with open('ls_orchid.gbk') as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, 'gb')))

67518


In [25]:
# calculate the total length of the sequences in a multiple record GenBank file old fastioned way (closing the file)
from Bio import SeqIO
handle = open('ls_orchid.gbk')
print(sum(len(r) for r in SeqIO.parse(handle, 'gb')))
handle.close()

67518


In [27]:
# gzip compressed file
import gzip
with gzip.open('ls_orchid.gbk.gz', 'rt') as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, 'gb')))

FileNotFoundError: [Errno 2] No such file or directory: 'ls_orchid.gbk.gz'

#### Parsing sequences from the net

#### Note that just because you can download sequence data and parse it into a SeqRecord object in one go doesn’t mean this is a good idea. In general, you should probably download sequences once and save them to a file for reuse.

In [28]:
from Bio import Entrez
from Bio import SeqIO

Entrez.email = 'A.N.Other@example.com'
with Entrez.efetch(db='nucleotide', rettype='fasta', retmode='text', id='6273291') as handle:
    seq_record = SeqIO.read(handle, 'fasta')
print('%s with %i features' % (seq_record.id, len(seq_record.features)))

AF191665.1 with 0 features


In [30]:
from Bio import Entrez
from Bio import SeqIO

Entrez.email = 'A.N.Other@example.com'
with Entrez.efetch(db='nucleotide', rettype='gb', retmode='text', id='6273291') as handle:
    seq_record = SeqIO.read(handle, 'gb')
print('%s with %i features' % (seq_record.id, len(seq_record.features)))

AF191665.1 with 3 features


In [34]:
# fetching several records
from Bio import Entrez
from Bio import SeqIO

Entrez.email = 'A.N.Other@example.com'
with Entrez.efetch(db='nucleotide', rettype='gb', retmode='text', id='6273291, 6273290, 6273289') as handle:
    for seq_record in SeqIO.parse(handle, 'gb'):
        print('%s %s...' % (seq_record.id, seq_record.description[:50]))
        print('Sequence length %i, %i features, from: %s'
                % (
                    len(seq_record),
                    len(seq_record.features),
                    seq_record.annotations['source'],
                )
            )

AF191665.1 Opuntia marenae rpl16 gene; chloroplast gene for c...
Sequence length 902, 3 features, from: chloroplast Grusonia marenae
AF191664.1 Opuntia clavata rpl16 gene; chloroplast gene for c...
Sequence length 899, 3 features, from: chloroplast Grusonia clavata
AF191663.1 Opuntia bradtiana rpl16 gene; chloroplast gene for...
Sequence length 899, 3 features, from: chloroplast Grusonia bradtiana


#### Sequence files as dictionaries

In [None]:
from Bio import SeqIO
handle = open('sequence.bigendian.2bit', 'rb')
records = SeqIO.parse(handle, 'twobit')
print(f'records keys: {records.keys()}')
print(f"seq222: {records['seq222']}")
print(f"seq222.seq: {records['seq222'].seq}")
handle.close()
print(f"seq222.seq: {records['seq222'].seq}")

In [36]:
from Bio import SeqIO
orchid_dict = SeqIO.to_dict(SeqIO.parse('ls_orchid.gbk', 'genbank'))
print(f'dictionary length: {len(orchid_dict)}')
print(f'orchid keys: {orchid_dict.keys()}')

dictionary length: 94
orchid keys: dict_keys(['Z78533.1', 'Z78532.1', 'Z78531.1', 'Z78530.1', 'Z78529.1', 'Z78527.1', 'Z78526.1', 'Z78525.1', 'Z78524.1', 'Z78523.1', 'Z78522.1', 'Z78521.1', 'Z78520.1', 'Z78519.1', 'Z78518.1', 'Z78517.1', 'Z78516.1', 'Z78515.1', 'Z78514.1', 'Z78513.1', 'Z78512.1', 'Z78511.1', 'Z78510.1', 'Z78509.1', 'Z78508.1', 'Z78507.1', 'Z78506.1', 'Z78505.1', 'Z78504.1', 'Z78503.1', 'Z78502.1', 'Z78501.1', 'Z78500.1', 'Z78499.1', 'Z78498.1', 'Z78497.1', 'Z78496.1', 'Z78495.1', 'Z78494.1', 'Z78493.1', 'Z78492.1', 'Z78491.1', 'Z78490.1', 'Z78489.1', 'Z78488.1', 'Z78487.1', 'Z78486.1', 'Z78485.1', 'Z78484.1', 'Z78483.1', 'Z78482.1', 'Z78481.1', 'Z78480.1', 'Z78479.1', 'Z78478.1', 'Z78477.1', 'Z78476.1', 'Z78475.1', 'Z78474.1', 'Z78473.1', 'Z78472.1', 'Z78471.1', 'Z78470.1', 'Z78469.1', 'Z78468.1', 'Z78467.1', 'Z78466.1', 'Z78465.1', 'Z78464.1', 'Z78463.1', 'Z78462.1', 'Z78461.1', 'Z78460.1', 'Z78459.1', 'Z78458.1', 'Z78457.1', 'Z78456.1', 'Z78455.1', 'Z78454.1', 'Z7845

In [37]:
seq_record = orchid_dict['Z78475.1']
print(f'description: {seq_record.description}')
print(f'sequence: {seq_record.seq}')

description: P.supardii 5.8S rRNA gene and ITS1 and ITS2 DNA
sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGAGTTAATCTGGAGGATCAGTTTACTTTGGTCACCCATGGGCATCTGCTCTTGCAGTGACCTGGATTTGCCATCGAGCCTCCTTGGGAGCTTTCTTGCTGGCGATCTAAACCCGTCCCGGCGCAGTTTTGCGCCAAGTCATATGACACATAATTGGAAGGGGGTGGCATGCTGCCTTGACCCTCCCCAAATTATTTTTTTGACAACTCTCAGCAACGGATATCTCGGCTCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCAGGCCAAGGGCACGCCTGCCTGGGCATTGCGAGTCATATCTCTCCCTTAATGAGGCTGTCCATACATACTGTTCAGCCAATGCGGATGTGAGTTTGGCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGCTGCATGGGCTTTTGATGGTCCAAAATACGGCAAGAGGTGGACGAACTATGCTACAACAAAATTGTTGTGCGAATGCCCCGGGTTGTCGTATTAGATGGGCCAGCATAATCTAAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGACGGCCATTTGGTTGCGACCCAGGTCAGGT


#### Specifiying dictionary keys

In [38]:
# using FASTA file instead of GenBank file
orchid_dict = SeqIO.to_dict(SeqIO.parse('ls_orchid.fasta', 'fasta'))
print(f'orchid keys: {orchid_dict.keys()}')

orchid keys: dict_keys(['gi|2765658|emb|Z78533.1|CIZ78533', 'gi|2765657|emb|Z78532.1|CCZ78532', 'gi|2765656|emb|Z78531.1|CFZ78531', 'gi|2765655|emb|Z78530.1|CMZ78530', 'gi|2765654|emb|Z78529.1|CLZ78529', 'gi|2765652|emb|Z78527.1|CYZ78527', 'gi|2765651|emb|Z78526.1|CGZ78526', 'gi|2765650|emb|Z78525.1|CAZ78525', 'gi|2765649|emb|Z78524.1|CFZ78524', 'gi|2765648|emb|Z78523.1|CHZ78523', 'gi|2765647|emb|Z78522.1|CMZ78522', 'gi|2765646|emb|Z78521.1|CCZ78521', 'gi|2765645|emb|Z78520.1|CSZ78520', 'gi|2765644|emb|Z78519.1|CPZ78519', 'gi|2765643|emb|Z78518.1|CRZ78518', 'gi|2765642|emb|Z78517.1|CFZ78517', 'gi|2765641|emb|Z78516.1|CPZ78516', 'gi|2765640|emb|Z78515.1|MXZ78515', 'gi|2765639|emb|Z78514.1|PSZ78514', 'gi|2765638|emb|Z78513.1|PBZ78513', 'gi|2765637|emb|Z78512.1|PWZ78512', 'gi|2765636|emb|Z78511.1|PEZ78511', 'gi|2765635|emb|Z78510.1|PCZ78510', 'gi|2765634|emb|Z78509.1|PPZ78509', 'gi|2765633|emb|Z78508.1|PLZ78508', 'gi|2765632|emb|Z78507.1|PLZ78507', 'gi|2765631|emb|Z78506.1|PLZ78506', 'gi|

In [2]:
from Bio import SeqIO

def get_accession(record):
    """
    Given a SeqRecord, return the accession number as a string.

    e.g. 'gi|2765613|emb|Z78488.1|PTZ78488' -> 'Z78488.1'
    """
    parts = record.id.split('|')  # | is the pipe symbol
    assert len(parts) == 5 and parts[0] == 'gi' and parts[2] == 'emb'
    return parts[3]

orchid_dict = SeqIO.to_dict(SeqIO.parse('ls_orchid.fasta', 'fasta'), key_function=get_accession)
print(orchid_dict.keys())

dict_keys(['Z78533.1', 'Z78532.1', 'Z78531.1', 'Z78530.1', 'Z78529.1', 'Z78527.1', 'Z78526.1', 'Z78525.1', 'Z78524.1', 'Z78523.1', 'Z78522.1', 'Z78521.1', 'Z78520.1', 'Z78519.1', 'Z78518.1', 'Z78517.1', 'Z78516.1', 'Z78515.1', 'Z78514.1', 'Z78513.1', 'Z78512.1', 'Z78511.1', 'Z78510.1', 'Z78509.1', 'Z78508.1', 'Z78507.1', 'Z78506.1', 'Z78505.1', 'Z78504.1', 'Z78503.1', 'Z78502.1', 'Z78501.1', 'Z78500.1', 'Z78499.1', 'Z78498.1', 'Z78497.1', 'Z78496.1', 'Z78495.1', 'Z78494.1', 'Z78493.1', 'Z78492.1', 'Z78491.1', 'Z78490.1', 'Z78489.1', 'Z78488.1', 'Z78487.1', 'Z78486.1', 'Z78485.1', 'Z78484.1', 'Z78483.1', 'Z78482.1', 'Z78481.1', 'Z78480.1', 'Z78479.1', 'Z78478.1', 'Z78477.1', 'Z78476.1', 'Z78475.1', 'Z78474.1', 'Z78473.1', 'Z78472.1', 'Z78471.1', 'Z78470.1', 'Z78469.1', 'Z78468.1', 'Z78467.1', 'Z78466.1', 'Z78465.1', 'Z78464.1', 'Z78463.1', 'Z78462.1', 'Z78461.1', 'Z78460.1', 'Z78459.1', 'Z78458.1', 'Z78457.1', 'Z78456.1', 'Z78455.1', 'Z78454.1', 'Z78453.1', 'Z78452.1', 'Z78451.1', 'Z784

In [3]:
orchid_dict.keys()

dict_keys(['Z78533.1', 'Z78532.1', 'Z78531.1', 'Z78530.1', 'Z78529.1', 'Z78527.1', 'Z78526.1', 'Z78525.1', 'Z78524.1', 'Z78523.1', 'Z78522.1', 'Z78521.1', 'Z78520.1', 'Z78519.1', 'Z78518.1', 'Z78517.1', 'Z78516.1', 'Z78515.1', 'Z78514.1', 'Z78513.1', 'Z78512.1', 'Z78511.1', 'Z78510.1', 'Z78509.1', 'Z78508.1', 'Z78507.1', 'Z78506.1', 'Z78505.1', 'Z78504.1', 'Z78503.1', 'Z78502.1', 'Z78501.1', 'Z78500.1', 'Z78499.1', 'Z78498.1', 'Z78497.1', 'Z78496.1', 'Z78495.1', 'Z78494.1', 'Z78493.1', 'Z78492.1', 'Z78491.1', 'Z78490.1', 'Z78489.1', 'Z78488.1', 'Z78487.1', 'Z78486.1', 'Z78485.1', 'Z78484.1', 'Z78483.1', 'Z78482.1', 'Z78481.1', 'Z78480.1', 'Z78479.1', 'Z78478.1', 'Z78477.1', 'Z78476.1', 'Z78475.1', 'Z78474.1', 'Z78473.1', 'Z78472.1', 'Z78471.1', 'Z78470.1', 'Z78469.1', 'Z78468.1', 'Z78467.1', 'Z78466.1', 'Z78465.1', 'Z78464.1', 'Z78463.1', 'Z78462.1', 'Z78461.1', 'Z78460.1', 'Z78459.1', 'Z78458.1', 'Z78457.1', 'Z78456.1', 'Z78455.1', 'Z78454.1', 'Z78453.1', 'Z78452.1', 'Z78451.1', 'Z784

#### Sequence files as dictionaries - indexed files

In [4]:
# for larger files (previous example is limited by computer RAM)
from Bio import SeqIO
orchid_dict = SeqIO.index('ls_orchid.gbk', 'genbank')
print(f'dictionary length: {len(orchid_dict)}')
print(f'dictionary keys: {orchid_dict.keys()}')
orchid_dict.keys()

dictionary length: 94
dictionary keys: KeysView(SeqIO.index('ls_orchid.gbk', 'genbank', alphabet=None, key_function=None))


KeysView(SeqIO.index('ls_orchid.gbk', 'genbank', alphabet=None, key_function=None))

In [7]:
seq_record = orchid_dict['Z78475.1']
print(f'seq record description: {seq_record.description}')
print(f'seq record sequence: {seq_record.seq}')
orchid_dict.close()

seq record description: P.supardii 5.8S rRNA gene and ITS1 and ITS2 DNA
seq record sequence: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGAGTTAATCTGGAGGATCAGTTTACTTTGGTCACCCATGGGCATCTGCTCTTGCAGTGACCTGGATTTGCCATCGAGCCTCCTTGGGAGCTTTCTTGCTGGCGATCTAAACCCGTCCCGGCGCAGTTTTGCGCCAAGTCATATGACACATAATTGGAAGGGGGTGGCATGCTGCCTTGACCCTCCCCAAATTATTTTTTTGACAACTCTCAGCAACGGATATCTCGGCTCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCAGGCCAAGGGCACGCCTGCCTGGGCATTGCGAGTCATATCTCTCCCTTAATGAGGCTGTCCATACATACTGTTCAGCCAATGCGGATGTGAGTTTGGCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGCTGCATGGGCTTTTGATGGTCCAAAATACGGCAAGAGGTGGACGAACTATGCTACAACAAAATTGTTGTGCGAATGCCCCGGGTTGTCGTATTAGATGGGCCAGCATAATCTAAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGACGGCCATTTGGTTGCGACCCAGGTCAGGT


In [9]:
# want to use the same keys as before
from Bio import SeqIO

def get_acc(identifier):
    """
    Given a SeqRecord, return the accession number as a string.
    e.g. 'gi|2765613|emb|Z78488.1|PTZ78488' -> 'Z78488.1'
    """
    parts = identifier.split('|')
    assert len(parts) == 5 and parts[0] == 'gi' and parts[2] == 'emb'
    return parts[3]

orchid_dict = SeqIO.index('ls_orchid.fasta', 'fasta', key_function=get_acc)
orchid_dict.keys()

KeysView(SeqIO.index('ls_orchid.fasta', 'fasta', alphabet=None, key_function=<function get_acc at 0x00000249A6C15550>))

#### Biopython 1.57 introduced an alternative, Bio.SeqIO.index_db(), which can work on even extremely large files since it stores the record information as a file on disk (using an SQLite3 database) rather than in memory. Also, you can index multiple files together (providing all the record identifiers are unique).

#### Writing sequence files

In [12]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

rec1 = SeqRecord(
    Seq(
        'MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD'
        'GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK'
        'NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM'
        'SSAC',
    ),
    id="gi|14150838|gb|AAK54648.1|AF376133_1",
    description="chalcone synthase [Cucumis sativus]",
)  # why does Python give Syntax error: invalid syntax when I use single quotes in lines 12-13?

rec2 = SeqRecord(
    Seq(
        'YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ'
        'DMVVVEIPKLGKEAAVKAIKEWGQ',
    ),
    id="gi|13919613|gb|AAK33142.1|",
    description="chalcone synthase [Fragaria vesca subsp. bracteata]",
)

rec3 = SeqRecord(
    Seq(
        'MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC'
        'EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP'
        'KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN'
        'NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV'
        'SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW'
        'IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT'
        'TGEGLEWGVLFGFGPGLTVETVVLHSVAT',
    ),
    id="gi|13925890|gb|AAK49457.1|",
    description="chalcone synthase [Nicotiana tabacum]",
)

my_records = [rec1, rec2, rec3]
SeqIO.write(my_records, 'my_example.faa', 'fasta')  # function returns number of SeqRecord objects written to the file

3

#### Converting between sequence file formats

In [13]:
from Bio import SeqIO
records = SeqIO.parse('ls_orchid.gbk', 'genbank')
count = SeqIO.write(records, 'my_example.fasta', 'fasta')
print('Converted %i records' % count)

Converted 94 records


In [14]:
from Bio import SeqIO
count = SeqIO.convert('ls_orchid.gbk', 'genbank', 'my_example.fasta', 'fasta')  # helper function
print('Converted %i records' % count)

Converted 94 records


Bio.SeqIO.convert() function will take handles or filenames. Watch out though – if the output file already exists, it will overwrite it! To find out more, see the built in help

#### Converting file sequences to their reverse complements

In [15]:
from Bio import SeqIO
for record in SeqIO.parse('ls_orchid.gbk', 'genbank'):
    print(record.id)
    print(record.seq.reverse_complement())

Z78533.1
GCGTAAACTCAGCGGGTGCCCCCGCCTGACCTGGGGTCACATCCGAATGGCGGTCAACCGCCCTCCATTGGGGTTCGGAAGGGTTCTCCTGCCTGTCCGACAAGCACGACAACATGGGGGATTCGCACGGCAGCTGCTGCCAGCATCCGTCCACCTCTTGCCGGGTTCCGGGCCATCAAAACACCAGCTCTTGGACCCGCCGCACCTAGGCACAAGGGGCCAATCTTTCACATCCGCACCACGCCGGCCTGGCTGTATGCCGGGCAAGCATTGGCAGGAGAGAGACGAAGCGCGACGCCCAAGCAGGCGTGCCCTTAGCCTGATGGCCTCGGGCGCAACTTGCGTTCAAAAGACTCGATGGTTCACGGGATCTTGCAATTCACACCACTTATCGCATTTCGCTGCGTCCTTCCATCCGATGCAAAGAGCCAAGATTCCCGTTTGCGAGAGTCATCAAAATTCATTGGGCACGCGACAGCACGCCGCCGCTCCGGGTTTTGGGGAAGACAATGCCATTCGCCGGTGATGCTTTCATATGGCTTGGCGCCCAAACTGCGCCGGGCTAGAGGTTCAAACCCGCCATGGACGCTCCCGAGGCGGCCCAACAACAAATCAGGGTCACCACGGGAGCAATGCCCCCGGTGAGCTGAGTACACCGGTCCTCCGGATTCACTCGATCGTTTATTCCACGGTCTCATCAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACG
Z78532.1
GCCTCAACTCAGCGGGTGGCCCCGCCTGACCTGGGGTCGCATCTGAATGGAAATCAACTGCCCAATGGTTATTTTAGCTCCATTGGGGTTCAATTAGGTTCTTGTGTAGGTTCGAAAAAATACAACAACATGGGGGATTCAAATAGCAGCCTTATGACTGTTAGCATTCTCCACCTCGTGCCACATTCCTACCCATCAAAGCAACAATCCTTAGACCCACCGCACCTAGGCACAAGGGGCC

In [17]:
# to save these reverse complements to a file, we’ll need to make SeqRecord objects
from Bio import SeqIO
records = [rec.reverse_complement(id='rc_'+rec.id, description='reverse complement') \
          for rec in SeqIO.parse('ls_orchid.fasta', 'fasta')]
len(records)

94

In [18]:
from Bio import SeqIO
records = [rec.reverse_complement(id='rc_'+rec.id, description='reverse complement') \
          for rec in SeqIO.parse('ls_orchid.fasta', 'fasta') if len(rec)<700]
len(records)

18

In [19]:
# complete example
from Bio import SeqIO
records = [rec.reverse_complement(id='rc_'+rec.id, description='reverse complement') \
          for rec in SeqIO.parse('ls_orchid.fasta', 'fasta') if len(rec)<700]
SeqIO.write(records, 'rev_comp.fasta', 'fasta')

18

#### Getting SeqRecord objects as formatted strings

In [20]:
from Bio import SeqIO
from io import StringIO

records = SeqIO.parse('ls_orchid.gbk', 'genbank')
out_handle = StringIO()
SeqIO.write(records, out_handle, 'fasta')
fasta_data = out_handle.getvalue()
print(fasta_data)

>Z78533.1 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAA
CGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGT
GACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCC
CGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCC
CAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAA
CGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTG
AATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA
GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCG
GCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCG
GCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTG
GCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCC
TTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGG
GGCACCCGCTGAGTTTACGC
>Z78532.1 C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAGAATATA
TGATCGAGTGAATCTGGAGGACCTGTGGTAACTCAGCTCGTCGTGGCACTGCTTTTGTCG
TGAC

### Multiple Sequence Alignments

Parsing or Reading MSAs (multiple sequence alignments)

In [1]:
from Bio import AlignIO
alignment = AlignIO.read('PF05371_seed.sth', 'stockholm')
print(alignment)

Alignment with 69 rows and 64 columns
LFMVTDAQAAALVAN------ITVTDIVDQLK--AGAPAIVSIA...GAL I3CK69_9GAMM/17-72
LLFLSSWSVSATPFY------LVVDDVVDQIA--EGAFPVLAIG...GAM I3CE12_9GAMM/11-66
SLLLFSGVASAAPIT------MDVSEVVDQIK--AAAVPILSVG...GAM I3CE11_9GAMM/21-76
VTAALTLPAFAAAAQ------PDVTEVVAYIL--AGIATIALVG...GVL A0A328ZKQ2_9BURK/30-85
LVLASPMALAQTGPT------IDVSSVTGFID-GQLIPGLGTIG...GAA A0A318DYR2_9GAMM/20-76
LLLATSAMASSAFAE------IDVSAATTAIS-TDGSTAITAVG...GAI A0A081KB79_9GAMM/9-65
LLVASALMSSSVFAA------IDISAATTALT-TDGSAAITAVG...GAI A0A081K835_9GAMM/9-65
LAMGTVLSANAAATT------IDTTDVLLSIA--AAVVAIVAVG...AAM A0A1H0GMS3_9BURK/27-82
LSTAAAFVGAQAQAA------IDVTAVGTEIE--AAGNAASSTG...GLI A0A1H4H7Y8_9GAMM/12-67
LSGVALTVGAGLAHA------IDTTKVGASIT--AAETDALTTG...GIV A0A1I1EUT2_9GAMM/16-71
AGFAANAPVFAADGQ------LDTTSVQAGID--AAKATGLSVG...GLV A0A261GG07_9GAMM/20-75
TAVATATLVSATANA------EDLSSITGKINLKSASTGIVAVG...GFL Q7VLI9_HAEDU/13-70
LVTAPALAFAEASTATS----FDVSAITGQISFASVAAGVIAIA...GMI A0A239SNN7_9BURK/21-80
TGMAMSAVAFADDKI---

In [3]:
from Bio import AlignIO
alignment = AlignIO.read('PF05371_seed.sth', 'stockholm')
print('Alignment length %i' % alignment.get_alignment_length())

for record in alignment:
    print('%s-%s' % (record.seq, record.id))

Alignment length 64
LFMVTDAQAAALVAN------ITVTDIVDQLK--AGAPAIVSIATAAISLLAVIAVFKYVRGAL-I3CK69_9GAMM/17-72
LLFLSSWSVSATPFY------LVVDDVVDQIA--EGAFPVLAIGLASLTILAIVAMIRWARGAM-I3CE12_9GAMM/11-66
SLLLFSGVASAAPIT------MDVSEVVDQIK--AAAVPILSVGVAALTVLAIVAMVRWARGAM-I3CE11_9GAMM/21-76
VTAALTLPAFAAAAQ------PDVTEVVAYIL--AGIATIALVGNAGLMVRGATAVFAWIRGVL-A0A328ZKQ2_9BURK/30-85
LVLASPMALAQTGPT------IDVSSVTGFID-GQLIPGLGTIGTAFLLVAVLFAAYRWIRGAA-A0A318DYR2_9GAMM/20-76
LLLATSAMASSAFAE------IDVSAATTAIS-TDGSTAITAVGTALIGVAALAVVFKWVKGAI-A0A081KB79_9GAMM/9-65
LLVASALMSSSVFAA------IDISAATTALT-TDGSAAITAVGTALVGLAGIAVVFKWVKGAI-A0A081K835_9GAMM/9-65
LAMGTVLSANAAATT------IDTTDVLLSIA--AAVVAIVAVGGAVLGLQVAVKAFKWVRAAM-A0A1H0GMS3_9BURK/27-82
LSTAAAFVGAQAQAA------IDVTAVGTEIE--AAGNAASSTGTYVIAAVAAVCGVGLIIGLI-A0A1H4H7Y8_9GAMM/12-67
LSGVALTVGAGLAHA------IDTTKVGASIT--AAETDALTTGEFVIGSVASLVVIGLIIGIV-A0A1I1EUT2_9GAMM/16-71
AGFAANAPVFAADGQ------LDTTSVQAGID--AAKATGLSVGAMVVAAVASMVVVGIVIGLV-A0A261GG07_9GAMM/20-75
TAVATATLVSATANA------EDLSS

In [6]:
for record in alignment:
    if record.dbxrefs:
        print('%s %s' % (record.id, record.dbxrefs))

In [7]:
for record in alignment:
    print(record)

ID: I3CK69_9GAMM/17-72
Name: I3CK69_9GAMM
Description: I3CK69_9GAMM/17-72
Number of features: 0
/accession=I3CK69.1
/start=17
/end=72
Seq('LFMVTDAQAAALVAN------ITVTDIVDQLK--AGAPAIVSIATAAISLLAVI...GAL')
ID: I3CE12_9GAMM/11-66
Name: I3CE12_9GAMM
Description: I3CE12_9GAMM/11-66
Number of features: 0
/accession=I3CE12.1
/start=11
/end=66
Seq('LLFLSSWSVSATPFY------LVVDDVVDQIA--EGAFPVLAIGLASLTILAIV...GAM')
ID: I3CE11_9GAMM/21-76
Name: I3CE11_9GAMM
Description: I3CE11_9GAMM/21-76
Number of features: 0
/accession=I3CE11.1
/start=21
/end=76
Seq('SLLLFSGVASAAPIT------MDVSEVVDQIK--AAAVPILSVGVAALTVLAIV...GAM')
ID: A0A328ZKQ2_9BURK/30-85
Name: A0A328ZKQ2_9BURK
Description: A0A328ZKQ2_9BURK/30-85
Number of features: 0
/accession=A0A328ZKQ2.1
/start=30
/end=85
Seq('VTAALTLPAFAAAAQ------PDVTEVVAYIL--AGIATIALVGNAGLMVRGAT...GVL')
ID: A0A318DYR2_9GAMM/20-76
Name: A0A318DYR2_9GAMM
Description: A0A318DYR2_9GAMM/20-76
Number of features: 0
/accession=A0A318DYR2.1
/start=20
/end=76
Seq('LVLASPMALAQTGPT------

In [9]:
from Bio import AlignIO
alignment = AlignIO.read('PF05356_seed.faa', 'fasta')
print(alignment)

Alignment with 69 rows and 64 columns
LFMVTDAQAAALVAN......ITVTDIVDQLK..AGAPAIVSIA...GAL I3CK69_9GAMM/17-72
LLFLSSWSVSATPFY......LVVDDVVDQIA..EGAFPVLAIG...GAM I3CE12_9GAMM/11-66
SLLLFSGVASAAPIT......MDVSEVVDQIK..AAAVPILSVG...GAM I3CE11_9GAMM/21-76
VTAALTLPAFAAAAQ......PDVTEVVAYIL..AGIATIALVG...GVL A0A328ZKQ2_9BURK/30-85
LVLASPMALAQTGPT......IDVSSVTGFID.GQLIPGLGTIG...GAA A0A318DYR2_9GAMM/20-76
LLLATSAMASSAFAE......IDVSAATTAIS.TDGSTAITAVG...GAI A0A081KB79_9GAMM/9-65
LLVASALMSSSVFAA......IDISAATTALT.TDGSAAITAVG...GAI A0A081K835_9GAMM/9-65
LAMGTVLSANAAATT......IDTTDVLLSIA..AAVVAIVAVG...AAM A0A1H0GMS3_9BURK/27-82
LSTAAAFVGAQAQAA......IDVTAVGTEIE..AAGNAASSTG...GLI A0A1H4H7Y8_9GAMM/12-67
LSGVALTVGAGLAHA......IDTTKVGASIT..AAETDALTTG...GIV A0A1I1EUT2_9GAMM/16-71
AGFAANAPVFAADGQ......LDTTSVQAGID..AAKATGLSVG...GLV A0A261GG07_9GAMM/20-75
TAVATATLVSATANA......EDLSSITGKINLKSASTGIVAVG...GFL Q7VLI9_HAEDU/13-70
LVTAPALAFAEASTATS....FDVSAITGQISFASVAAGVIAIA...GMI A0A239SNN7_9BURK/21-80
TGMAMSAVAFADDKI...

#### Getting alignment objects as formatted strings

In [11]:
from Bio import AlignIO
alignment = AlignIO.read('PF05371_seed.sth', 'stockholm')
print(format(alignment, 'clustal'))

CLUSTAL X (1.81) multiple sequence alignment


I3CK69_9GAMM/17-72                  LFMVTDAQAAALVAN------ITVTDIVDQLK--AGAPAIVSIATAAISL
I3CE12_9GAMM/11-66                  LLFLSSWSVSATPFY------LVVDDVVDQIA--EGAFPVLAIGLASLTI
I3CE11_9GAMM/21-76                  SLLLFSGVASAAPIT------MDVSEVVDQIK--AAAVPILSVGVAALTV
A0A328ZKQ2_9BURK/30-85              VTAALTLPAFAAAAQ------PDVTEVVAYIL--AGIATIALVGNAGLMV
A0A318DYR2_9GAMM/20-76              LVLASPMALAQTGPT------IDVSSVTGFID-GQLIPGLGTIGTAFLLV
A0A081KB79_9GAMM/9-65               LLLATSAMASSAFAE------IDVSAATTAIS-TDGSTAITAVGTALIGV
A0A081K835_9GAMM/9-65               LLVASALMSSSVFAA------IDISAATTALT-TDGSAAITAVGTALVGL
A0A1H0GMS3_9BURK/27-82              LAMGTVLSANAAATT------IDTTDVLLSIA--AAVVAIVAVGGAVLGL
A0A1H4H7Y8_9GAMM/12-67              LSTAAAFVGAQAQAA------IDVTAVGTEIE--AAGNAASSTGTYVIAA
A0A1I1EUT2_9GAMM/16-71              LSGVALTVGAGLAHA------IDTTKVGASIT--AAETDALTTGEFVIGS
A0A261GG07_9GAMM/20-75              AGFAANAPVFAADGQ------LDTTSVQAGID--AAKATGLSVGAMV

In [14]:
from io import StringIO
alignments = AlignIO.parse('PF05371_seed.sth', 'stockholm')
out_handle = StringIO()
AlignIO.write(alignments, out_handle, 'clustal')
clustal_data = out_handle.getvalue()
print(clustal_data)

CLUSTAL X (1.81) multiple sequence alignment


I3CK69_9GAMM/17-72                  LFMVTDAQAAALVAN------ITVTDIVDQLK--AGAPAIVSIATAAISL
I3CE12_9GAMM/11-66                  LLFLSSWSVSATPFY------LVVDDVVDQIA--EGAFPVLAIGLASLTI
I3CE11_9GAMM/21-76                  SLLLFSGVASAAPIT------MDVSEVVDQIK--AAAVPILSVGVAALTV
A0A328ZKQ2_9BURK/30-85              VTAALTLPAFAAAAQ------PDVTEVVAYIL--AGIATIALVGNAGLMV
A0A318DYR2_9GAMM/20-76              LVLASPMALAQTGPT------IDVSSVTGFID-GQLIPGLGTIGTAFLLV
A0A081KB79_9GAMM/9-65               LLLATSAMASSAFAE------IDVSAATTAIS-TDGSTAITAVGTALIGV
A0A081K835_9GAMM/9-65               LLVASALMSSSVFAA------IDISAATTALT-TDGSAAITAVGTALVGL
A0A1H0GMS3_9BURK/27-82              LAMGTVLSANAAATT------IDTTDVLLSIA--AAVVAIVAVGGAVLGL
A0A1H4H7Y8_9GAMM/12-67              LSTAAAFVGAQAQAA------IDVTAVGTEIE--AAGNAASSTGTYVIAA
A0A1I1EUT2_9GAMM/16-71              LSGVALTVGAGLAHA------IDTTKVGASIT--AAETDALTTGEFVIGS
A0A261GG07_9GAMM/20-75              AGFAANAPVFAADGQ------LDTTSVQAGID--AAKATGLSVGAMV

#### Manipulating alignments

In [17]:
# slicing alignments
from Bio import AlignIO
alignment = AlignIO.read('PF05371_seed.sth', 'stockholm')
print('Number of rows: %i' % (len(alignment)))
for record in alignment:
    print('%s - %s' % (record.seq, record.id))

print(alignment[3:7])

Number of rows: 69
LFMVTDAQAAALVAN------ITVTDIVDQLK--AGAPAIVSIATAAISLLAVIAVFKYVRGAL - I3CK69_9GAMM/17-72
LLFLSSWSVSATPFY------LVVDDVVDQIA--EGAFPVLAIGLASLTILAIVAMIRWARGAM - I3CE12_9GAMM/11-66
SLLLFSGVASAAPIT------MDVSEVVDQIK--AAAVPILSVGVAALTVLAIVAMVRWARGAM - I3CE11_9GAMM/21-76
VTAALTLPAFAAAAQ------PDVTEVVAYIL--AGIATIALVGNAGLMVRGATAVFAWIRGVL - A0A328ZKQ2_9BURK/30-85
LVLASPMALAQTGPT------IDVSSVTGFID-GQLIPGLGTIGTAFLLVAVLFAAYRWIRGAA - A0A318DYR2_9GAMM/20-76
LLLATSAMASSAFAE------IDVSAATTAIS-TDGSTAITAVGTALIGVAALAVVFKWVKGAI - A0A081KB79_9GAMM/9-65
LLVASALMSSSVFAA------IDISAATTALT-TDGSAAITAVGTALVGLAGIAVVFKWVKGAI - A0A081K835_9GAMM/9-65
LAMGTVLSANAAATT------IDTTDVLLSIA--AAVVAIVAVGGAVLGLQVAVKAFKWVRAAM - A0A1H0GMS3_9BURK/27-82
LSTAAAFVGAQAQAA------IDVTAVGTEIE--AAGNAASSTGTYVIAAVAAVCGVGLIIGLI - A0A1H4H7Y8_9GAMM/12-67
LSGVALTVGAGLAHA------IDTTKVGASIT--AAETDALTTGEFVIGSVASLVVIGLIIGIV - A0A1I1EUT2_9GAMM/16-71
AGFAANAPVFAADGQ------LDTTSVQAGID--AAKATGLSVGAMVVAAVASMVVVGIVIGLV - A0A261GG07_9GAMM/20-75
TAVAT

In [20]:
# selecting by column
print(f'alignment[2, 6]: {alignment[2, 6]}')
# using two integer indices pulls out a single letter, short hand
print(f'alignment[2].seq[6]: {alignment[2].seq[6]}')
# pulling out a single column
print(f'alignment[:, 6]: {alignment[:, 6]}')
# selecting a range of columns
print(f'alignment[3:6, :6]: {alignment[3:6, :6]}')

alignment[2, 6]: G
alignment[2].seq[6]: G
alignment[:, 6]: AWGLMALLFTATLASDFGVTLLVLGAATVLLMFMLALALLSLAAVLLMLMMLMTVVNFLMAAVALAVII
alignment[3:6, :6]: Alignment with 3 rows and 6 columns
VTAALT A0A328ZKQ2_9BURK/30-85
LVLASP A0A318DYR2_9GAMM/20-76
LLLATS A0A081KB79_9GAMM/9-65


In [23]:
# alignments as arrays
import numpy as np
from Bio import AlignIO
alignment = AlignIO.read('PF05371_seed.sth', 'stockholm')
align_array = np.array([list(rec) for rec in alignment], np.character)
print('Array shape %i by %i' % align_array.shape)

Array shape 69 by 64


  align_array = np.array([list(rec) for rec in alignment], np.character)


#### Getting information from the alignment

In [28]:
# substitutions
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
alignment = MultipleSeqAlignment(
    [
        SeqRecord(Seq("ACTCCTA"), id='seq1'),
        SeqRecord(Seq("AAT-CTA"), id='seq2'),
        SeqRecord(Seq("CCTACT-"), id='seq3'),
        SeqRecord(Seq("TCTCCTC"), id='seq4'),
    ]
)
print(alignment)
substitutions = alignment.substitutions
print(substitutions)

# adding entries for missing letters
m = substitutions.select('ATCG')
print()
print(m)

Alignment with 4 rows and 7 columns
ACTCCTA seq1
AAT-CTA seq2
CCTACT- seq3
TCTCCTC seq4
    A    C    T
A 2.0  4.5  1.0
C 4.5 10.0  0.5
T 1.0  0.5 12.0


    A    T    C   G
A 2.0  1.0  4.5 0.0
T 1.0 12.0  0.5 0.0
C 4.5  0.5 10.0 0.0
G 0.0  0.0  0.0 0.0



#### Alignment tools

##### ClustalW

In [31]:
from Bio.Align.Applications import ClustalwCommandline
# help(ClustalwCommandline)
cline = ClustalwCommandline('clustalw2', infile='opuntia.fasta')
print(cline)

clustalw2 -infile=opuntia.fasta


In [None]:
# do not run: results in error
from Bio import AlignIO
align = AlignIO.read('opuntia.aln', 'clustal')
print(align)

##### MUSCLE

In [33]:
from Bio.Align.Applications import MuscleCommandline
# help(ClustalwCommandline)
cline = MuscleCommandline(input='opuntia.fasta', out='opuntia.txt')
print(cline)

muscle -in opuntia.fasta -out opuntia.txt


#### MUSCLE uses “-in” and “-out” but in Biopython we have to use “input” and “out” as the keyword arguments or property names

In [1]:
# default output for MUSCLE is fasta format
# asking for ClustalW-like format
from Bio.Align.Applications import MuscleCommandline
# help(ClustalwCommandline)
cline = MuscleCommandline(input='opuntia.fasta', out='opuntia.aln', clw=True)
print(cline)

muscle -in opuntia.fasta -out opuntia.aln -clw


In [3]:
# strict ClustalW output
from Bio.Align.Applications import MuscleCommandline
# help(ClustalwCommandline)
cline = MuscleCommandline(input='opuntia.fasta', out='opuntia.aln', clwstrict=True)
print(cline)  # Bio.AlignIO module should be able to read these alignments using format="clustal"

muscle -in opuntia.fasta -out opuntia.aln -clwstrict


#### Pairwise alignments

In [2]:
from Bio import Align
# aligner = Align.PairwiseAligner()
aligner = Align.PairwiseAligner(match_score=1.0)
seq1 = 'GAACT'
seq2 = 'GAT'
score = aligner.score(seq1, seq2)
score

3.0

In [3]:
# to see actual alignment
alignments = aligner.align(seq1, seq2)
# print(len(alignments))
for alignment in alignments:
    print(alignment)

GAACT
||--|
GA--T

GAACT
|-|-|
G-A-T



In [6]:
# local alignment (default is global alignment)
aligner.mode = 'local'
seq1 = 'AGAACTC'
seq2 = 'GAACT'
score = aligner.score(seq1, seq2)
print(f'score: {score}')

alignments = aligner.align(seq1, seq2)
for alignment in alignments:
    print(alignment)
print(aligner)

score: 5.0
AGAACTC
 ||||| 
 GAACT 

Pairwise sequence aligner with parameters
  match_score: 1.000000
  mismatch_score: 0.000000
  target_internal_open_gap_score: 0.000000
  target_internal_extend_gap_score: 0.000000
  target_left_open_gap_score: 0.000000
  target_left_extend_gap_score: 0.000000
  target_right_open_gap_score: 0.000000
  target_right_extend_gap_score: 0.000000
  query_internal_open_gap_score: 0.000000
  query_internal_extend_gap_score: 0.000000
  query_left_open_gap_score: 0.000000
  query_left_extend_gap_score: 0.000000
  query_right_open_gap_score: 0.000000
  query_right_extend_gap_score: 0.000000
  mode: local



In [7]:
print(aligner.algorithm)
print(aligner.epsilon)

Smith-Waterman
1e-06


#### Iterating over alignments

In [28]:
from Bio import Align
aligner = Align.PairwiseAligner()
alignments = aligner.align('AAA', 'AA')
print(alignments[2])
print(alignments[0])
len(alignments)

AAA
-||
-AA

AAA
||-
AA-



3

In [14]:
for alignment in alignments:
    print(alignment)

# alignments can be reused, i.e. you can iterate over alignments multiple times

# converting alignments into list or tuples
alignments = list(alignments)
print(f'list of alignments: {alignments}')

AAA
||-
AA-

AAA
|-|
A-A

AAA
-||
-AA

list of alignments: [<Bio.Align.PairwiseAlignment object at 0x00000132301DC2E0>, <Bio.Align.PairwiseAlignment object at 0x00000132301DC610>, <Bio.Align.PairwiseAlignment object at 0x00000132301DCB50>]


#### It is wise to check the number of alignments by calling len(alignments) before attempting to call list(alignments) to save all alignments as a list.

In [17]:
alignments.score

2.0

In [1]:
# alignment objects
from Bio import Align
aligner = Align.PairwiseAligner()
seq1 = 'GAACT'
seq2 = 'GAT'
alignments = aligner.align(seq1, seq2)
alignment = alignments[0]
alignment  # doctest: +SKIP

<Bio.Align.PairwiseAlignment at 0x2bca507e7f0>

In [35]:
print(alignment.score)
print(alignment.target)
print(alignment.query)
print(alignment)  # show alignment explictly
# alignment.shape  # returns a tuple: length of the alignnment and the number of columns in the alignment

3.0
GAACT
GAT
GAACT
||--|
GA--T



In [4]:
# local alignments: sections that are not aligned are not included in the number of columns
aligner.mode = 'local'
local_alignments = aligner.align('TGAACT', 'GAC')
local_alignment = local_alignments[0]
print(local_alignment)
# local_alignment.shape  # yields AttributeError: 'PairwiseAlignment' object has no attribute 'shape'

TGAACT
 ||-| 
 GA-C 



#### Use the aligned property to find the start and end indices of subsequences in the target and query sequence that were aligned to each other. Generally, if the alignment between target (t) and query (q) consists of N chunks, you get two tuples of length N:
(((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)), ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN)),)

In [6]:
alignment.aligned

(((0, 2), (4, 5)), ((0, 2), (2, 3)))

In [8]:
# alternative alignment
alignment = alignments[1]
print(alignment)
alignment.aligned

GAACT
|-|-|
G-A-T



(((0, 1), (2, 3), (4, 5)), ((0, 1), (1, 2), (2, 3)))

In [17]:
# aligning to reverse strand - use strand='-'
from Bio import Align
from Bio.Seq import reverse_complement
seq1 = 'AAAACCC'
seq2 = 'AACC'
aligner = Align.PairwiseAligner()
aligner.mismatch_score = -1
aligner.internal_gap_score = -1
print(aligner.score(seq1, seq2))  # strand is '+' by default
# print(aligner.score(seq1, reverse_complement(seq2), strand='-'))
# aligner.score(seq1, seq2, strand="-")
print(aligner.score(seq1, reverse_complement(seq2)))

4.0
0.0


#### Generalized pairwise alignments using match/mismatch scores and an alphabet
#### In most cases, PairwiseAligner is used to perform alignments of sequences (strings or Seq objects) consisting of single-letter nucleotides or amino acids. More generally, PairwiseAligner can also be applied to lists or tuples of arbitrary objects.

In [29]:
from Bio import Align
aligner = Align.PairwiseAligner()
s1 = ('Asn', 'Leu', 'Leu', 'Phe')
s2 = ('Asn', 'Leu', 'Phe')
aligner.alphabet = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys',
                    'Gln', 'Glu', 'Gly', 'His', 'Ile',
                    'Leu', 'Lys', 'Met', 'Phe', 'Pro',
                    'Ser', 'Thr', 'Trp', 'Tyr', 'Val']

# use +6/-1 match and mismatch scores as an approximation of the BLOSUM62 matrix and align these sequences to each other
aligner.match = +6
aligner.mismatch = -1
alignments = aligner.align(s1, s2)
print(f'number of alignments: {len(alignments)}')
print(f'alignment 1:\n{alignments[0]}')  # prints a blankline after the alignment
print(f'alignment 2:\n{alignments[1]}')
print(f'alignment score: {alignments.score}')

number of alignments: 2
alignment 1:
Asn Leu Leu Phe
||| ||| --- |||
Asn Leu --- Phe

alignment 2:
Asn Leu Leu Phe
||| --- ||| |||
Asn --- Leu Phe

alignment score: 18.0
