# Python Project Part 2: Data Parsing with Biopython

## Based in the Biopython tutorial Chapter 2, 5: Parsing sequence files
Focus on Sars-COV-2 data

In [3]:
from Bio import SeqIO

Try these examples from Chapter 2, Sections 2.4.1 - 2.4.2
Then complete the exercises in Chapter 5 up to and including section 5.5.3
Note: You will need to make some compressed verions of the files
      and for the downloaded files in the exercise, use the examples in the tutorial.

In [4]:
for seq_record in SeqIO.parse("sarscov2.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

MT081068.1
Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', SingleLetterAlphabet())
1260
MT072667.1
Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', SingleLetterAlphabet())
670
MT066159.1
Seq('TAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTAT...TTG', SingleLetterAlphabet())
290
MT050416.1
Seq('TGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCG...CCT', SingleLetterAlphabet())
562
MT161607.1
Seq('TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTC...AAA', SingleLetterAlphabet())
253


In [5]:
for seq_record in SeqIO.parse("sarscov2.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

MT081068.1
Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', IUPACAmbiguousDNA())
1260
MT072667.1
Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', IUPACAmbiguousDNA())
670
MT066159.1
Seq('TAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTAT...TTG', IUPACAmbiguousDNA())
290
MT050416.1
Seq('TGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCG...CCT', IUPACAmbiguousDNA())
562
MT161607.1
Seq('TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTC...AAA', IUPACAmbiguousDNA())
253


# CHAPTER 5 Tutorial Below

Sections 5.1 to 5.5.3
Make sure to put in a few notes with the markdown language. Briefly explain what each part is doing.

In [6]:
identifiers = [seq_record.id for seq_record in SeqIO.parse("sarscov2.gbk", "genbank")]
identifiers

['MT081068.1', 'MT072667.1', 'MT066159.1', 'MT050416.1', 'MT161607.1']

5.1.2  Iterating over the records in a sequence file

The Object returned by SEQIO.parse is a iterator so we can use next to iterate over 

In [7]:
record_iterator = SeqIO.parse("sarscov2.gbk", "genbank")
first_record = next(record_iterator)
print(first_record)

ID: MT081068.1
Name: MT081068
Description: Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds
Number of features: 3
/molecule_type=RNA
/topology=linear
/data_file_division=VRL
/date=20-FEB-2020
/accessions=['MT081068']
/sequence_version=1
/keywords=['']
/source=Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
/organism=Severe acute respiratory syndrome coronavirus 2
/taxonomy=['Viruses', 'Riboviria', 'Nidovirales', 'Cornidovirineae', 'Coronaviridae', 'Orthocoronavirinae', 'Betacoronavirus', 'Sarbecovirus']
/references=[Reference(title='Optimizing diagnostic strategy for novel coronavirus pneumonia, a multi-center study in Eastern China', ...), Reference(title='Direct Submission', ...)]
/structured_comment=OrderedDict([('Assembly-Data', OrderedDict([('Assembly Method', 'SPAdes v. v3.13'), ('Sequencing Technology', 'Illumina')]))])
Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTG

In [8]:
record_iterator = SeqIO.parse("sarscov2.fasta", "fasta")

first_record = next(record_iterator)
print(first_record.id)
print(first_record.description)

second_record = next(record_iterator)
print(second_record.id)
print(second_record.description)


MT081068.1
MT081068.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds
MT072667.1
MT072667.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds


In [9]:
first_record = next(SeqIO.parse("sarscov2.gbk", "genbank"))
first_record

SeqRecord(seq=Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', IUPACAmbiguousDNA()), id='MT081068.1', name='MT081068', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds', dbxrefs=[])

### 5.1.3  Getting a list of the records in a sequence file

Using List gives the flexibility to select a record in any order

In [10]:
records = list(SeqIO.parse("sarscov2.gbk", "genbank"))

print("Found %i records" % len(records))

print("The last record")
last_record = records[-1]  
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))

print("The first record")
first_record = records[0]  
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

Found 5 records
The last record
MT161607.1
Seq('TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTC...AAA', IUPACAmbiguousDNA())
253
The first record
MT081068.1
Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', IUPACAmbiguousDNA())
1260


### 5.1.4  Extracting data

Printing the Annotation

In [13]:
print(first_record.annotations)

{'molecule_type': 'RNA', 'topology': 'linear', 'data_file_division': 'VRL', 'date': '20-FEB-2020', 'accessions': ['MT081068'], 'sequence_version': 1, 'keywords': [''], 'source': 'Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)', 'organism': 'Severe acute respiratory syndrome coronavirus 2', 'taxonomy': ['Viruses', 'Riboviria', 'Nidovirales', 'Cornidovirineae', 'Coronaviridae', 'Orthocoronavirinae', 'Betacoronavirus', 'Sarbecovirus'], 'references': [Reference(title='Optimizing diagnostic strategy for novel coronavirus pneumonia, a multi-center study in Eastern China', ...), Reference(title='Direct Submission', ...)], 'structured_comment': OrderedDict([('Assembly-Data', OrderedDict([('Assembly Method', 'SPAdes v. v3.13'), ('Sequencing Technology', 'Illumina')]))])}


In [15]:
print(first_record.annotations["source"])
print(first_record.annotations["organism"])

Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
Severe acute respiratory syndrome coronavirus 2


Splitting the description and using the data in 1st column

In [30]:
all_descrip = []
for seq_record in SeqIO.parse("sarscov2.fasta", "fasta"):
    all_descrip.append(seq_record.description.split()[1])
print(all_descrip)

[['MT081068.1', 'Severe', 'acute', 'respiratory', 'syndrome', 'coronavirus', '2', 'isolate', 'SARS-CoV-2/HS_194/human/2020/CHN', 'nucleocapsid', 'phosphoprotein', '(N)', 'gene,', 'complete', 'cds'], ['MT072667.1', 'Severe', 'acute', 'respiratory', 'syndrome', 'coronavirus', '2', 'isolate', 'SARS-CoV-2/GHB-03021/human/2020/BEL', 'orf1ab', 'polyprotein', 'gene,', 'partial', 'cds'], ['MT066159.1', 'Severe', 'acute', 'respiratory', 'syndrome', 'coronavirus', '2', 'isolate', 'SARS-CoV-2/CL5049/human/2020/MYS', 'orf1ab', 'polyprotein,', 'RdRp', 'region,', '(orf1ab)', 'gene,', 'partial', 'cds'], ['MT050416.1', 'Severe', 'acute', 'respiratory', 'syndrome', 'coronavirus', '2', 'isolate', 'SARS-CoV-2/QLD03/human/2020_FSS917/AUS', 'ORF1ab', 'polyprotein,', 'RdRp', 'region,', 'gene,', 'partial', 'cds'], ['MT161607.1', 'Severe', 'acute', 'respiratory', 'syndrome', 'coronavirus', '2', 'isolate', 'SARS-CoV-2/NIAA05032020/human/2020/ITA', "5'", 'UTR']]


Using List Comprehension

In [20]:
all_descrip = [seq_record.description.split()[1]
    for seq_record in SeqIO.parse("sarscov2.fasta", "fasta")
]
print(all_descrip)

['Severe', 'Severe', 'Severe', 'Severe', 'Severe']


### 5.1.5  Modifying data

In [24]:
record_iterator = SeqIO.parse("sarscov2.fasta", "fasta")
first_record = next(record_iterator)
first_record.id = "new_id"
first_record.description = first_record.id + " " + "desired new description"
print(first_record.format("fasta")[:200])

>new_id desired new description
ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCC
TCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGT
CGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTC


5.2  Parsing sequences from compressed files
Calculating the  total length of the sequences in a multiple record  in genbank file

In [25]:
print(sum(len(r) for r in SeqIO.parse("sarscov2.gbk", "gb")))

3035


Using File Handle. It will close the handle automatically

In [26]:
with open("sarscov2.gbk") as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "gb")))

3035


Opening a gzip and bz2 file

In [28]:
import gzip
with gzip.open("sarscov2.gbk.gz", "rt") as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "gb")))

3035


In [29]:
import bz2
handle = bz2.open("sarscov2.gbk.bz2", "rt") 
with handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "gb")))

## 5.3  Parsing sequences from the net

In [11]:
from Bio import Entrez


Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(
    db="nucleotide", rettype="fasta", retmode="text", id="1824640867"
) as handle:
    seq_record = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))


Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(
    db="nucleotide", rettype="gb", retmode="text", id="1824640867"
) as handle:
    seq_record = SeqIO.read(handle, "gb")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))

Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(
    db="nucleotide", rettype="fasta", retmode="text", id="NC_045512.2"
) as handle:
    seq_record = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))


MT246490.1 with 0 features
MT246490.1 with 21 features
NC_045512.2 with 0 features


Getting Multiple Records

In [14]:
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(
    db="nucleotide", rettype="gb", retmode="text", id="1824640867,1824640856,1824640834"
) as handle:
    for seq_record in SeqIO.parse(handle, "gb"):
        print("%s %s..." % (seq_record.id, seq_record.description[:100]))
        print(
            "Sequence length %i, %i features, from: %s"
            % (
                len(seq_record),
                len(seq_record.features),
                seq_record.annotations["source"],
            )
        )

MT246490.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/WA-UW233/human/2020/USA, complete...
Sequence length 29850, 21 features, from: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
MT246489.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/WA-UW232/human/2020/USA, complete...
Sequence length 29864, 21 features, from: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
MT246487.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/WA-UW230/human/2020/USA, complete...
Sequence length 29883, 21 features, from: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)


In [24]:

Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(
    db="nucleotide", rettype="gb", retmode="text", id="1824640867"
) as handle:
    seq_record = SeqIO.read(handle, "gb")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))
print(seq_record.annotations["molecule_type"])
print(seq_record.annotations["taxonomy"])

MT246490.1 with 21 features
RNA
['Viruses', 'Riboviria', 'Nidovirales', 'Cornidovirineae', 'Coronaviridae', 'Orthocoronavirinae', 'Betacoronavirus', 'Sarbecovirus']


### 5.3.2  Parsing SwissProt sequences from the net

from Bio import ExPASy
with ExPASy.get_sprot_raw("A0A1W6S797") as handle:
    seq_record = SeqIO.read(handle, "swiss")
print(seq_record.id)
print(seq_record.name)
print(seq_record.description)
print(repr(seq_record.seq))
print("Length %i" % len(seq_record))
print(seq_record.annotations["keywords"])


## 5.4  Sequence files as Dictionaries


Acessing the record in the form of dictionary

In [68]:
covid_dict = SeqIO.to_dict(SeqIO.parse("sarscov2.gbk", "genbank"))
len(covid_dict)

5

In [69]:
covid_dict

{'MT081068.1': SeqRecord(seq=Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', IUPACAmbiguousDNA()), id='MT081068.1', name='MT081068', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds', dbxrefs=[]),
 'MT072667.1': SeqRecord(seq=Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', IUPACAmbiguousDNA()), id='MT072667.1', name='MT072667', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds', dbxrefs=[]),
 'MT066159.1': SeqRecord(seq=Seq('TAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTAT...TTG', IUPACAmbiguousDNA()), id='MT066159.1', name='MT066159', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/CL5049/human/2020/MYS orf1ab polyprotein, RdRp region, (orf1ab) gene, partial cds', dbxrefs=[]),
 'MT050416.1': SeqRecord(seq=Seq('TGATA

Print all the keys in the dictionary

In [70]:
list(covid_dict.keys())

['MT081068.1', 'MT072667.1', 'MT066159.1', 'MT050416.1', 'MT161607.1']

Print all the values in the dictionary

In [71]:
list(covid_dict.values())

[SeqRecord(seq=Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', IUPACAmbiguousDNA()), id='MT081068.1', name='MT081068', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds', dbxrefs=[]),
 SeqRecord(seq=Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', IUPACAmbiguousDNA()), id='MT072667.1', name='MT072667', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds', dbxrefs=[]),
 SeqRecord(seq=Seq('TAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTAT...TTG', IUPACAmbiguousDNA()), id='MT066159.1', name='MT066159', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/CL5049/human/2020/MYS orf1ab polyprotein, RdRp region, (orf1ab) gene, partial cds', dbxrefs=[]),
 SeqRecord(seq=Seq('TGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCG...CCT'

 Accessing a single SeqRecord object via the keys

In [74]:
SeqRecord = covid_dict['MT072667.1']
SeqRecord

SeqRecord(seq=Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', IUPACAmbiguousDNA()), id='MT072667.1', name='MT072667', description='Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds', dbxrefs=[])

Print SeqRecord description and Sequence

In [75]:
print(SeqRecord.description)
print(repr(SeqRecord.seq))

Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds
Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', IUPACAmbiguousDNA())


#### 5.4.1.1  Specifying the dictionary keys

Dictionary for the FASTA files

In [76]:
fasta_dict = SeqIO.to_dict(SeqIO.parse("sarscov2.fasta", "fasta"))
len(fasta_dict)

5

In [77]:
print(fasta_dict.keys())

dict_keys(['MT081068.1', 'MT072667.1', 'MT066159.1', 'MT050416.1', 'MT161607.1'])


In [78]:
print(fasta_dict.values())

dict_values([SeqRecord(seq=Seq('ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGT...TAA', SingleLetterAlphabet()), id='MT081068.1', name='MT081068.1', description='MT081068.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds', dbxrefs=[]), SeqRecord(seq=Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', SingleLetterAlphabet()), id='MT072667.1', name='MT072667.1', description='MT072667.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds', dbxrefs=[]), SeqRecord(seq=Seq('TAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTAT...TTG', SingleLetterAlphabet()), id='MT066159.1', name='MT066159.1', description='MT066159.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/CL5049/human/2020/MYS orf1ab polyprotein, RdRp region, (orf1ab) gene, partial cds', dbxrefs=[]), SeqRecord(seq=Seq('TGAT

#### 5.4.1.2  Indexing a dictionary using the SEGUID checksum

SEGUID checksum gives different chksum values for different sequences

In [79]:
from Bio.SeqUtils.CheckSum import seguid

for record in SeqIO.parse("sarscov2.gbk", "genbank"):
    print(record.id, seguid(record.seq))


MT081068.1 5BviYOuizbQkd1/g74Hzwgox008
MT072667.1 teNUy6ULqWZKMlXSo+Xj0NPzB2U
MT066159.1 TI9rGoDULpeR1MJFktlQha+y8GM
MT050416.1 5RsIiXNqTAgk3nuC9MheQCRHfH4
MT161607.1 pNKP9yzw9ctZzPH+NPv91Dz39G4


In [85]:
seguid_dict = SeqIO.to_dict(SeqIO.parse("sarscov2.gbk", "genbank"),
                             lambda rec : seguid(rec.seq))
print(seguid_dict.keys())

dict_keys(['5BviYOuizbQkd1/g74Hzwgox008', 'teNUy6ULqWZKMlXSo+Xj0NPzB2U', 'TI9rGoDULpeR1MJFktlQha+y8GM', '5RsIiXNqTAgk3nuC9MheQCRHfH4', 'pNKP9yzw9ctZzPH+NPv91Dz39G4'])


In [86]:

record = seguid_dict["5BviYOuizbQkd1/g74Hzwgox008"]
print(record.id)
print(record.description)

MT081068.1
Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds


### 5.4.2  Sequence files as Dictionaries – Indexed files

Dictionaries are good for small to medium size file as it holds everything in memory. 
For bigger files, we use Bio.SeqIO.index() it still returns a dictionary like object, but does not keep everything in memory.

In [95]:
covid_dict = SeqIO.index("sarscov2.gbk", "genbank")
len(covid_dict)


5

In [89]:
list(covid_dict.keys())
SeqRecord = covid_dict['MT072667.1']
print(SeqRecord.description)
print(repr(SeqRecord.seq))

Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds
Seq('GTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATT...ACT', IUPACAmbiguousDNA())


In [90]:
covid_dict.close()

In [91]:
covid_fasta_dict = SeqIO.index("sarscov2.fasta", "fasta")
len(covid_fasta_dict)

5

In [93]:
list(covid_fasta_dict.keys())

['MT081068.1', 'MT072667.1', 'MT066159.1', 'MT050416.1', 'MT161607.1']

#### 5.4.2.2  Getting the raw data for a record

In [None]:
uniprot = SeqIO.index("uniprot_sprot.dat", "swiss")
with open("selected.dat", "wb") as out_handle:
     for acc in ["P33487", "P19801", "P13689", "Q8JZQ5", "Q9TRC7"]:
         out_handle.write(uniprot.get_raw(acc))

### 5.4.3  Sequence files as Dictionaries – Database indexed files


Getting the Virus Data from ftp://ftp.ncbi.nih.gov/genbank/, here we downloaded 4 of the virus files.
unzipped the file
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl2.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl3.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl4.seq.gz
$ gunzip gbvrl*.seq.gz

In [100]:
import glob
files = glob.glob("gbvrl*.seq")
print("%i files to index" % len(files))

4 files to index


Indexing these Virus files

In [101]:
gb_vrl = SeqIO.index_db("gbvrl.idx", files, "genbank")
print("%i sequences indexed" % len(gb_vrl))

453473 sequences indexed


In [102]:
print(gb_vrl["AB811634.1"].description)

Equine encephalosis virus NS3 gene, complete cds, isolate: Kimron1


#### 5.4.3.1  Getting the raw data for a record


Here we get the raw bytes of each record

In [103]:
print(gb_vrl.get_raw("AB811634.1"))

b'LOCUS       AB811634                 723 bp    RNA     linear   VRL 17-JUN-2015\nDEFINITION  Equine encephalosis virus NS3 gene, complete cds, isolate: Kimron1.\nACCESSION   AB811634\nVERSION     AB811634.1\nKEYWORDS    .\nSOURCE      Equine encephalosis virus\n  ORGANISM  Equine encephalosis virus\n            Viruses; dsRNA viruses; Reoviridae; Sedoreovirinae; Orbivirus.\nREFERENCE   1\n  AUTHORS   Westcott,D., Mildenberg,Z., Bellaiche,M., McGowan,S.L.,\n            Grierson,S.S., Choudhury,B. and Steinbach,F.\n  TITLE     Evidence for the circulation of equine encephalosis virus in Israel\n            since 2001\n  JOURNAL   PLoS ONE 8 (8), E70532 (2013)\n   PUBMED   23950952\n  REMARK    DOI:10.1371/journal.pone.0070532\n            Erratum:[PLoS One. 2013;8(9).\n            doi:10.1371/annotation/4875ab92-466a-4f5f-b9c7-bc0e168a8f9b.\n            Wescott, David G [corrected to Westcott, David G]]\n            Publication Status: Online-Only\nREFERENCE   2  (bases 1 to 723)\n  AU

### 5.4.4  Indexing compressed files


In [106]:
covid_dict = SeqIO.index("sarscov2.gbk.bgz", "genbank")
len(covid_dict)


In [None]:
covid_dict.close()

## 5.5  Writing Sequence Files
Writing sequence records to file

In [108]:
from Bio.Alphabet import generic_protein
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

rec1 = SeqRecord(
    Seq(
        "MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
        "GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
        "NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
        "SSAC",
        generic_protein,
    ),
    id="MT081068.1",
    description="Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds",
)

rec2 = SeqRecord(
    Seq(
        "YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ"
        "DMVVVEIPKLGKEAAVKAIKEWGQ",
        generic_protein,
    ),
    id="MT072667.1",
    description="Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/GHB-03021/human/2020/BEL orf1ab polyprotein gene, partial cds",
)

rec3 = SeqRecord(
    Seq(
        "MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC"
        "EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP"
        "KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN"
        "NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV"
        "SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW"
        "IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT"
        "TGEGLEWGVLFGFGPGLTVETVVLHSVAT",
        generic_protein,
    ),
    id="MT066159.1",
    description="Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/CL5049/human/2020/MYS orf1ab polyprotein, RdRp region, (orf1ab) gene, partial cds]",
)

my_records = [rec1, rec2, rec3]

SeqIO.write(my_records, "my_example.faa", "fasta")


3

### 5.5.2  Converting between sequence file formats

One way is to use SeqIO.parse and SeqIO.write but it is bit complicated

In [109]:
records = SeqIO.parse("sarscov2.gbk", "genbank")
count = SeqIO.write(records, "my_example.fasta", "fasta")
print("Converted %i records" % count)

Converted 5 records


Another approach is to use SeqIO.convert
This allows converting from genbank file format to fasta file format

In [110]:
count = SeqIO.convert("sarscov2.gbk", "genbank", "my_example.fasta", "fasta")
print("Converted %i records" % count)

Converted 5 records


### 5.5.3  Converting a file of sequences to their reverse complements

Converting file of nucleotide sequences  into a file containing their reverse complement sequences.
We use the built in .reverse_complement() method

In [113]:
for record in SeqIO.parse("sarscov2.gbk", "genbank"):
     print(record.id)
     print(record.seq.reverse_complement())


MT081068.1
TTAGGCCTGAGTTGAGTCAGCACTGCTCATGGATTGTTGCAATTGTTTGGAGAAATCATCCAAATCTGCAGCAGGAAGAAGAGTCACAGTTTGCTGTTTCTTCTGTCTCTGCGGTAAGGCTTGAGTTTCATCAGCCTTCTTCTTTTTGTCCTTTTTAGGCTCTGTTGGTGGGAATGTTTTGTATGCGTCAATATGCTTATTCAGCAAAATGACTTGATCTTTGAAATTTGGATCTTTGTCATCCAATTTGATGGCACCTGTGTAGGTCAACCACGTTCCCGAAGGTGTGACTTCCATGCCAATGCGCGACATTCCGAAGAACGCTGAAGCGCTGGGGGCAAATTGTGCAATTTGCGGCCAATGTTTGTAATCAGTTCCTTGTCTGATTAGTTCCTGGTCCCCAAAATTTCCTTGGGTTTGTTCTGGACCACGTCTGCCGAAAGCTTGTGTTACATTGTATGCTTTAGTGGCAGTACGTTTTTGCCGAGGCTTCTTAGAAGCCTCAGCAGCAGATTTCTTAGTGACAGTTTGGCCTTGTTGTTGTTGGCCTTTACCAGACATTTTGCTCTCAAGCTGGTTCAATCTGTCAAGCAGCAGCAAAGCAAGAGCAGCATCACCGCCATTGCCAGCCATTCTAGCAGGAGAAGTTCCCCTACTGCTGCCTGGAGTTGAATTTCTTGAACTGTTGCGACTACGTGATGAGGAACGAGAAGAGGCTTGACTGCCGCCTCTGCTCCCTTCTGCGTAGAAGCCTTTTGGCAATGTTGTTCCTTGAGGAAGTTGTAGCACGATTGCAGCATTGTTAGCAGGATTGCGGGTGCCAATGTGATCTTTTGGTGTATTCAAGGCTCCCTCAGTTGCAACCCATATGATGCCGTCTTTGTTAGCACCATAGGGAAGTCCAGCTTCTGGCCCAGTTCCTAGGTAGTAGAAATACCATCTTGGACTGAGATCTTTCATTTTACCGTCACCACCACGAATTCGTCTGG

We can save the Reverse complement to the file using SeqRecord objects.
Using List comprehension

In [116]:
from Bio import SeqIO
records = [rec.reverse_complement(id="rc_"+rec.id, description = "reverse complement") \
            for rec in SeqIO.parse("sarscov2.fasta", "fasta")]
len(records)
print(records)

[SeqRecord(seq=Seq('TTAGGCCTGAGTTGAGTCAGCACTGCTCATGGATTGTTGCAATTGTTTGGAGAA...CAT', SingleLetterAlphabet()), id='rc_MT081068.1', name='<unknown name>', description='reverse complement', dbxrefs=[]), SeqRecord(seq=Seq('AGTTACTGAAGTCATTGAGAGCCTTTGCGAGATGACAACAAGCAGCTTCTCTGT...TAC', SingleLetterAlphabet()), id='rc_MT072667.1', name='<unknown name>', description='reverse complement', dbxrefs=[]), SeqRecord(seq=Seq('CAATCATAAACGGATTATAGACGTAATCAAATCCAATAGAATGATGCCAACAGG...TTA', SingleLetterAlphabet()), id='rc_MT066159.1', name='<unknown name>', description='reverse complement', dbxrefs=[]), SeqRecord(seq=Seq('AGGTCCTTTAGTAAGGTCAGTCTCAGTCCAACATTTTGCTTCAGACATAAAAAC...TCA', SingleLetterAlphabet()), id='rc_MT050416.1', name='<unknown name>', description='reverse complement', dbxrefs=[]), SeqRecord(seq=Seq('TTTACCTTTCGGTCACACCCGGACAAAACCTAGATGTGCTGATGATCGGCTGCA...GTA', SingleLetterAlphabet()), id='rc_MT161607.1', name='<unknown name>', description='reverse complement', dbxrefs=[])]


Adding a conditional check of eliminating records with length < 700

In [117]:
records = [rec.reverse_complement(id="rc_"+rec.id, description = "reverse complement") \
            for rec in SeqIO.parse("sarscov2.fasta", "fasta") if len(rec)<700]
len(records)

4

Writing the reverse complement to a file

In [118]:
SeqIO.write(records, "rev_comp.fasta", "fasta")

4

### 5.5.4  Getting your SeqRecord objects as formatted strings

Loading a bunch of SeqRecord objects from  GenBank file, and creating a string containing the records in FASTA format

In [123]:
from Bio import SeqIO
try:
    from StringIO import StringIO ## for Python 2
except ImportError:
    from io import StringIO ## for Python 3

records = SeqIO.parse("sarscov2.gbk", "genbank")
out_handle = StringIO()
SeqIO.write(records, out_handle, "fasta")
fasta_data = out_handle.getvalue()
print(fasta_data)

>MT081068.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/HS_194/human/2020/CHN nucleocapsid phosphoprotein (N) gene, complete cds
ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCC
TCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGT
CGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGC
AAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCA
GATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAA
ATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGA
CTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAAT
ACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAA
CTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGT
CAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGC
AGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCT
TTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAA
CAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAA
AAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTC

In [122]:
from Bio import SeqIO

records = (rec for rec in SeqIO.parse("sarscov2.gbk", "genbank") if len(rec) > 100)

SeqIO.write(records, "sarscov2.tab", "tab")

5