# Installation

Check Python version

In [1]:
from platform import python_version
python_version()

'3.9.7'

Install BioPython.

Note:
- only need to run this line for the first run

In [2]:
#pip install biopython

Import `Bio` and view version

In [3]:
import Bio
print(Bio.__version__)

1.80


In [4]:
#from Bio.Seq import Seq
from Bio import SeqIO, SearchIO, Entrez
#from Bio.Seq import Seq
#from Bio.SeqUtils import GC, molecular_weight
from Bio.Blast import NCBIWWW
#from Bio.Data import CodonTable
import Bio.Data as bd
import Bio.SeqUtils as bsu
import Bio.Seq as bseq
import numpy as np

# BioPython

Multiple file formats can be used to read, write and index. For more detail, click [here](https://biopython.org/wiki/SeqIO)

* Provides a simple uniform interface to input and output assorted sequence file formats (including multiple sequence alignments)
* Will only deal with sequences as `SeqRecord` objects
* File formats:
    * `abi`
    * `cif-atoms`
    * `clustal`
    * `fasta`
    * `pdb-seqres`
    * `pdb-atom`
    * `swiss`
* Note: When using Bio.SeqIO for alignment, make sure all the sequences are the same length (aka. they need to include gaps)

View the complete sequence for [Drosophila X Virus Segment A](https://www.ncbi.nlm.nih.gov/gene/993338) using `Bio.SeqIO`.

Note:
- Organism: Drosophila X virus
- Molecule type: genomic RNA
- Gene: 1 to 3099
- Locus tag: DxvsAgp1
- For more detail, click [here](https://www.ncbi.nlm.nih.gov/nuccore/NC_004177.1?report=genbank&from=108&to=3206)

In [5]:
gene_record = SeqIO.read("drosophila_x_virus_segment_A_complete_sequence.fasta", "fasta")
print(gene_record.seq)

ATGAATACGACAAACGAATACTTGAAAACTCTTTTAAACCCAGCACAATTTATCTCAGACATTCCTGATGATATAATGATCCGACACGTAAACAGCGCCCAGACCATCACCTACAACTTGAAGTCAGGGGCCTCTGGCACCGGCCTGATCGTGGTCTATCCAAACACCCCGTCGAGTATTAGCGGCTTCCATTACATATGGGATTCCGCTACCTCGAATTGGGTGTTTGATCAGTACATCTACACAGCTCAGGAGTTGAAGGACTCATATGACTATGGCAGACTGATTTCAGGCTCGCTAAGCATTAAGTCCAGCACCTTACCTGCGGGTGTTTATGCACTGAATGGCACATTCAATGCAGTCTGGTTCCAAGGGACCTTGAGTGAAGTGTCTGACTACTCTTACGATAGGATCCTGTCAATAACATCCAATCCTCTGGATAAGGTTGGAAATGTGTTGGTTGGAGACGGCATAGAGGTTCTAAGCCTGCCGCAGGGGTTCAACAACCCCTACGTTAGGCTGGGTGACAAGTCACCGTCCACTCTATCCTCTCCAACCCACATAACCAACACTTCCCAGAACTTGGCTACGGGAGGTGCATACATGATCCCAGTAACCACAGTTCCTGGGCAAGGATTCCATAACAAGGAATTCAGCATTAATGTGGACTCCGTAGGGCCAGTTGACATCTTGTGGTCTGGTCAAATGACTATGCAGGACGAATGGACTGTAACTGCAAATTATCAACCATTGAACATCTCTGGCACGCTAATTGCAAACAGTCAGCGAACCCTAACATGGTCCAACACTGGTGTATCCAATGGCAGCCACTACATGAACATGAACAACCTTAATGTCTCCCTTTTCCATGAGAATCCACCACCTGAACCCGTTGCCGCCATAAAAATAAACATCAATTATGGAAACAACACCAATGGTGACAGCTCGTTCAGTGTGGACTCATCATTTACCATCAATGTCATTGGGGGCGCCACCATTG

In [6]:
print("Sequence length (bp)", len(gene_record))

Sequence length (bp) 3099


View the complete amino acid sequence for Drosophila X Virus Polyprotein using Bio.SeqIO

Note:
- Chromosome: Segment A
- Region 2 to 442: Birnavirus VP2 protein
- Region 44 to 702: Birnavirus VP4 protein
- Region 734 to 983: Birnavirus VP3 protein
- Fore more detail, click [here](https://www.ncbi.nlm.nih.gov/protein/1545998)

In [7]:
polyprotein_record = SeqIO.read("polyprotein_Drosophila_X_virus_sequence.fasta", "fasta")
print(polyprotein_record.seq)

MNTTNEYLKTLLNPAQFISDIPDDIMIRHVNSAQTITYNLKSGASGTGLIVVYPNTPSSISGFHYIWDSATSNWVFDQYIYTAQELKDSYDYGRLISGSLSIKSSTLPAGVYALNGTFNAVWFQGTLSEVSDYSYDRILSITSNPLDKVGNVLVGDGIEVLSLPQGFNNPYVRLGDKSPSTLSSPTHITNTSQNLATGGAYMIPVTTVPGQGFHNKEFSINVDSVGPVDILWSGQMTMQDEWTVTANYQPLNISGTLIANSQRTLTWSNTGVSNGSHYMNMNNLNVSLFHENPPPEPVAAIKININYGNNTNGDSSFSVDSSFTINVIGGATIGVNSPTVGVGYQGVAEGTAITISGINNYELVPNPDLQKNLPMTYGTCDPHDLTYIKYILSNREQLGLRSVMTLADYNRMKMYMHVLTNYHVDEREASSFDFWQLLKQIKNVAVPLAATLAPQFAPIIGAADGLANAILGDSASGRPVGNSASGMPISMSRRLRNAYSADSPLGEEHWLPNENENFNKFDIIYDVSHSSMALFPVIMMEHDKVIPSDPEELYIAVSLTESLRKQIPNLNDMPYYEMGGHRVYNSVSSNVRSGNFLRSDYILLPCYQLLEGRLASSTSPNKVTGTSHQLAIYAADDLLKSGVLGKAPFAAFTGSVVGSSVGEVFGINLKLQLTDSLGIPLLGNSPGLVQVKTLTSLDKKIKDMGDVKRRTPKQTLPHWTAGSASMNPFMNTNPFLEELDQPIPSNAAKPISEETRDLFLSDGQTIPSSQEKIATIHEYLLEHKELEEAMFSLISQGRGRSLINMVVKSALNIETQSREVTGERRQRLERKLRNLENQGIYVDESKIMSRGRISKEDTELAMRIARKNQKDAKLRRIYSNNASIQESYTVDDFVSYWMEQESLPTGIQIAMWLKGDDWSQPIPPRVQRRHYDSYIMMLGPSPTQEQADAVKDLVDDIYDRNQGKGPSQEQARELSHAVRRLISHSLVNQPATAPRVPPRR

In [8]:
print("Sequence length (bp)", len(polyprotein_record))

Sequence length (bp) 1032


## `SeqRecord` class

- The only class of object returned by `SeqIO`.
- Extracting information from a SeqRecord object. Note: it depends on the file format (e.g. FASTA, GenBank).
- View all the information held in this object via: `print(record)`

In [15]:
from Bio import SeqIO
for polyprotein_record in SeqIO.parse("polyprotein_Drosophila_X_virus_sequence.fasta", "fasta"):
    print(polyprotein_record)

ID: AAB16798.1
Name: AAB16798.1
Description: AAB16798.1 polyprotein [Drosophila X virus]
Number of features: 0
Seq('MNTTNEYLKTLLNPAQFISDIPDDIMIRHVNSAQTITYNLKSGASGTGLIVVYP...DIV')


View the type of information help in this object using the `dir()` function. Note: the `dir()` function returns all properties and methods of the specified object, without the values:

In [16]:
dir(polyprotein_record)

['__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'count',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'islower',
 'isupper',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

Print the information `polyprotein_record`

In [17]:
print("ID", polyprotein_record.id)
print("Name", polyprotein_record.name)
print("Description", polyprotein_record.description)
print("Sequence", polyprotein_record.seq)
print("Number of features", polyprotein_record.features) # empty

ID AAB16798.1
Name AAB16798.1
Description AAB16798.1 polyprotein [Drosophila X virus]
Sequence MNTTNEYLKTLLNPAQFISDIPDDIMIRHVNSAQTITYNLKSGASGTGLIVVYPNTPSSISGFHYIWDSATSNWVFDQYIYTAQELKDSYDYGRLISGSLSIKSSTLPAGVYALNGTFNAVWFQGTLSEVSDYSYDRILSITSNPLDKVGNVLVGDGIEVLSLPQGFNNPYVRLGDKSPSTLSSPTHITNTSQNLATGGAYMIPVTTVPGQGFHNKEFSINVDSVGPVDILWSGQMTMQDEWTVTANYQPLNISGTLIANSQRTLTWSNTGVSNGSHYMNMNNLNVSLFHENPPPEPVAAIKININYGNNTNGDSSFSVDSSFTINVIGGATIGVNSPTVGVGYQGVAEGTAITISGINNYELVPNPDLQKNLPMTYGTCDPHDLTYIKYILSNREQLGLRSVMTLADYNRMKMYMHVLTNYHVDEREASSFDFWQLLKQIKNVAVPLAATLAPQFAPIIGAADGLANAILGDSASGRPVGNSASGMPISMSRRLRNAYSADSPLGEEHWLPNENENFNKFDIIYDVSHSSMALFPVIMMEHDKVIPSDPEELYIAVSLTESLRKQIPNLNDMPYYEMGGHRVYNSVSSNVRSGNFLRSDYILLPCYQLLEGRLASSTSPNKVTGTSHQLAIYAADDLLKSGVLGKAPFAAFTGSVVGSSVGEVFGINLKLQLTDSLGIPLLGNSPGLVQVKTLTSLDKKIKDMGDVKRRTPKQTLPHWTAGSASMNPFMNTNPFLEELDQPIPSNAAKPISEETRDLFLSDGQTIPSSQEKIATIHEYLLEHKELEEAMFSLISQGRGRSLINMVVKSALNIETQSREVTGERRQRLERKLRNLENQGIYVDESKIMSRGRISKEDTELAMRIARKNQKDAKLRRIYSNNASIQESYTVDDFVSYWMEQESLPT

## Other application of `BioSeq.IO`:
- parse
- convert file format
- generate random subsequences
- filter by seqeuence length
- write seqeunce output

In [9]:
polyprotein_record

SeqRecord(seq=Seq('MNTTNEYLKTLLNPAQFISDIPDDIMIRHVNSAQTITYNLKSGASGTGLIVVYP...DIV'), id='AAB16798.1', name='AAB16798.1', description='AAB16798.1 polyprotein [Drosophila X virus]', dbxrefs=[])

# `Bio.Blast`

In [18]:
from Bio.Blast import NCBIWWW

# PyRosetta

Introduction to Python on Juypter Notebook [link](https://nbviewer.org/github/jckantor/CBE20255/blob/master/notebooks/00.01-Getting-Started-with-Jupyter-Notebooks-and-Python.ipynb)

In [22]:
!pip install pyrosettacolabsetup
import pyrosettacolabsetup; pyrosettacolabsetup.install_pyrosetta()
import pyrosetta; pyrosetta.init()



ModuleNotFoundError: No module named 'google.colab'