# Running instructions

## Change working directory

In [None]:
cd ~/workspace/genome_data/annotations/e_coli/gene_sequences/fhub_gene_survived
source ~/workspace/alfred-data-analysis/.venv_gwas/bin/activate

## Obtain variants from the multiple sequence alignment files

In [None]:
snp-sites -v -o variants.vcf aligned.fna

## Obtain consensus sequence from the multiple sequence alignment files

In [None]:
snp-sites -r -o consensus.fna aligned.fna

In [1]:
import os
from pathlib import Path

from Bio import AlignIO
from Bio.Align import AlignInfo


dataDir = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 'e_coli', 'gene_sequences', 'fhub_gene_survived')

alignment = AlignIO.read(Path(dataDir, "aligned.fna"), "fasta")

summary_align = AlignInfo.SummaryInfo(alignment)

consensus = summary_align.dumb_consensus(threshold=0.45, ambiguous='N')

>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from Bio.Align import MultipleSeqAlignment
>>> from Bio.Align.AlignInfo import SummaryInfo
>>> msa = MultipleSeqAlignment([SeqRecord(Seq('ACGT')),
...                             SeqRecord(Seq('ATGT')),
...                             SeqRecord(Seq('ATGT'))])
>>> summary = SummaryInfo(msa)
>>> dumb_consensus = summary.dumb_consensus(ambiguous='N')
>>> print(dumb_consensus)
ANGT
>>> alignment = msa.alignment
>>> from Bio.motifs import Motif
>>> motif = Motif('ACGT', alignment)
>>> print(motif.consensus)
ATGT
>>> print(motif.degenerate_consensus)
AYGT
>>> counts = motif.counts
>>> consensus = counts.calculate_consensus(identity=0.7)
>>> print(consensus)
ANGT

If your multiple sequence alignment object was obtained using Bio.AlignIO, then you can obtain a new-style Alignment object directly by using Bio.Align.read instead of Bio.AlignIO.read, or Bio.Align.parse instead of Bio.AlignIO.parse.


In [2]:
print(consensus.count('N'))

0


In [3]:
print(consensus)

GTGAGTAAACGAATTGCGCTTTTTCCGGTGTTATTGCTGGCGCTGCTGGTGGTTGCTGCTGCGGCGTTGACCTGGATGAACTTCTCGCAGGCGCTGCCGCGCAGCCAGTGGGCGCAGGCCGCCTGGTCGCCGGATATTGACGTCATCGAGCAGATGATTTTTCACTACAGCTTGTTGCCGCGTCTGGCGATTTCGCTGCTGGTGGGCGCGGGCCTGGGGCTGGTGGGCGTGCTGTTTCAGCAAGTGCTGCGTAACCCGCTGGCGGAACCGACGACGTTGGGTGTTGCAACAGGCGCGCAACTGGGGATTACCGTCACCACGCTCTGGGCGATCCCCGGCGCGATGGCGAGCCAGTTTGCTGCGCTGGCAGGGGCTTGTGTTGTTGGCTTAATCGTCTTTGGCGTCGCGTGGGGGAAACGGCTTTCGCCGGTAACGCTGATCCTCGCGGGGCTGGTAGTGAGCCTTTATTGCGGCGCAATCAATCAGTTACTGGTTATCTTCCATCATGACCAACTGCAAAGCATGTTCCTGTGGAGCACCGGAACGCTGACGCAAACCGACTGGGGCGGCGTTGAGCGTTTATGGCCGCAGCTGCTGGGCGGCGTGATGCTGACGTTATTGCTACTTCGCCCGTTAACTCTGATGGGGCTTGATGATGGCGTGGCGCGCAATCTCGGGCTGGCCTTGTCGCTCGCTCGTCTGGCGGCGTTGTCGCTGGCGATTGTCATCAGTGCGCTGCTGGTGAACGCGGTGGGGATTATCGGCTTTATCGGTTTGTTCGCGCCACTGCTGGCGAAAATGCTGGGGGCGCGGCGTTTGTTGCCACGGTTGATGCTGGCATCGCTGATTGGTGCGTTAATTCTGTGGCTTTCTGATCAAATCATCCTCTGGCTGACTCGCGTGTGGATGGAAGTGTCCACCGGTTCGGTCACTGCGTTGATCGGTGCGCCGCTGCTACTGTGGCTGCTGCCGCGTTTACGCAGCATTAGCGCGCCGGATA

In [4]:
print(len(consensus))

1983


### Translate the nucleotide sequence to amino acid sequence

In [5]:
from Bio.Seq import Seq


dna_seq = Seq(consensus)
protein_seq = dna_seq.translate()
protein_seq

Seq('VSKRIALFPVLLLALLVVAAAALTWMNFSQALPRSQWAQAAWSPDIDVIEQMIF...SR*')

In [6]:
str(protein_seq)

'VSKRIALFPVLLLALLVVAAAALTWMNFSQALPRSQWAQAAWSPDIDVIEQMIFHYSLLPRLAISLLVGAGLGLVGVLFQQVLRNPLAEPTTLGVATGAQLGITVTTLWAIPGAMASQFAALAGACVVGLIVFGVAWGKRLSPVTLILAGLVVSLYCGAINQLLVIFHHDQLQSMFLWSTGTLTQTDWGGVERLWPQLLGGVMLTLLLLRPLTLMGLDDGVARNLGLALSLARLAALSLAIVISALLVNAVGIIGFIGLFAPLLAKMLGARRLLPRLMLASLIGALILWLSDQIILWLTRVWMEVSTGSVTALIGAPLLLWLLPRLRSISAPDMKVNDRVATERQHVLAFALAGGVLLLMAVVVALSFGRDAHGWTWASGALLEDLMPWRWPRIMAALFAGVMLAVAGCIIQRLTGNPMASPEVLGISSGAAFGVVLMLFLVPGNAFGWLLPAGSLGAAVTLLIIMIAAGRGGFSPHRMLLAGMALSTAFTMLLMMLQASGDPRMAQVLTWISGSTYNATDAQVWRTGIVMVILLAITPLCRRWLTILPLGGDTARAVGMALTPTRIALLLLAACLTATATMTIGPLSFVGLMAPHIARMMGFRRTMPHIVISALVGGLLLVFADWCGRMVLFPFQIPAGLLSTFIGAPYFIYLLRKQSR*'