# Basic sequence analysis

## Analyzing DNA, RNA and protein sequences

In [1]:
from Bio import SeqIO
from Bio.SeqUtils import *

### Transcription, reverse-complement and translation

In [2]:
record = SeqIO.parse(
    handle = './resources/HBB-human.fasta',
    format='fasta',
)

# look at each element and pick up the one for the gene of interest
for element in record:
    if 'Homo' in element.description:
        dna = element
        break

print(str(element.seq))

# sequence manipulation
dna = str(dna.seq)
rna = str(element.seq.transcribe())
pro = str(element.seq[50:494].translate())

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


### GC content

In [3]:
# GC content
print(GC(dna))

51.27388535031847


### Search for motifs

In [4]:
# searching for all sequence of nucleotides
print(nt_search(dna, 'ATG'))

['ATG', 50, 114, 207, 215, 240, 270, 468, 570]


### Molecular weight

In [5]:
# molecular weight
print(molecular_weight(dna, 'DNA'))
print(molecular_weight(rna, 'RNA'))

194074.49929999988
201779.6802999993


In [6]:
# print(molecular_weight(pro, 'protein')) # raises error because of presence of STOP codon
pro = str(element.seq[50:494].translate(stop_symbol=''))
print(molecular_weight(pro, 'protein'))

15998.206399999986


### Six-frame translation

In [7]:
# 6 frame translation
print(six_frame_translations(dna))

GC_Frame: a:139 t:167 g:165 c:157 
Sequence: acatttgctt ... ttcattgcaa, 628 nt, 51.27 %GC


1/1
  I  C  F  *  H  N  C  V  H  *  Q  P  Q  T  D  T  M  V  H  L
 H  L  L  L  T  Q  L  C  S  L  A  T  S  N  R  H  H  G  A  S
T  F  A  S  D  T  T  V  F  T  S  N  L  K  Q  T  P  W  C  I
acatttgcttctgacacaactgtgttcactagcaacctcaaacagacaccatggtgcatc   45 %
tgtaaacgaagactgtgttgacacaagtgatcgttggagtttgtctgtggtaccacgtag
C  K  S  R  V  C  S  H  E  S  A  V  E  F  L  C  W  P  A  D 
 M  Q  K  Q  C  L  Q  T  *  *  C  G  *  V  S  V  M  T  C  R
  N  A  E  S  V  V  T  N  V  L  L  R  L  C  V  G  H  H  M  Q

61/21
  T  P  E  E  K  S  A  V  T  A  L  W  G  K  V  N  V  D  E  V
 D  S  *  G  E  V  C  R  Y  C  P  V  G  Q  G  E  R  G  *  S
*  L  L  R  R  S  L  P  L  L  P  C  G  A  R  *  T  W  M  K
tgactcctgaggagaagtctgccgttactgccctgtggggcaaggtgaacgtggatgaag   56 %
actgaggactcctcttcagacggcaatgacgggacaccccgttccacttgcacctacttc
S  E  Q  P  S  T  Q  R  *  Q  G  T  P  C  P  S  R  P  H  L 
 V  G  S  S  F  D  A  T  V  A  R  H  P

### Amino-acid representations

In [8]:
# 3 letter amino acid sequence
print(seq3(pro))

MetValHisLeuThrProGluGluLysSerAlaValThrAlaLeuTrpGlyLysValAsnValAspGluValGlyGlyGluAlaLeuGlyArgLeuLeuValValTyrProTrpThrGlnArgPhePheGluSerPheGlyAspLeuSerThrProAspAlaValMetGlyAsnProLysValLysAlaHisGlyLysLysValLeuGlyAlaPheSerAspGlyLeuAlaHisLeuAspAsnLeuLysGlyThrPheAlaThrLeuSerGluLeuHisCysAspLysLeuHisValAspProGluAsnPheArgLeuLeuGlyAsnValLeuValCysValLeuAlaHisHisPheGlyLysGluPheThrProProValGlnAlaAlaTyrGlnLysValValAlaGlyValAlaAsnAlaLeuAlaHisLysTyrHis


In [9]:
# back to 1 letter amino acid abreviation
# 3 letter amino acid sequence
print(seq1(seq3(pro)))

MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH


## Working with restriction sites

See more details [there](http://biopython.org/DIST/docs/cookbook/Restriction.html).

In [12]:
from Bio import Restriction

Restriction.EcoRI.site

'GAATTC'

In [13]:
record = SeqIO.parse(
    handle = './resources/HBB.fasta',
    format='fasta',
)

for element in record:
    if 'Homo' in element.description:
        dna = element
        break

Restriction.EcoRI.search(dna.seq)

[415]

In [14]:
Restriction.EcoRI.catalyse(dna.seq)

(Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...AAG'),
 Seq('AATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTA...CAA'))

In [15]:
[len(seq) for seq in Restriction.EcoRI.catalyse(dna.seq)]

[414, 214]

## Exercice

Use `nt_search` to get the locations of a sequence within another sequence.

>Can you add codes to solving assignment #10 for which you get the EcoRI locations in each sequence?

>Can you calculate the GC content for all the sequences in HBB.fasta?

In [10]:
record = SeqIO.parse(
    handle = './resources/HBB.fasta',
    format='fasta',
)

EcoRI = "GAATTC" # no need to put uppercases, nt_search does it automatically

for element in record:
    print(
        f"%GC = {GC(element.seq):.2f} for sequence {element.name}; EcoRI located at {'+'.join(str(x) for x in nt_search(str(element.seq), EcoRI)[1:])}")


%GC = 54.95 for sequence NM_001168847.1; EcoRI located at 363
%GC = 50.32 for sequence NM_033234.1; EcoRI located at 410
%GC = 51.27 for sequence NM_000518.5; EcoRI located at 413
%GC = 47.23 for sequence NM_001304110.1; EcoRI located at 
%GC = 50.81 for sequence NM_001303935.1; EcoRI located at 
%GC = 41.27 for sequence NM_001303868.1; EcoRI located at 
%GC = 49.84 for sequence NM_001303858.1; EcoRI located at 
%GC = 52.33 for sequence NM_001246752.1; EcoRI located at 
%GC = 46.72 for sequence NM_001086273.2; EcoRI located at 
%GC = 56.25 for sequence NM_001144841.1; EcoRI located at 
%GC = 55.94 for sequence NM_001097648.1; EcoRI located at 357
%GC = 48.66 for sequence NM_173917.2; EcoRI located at 409
%GC = 48.74 for sequence NM_131020.3; EcoRI located at 
%GC = 50.49 for sequence NM_001314043.1; EcoRI located at 628
%GC = 50.49 for sequence NM_001082260.3; EcoRI located at 628
%GC = 52.65 for sequence NM_001283367.1; EcoRI located at 413
%GC = 55.41 for sequence NM_001164018.1; Eco

### Correction

In [11]:
r = SeqIO.parse("./resources/HBB.fasta", 'fasta')
EcoRI = "gaattc".upper()
addCutter = "g|aattc".upper()
for i in r:
    pos = nt_search(str(i.seq), EcoRI)
    rep = str(i.seq).replace(EcoRI, addCutter)
    sp = rep.split("|")
    num = len(sp)
    frag = []
    for j in sp:
        frag.append(len(j))
    print(i.name, num, pos, frag, sep='\t\t')

NM_001168847.1		2		['GAATTC', 363]		[364, 80]
NM_033234.1		2		['GAATTC', 410]		[411, 209]
NM_000518.5		2		['GAATTC', 413]		[414, 214]
NM_001304110.1		1		['GAATTC']		[595]
NM_001303935.1		1		['GAATTC']		[618]
NM_001303868.1		1		['GAATTC']		[1289]
NM_001303858.1		1		['GAATTC']		[620]
NM_001246752.1		1		['GAATTC']		[600]
NM_001086273.2		1		['GAATTC']		[640]
NM_001144841.1		1		['GAATTC']		[496]
NM_001097648.1		2		['GAATTC', 357]		[358, 80]
NM_173917.2		2		['GAATTC', 409]		[410, 223]
NM_131020.3		1		['GAATTC']		[716]
NM_001314043.1		2		['GAATTC', 628]		[629, 193]
NM_001082260.3		2		['GAATTC', 628]		[629, 193]
NM_001283367.1		2		['GAATTC', 413]		[414, 190]
NM_001164018.1		1		['GAATTC']		[444]
NM_001123666.1		1		['GAATTC']		[747]
NM_001329918.1		2		['GAATTC', 403]		[404, 212]
NM_001304883.1		1		['GAATTC']		[645]
NM_001279263.1		2		['GAATTC', 434]		[435, 191]
NM_001201019.1		1		['GAATTC']		[607]
NM_001164428.1		2		['GAATTC', 372]		[373, 98]
NM_001304885.1		1		['GAATTC']		[644]
