# Sequence manipulation

In [1]:
from Bio import SeqIO

In [2]:
record = SeqIO.parse(
    handle = './resources/HBB-human.fasta',
    format='fasta',
)

# look at each element and pick up the one for the gene of interest
for element in record:
    if 'HBB' in element.description:
        dna = element
        break

print(dna)

ID: NM_000518.5
Name: NM_000518.5
Description: NM_000518.5 Homo sapiens hemoglobin subunit beta (HBB), mRNA
Number of features: 0
Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...CAA')


## Sequence manipulation using string objects

### Slice, count and find

The [Seq object](https://biopython.org/docs/1.75/api/Bio.Seq.html) provides a number of string like methods (such as count, find, split and strip), which are alphabet aware where appropriate.

In [3]:
# print method returns the string object
print(dna.seq)

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [4]:
dna.seq

Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...CAA')

In [5]:
str(dna.seq)

'ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA'

In [6]:
# slicing
dna[50:494] # coordinates of the ORF

SeqRecord(seq=Seq('ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAG...TAA'), id='NM_000518.5', name='NM_000518.5', description='NM_000518.5 Homo sapiens hemoglobin subunit beta (HBB), mRNA', dbxrefs=[])

In [7]:
dna.seq[50:494]

Seq('ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAG...TAA')

In [8]:
str(dna.seq)[50:494]

'ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAA'

### Counting and finding motifs

In [9]:
# counting, e.g. ATG codons
dna.seq.count('ATG')

8

In [10]:
str(dna.seq).count('ATG')

8

In [11]:
# find patterns
dna.seq.find('ATG')

50

In [12]:
string = "ATGATGATGATGC"
print(string.find("A"))
print(string.find("T"))
print(string.find("G"))
print(string.find("C"))

0
1
2
12


Python string method find() determines if string str occurs in string, so it's not the most optimal method for recurrent search, i.e. if a pattern appears more than once.

### More sequence manipulations using string methods

In [13]:
# length of a sequence
len(dna)

628

In [14]:
len(dna.seq)

628

In [15]:
# lowercase
dna.seq.lower()

Seq('acatttgcttctgacacaactgtgttcactagcaacctcaaacagacaccatgg...caa')

In [16]:
dna.seq.upper()

Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...CAA')

In [17]:
# replace one or group of characters
dna.seq.replace('T', 'U') # equivalent to transcription

Seq('ACAUUUGCUUCUGACACAACUGUGUUCACUAGCAACCUCAAACAGACACCAUGG...CAA')

In [18]:
# split and REMOVE
split = dna.seq.split('ATG') # list
print(len(split))

9


In [19]:
split

[Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACC'),
 Seq('GTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG...TGG'),
 Seq('AAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGA...CTG'),
 Seq('CTGTT'),
 Seq('GGCAACCCTAAGGTGAAGGCTC'),
 Seq('GCAAGAAAGTGCTCGGTGCCTTTAGTG'),
 Seq('GCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGC...CTA'),
 Seq('CCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAA...ATT'),
 Seq('AAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA')]

In [20]:
# join all item from the list
join = ''.join(split) # doesn't work on Seq objects

TypeError: sequence item 0: expected str instance, Seq found

In [21]:
split = str(dna.seq).split('ATG') # list
join = '\t'.join(split)
print(join)

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACC	GTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGG	AAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTG	CTGTT	GGCAACCCTAAGGTGAAGGCTC	GCAAGAAAGTGCTCGGTGCCTTTAGTG	GCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTA	CCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATT	AAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [24]:
# stripping space and tabulations
add_plus = '+'*3+str(dna.seq)
print(add_plus)
str1 = add_plus.strip('+')
print(str1)

+++ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTC

In [25]:
add_polyA = str1 + 'A'*10
print(add_polyA)
str2 = add_polyA.strip('A')
print(str2)

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAAAAAAAAAAAA
CATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTC

## Exercices

`BamHI = "GGATCC"`

Produce the result in the following format
"sequenceName\tcount"
Like this:
`NM_001168847.1 1`

>Can you print out BamHI occurrence times in each sequence record in the HBB.fasta file?

In [26]:
BamHI = "GGATCC"

record = SeqIO.parse(
    handle = './resources/HBB.fasta',
    format='fasta',
)

# look at each element and pick up the one for the gene of interest
for element in record:
    print(element.name, element.seq.count(BamHI), sep='\t')

NM_001168847.1	1
NM_033234.1	1
NM_000518.5	1
NM_001304110.1	1
NM_001303935.1	2
NM_001303868.1	1
NM_001303858.1	1
NM_001246752.1	1
NM_001086273.2	0
NM_001144841.1	1
NM_001097648.1	1
NM_173917.2	1
NM_131020.3	1
NM_001314043.1	1
NM_001082260.3	1
NM_001283367.1	1
NM_001164018.1	1
NM_001123666.1	1
NM_001329918.1	1
NM_001304883.1	1
NM_001279263.1	0
NM_001201019.1	1
NM_001164428.1	1
NM_001304885.1	1


EcoRI = "gaattc"; the enzyme cuts the sequences as "g|aattc".

addCutter = "g|aattc"

You have to remember that python is case-sensitive.

>Can you cut the sequences in the HBB.fasta file and print the number of fragments for each sequence after the cut and the length of each fragment?

In [27]:
EcoRI = "GAATTC"

record = SeqIO.parse(
    handle = './resources/HBB.fasta',
    format='fasta',
)

# look at each element, replace the EcoRI pattern and cut
for element in record:
    new_seq = element.seq.replace(EcoRI, "G|AATTC") # replace method not in place!
    fragments = new_seq.split('|')
    print(f"sequence name: {element.name} gives {len(fragments)} fragments after EcoRI of lengths {[len(x) for x in fragments]}")

sequence name: NM_001168847.1 gives 2 fragments after EcoRI of lengths [364, 80]
sequence name: NM_033234.1 gives 2 fragments after EcoRI of lengths [411, 209]
sequence name: NM_000518.5 gives 2 fragments after EcoRI of lengths [414, 214]
sequence name: NM_001304110.1 gives 1 fragments after EcoRI of lengths [595]
sequence name: NM_001303935.1 gives 1 fragments after EcoRI of lengths [618]
sequence name: NM_001303868.1 gives 1 fragments after EcoRI of lengths [1289]
sequence name: NM_001303858.1 gives 1 fragments after EcoRI of lengths [620]
sequence name: NM_001246752.1 gives 1 fragments after EcoRI of lengths [600]
sequence name: NM_001086273.2 gives 1 fragments after EcoRI of lengths [640]
sequence name: NM_001144841.1 gives 1 fragments after EcoRI of lengths [496]
sequence name: NM_001097648.1 gives 2 fragments after EcoRI of lengths [358, 80]
sequence name: NM_173917.2 gives 2 fragments after EcoRI of lengths [410, 223]
sequence name: NM_131020.3 gives 1 fragments after EcoRI of l

### Working with restriction sites

See more details [there](http://biopython.org/DIST/docs/cookbook/Restriction.html).

In [28]:
from Bio import Restriction

Restriction.EcoRI.site

'GAATTC'

In [29]:
record = SeqIO.parse(
    handle = './resources/HBB.fasta',
    format='fasta',
)

for element in record:
    if 'Homo' in element.description:
        dna = element
        break

In [30]:
Restriction.EcoRI.search(dna.seq)

[415]

In [31]:
dna.seq.find(EcoRI)

413

In [32]:
replacement = dna.seq.replace(EcoRI, r'G|AATTC')
print(replacement)

ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAG|AATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [33]:
Restriction.EcoRI.catalyse(dna.seq)

(Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...AAG'),
 Seq('AATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTA...CAA'))

In [34]:
[len(seq) for seq in Restriction.EcoRI.catalyse(dna.seq)]

[414, 214]