In [7]:
accessions_string = 'xkn59438, yhdck2, eihd39d9, chdsye847, hedle3455, xjhd53e, 45da, de37dp'
accessions = accessions_string.split(', ')
accessions

['xkn59438',
 'yhdck2',
 'eihd39d9',
 'chdsye847',
 'hedle3455',
 'xjhd53e',
 '45da',
 'de37dp']

In [4]:
import re

In [5]:
# contains the number 5
for accession in accessions:
    if re.search(r"5", accession):
        print(accession)

xkn59438
hedle3455
xjhd53e
45da


In [6]:
# filter from the accessions list and put into a new list
new_list = []
for accession in accessions:
    if re.search(r"5", accession):
        new_list.append(accession)
print(new_list)

['xkn59438', 'hedle3455', 'xjhd53e', '45da']


In [8]:
# contains the letters d or e
for accession in accessions:
    if re.search(r"(d|e)", accession):
        print(accession)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [9]:
for accession in accessions:
    if re.search(r"d", accession) or re.search(r"e", accession):
        print(accession)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [10]:
for accession in accessions:
    if re.search(r"[de]", accession) or re.search(r"e", accession):
        print(accession)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [12]:
# contains the letter d and e in that oeder with a single letter between them
for accession in accessions:
    if re.search(r"d.e", accession):
        print(accession)

hedle3455


In [15]:
# contains the letters d and e in that order
for accession in accessions:
    # wrong - will match ee or dd
    if re.search(r"[de]{2}", accession):
        print(accession)

hedle3455
de37dp


In [16]:
for accession in accessions:
    if re.search(r"(de|ed)", accession):
        print(accession)

hedle3455
de37dp


In [17]:
# start with x or y
for accession in accessions:
    if re.search(r"^[xy]", accession):
        print(accession)

xkn59438
yhdck2
xjhd53e


In [18]:
for accession in accessions:
    if re.match(r"[xy]", accession):
        print(accession)

xkn59438
yhdck2
xjhd53e


In [23]:
# start with x or y and end with e
for accession in accessions:
    # .* - . means any character, * means 0 or more
    # so .* means anything or nothing
    if re.match(r"[xy].*e$", accession):
        print(accession)

xjhd53e


In [24]:
for accession in accessions:
    if re.search(r"^[xy].*e$", accession):
        print(accession)

xjhd53e


In [25]:
# contains three or more numbers in a row
for accession in accessions:
    if re.search(r"[0123456789]{3}", accession):
        print(accession)

xkn59438
chdsye847
hedle3455


In [27]:
for accession in accessions:
    if re.search(r"[0-9]{3}", accession):
        print(accession)

xkn59438
chdsye847
hedle3455


In [31]:
# insist on *only* 3 numbers
for accession in accessions:
    if re.search(r"(^|[^0-9])[0-9]{3}([^0-9]|$)", accession):
        print(accession)

chdsye847


In [32]:
# end with a d followed by either a, r or p
for accession in accessions:
    if re.search(r"d(a|r|p)$", accession):
        print(accession)

45da
de37dp


In [33]:
for accession in accessions:
    if re.search(r"d[arp]$", accession):
        print(accession)

45da
de37dp


### Double digest

In [5]:
import requests
# save from the web to re_dna.txt
with requests.get('https://raw.githubusercontent.com/pvanheus/biocourse2020_python/master/re_dna.txt') as response:
    open('re_dna.txt', 'w').write(response.text)

In [6]:
# open re_dna.txt and read its contents
dna = open('re_dna.txt').read()
dna = dna.rstrip()  # remove '\n'
# the dna variable is ready for use
dna

'ATGGCAATAACCCCCCGTTTCTACTTCTAGAGGAGAAAAGTATTGACATGAGCGCTCCCGGCACAAGGGCCAAAGAAGTCTCCAATTTCTTATTTCCGAATGACATGCGTCTCCTTGCGGGTAAATCACCGACCGCAATTCATAGAAGCCTGGGGGAACAGATAGGTCTAATTAGCTTAAGAGAGTAAATCCTGGGATCATTCAGTAGTAACCATAAACTTACGCTGGGGCTTCTTCGGCGGATTTTTACAGTTACCAACCAGGAGATTTGAAGTAAATCAGTTGAGGATTTAGCCGCGCTATCCGGTAATCTCCAAATTAAAACATACCGTTCCATGAAGGCTAGAATTACTTACCGGCCTTTTCCATGCCTGCGCTATACCCCCCCACTCTCCCGCTTATCCGTCCGAGCGGAGGCAGTGCGATCCTCCGTTAAGATATTCTTACGTGTGACGTAGCTATGTATTTTGCAGAGCTGGCGAACGCGTTGAACACTTCACAGATGGTAGGGATTCGGGTAAAGGGCGTATAATTGGGGACTAACATAGGCGTAGACTACGATGGCGCCAACTCAATCGCAGCTCGAGCGCCCTGAATAACGTACTCATCTCAACTCATTCTCGGCAATCTACCGAGCGACTCGATTATCAACGGCTGTCTAGCAGTTCTAATCTTTTGCCAGCATCGTAATAGCCTCCAAGAGATTGATGATAGCTATCGGCACAGAACTGAGACGGCGCCGATGGATAGCGGACTTTCGGTCAACCACAATTCCCCACGGGACAGGTCCTGCGGTGCGCATCACTCTGAATGTACAAGCAACCCAAGTGGGCCGAGCCTGGACTCAGCTGGTTCCTGCGTGAGCTCGAGACTCGGGATGACAGCTCTTTAAACATAGAGCGGGGGCGTCGAACGGTCGAGAAAGTCATAGTACCTCGGGTACCAACTTACTCAGGTTATTGCTTGAAGCTGTACTATTTTAGGGGGGGAGCGCTGAAG

In [2]:
# Solution 1
# AbcI cuts at ANT*AAT
# AbcII cuts at GCRW*TG
# R means A or G
# W means A or T
#restriction_site1 = "ANT*AAT" # "^(A.T)(AAT)$" or "A[ATCG]TAAT"
#restriction_site2 = "GCRW*TG" # "^(GC(A|G)(A|T))TG$" or "GC[AG][AT]TG"
 
#open and read file
 
import re
 
my_dna_file = open('re_dna.txt', 'r')
dna = my_dna_file.read().rstrip('\n')
 
# find positions at which enzymes cut
# make a list to store the cut positions of both enzymes
 
cut_positions = [0]
 
for match in re.finditer(r"A[ATCG]TAAT", dna):
        print ("AbcI cuts at", match.start()+3)
        cut_positions.append(match.start()+3)
 
for match in re.finditer(r"GC[AG][AT]TG", dna):
        print ("AbcII cuts at", match.start()+4)
        cut_positions.append(match.start()+4)
print (cut_positions)
 
#  add the length of the dna sequence to the list to calculate the distance between fragments (i.e. fragment length)
# sort the list in numerical order
 
cut_positions.append(len(dna))
sort_cut_positions = sorted(cut_positions)
print (sort_cut_positions)
 
#calculate the distance between the cut sites
 
fragment_no = 0
 
for fragment in range(1, len(cut_positions)):
    fragment_length = sort_cut_positions[fragment] - sort_cut_positions[fragment-1]
    fragment_no +=1
    print("Fragment", fragment_no, "is", fragment_length, "bp long!")


AbcI cuts at 1143
AbcI cuts at 1628
AbcII cuts at 488
AbcII cuts at 1577
[0, 1143, 1628, 488, 1577]
[0, 488, 1143, 1577, 1628, 2012]
Fragment 1 is 488 bp long!
Fragment 2 is 655 bp long!
Fragment 3 is 434 bp long!
Fragment 4 is 51 bp long!
Fragment 5 is 384 bp long!


In [3]:
# textbook solution
all_cuts = [0]
# add cut positions for AbcI
for match in re.finditer(r"A[ATGC]TAAT", dna):
    all_cuts.append(match.start() + 3)
# add cut positions for AbcII
for match in re.finditer(r"GC[AG][AT]TG", dna):
    all_cuts.append(match.start() + 4)
# add the final position
all_cuts.append(len(dna))
sorted_cuts = sorted(all_cuts)
print(sorted_cuts)

for i in range(1,len(sorted_cuts)):
    this_cut_position = sorted_cuts[i]
    previous_cut_position = sorted_cuts[i-1]
    fragment_size = this_cut_position - previous_cut_position
    print("one fragment size is " + str(fragment_size))

[0, 488, 1143, 1577, 1628, 2012]
one fragment size is 488
one fragment size is 655
one fragment size is 434
one fragment size is 51
one fragment size is 384


In [4]:
# Peter's solution

def make_dna_regex(enzyme):
    subs = {
        'R': '[AG]',
        'Y': '[CT]',
        'S': '[GC]',
        'W': '[AT]',
        'K': '[GT]',
        'M': '[AC]',
        'B': '[CGT]',
        'D': '[AGT]',
        'H': '[ACT]',
        'V': '[ACG]',
        'N': '[ACTG]'
    }
    pattern = r''
    for base in enzyme:
        pattern = pattern + subs.get(base, base)
    return pattern

assert make_dna_regex('ANT*AAT') == 'A[ACTG]T*AAT'
assert make_dna_regex('GCRW*TG') == 'GC[AG][AT]*TG'

def cut(dna, enzyme):
    offset = enzyme.find('*')
    pattern = make_dna_regex(enzyme.replace('*', ''))
    fragments = []
    start = 0
    for match in re.finditer(pattern, dna):
        cut_position = (match.start() + offset)
        length = cut_position - start
        fragment = dna[start:cut_position]
        start = cut_position
        fragments.append(fragment)
    fragments.append(dna[start:])
    return fragments

assert cut('TCCGGATTAATCCAT', 'ANT*AAT') == ['TCCGGATT', 'AATCCAT']

fragments = cut(dna, 'ANT*AAT')
for fragment in fragments:
    new_fragments = cut(fragment, 'GCRW*TG')
    for new_fragment in new_fragments:
        print(len(new_fragment))

488
655
434
51
384
