In [3]:
accessions = ['xkn59438', 'yhdck2', 'eihd39d9', 
              'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp']

In [7]:
accessions_string = 'xkn59438, yhdck2, eihd39d9, chdsye847, hedle3455, xjhd53e, 45da, de37dp'
accessions = accessions_string.split(', ')
accessions

['xkn59438',
 'yhdck2',
 'eihd39d9',
 'chdsye847',
 'hedle3455',
 'xjhd53e',
 '45da',
 'de37dp']

In [4]:
import re

In [5]:
# contains the number 5
for accession in accessions:
    if re.search(r"5", accession):
        print(accession)

xkn59438
hedle3455
xjhd53e
45da


In [6]:
# filter from the accessions list and put into a new list
new_list = []
for accession in accessions:
    if re.search(r"5", accession):
        new_list.append(accession)
print(new_list)

['xkn59438', 'hedle3455', 'xjhd53e', '45da']


In [8]:
# contains the letters d or e
for accession in accessions:
    if re.search(r"(d|e)", accession):
        print(accession)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [9]:
for accession in accessions:
    if re.search(r"d", accession) or re.search(r"e", accession):
        print(accession)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [10]:
for accession in accessions:
    if re.search(r"[de]", accession) or re.search(r"e", accession):
        print(accession)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [12]:
# contains the letter d and e in that oeder with a single letter between them
for accession in accessions:
    if re.search(r"d.e", accession):
        print(accession)

hedle3455


In [15]:
# contains the letters d and e in that order
for accession in accessions:
    # wrong - will match ee or dd
    if re.search(r"[de]{2}", accession):
        print(accession)

hedle3455
de37dp


In [16]:
for accession in accessions:
    if re.search(r"(de|ed)", accession):
        print(accession)

hedle3455
de37dp


In [17]:
# start with x or y
for accession in accessions:
    if re.search(r"^[xy]", accession):
        print(accession)

xkn59438
yhdck2
xjhd53e


In [18]:
for accession in accessions:
    if re.match(r"[xy]", accession):
        print(accession)

xkn59438
yhdck2
xjhd53e


In [23]:
# start with x or y and end with e
for accession in accessions:
    # .* - . means any character, * means 0 or more
    # so .* means anything or nothing
    if re.match(r"[xy].*e$", accession):
        print(accession)

xjhd53e


In [24]:
for accession in accessions:
    if re.search(r"^[xy].*e$", accession):
        print(accession)

xjhd53e


In [25]:
# contains three or more numbers in a row
for accession in accessions:
    if re.search(r"[0123456789]{3}", accession):
        print(accession)

xkn59438
chdsye847
hedle3455


In [27]:
for accession in accessions:
    if re.search(r"[0-9]{3}", accession):
        print(accession)

xkn59438
chdsye847
hedle3455


In [31]:
# insist on *only* 3 numbers
for accession in accessions:
    if re.search(r"(^|[^0-9])[0-9]{3}([^0-9]|$)", accession):
        print(accession)

chdsye847


In [32]:
# end with a d followed by either a, r or p
for accession in accessions:
    if re.search(r"d(a|r|p)$", accession):
        print(accession)

45da
de37dp


In [33]:
for accession in accessions:
    if re.search(r"d[arp]$", accession):
        print(accession)

45da
de37dp


In [5]:
import requests
# save from the web to re_dna.txt
with requests.get('https://raw.githubusercontent.com/pvanheus/biocourse2020_python/master/re_dna.txt') as response:
    open('re_dna.txt', 'w').write(response.text)

In [6]:
# open re_dna.txt and read its contents
dna = open('re_dna.txt').read()
dna = dna.rstrip()  # remove '\n'
# the dna variable is ready for use
dna

'ATGGCAATAACCCCCCGTTTCTACTTCTAGAGGAGAAAAGTATTGACATGAGCGCTCCCGGCACAAGGGCCAAAGAAGTCTCCAATTTCTTATTTCCGAATGACATGCGTCTCCTTGCGGGTAAATCACCGACCGCAATTCATAGAAGCCTGGGGGAACAGATAGGTCTAATTAGCTTAAGAGAGTAAATCCTGGGATCATTCAGTAGTAACCATAAACTTACGCTGGGGCTTCTTCGGCGGATTTTTACAGTTACCAACCAGGAGATTTGAAGTAAATCAGTTGAGGATTTAGCCGCGCTATCCGGTAATCTCCAAATTAAAACATACCGTTCCATGAAGGCTAGAATTACTTACCGGCCTTTTCCATGCCTGCGCTATACCCCCCCACTCTCCCGCTTATCCGTCCGAGCGGAGGCAGTGCGATCCTCCGTTAAGATATTCTTACGTGTGACGTAGCTATGTATTTTGCAGAGCTGGCGAACGCGTTGAACACTTCACAGATGGTAGGGATTCGGGTAAAGGGCGTATAATTGGGGACTAACATAGGCGTAGACTACGATGGCGCCAACTCAATCGCAGCTCGAGCGCCCTGAATAACGTACTCATCTCAACTCATTCTCGGCAATCTACCGAGCGACTCGATTATCAACGGCTGTCTAGCAGTTCTAATCTTTTGCCAGCATCGTAATAGCCTCCAAGAGATTGATGATAGCTATCGGCACAGAACTGAGACGGCGCCGATGGATAGCGGACTTTCGGTCAACCACAATTCCCCACGGGACAGGTCCTGCGGTGCGCATCACTCTGAATGTACAAGCAACCCAAGTGGGCCGAGCCTGGACTCAGCTGGTTCCTGCGTGAGCTCGAGACTCGGGATGACAGCTCTTTAAACATAGAGCGGGGGCGTCGAACGGTCGAGAAAGTCATAGTACCTCGGGTACCAACTTACTCAGGTTATTGCTTGAAGCTGTACTATTTTAGGGGGGGAGCGCTGAAG