In [1]:
import re

In [2]:
help(re.search)

Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a Match object, or None if no match was found.



In [3]:
print("\t\n")

	



In [4]:
# raw string
print(r"\t\n")

\t\n


In [5]:
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")

restriction site found!


In [6]:
dna = "MICKEYMOUSE"
if re.search(r"GAATTC", dna):
    print("restriction site found!")

In [7]:
dna = "ATCGCGAATTCAC"
if dna.find("GAATTC") != -1:
    print("restriction site found!")

restriction site found!


In [13]:
dna = "ATCGCGGACCCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")

restriction site found!


In [16]:
# regex groups with ( ) and alternatives with |
dna = "ATCGCGGACCCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")

restriction site found!


In [17]:
dna = "This (is interesting"
if re.search(r"This \(is", dna):
    print("string found!")

string found!


In [18]:
print("this is a backslash \\")

this is a backslash \


In [19]:
dna = "ATCGCGGACCCAC"
if re.search(r"GG(A|T|C|G)CC", dna):
    print("restriction site found!")

restriction site found!


In [21]:
dna = "ATCGCGGAACCCAC"
if re.search(r"GG(A|T|C|G)CC", dna):
    print("restriction site found!")
else:
    print("not found")

not found


In [23]:
# regex character classes with [ ]
dna = "ATCGCGGACCCAC"
if re.search(r"GG[ACTG]CC", dna):
    print("restriction site found!")
else:
    print("not found")

restriction site found!


In [24]:
# regex character classes with [ ] and not ^
dna = "ATCGCGGACCCAC"
if re.search(r"GG[^CG]CC", dna):
    print("restriction site found!")
else:
    print("not found")

restriction site found!


In [25]:
# regex character classes with [ ] and not ^
dna = "ATCGCGGYCCCAC"
if re.search(r"GG[^CG]CC", dna):
    print("restriction site found!")
else:
    print("not found")

restriction site found!


In [26]:
# regex match any character with .
dna = "ATCGCGGYCCCAC"
if re.search(r"GG.CC", dna):
    print("restriction site found!")
else:
    print("not found")

restriction site found!


### Learning more about regular expressions

Test your regular expressions at [Debuggex](https://www.debuggex.com/)

Check your skills with [Regex Golf](https://www.debuggex.com/)

The Official Regex Docs are [here](https://docs.python.org/3/library/re.html#regular-expression-syntax)

[W3Schools on Regex](https://www.w3schools.com/python/python_regex.asp#:~:text=A%20RegEx%2C%20or%20Regular%20Expression,contains%20the%20specified%20search%20pattern.)

In [33]:
# regex match 1 or 0 with ?
dna = "GGACCCAC"
if re.search(r"GGA?CC", dna):
    print("restriction site found!")
else:
    print("not found")
# regex match 1 or 0 with ?
dna = "GGCCCAC"
if re.search(r"GGA?CC", dna):
    print("restriction site found!")
else:
    print("not found")    

restriction site found!
restriction site found!


In [34]:
# regex match 1 or 0 with ?
dna = "GGAGCCCAC"
if re.search(r"GG(AG|T)?CC", dna):
    print("restriction site found!")
else:
    print("not found")
# regex match 1 or 0 with ?
dna = "GGCCCAC"
if re.search(r"GG(AG|T)?CC", dna):
    print("restriction site found!")
else:
    print("not found")    

restriction site found!
restriction site found!


In [37]:
# regex match at least 1 time with +
dna = "GGACCCAC"
if re.search(r"GGA+CC", dna):
    print("restriction site found!")
else:
    print("not found")

dna = "GGCCCAC"
if re.search(r"GGA+CC", dna):
    print("restriction site found!")
else:
    print("not found")
    
dna = "GGAACCCAC"
if re.search(r"GGA+CC", dna):
    print("restriction site found!")
else:
    print("not found")

restriction site found!
not found
restriction site found!


In [42]:
# regex match at least 1 time with +
dna = "GGAGCCCAC"
if re.search(r"GG(AG)+CC", dna):
    print("restriction site found!")
else:
    print("not found")

dna = "GGCCCAC"
if re.search(r"GG(AG)+CC", dna):
    print("restriction site found!")
else:
    print("not found")

dna = "GGAGAGCCCAC"
if re.search(r"GG(AG)+CC", dna):
    print("restriction site found!")
else:
    print("not found")

dna = "GGAGAGCCCAC"
if re.search(r"GGAG+CC", dna):
    print("restriction site found!")
else:
    print("not found")

restriction site found!
not found
restriction site found!
not found


In [44]:
# regex zero or more of a thing - *
dna = "GGGATTT"
pattern = r"GGGA*TTT"
if re.search(pattern, dna):
    print("it matches")

dna = "GGGTTT"
if re.search(pattern, dna):
    print("it matches")

dna = "GGGAAATTT"
if re.search(pattern, dna):
    print("it matches")


it matches
it matches
it matches


In [46]:
# regex exactly N times - { } 
dna = "GGGAAATTT"
pattern = r"GGGA{3}TTT"
if re.search(pattern, dna):
    print("it matches")

if re.search(r"GGGAAATTT", dna):
    print("it matches")

it matches
it matches


In [51]:
# regex exactly N to M times - { } 
dna = "GGGAAATTT"
pattern = r"GGGA{3,5}TTT"
if re.search(pattern, dna):
    print("it matches")

dna = "GGGAAAATTT"
if re.search(pattern, dna):
    print("it matches")

dna = "GGGAAAAATTT"
if re.search(pattern, dna):
    print("it matches")

dna = "GGGAAAAAATTT"
if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

it matches
it matches
it matches
it doesn't match


In [58]:
# regex - string starts with: ^
dna = "GGGAAATTT"
pattern = r"^GG[ACTG]*"
if re.search(pattern, dna):
    print("it matches")

dna = "AGGGAAATTT"
if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

it matches
it doesn't match


In [63]:
# regex - re.match is like putting a ^ in your pattern
dna = "AGGGAAATTT"
pattern = r"GG[ACTG]*"

if re.match(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

it doesn't match
it matches


In [70]:
# regex - string ends with: $
dna = "GGGAAATTT"
pattern = r"GG[ACTG]*TT$"
if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

dna = "GGGAAATTAT"
if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

dna = "GGGAAATTAT"
pattern = r"GG[ACTG]*TT"
if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

dna = "GGGAAATT"
pattern = r"^GG[ACTG]*TT$"
if re.search(pattern, dna):
    print("it matches")
else:
    print("it doesn't match")

it matches
it doesn't match
it matches
it matches


In [88]:
# regex character class ranges with [ ] and -
mystring = "HELLO"
pattern = r"^[A-Z]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "hello"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "HELLO WORLD"
pattern = r"^[A-Z ]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "HELLO WORLD"
pattern = r"^[^A-Z ]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "hello world"
pattern = r"^[^A-Z]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "Hello world"
pattern = r"^[^A-Z]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "Hello world"
pattern = r"^[A-Za-z ]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "Hello world!"
pattern = r"^[A-Za-z ]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")

mystring = "Hello-world"
pattern = r"^[-A-Za-z ]*$"
if re.search(pattern, mystring):
    print("it matches")
else:
    print("it doesn't match")
    

it matches
it doesn't match
it matches
it doesn't match
it matches
it doesn't match
it matches
it doesn't match
it matches


### The match object and groups

In [93]:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA[ATCG]{3}AC", dna)
print(m)
print(bool(m))
m2 = re.search(r"Gold", dna)
print(m2)
print(type(m2))
print(bool(m2))

<re.Match object; span=(2, 9), match='GACGTAC'>
True
None
<class 'NoneType'>
False


In [94]:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA[ATCG]{3}AC", dna)
print(m.group())

GACGTAC


In [96]:
pattern = r"GA[ATGC]{3}AC[ATGC]{2}AC"
m = re.search(pattern, dna)
print(m)
print(m.group())

<re.Match object; span=(2, 13), match='GACGTACGTAC'>
GACGTACGTAC


In [101]:
pattern = r"GA([ATGC]{3})AC([ATGC]{2})AC"
m = re.search(pattern, dna)
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))


GACGTACGTAC
CGT
GT


IndexError: no such group

In [104]:
def parse_id(id):
    id_pattern = r"([A-Za-z0-9]+)[.]([A-Z][a-z][0-9]{1,2})"
    match = re.search(id_pattern, id)
    if match is not None:
        id_part = match.group(1)
        primer_part = match.group(2)
        return (id_part, primer_part)
my_id = "a752.Ab7"
info = parse_id(id)
print(info[0])
print(info[1])

a752
Ab7


In [109]:
dna = "ATGACGTACGTACGACTG"
pattern = r"GA([ATGC]{3})AC([ATGC]{2})AC"
m = re.search(pattern, dna)
print(m.start())
print(m.end())
start = m.start()
end = m.end()
print(dna[start:end])
print(m.group())
print(m.start(1))

2
13
GACGTACGTAC
GACGTACGTAC
4
