### P4B Chapter 7: Regular expressions

A [Regular Expression](https://en.wikipedia.org/wiki/Regular_expression) also known as *regexp* or *regex* is a string of characters that defines a text search pattern

In [1]:
import re

In [2]:
dna = "ATCGCGAATTCAC"
re.search(r"GAATTC", dna)

<re.Match object; span=(5, 11), match='GAATTC'>

In [3]:
dna = "ATCGCGAATTCAC"
dna.find("GAATTC")

5

In [4]:
dna.find("GAAATTC")

-1

In [7]:
print(re.search(r"GAAATTC", dna))

None


In [8]:
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found")
else:
    print("site not found in", dna)

restriction site found


In [9]:
if dna.find("GAATTC") != -1:
    print("restriction site found")
else:
    print("site not found in", dna)

restriction site found


In [10]:
# AvaII motif
# GGACC or GGTCC
dna = "ATCGCGAATTCAC"
if dna.find("GGACC") != -1 or dna.find("GGTCC") != -1:
    print("restriction site found")

In [11]:
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found")

In [13]:
dna = "ATCGCGGACCATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found")

restriction site found


In [14]:
re.search(r"GG(A|T)CC", dna)

<re.Match object; span=(5, 10), match='GGACC'>

In [15]:
date = "14 August 1985"
re.search(r" (August|July) ", date)

<re.Match object; span=(2, 10), match=' August '>

In [17]:
date = "14 July 1985"
re.search(r" (August|July) ", date)

<re.Match object; span=(2, 8), match=' July '>

In [18]:
date = "14 February 1985"
print(re.search(r" (August|July) ", date))

None


In [19]:
date = "14 February 1985"
print(re.search(r" (August|July|February) ", date))

<re.Match object; span=(2, 12), match=' February '>


In [20]:
date = "14 July 1985"
re.search(r"(August|July)", date)

<re.Match object; span=(3, 7), match='July'>

In [21]:
# BisI
# GCNGC - where N is any of ACTG
dna = "ATCGCGAATTCAC"
if re.search(r"GC(A|G|T|C)GC", dna):
    print("restriction site found")

In [22]:
# BisI
# GCNGC - where N is any of ACTG
dna = "ATCGCGGCTTCAC"
if re.search(r"GC(A|G|T|C)GC", dna):
    print("restriction site found")

restriction site found


In [23]:
# BisI
# GCNGC - where N is any of ACTG
dna = "ATCGCGGCTTCAC"
if re.search(r"GC[AGTC]GC", dna):
    print("restriction site found")

restriction site found


In [24]:
date = "17 March 1995"
re.search(r"[0-9]", date)

<re.Match object; span=(0, 1), match='1'>

In [25]:
date = "17 March 1995"
re.search(r"[9-0]", date)

error: bad character range 9-0 at position 1

In [26]:
date = "17 March 1995"
re.search(r"[a-z]", date)

<re.Match object; span=(4, 5), match='a'>

In [27]:
# "not" character class
date = "17 March 1995"
re.search(r"[^0-9]", date)

<re.Match object; span=(2, 3), match=' '>

#### Quantifiers

In [29]:
# * means 0 or more
dna = "ATCGCGAATTCAC"
re.search(r"[GATC]*", dna)

<re.Match object; span=(0, 13), match='ATCGCGAATTCAC'>

In [31]:
dna = "the dna is: ATCGCGAATTCAC"
re.search(r"[GATC]*", dna)

<re.Match object; span=(0, 0), match=''>

In [32]:
# * means 0 or more
dna = "ATCGCGAATTCAC are bases"
re.search(r"[GATC]*", dna)

<re.Match object; span=(0, 13), match='ATCGCGAATTCAC'>

In [33]:
# + means 1 or more
dna = "the dna is: ATCGCGAATTCAC"
re.search(r"[GATC]+", dna)

<re.Match object; span=(12, 25), match='ATCGCGAATTCAC'>

In [34]:
# glob: *.pdb - anything followed by .pdb
# glob: * - anything
# regular expressions:
mystring = "Happy Birthday"
re.search(".*", mystring)

<re.Match object; span=(0, 14), match='Happy Birthday'>

In [35]:
date = "18 October 2003"
re.search(r"[0-9]+", date)

<re.Match object; span=(0, 2), match='18'>

In [36]:
date = "The date is 18 October 2003"
re.search(r"[0-9]+", date)

<re.Match object; span=(12, 14), match='18'>

In [37]:
date = "The date is 18 October 2003"
re.search(r"[0-9]*", date)

<re.Match object; span=(0, 0), match=''>

In [38]:
date = "18 October 2003"
re.search(r"[0-9]*", date)

<re.Match object; span=(0, 2), match='18'>

In [40]:
dna = "GGGATTT"
re.search(r"GGA*TT", dna)

<re.Match object; span=(1, 6), match='GGATT'>

In [41]:
dna = "GGGAATTT"
re.search(r"GGA*TT", dna)

<re.Match object; span=(1, 7), match='GGAATT'>

In [42]:
dna = "GGGTTT"
re.search(r"GGA*TT", dna)

<re.Match object; span=(1, 5), match='GGTT'>

In [44]:
dna = "GGGTTT"
print(re.search(r"GGA+TT", dna))

None


In [45]:
dna = "GGGAAATTT"
print(re.search(r"GGA+TT", dna))

<re.Match object; span=(1, 8), match='GGAAATT'>


In [46]:
dna = "GGGATTT"
print(re.search(r"GGA?TT", dna))

<re.Match object; span=(1, 6), match='GGATT'>


In [47]:
dna = "GGGAATTT"
print(re.search(r"GGA?TT", dna))

None


In [48]:
dna = "GGGAATTT"
print(re.search(r"GGA{1,3}TT", dna))

<re.Match object; span=(1, 7), match='GGAATT'>


In [49]:
dna = "GGGAAAATTT"
print(re.search(r"GGA{1,3}TT", dna))

None


In [50]:
dna = "GGGAAATTT"
print(re.search(r"GGA{1,3}TT", dna))

<re.Match object; span=(1, 8), match='GGAAATT'>


#### Positions

In [51]:
date = "14 May 2007"
print(re.search(r"[0-9]+", date))

<re.Match object; span=(0, 2), match='14'>


In [52]:
date = "Date: 14 May 2007"
print(re.search(r"[0-9]+", date))

<re.Match object; span=(6, 8), match='14'>


In [53]:
# ^ a
date = "Date: 14 May 2007"
print(re.search(r"^[0-9]+", date))

None


In [54]:
date = "14 May 2007"
print(re.search(r"^[0-9]+", date))

<re.Match object; span=(0, 2), match='14'>


In [55]:
# $ anchors to the end of the string
date = "14 May 2007"
print(re.search(r"[0-9]+$", date))

<re.Match object; span=(7, 11), match='2007'>


In [56]:
date = "14 May 2007 is the date"
print(re.search(r"[0-9]+$", date))

None


In [59]:
mrna = "ATGGGCGGAGGCGAACGGACCGGACCGAAAGGCAGGAAAAAAA"
print(re.search("^ATG[ATGC]{30,1000}A{5,10}", mrna))

<re.Match object; span=(0, 43), match='ATGGGCGGAGGCGAACGGACCGGACCGAAAGGCAGGAAAAAAA'>


In [None]:
# AUG (3542/4284), 14% (612) GUG, 3% (103) UUG[7] 
# and one or two others (e.g., an AUU and possibly a CUG
# ATG, GTG, TTG, ATT, CTG
# stop codon
#  TAG, TGA, and TAA 
re.search(r"^(ATG|GTG|TTG|ATT|CTG)[ATCG]{30,1000}(TAG|TGA|TAA)$", dna)
re.search(r"(ATG|GTG|TTG|ATT|CTG)[ATCG]{30,1000}(TAG|TGA|TAA)", dna)
          

#### Extracting the match

In [61]:
mrna = "ATGGGCGGAGGCGAACGGACCGGACCGAAAGGCAGGAAAAAAA"
match = re.search("^ATG[ATGC]{30,1000}A{5,10}", mrna)
print(match)
if match:
    print("match:", match.group())

<re.Match object; span=(0, 43), match='ATGGGCGGAGGCGAACGGACCGGACCGAAAGGCAGGAAAAAAA'>
match: ATGGGCGGAGGCGAACGGACCGGACCGAAAGGCAGGAAAAAAA


In [63]:
date = "20 April 2017"
match = re.search(r"[0-9]+$", date)
if match:
    year = match.group()
    print("year:", year)
else:
    print("did not match")

year: 2017


In [66]:
date = "20 April 2017 is the date"
match = re.search(r"[0-9]+$", date)
if match is not None:
    year = match.group()
    print("year:", year)
else:
    print("did not match")

did not match


In [65]:
date = "20 April 2017 is the date"
match = re.search(r"[0-9]+$", date)
year = match.group()
print("year:", year)


AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
date = "20 April 2017 is the date"
match = re.search(r"([0-9]+) ([^ ]+) ([0-9]+)", date)
if match is not None:
    year = match.group()
    print("year:", year)
else:
    print("did not match")