In [11]:
import re

In [12]:
texts = [
    "have you ever read a book before",
    "have you been to africa",
    "have you ever seen two thousand one a space odyssey",
    "have you ever had ice cream",
    "do you know any musical instruments",
    "how do you say cat in spanish",
    "can you recommend a book",
    "how do you cook french toast",
    "how do you play dungeons and dragons",
    "what should i eat for dinner"
]


In [13]:
# match()
pat = re.compile("have you (\w+) (\w+)")
m = pat.match("have you ever read a book before")
print(m.group(0))

#pat = re.compile("have you (\w+) (\w+)")
m = re.match("have you (\w+) (\w+)", "have you ever read a book before")
print(m.group(0))

have you ever read
have you ever read


In [14]:
pat = re.compile("have you (\w+) (\w+)")
m = pat.match("have you ever read a book before")
print(m.group(0))

have you ever read


In [15]:
# re.match() versus re.search()
pat = re.compile("have you (\w+) (\w+)")
m = pat.match("hi have you ever read a book before")
print(m.group(0))  

AttributeError: ignored

In [16]:
# search()
pat = re.compile("have you (\w+) (\w+)")
m = pat.search("hi have you ever read a book before")
print(m.group(0))  

have you ever read


In [18]:
# match groups 
pat = re.compile("have you (\w+) (\w+)")
m = pat.search("hi have you ever read a book before")
print(m.group(0))   # group 0 is the full pattern
print(m.group(1))   # group 1 is the leftmost group in the pattern
print(m.group(2))
print(m.group(3))

have you ever read
ever
read


IndexError: ignored

In [19]:
# nested groups
pat = re.compile("have you ((\w+) (\w+))")
m = pat.match("have you ever read a book before")
print(m.group(0))   # group 0 is the full pattern
print(m.group(1))   # group 1 is the leftmost group in the pattern
print(m.group(2))
print(m.group(3))

have you ever read
ever read
ever
read


In [21]:
# sometimes we want to define a closure group but we dont want 
# to ever retrieve the matching text
pat = re.compile("have you (?:\w+) (\w+)")
m = pat.match("have you ever read a book before")
print(m.group(0))   # group 0 is the full pattern
print(m.group(1))   # "read" 
print(m.group(2))   # causes an error

have you ever read
read


IndexError: ignored

In [22]:
# named capture groups
pat = re.compile("(?:have you) (?P<have_you>\w+)")
m = pat.search("hi have you ever read a book before")
print(m["have_you"])

ever


In [23]:
# named capture groups
pat = re.compile("(?P<quote>['\"])(?P<quoted_phrase>(\w+ )+\w+)(?P=quote)")
m = pat.search('have you ever seen "the land before time" before?')
print(m.group(0))
print(m["quoted_phrase"])
print(m.groupdict())

"the land before time"
the land before time
{'quote': '"', 'quoted_phrase': 'the land before time'}


In [57]:
# extracting multiple matches
pat = re.compile("have you (\w+) (\w+)")
m = pat.match("have you ever read a book before have you been to africa")
print(m.group(0))   # group 0 is the full pattern
print(m.group(1))   # group 1 is the leftmost group in the pattern
print(m.group(2))
print(m.span())
print(m.groupdict())


have you ever read
ever
read
(0, 18)
{}


In [58]:
# extracting multiple matches
pat = re.compile("have you (\w+) (\w+)")

# findall()
print("find all:", pat.findall("have you ever read a book before have you been to africa"))

# finditer()
for j, m in enumerate(list(pat.finditer("have you ever read a book before have you been to africa"))):
    print("-"*20, f"\nMatch {j}")
    print(f"type={type(m)}")
    print(m.group(0))   # group 0 is the full pattern
    print(m.group(1))   # group 1 is the leftmost group in the pattern
    print(m.group(2))

find all: [('ever', 'read'), ('been', 'to')]
-------------------- 
Match 0
type=<class 're.Match'>
have you ever read
ever
read
-------------------- 
Match 1
type=<class 're.Match'>
have you been to
been
to


In [29]:
# finditer()
for j, m in enumerate(pat.findall("have you ever read a book before have you been to africa")):
    print("-"*20, f"\nMatch {j}")
    print(f"word1={m[0]}  word2={m[1]}")
    #print(m.group(0))   # group 0 is the full pattern
    #print(m.group(1))   # group 1 is the leftmost group in the pattern
    #print(m.group(2))

-------------------- 
Match 0
word1=ever  word2=read
-------------------- 
Match 1
word1=been  word2=to


# Lookahead Assertions
Some of this information and examples are from 
https://docs.python.org/3/howto/regex.html . 
Visit this website for more information.


 - `(?=...)` Positive lookahead assertion. This succeeds if the contained regular expression, represented here by ..., successfully matches at the current location, and fails otherwise. But, once the contained expression has been tried, the matching engine doesn’t advance at all; the rest of the pattern is tried right where the assertion started.
 - `(?!...)` Negative lookahead assertion. This is the opposite of the positive assertion; it succeeds if the contained expression doesn’t match at the current position in the string.


In [55]:
def run_pat(pat, text):
    print(f"\npattern: '{pat}'")
    pat = re.compile(pat)
    for j, m in enumerate(pat.finditer(text)):
        print(f"match {j}: '{m.group()}'")

text = "a113222b11122c111211122c"
# we want all characters that occure before a 1. 
run_pat(r"1", text)
print("-"*10)
run_pat("b1", text)
print("-"*10)

# we only want the 1 but we will also get the character
# just before the 1 too. ex: we get 'b1' but we want '1'.
run_pat(r"\w11", text)


pattern: '1'
match 0: '1'
match 1: '1'
match 2: '1'
match 3: '1'
match 4: '1'
match 5: '1'
match 6: '1'
match 7: '1'
match 8: '1'
match 9: '1'
match 10: '1'
----------

pattern: 'b1'
match 0: 'b1'
----------

pattern: '\w11'
match 0: 'a11'
match 1: 'b11'
match 2: 'c11'
match 3: '211'


In [36]:
# match a word character that precedes '112'
run_pat("\w+(?=112)", text)



pattern: '\w+(?=112)'
match 0: 'a113222b11122c11121'


In [47]:
# match a word character that precedes '112', and
# the next 0 to 4 word characters.
print("text:", text)
run_pat("\w(?=112)\w{0,4}", text)

text: a113222b11122c111211122c

pattern: '\w(?=112)\w{0,4}'
match 0: '11122'
match 1: '11121'


In [46]:
print("text:", text)
# match a word character that does not precede '112'
run_pat("\w(?!112)", text)
# match a word character that does not precede '112', and
# the next 0 to 4 word characters.
run_pat("\w(?!112)\w{,4}", text)

text: a113222b11122c111211122c

pattern: '\w(?!112)'
match 0: 'a'
match 1: '1'
match 2: '1'
match 3: '3'
match 4: '2'
match 5: '2'
match 6: '2'
match 7: 'b'
match 8: '1'
match 9: '1'
match 10: '2'
match 11: '2'
match 12: 'c'
match 13: '1'
match 14: '1'
match 15: '2'
match 16: '1'
match 17: '1'
match 18: '2'
match 19: '2'
match 20: 'c'

pattern: '\w(?!112)\w{,4}'
match 0: 'a1132'
match 1: '22b11'
match 2: '122c1'
match 3: '11211'
match 4: '122c'


# Lookbehind Assertions
- `(?<=foo)` Asserts that what immediately precedes the current position in the string **is foo**.
- `(?<!foo)` Asserts that what immediately precedes the current position in the string **is not foo**.  

In [56]:
text = "a113222b11122c111211122c"
# positive lookbehind 
# match a '2' but only if it follows '12'
run_pat("(?<=12)2", text)
# match '111' but only if it follows 'b'
run_pat("(?<=b)111", text)

# negative lookbehind
# match '111' but only if it does not follow 'b'
run_pat("(?<!b)111", text)


pattern: '(?<=12)2'
match 0: '2'
match 1: '2'

pattern: '(?<=b)111'
match 0: '111'

pattern: '(?<!b)111'
match 0: '111'
match 1: '111'
