# Regular Expressions (Regex)

Advanced Pattern Matching

- https://en.wikipedia.org/wiki/Regular_expression
- https://docs.python.org/3/library/re.html

In [2]:
import re

In [6]:
pattern = r'is my number'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)  # Search anywhere in the string for a match
match, match.group(0)

(<_sre.SRE_Match object; span=(5, 17), match='is my number'>, 'is my number')

In [7]:
pattern = r'^is my number'  # ^ matches the beginning of the string
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
print(match)

None


In [8]:
pattern = r'ring.$'  # $ matches the end of the string
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(42, 47), match='ring.'>

In [9]:
pattern = r'elephant'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
print(match)  # None is retruned if no match is found

None


In [9]:
# * matches zero or more of the preceding characters
pattern = r'GCA*'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)  # NB. search will return the first matching string
match, match.group(0)

(<re.Match object; span=(2, 4), match='GC'>, 'GC')

In [4]:
# + matches 1 or more of the preceding characters
pattern = r'GCA+'
string = '123456ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)
match

<re.Match object; span=(10, 13), match='GCA'>

In [12]:
# {n} matches n of the preceding characters
pattern = r'GCA{1,2}'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)
match

<re.Match object; span=(4, 7), match='GCA'>

In [13]:
# [] match characters in set of characters.
pattern = r'[a-zA-Z\[ ]+'  # Match several characters at a time
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<re.Match object; span=(0, 18), match='This is my number '>

In [15]:
pattern = r'[0-9]+'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(18, 21), match='111'>

In [16]:
pattern = r'[0-9-]+'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(18, 30), match='111-456-7890'>

In [24]:
pattern = r'[0-9]*'  
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 0), match=''>

In [26]:
pattern = r'[0-9]*'  
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = '3This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 1), match='3'>

In [25]:
pattern = r'[0-9]+'  
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(18, 21), match='111'>

In [27]:
pattern = r'[a-zA-Z ]*'
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = '111-456-7890 This is my number... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 0), match=''>

In [4]:
pattern = r'\w+'  # \w is a short hand for [a-zA-Z0-9_]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 5), match='This '>

In [5]:
pattern = r'[\w ]+'  # \w is a short hand for [a-zA-Z0-9_]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 21), match='This is my number 111'>

In [6]:
pattern = r'\W+'  # \W is a short hand for the inverse of \w [^a-zA-Z0-9_]
string = ' This_is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)  # Matches first empty space
match  

<_sre.SRE_Match object; span=(0, 1), match=' '>

In [7]:
pattern = r'\d+'  # \d is a short hand for [0-9]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(18, 21), match='111'>

In [8]:
pattern = r'\D+'  # \D is a short hand for the inverse of \d [^0-9]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 18), match='This is my number '>

In [9]:
pattern = r'\s+'  # \s is a short hand for whitespace [ \t\n\r\f\v]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(4, 5), match=' '>

In [10]:
pattern = r'\S+'  # \S is a short hand for the inverse of \s [^ \t\n\r\f\v]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 4), match='This'>

In [11]:
# Find all tokens in string...
pattern = r'\S+'  # A token is a non-empty string of one or more characters
string = 'This is my number 111-456-7890... Gimme a ring.'
re.findall(pattern, string)

['This', 'is', 'my', 'number', '111-456-7890...', 'Gimme', 'a', 'ring.']

In [14]:
pattern = r'(\d{3})-(\d{3})-(\d{4})'  # () creates a group
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match.groups(), match.group(0), match.group(1), match.group(2)

(('111', '456', '7890'), '111-456-7890', '111', '456')

In [15]:
# How will we parse out phone numbers in several different formats?
PHONE_NUMBERS = """
111-456-7890
2228901234
333.456.3847
333..456.3847
(444) 456-7890
"""

In [16]:
def find_numbers(pattern, numbers):
    for line in numbers.strip().split('\n'):
        number_line = line.strip()
        # just like search but only matches at beginning of string
        match = pattern.match(number_line)
        if match:
            print(number_line, '=>', match.groups())
        else:
            print(number_line, '=>', 'No match')

In [17]:
# We can complie a pattern for faster speed.
# We have to use the search and match methods on a compile pattern.
pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') 
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => No match
333.456.3847 => No match
333..456.3847 => No match
(444) 456-7890 => No match


In [18]:
# ? matches 0 or 1 instances of last character set
pattern = re.compile(r'(\d{3})-?(\d{3})-?(\d{4})')  
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => No match
333..456.3847 => No match
(444) 456-7890 => No match


In [19]:
# ? matches 0 or 1 instances of last character set
pattern = re.compile(r'(\d{3})[-.]?(\d{3})[-.]?(\d{4})')  
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => ('333', '456', '3847')
333..456.3847 => No match
(444) 456-7890 => No match


In [20]:
# \D matches a non-integer character. * match 0 or more instances of last character set.
pattern = re.compile(r'(\d{3})(\D*)(\d{3})(\D*)(\d{4})') 
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '-', '456', '-', '7890')
2228901234 => ('222', '', '890', '', '1234')
333.456.3847 => ('333', '.', '456', '.', '3847')
333..456.3847 => ('333', '..', '456', '.', '3847')
(444) 456-7890 => No match


In [22]:
# \D matches a non-integer character. * match 0 or more instances of last character set.
# remove the parentheses ()
pattern = re.compile(r'(\d{3})\D*(\d{3})\D*(\d{4})') 
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => ('333', '456', '3847')
333..456.3847 => ('333', '456', '3847')
(444) 456-7890 => No match


In [21]:
# Putting it all together.
pattern = re.compile(r'\D*(\d{3})\D*(\d{3})\D*(\d{4})')
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => ('333', '456', '3847')
333..456.3847 => ('333', '456', '3847')
(444) 456-7890 => ('444', '456', '7890')


In [24]:
# Flags
pattern = r'''
    \D*                      # Non-numbers before phone number
    (?P<area_code>\d{3})     # Area code
    \D*                      # Non-numbers
    (?P<prefix>\d{3})        # Prefix
    \D*                      # Non-numbers
    (?P<line_number>\d{4})   # Last four numbers of phone number
'''

flags = (
    re.IGNORECASE |  # Match against upper and lower case with one case
    re.VERBOSE  # Match with comments
)
match = re.match(pattern, string, flags=flags)

print(match.groups())
print('area_code: ', match.group('area_code'))
print('prefix: ', match.group('prefix'))
print('line_number: ', match.group('line_number'))

('111', '456', '7890')
area_code:  111
prefix:  456
line_number:  7890
