# Regular Expressions (Regex)

Advanced Pattern Matching

- https://en.wikipedia.org/wiki/Regular_expression
- https://docs.python.org/3/library/re.html

In [None]:
import re

In [None]:
pattern = r'is my number'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)  # Search anywhere in the string for a match
match

In [None]:
pattern = r'^is my number'  # ^ matches the beginning of the string
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
print(match)

In [None]:
pattern = r'ring.$'  # $ matches the end of the string
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'elephant'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
print(match)  # None is retruned if no match is found

In [None]:
# * matches zero or more of the preceding characters
pattern = r'GCA*'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)  # NB. search will return the first matching string
match

In [None]:
# + matches 1 or more of the preceding characters
pattern = r'GCA+'
string = '123456ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)
match

In [None]:
# {n} matches n of the preceding characters
pattern = r'GCA{1,2}'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)
match

In [None]:
# [] match characters in set of characters.
pattern = r'[a-zA-Z\[ ]+'  # Match several characters at a time
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'[0-9]+'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'[0-9]*'  
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'[a-zA-Z ]*'
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = '111-456-7890 This is my number... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'\w+'  # \w is a short hand for [a-zA-Z0-9_]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'\W+'  # \W is a short hand for the inverse of \w [^a-zA-Z0-9_]
string = ' This_is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)  # Matches first empty space
match  

In [None]:
pattern = r'\d+'  # \d is a short hand for [0-9]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'\D+'  # \D is a short hand for the inverse of \d [^0-9]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'\s+'  # \s is a short hand for whitespace [ \t\n\r\f\v]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
pattern = r'\S+'  # \S is a short hand for the inverse of \s [^ \t\n\r\f\v]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

In [None]:
# Find all tokens in string...
pattern = r'\S+'  # A token is a non-empty string of one or more characters
string = 'This is my number 111-456-7890... Gimme a ring.'
re.findall(pattern, string)

In [None]:
pattern = r'(\d{3})-(\d{3})-(\d{4})'  # () creates a group
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match.groups()

In [None]:
# How will we parse out phone numbers in several different formats?
PHONE_NUMBERS = """
111-456-7890
2228901234
333.456.3847
333..456.3847
(444) 456-7890
"""

In [None]:
def find_numbers(pattern, numbers):
    for line in numbers.strip().split('\n'):
        number_line = line.strip()
        # just like search but only matches at beginning of string
        match = pattern.match(number_line)
        if match:
            print(number_line, '=>', match.groups())
        else:
            print(number_line, '=>', 'No match')

In [None]:
# We can complie a pattern for faster speed.
# We have to use the search and match methods on a compile pattern.
pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') 
find_numbers(pattern, PHONE_NUMBERS)

In [None]:
# ? matches 0 or 1 instances of last character set
pattern = re.compile(r'(\d{3})-?(\d{3})-?(\d{4})')  
find_numbers(pattern, PHONE_NUMBERS)

In [None]:
# \D matches a non-integer character. * match 0 or more instances of last character set.
pattern = re.compile(r'(\d{3})(\D*)(\d{3})(\D*)(\d{4})') 
find_numbers(pattern, PHONE_NUMBERS)

In [None]:
# Putting it all together.
pattern = re.compile(r'\D*(\d{3})\D*(\d{3})\D*(\d{4})')
find_numbers(pattern, PHONE_NUMBERS)

In [None]:
# Flags
pattern = r'''
    \D*                      # Non-numbers before phone number
    (?P<area_code>\d{3})     # Area code
    \D*                      # Non-numbers
    (?P<prefix>\d{3})        # Prefix
    \D*                      # Non-numbers
    (?P<line_number>\d{4})   # Last four numbers of phone number
'''

flags = (
    re.IGNORECASE |  # Match against upper and lower case with one case
    re.VERBOSE  # Match with comments
)
match = re.match(pattern, string, flags=flags)

print(match.groups())
print(match.group('area_code'))
print(match.group('prefix'))
print(match.group('line_number'))