# Regex in Python

In [35]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

'''

sentence = 'Start a sentence and then bring it to an end'


In [3]:
import re

In [4]:
print('\tTab')

	Tab


In [5]:
print(r'\tTab')

\tTab


In [8]:
# Compile method
pattern = re.compile(r'abc')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

In [None]:
# Search for period
pattern = re.compile(r'\.')

pattern = re.compile(r'coreyms\.com')

In [None]:
# Search for digits
pattern = re.compile(r'\d')

# Not a digit
pattern = re.compile(r'\D')

In [12]:
# Start and end of sentence
pattern = re.compile(r'^Start')
pattern = re.compile(r'$end')

matches = pattern.finditer(sentence)

for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Start'>


In [16]:
def find_matches(pattern, text_to_search):
    matches = pattern.finditer(text_to_search)
    
    for match in matches:
        print(match)

In [17]:
# Look for sequence of digits
pattern = re.compile(r'\d\d\d')
find_matches(pattern, text_to_search)

<re.Match object; span=(55, 58), match='123'>
<re.Match object; span=(58, 61), match='456'>
<re.Match object; span=(61, 64), match='789'>
<re.Match object; span=(151, 154), match='321'>
<re.Match object; span=(155, 158), match='555'>
<re.Match object; span=(159, 162), match='432'>
<re.Match object; span=(164, 167), match='123'>
<re.Match object; span=(168, 171), match='555'>
<re.Match object; span=(172, 175), match='123'>
<re.Match object; span=(177, 180), match='123'>
<re.Match object; span=(181, 184), match='555'>
<re.Match object; span=(185, 188), match='123'>
<re.Match object; span=(190, 193), match='800'>
<re.Match object; span=(194, 197), match='555'>
<re.Match object; span=(198, 201), match='123'>
<re.Match object; span=(203, 206), match='900'>
<re.Match object; span=(207, 210), match='555'>
<re.Match object; span=(211, 214), match='123'>


In [21]:
# Look for phone number
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
find_matches(pattern, text_to_search)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [22]:
# Look for phone number with - or . separator
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')
find_matches(pattern, text_to_search)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [24]:
# Look for 800 and 900 numbers
pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')
find_matches(pattern, text_to_search)

<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [30]:
# Look for phone numbers in file
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d')

with open('data.txt', 'r') as f:
    contents = f.read()
    
    find_matches(pattern, contents)

In [29]:
# Look for 800, 900 phone numbers in file
pattern = re.compile(r'[89]\d\d.\d\d\d.\d\d\d\d')

with open('data.txt', 'r') as f:
    contents = f.read()
    
    find_matches(pattern, contents)

In [28]:
# Range in brackets in character set
pattern = re.compile(r'[a-zA-Z]')

# Match NOT the characters in character set
pattern = re.compile(r'[^a-zA-Z]')
find_matches(pattern, text_to_search)

In [36]:
# Match anything that is not a "b"
my_text = """
cat
mat
pat
bat
sat
"""
pattern = re.compile(r'[^b]at')
find_matches(pattern, my_text)

<re.Match object; span=(1, 4), match='cat'>
<re.Match object; span=(5, 8), match='mat'>
<re.Match object; span=(9, 12), match='pat'>
<re.Match object; span=(17, 20), match='sat'>


## Quantifiers

In [37]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
# Quantifier to match multiple characters at a time
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
find_matches(pattern, text_to_search)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [38]:
# Mr *
pattern = re.compile(r'Mr\.')
find_matches(pattern, text_to_search)

<re.Match object; span=(216, 219), match='Mr.'>
<re.Match object; span=(260, 263), match='Mr.'>


In [40]:
pattern = re.compile(r'Mr\.?')
find_matches(pattern, text_to_search)

<re.Match object; span=(216, 219), match='Mr.'>
<re.Match object; span=(228, 230), match='Mr'>
<re.Match object; span=(246, 248), match='Mr'>
<re.Match object; span=(260, 263), match='Mr.'>


In [41]:
# Upto first letter of name
pattern = re.compile(r'Mr\.?\s[A-Z]')
find_matches(pattern, text_to_search)

<re.Match object; span=(216, 221), match='Mr. S'>
<re.Match object; span=(228, 232), match='Mr S'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [43]:
# Upto first name
pattern = re.compile(r'Mr\.?\s[A-Z]\w*')
find_matches(pattern, text_to_search)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(260, 265), match='Mr. T'>


## Groups

In [44]:
pattern = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')
find_matches(pattern, text_to_search)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


### Emails

In [45]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
core-321-schafer@my-work.net
'''

In [56]:
pattern = re.compile(r'[a-zA-Z.0-9-]+@[a-zA-Z-]+\.(com|edu|net)')
find_matches(pattern, emails)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 82), match='core-321-schafer@my-work.net'>


### URLs

In [59]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [64]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
find_matches(pattern, urls)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [71]:
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(0))

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [73]:
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(1))

www.
None
None
www.


In [74]:
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(2))

google
coreyms
youtube
nasa


In [75]:
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(3))

.com
.com
.com
.gov


## Substitutions

In [76]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

# Backreferences to groups
subbed_urls = pattern.sub(r'\2\3', urls)

print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



## Find Methods

In [78]:
# Find all
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')

# Returns only the groups
matches = pattern.findall(text_to_search)

for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


In [81]:
# Match
sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'Start')

matches = pattern.match(sentence)

print(matches)

<re.Match object; span=(0, 5), match='Start'>


In [82]:
# Search
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>


## Flags

In [83]:
# Ignore case
pattern = re.compile(r'start', re.IGNORECASE)
# pattern = re.compile(r'start', re.I)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>
