# REGULAR EXPRESSIONS

Basic pattern matching examples using Python Regular Expressions.

## Text Inputs

In [1]:
with open("data.txt", "r") as f:
    TEXT = f.read()
    
TEXT2 = """
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321--555-4321
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
mat
pat
bat
"""

## Code Snippets

In [2]:
import re

In [3]:
def print_matches(matches, text):
    for index, match in enumerate(matches):
        if index == 5:
            break
        
        start = match.span(0)[0]
        end = match.span(0)[1]
        
        print(f"--- RESULT {index + 1} ---")
        print(f"OBJECT: {match}")
        print(f"INDICES: {match.span(0)}")
        print(f"STRING: {text[start:end]}")
        print("\n")

### A. Phone Patterns

In [4]:
pattern = re.compile(r"\d{3}.\d{3}.\d{4}")
matches = pattern.finditer(TEXT)
print_matches(matches, TEXT)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(12, 24), match='615-555-7164'>
INDICES: (12, 24)
STRING: 615-555-7164


--- RESULT 2 ---
OBJECT: <re.Match object; span=(102, 114), match='800-555-5669'>
INDICES: (102, 114)
STRING: 800-555-5669


--- RESULT 3 ---
OBJECT: <re.Match object; span=(191, 203), match='560-555-5153'>
INDICES: (191, 203)
STRING: 560-555-5153


--- RESULT 4 ---
OBJECT: <re.Match object; span=(281, 293), match='900-555-9340'>
INDICES: (281, 293)
STRING: 900-555-9340


--- RESULT 5 ---
OBJECT: <re.Match object; span=(378, 390), match='714-555-7405'>
INDICES: (378, 390)
STRING: 714-555-7405




In [5]:
pattern = re.compile(r"\d{3}[-.]\d{3}[-.]\d{3}")
matches = pattern.finditer(TEXT)
print_matches(matches, TEXT)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(12, 23), match='615-555-716'>
INDICES: (12, 23)
STRING: 615-555-716


--- RESULT 2 ---
OBJECT: <re.Match object; span=(102, 113), match='800-555-566'>
INDICES: (102, 113)
STRING: 800-555-566


--- RESULT 3 ---
OBJECT: <re.Match object; span=(191, 202), match='560-555-515'>
INDICES: (191, 202)
STRING: 560-555-515


--- RESULT 4 ---
OBJECT: <re.Match object; span=(281, 292), match='900-555-934'>
INDICES: (281, 292)
STRING: 900-555-934


--- RESULT 5 ---
OBJECT: <re.Match object; span=(378, 389), match='714-555-740'>
INDICES: (378, 389)
STRING: 714-555-740




In [6]:
pattern = re.compile(r"[89]00.\d{3}.\d{4}")
matches = pattern.finditer(TEXT)
print_matches(matches, TEXT)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(102, 114), match='800-555-5669'>
INDICES: (102, 114)
STRING: 800-555-5669


--- RESULT 2 ---
OBJECT: <re.Match object; span=(281, 293), match='900-555-9340'>
INDICES: (281, 293)
STRING: 900-555-9340


--- RESULT 3 ---
OBJECT: <re.Match object; span=(467, 479), match='800-555-6771'>
INDICES: (467, 479)
STRING: 800-555-6771


--- RESULT 4 ---
OBJECT: <re.Match object; span=(1091, 1103), match='900-555-3205'>
INDICES: (1091, 1103)
STRING: 900-555-3205


--- RESULT 5 ---
OBJECT: <re.Match object; span=(1439, 1451), match='800-555-6089'>
INDICES: (1439, 1451)
STRING: 800-555-6089




### B. Digit Range Pattern

In [7]:
pattern = re.compile(r"[1-5]")
matches = pattern.finditer(TEXT)
print_matches(matches, TEXT)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(13, 14), match='1'>
INDICES: (13, 14)
STRING: 1


--- RESULT 2 ---
OBJECT: <re.Match object; span=(14, 15), match='5'>
INDICES: (14, 15)
STRING: 5


--- RESULT 3 ---
OBJECT: <re.Match object; span=(16, 17), match='5'>
INDICES: (16, 17)
STRING: 5


--- RESULT 4 ---
OBJECT: <re.Match object; span=(17, 18), match='5'>
INDICES: (17, 18)
STRING: 5


--- RESULT 5 ---
OBJECT: <re.Match object; span=(18, 19), match='5'>
INDICES: (18, 19)
STRING: 5




### C. Letter Range Pattern

In [8]:
pattern = re.compile(r"[a-zA-Z]")
matches = pattern.finditer(TEXT)
print_matches(matches, TEXT)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(0, 1), match='D'>
INDICES: (0, 1)
STRING: D


--- RESULT 2 ---
OBJECT: <re.Match object; span=(1, 2), match='a'>
INDICES: (1, 2)
STRING: a


--- RESULT 3 ---
OBJECT: <re.Match object; span=(2, 3), match='v'>
INDICES: (2, 3)
STRING: v


--- RESULT 4 ---
OBJECT: <re.Match object; span=(3, 4), match='e'>
INDICES: (3, 4)
STRING: e


--- RESULT 5 ---
OBJECT: <re.Match object; span=(5, 6), match='M'>
INDICES: (5, 6)
STRING: M




### D. Negate Range Patterns

In [9]:
pattern = re.compile(r"[^a-zA-Z]")
matches = pattern.finditer(TEXT)
print_matches(matches, TEXT)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(4, 5), match=' '>
INDICES: (4, 5)
STRING:  


--- RESULT 2 ---
OBJECT: <re.Match object; span=(11, 12), match='\n'>
INDICES: (11, 12)
STRING: 



--- RESULT 3 ---
OBJECT: <re.Match object; span=(12, 13), match='6'>
INDICES: (12, 13)
STRING: 6


--- RESULT 4 ---
OBJECT: <re.Match object; span=(13, 14), match='1'>
INDICES: (13, 14)
STRING: 1


--- RESULT 5 ---
OBJECT: <re.Match object; span=(14, 15), match='5'>
INDICES: (14, 15)
STRING: 5




In [10]:
pattern = re.compile(r"[^b]at")
matches = pattern.finditer(TEXT2)
print_matches(matches, TEXT2)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(286, 289), match='cat'>
INDICES: (286, 289)
STRING: cat


--- RESULT 2 ---
OBJECT: <re.Match object; span=(290, 293), match='mat'>
INDICES: (290, 293)
STRING: mat


--- RESULT 3 ---
OBJECT: <re.Match object; span=(294, 297), match='pat'>
INDICES: (294, 297)
STRING: pat




### E. Name Patterns

In [11]:
pattern = re.compile(r"Mr\.?\s[A-Z]\w*")
matches = pattern.finditer(TEXT2)
print_matches(matches, TEXT2)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(235, 246), match='Mr. Schafer'>
INDICES: (235, 246)
STRING: Mr. Schafer


--- RESULT 2 ---
OBJECT: <re.Match object; span=(247, 255), match='Mr Smith'>
INDICES: (247, 255)
STRING: Mr Smith


--- RESULT 3 ---
OBJECT: <re.Match object; span=(279, 284), match='Mr. T'>
INDICES: (279, 284)
STRING: Mr. T




In [12]:
pattern = re.compile(r"M(r|s|rs)\.?\s[A-Z]\w*")
matches = pattern.finditer(TEXT2)
print_matches(matches, TEXT2)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(235, 246), match='Mr. Schafer'>
INDICES: (235, 246)
STRING: Mr. Schafer


--- RESULT 2 ---
OBJECT: <re.Match object; span=(247, 255), match='Mr Smith'>
INDICES: (247, 255)
STRING: Mr Smith


--- RESULT 3 ---
OBJECT: <re.Match object; span=(256, 264), match='Ms Davis'>
INDICES: (256, 264)
STRING: Ms Davis


--- RESULT 4 ---
OBJECT: <re.Match object; span=(265, 278), match='Mrs. Robinson'>
INDICES: (265, 278)
STRING: Mrs. Robinson


--- RESULT 5 ---
OBJECT: <re.Match object; span=(279, 284), match='Mr. T'>
INDICES: (279, 284)
STRING: Mr. T




## Other Examples

### A. Emails

In [13]:
emails = """
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
"""

pattern = re.compile(r"[a-zA-Z0-9.-]+@[a-zA-Z-]+\.(com|edu|net)+")
matches = pattern.finditer(emails)
# also try: [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

print_matches(matches, emails)

--- RESULT 1 ---
OBJECT: <re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
INDICES: (1, 24)
STRING: CoreyMSchafer@gmail.com


--- RESULT 2 ---
OBJECT: <re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
INDICES: (25, 53)
STRING: corey.schafer@university.edu


--- RESULT 3 ---
OBJECT: <re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>
INDICES: (54, 83)
STRING: corey-321-schafer@my-work.net




### B. URLs

In [14]:
urls = """
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
"""

# group 0 is the entire URL
# group 1 is the www. subdomain
# group 2 is the host name
# group 3 is the extension

pattern = re.compile(r"https?://(www\.)?(\w+)(\.\w+)")
subbed_urls = pattern.sub(r"\2\3", urls)
print(f"SUBBED URLs: {subbed_urls}")

matches = pattern.finditer(urls)
for match in matches:
    print(f"GROUP 0: {match.group(0)}")
    print(f"GROUP 1: {match.group(1)}")
    print(f"GROUP 2: {match.group(2)}")
    print(f"GROUP 3: {match.group(3)}")
    print("\n")

SUBBED URLs: 
google.com
coreyms.com
youtube.com
nasa.gov

GROUP 0: https://www.google.com
GROUP 1: www.
GROUP 2: google
GROUP 3: .com


GROUP 0: http://coreyms.com
GROUP 1: None
GROUP 2: coreyms
GROUP 3: .com


GROUP 0: https://youtube.com
GROUP 1: None
GROUP 2: youtube
GROUP 3: .com


GROUP 0: https://www.nasa.gov
GROUP 1: www.
GROUP 2: nasa
GROUP 3: .gov


