# Chapter 7 (Pattern Matching with Regular Expressions)

In [1]:
import re

In [2]:
# Finding Patterns of Text Without Regular Expressions (Checking mobile phone pattern manually) e.g 021-768-4444 
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

print('415-555-4242 is a phone number:')
print(isPhoneNumber('415-555-4242'))
print('Moshi moshi is a phone number:')
print(isPhoneNumber('Moshi moshi'))


415-555-4242 is a phone number:
True
Moshi moshi is a phone number:
False


In [2]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


In [244]:
# Finding Patterns of Text with Regular Expressions
# Creating Regex Objects
import re
# phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
# phoneNumRegex = re.compile(r'^\d\d\d-\d\d\d-\d\d\d\d$')
phoneNumRegex = re.compile(r'(?<!\d)\d{3}\-\d{3}\-\d{4}(?!\d)')
# phoneNumRegex = re.compile(r'\b\d\d\d-\d\d\d-\d\d\d\d\b')
# phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d\b')

In [247]:
# Matching Regex Objects 
mo = phoneNumRegex.search('My number is 451-555-4242')
mo

<re.Match object; span=(13, 25), match='451-555-4242'>

In [248]:
if mo is not None:
    print('Phone number found: ' + mo.group())
else:
    print("Pattern is not matched")


Phone number found: 451-555-4242


In [252]:
# Matching Multiple Groups with the Pipe
# The | character is called a pipe. You can use it anywhere you want to match one
# of many expressions. For example, the regular expression r'Batman|Tina Fey'
# will match either 'Batman' or 'Tina Fey'.

heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
mo1.group()

'Batman'

In [257]:
# You can find all matching occurrences with the findall() 
mo1 = heroRegex.findall('Batman and Tina Fey. and smith')
mo1

['Batman', 'Tina Fey']

In [262]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [260]:
mo.group(1)

'mobile'

In [265]:
# Optional Matching with the Question Mark
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()


'Batwoman'

In [273]:
mo1 = batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [274]:
# Matching Zero or More with the Star
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()


'Batman'

In [280]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [281]:
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

In [291]:
# Matching One or More with the Plus
# + (or plus) means “match one or more.”

batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwowowoman')
if mo1 is not None:
    print("Match found:", mo1.group())
else:
    print("Match not found")

Match found: Batwowowoman


In [303]:
# Matching Specific Repetitions with Curly Brackets
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
mo1.group()


'HaHaHa'

In [308]:
# The findall() Method
# While search() will return a Match object of the first matched text
# in the searched string, the findall() method will return the strings of every
# match in the searched string.
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo.group()

'415-555-9999'

In [310]:
# findall() will not return a Match object but a list of strings—
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [311]:
# Making Your Own Character Classes
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [313]:
# By placing a caret character (^) just after the character class’s opening
# bracket, you can make a negative character class. A negative character class
# will match all the characters that are not in the character class. 

consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

# Now, instead of matching every vowel, we’re matching every character
# that isn’t a vowel.

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

In [318]:
# You can also use the caret symbol (^) at the start of a regex to indicate that
# a match must occur at the beginning of the searched text. 

beginsWithHello = re.compile(r'^Hello')
matchObject = beginsWithHello.search('Hello world!')
matchObject.group()

'Hello'

In [319]:
beginsWithHello.search('He said hello.') == None

True

In [320]:
# The r'\d$' regular expression string matches strings that end with a
# numeric character from 0 to 9. 
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [321]:
# The r'^\d+$' regular expression string matches strings that both begin
# and end with one or more numeric characters.

wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [322]:
wholeStringIsNum.search('12345xyz67890') == None

True

In [15]:
# The Wildcard Character
# The . (or dot) character in a regular expression is called a wildcard and will
# match any character except for a newline.
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [23]:
# Matching Everything with Dot-Star
# Remember that the
# dot character means “any single character except the newline,”

nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group(1)

'Al'

In [24]:
mo.group(2)

'Sweigart'

In [31]:
# Matching Newlines with the Dot Character
# The dot-star will match everything except a newline. By passing re.DOTALL as
# the second argument to re.compile(), you can make the dot character match
# all characters, including the newline character

noNewlineRegex = re.compile('.*')
matchObject = noNewlineRegex.search('Serve the public trust.\nProtect the innocent. \nUphold the law.')
matchObject.group()

'Serve the public trust.'

In [35]:
newlineRegex = re.compile('.*', re.DOTALL)
matchObject = newlineRegex.search('Serve the public trust.\nProtect the innocent. \nUphold the law.')
matchObject.group()

'Serve the public trust.\nProtect the innocent. \nUphold the law.'

In [36]:
# Case-Insensitive Matching
# >>> regex1 = re.compile('RoboCop')
# >>> regex2 = re.compile('ROBOCOP')
# >>> regex3 = re.compile('robOcop')
# >>> regex4 = re.compile('RobocOp')
# To make your regex case-insensitive, you can pass re.IGNORECASE or re.I as a second argument to re.compile(). 
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [38]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [39]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()

'robocop'

In [40]:
# Substituting Strings with the sub() Method
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [41]:
stringTest = 'Agent Alice gave the secret documents to Agent Bob.'
stringTest.replace('Agent', 'CENSORED ')

'CENSORED  Alice gave the secret documents to CENSORED  Bob.'

In [4]:
# Managing Complex Regexes
# 1-Create a Regex for Phone Numbers
phoneRegex = re.compile(r'''(
 (\d{3}|\(\d{3}\))? # area code
 (\s|-|\.)? # separator
 (\d{3}) # first 3 digits
 (\s|-|\.) # separator
 (\d{4}) # last 4 digits
 (\s*(ext|x|ext.)\s*(\d{2,5}))? # extension
 )''', re.VERBOSE)

# 2-Create a Regex for Email Addresses
emailRegex = re.compile(r'''(
 [a-zA-Z0-9._%+-]+ # username
 @ # @ symbol
 [a-zA-Z0-9.-]+ # domain name
 (\.[a-zA-Z]{2,4}) # dot-something
 )''', re.VERBOSE)


In [8]:
# Extract username from email using python
# This solution is not feasible because its time and space complexity is much more than the solution using regex
emails = ['m.shaheerkhan199@gmail.com', 'm.areebkhan125@gmail.com','ravishankar.malhi1@gmail.com']
usernames = list()
for email in emails:
    username = ''
    for char in email:
        if char == '@':
            break
        else:
            username+=char
        
    usernames.append(username)

print(usernames)
            

['m.shaheerkhan199', 'm.areebkhan125', 'ravishankar.malhi1']


In [53]:
# Extract username from email using python re module
import re

usernames = list()
emails = ['m.shaheerkhan199@gmail.com', 'm.areebkhan125@gmail.com','ravishankar.malhi1@gmail.com']
# username_regex = re.compile(r'\w.?\w{,}')
username_regex = re.compile(r'([a-zA-Z0-9_.]+)')
for email in emails:
    match_object = username_regex.search(email)
    usernames.append(match_object.group())
print(usernames)

['m.shaheerkhan199', 'm.areebkhan125', 'ravishankar.malhi1']


In [20]:
# Strong Password Checker
import re

password_regex = re.compile(r'[(A-Z){2}(a-z){2}(0-9){2}($%&!#){2}]{8,}')
if password_regex.match('testpassword'):
    print("Your password is strong")
else:
    print("Your password is weak")


Your password is strong
