In [2]:
def isPhoneNumber(text):
    if len(text) != 12:        
        return False    
    for i in range(0, 3):
        if not text[i].isdecimal():            
            return False
    if text[3] != '-':        
        return False    
    for i in range(4, 7):
        if not text[i].isdecimal():            
            return False
    if text[7] != '-':        
        return False
    return True

print('Is 415-555-4242 a phone number?')
print(isPhoneNumber('415-555-4242'))
print('Is Moshi moshi a phone number?')
print(isPhoneNumber('Moshi moshi'))

Is 415-555-4242 a phone number?
True
Is Moshi moshi a phone number?
False


In [3]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):         
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


In [4]:
import re

In [5]:
phoneNumberRegEx = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # here raw string is important or else we will have to escape the escape backsplash

In [9]:
matchOutput = phoneNumberRegEx.search('My number is 415-555-4242.')
matchOutput

<re.Match object; span=(13, 25), match='415-555-4242'>

In [12]:
print('Phone number found:', matchOutput.group())

Phone number found: 415-555-4242


In [13]:
print('Phone number found: %s' % matchOutput.group())

Phone number found: 415-555-4242


In [14]:
print(f'Phone number found: {matchOutput.group()}')

Phone number found: 415-555-4242


In [15]:
phoneNumberRegEx = re.compile(r'\d{3}-\d{3}-\d{4}') 
phoneNumberRegEx.search('My number is 415-555-4242.').group()

'415-555-4242'

In [18]:
# Capture groups
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
mo.group()

415
555-4242
415-555-4242


'415-555-4242'

In [19]:
mo.groups() # only present when capture groups are used

('415', '555-4242')

In [20]:
areaCode, mainNumber = mo.groups() # unpacking tuple

In [23]:
# search multiple regex using pipe (|) aka or 
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
mo2 = heroRegex.search('Tina Fey and Batman')

In [24]:
print(mo1.group())
print(mo2.group())

Batman
Tina Fey


In [28]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
print(mo.group())
print(mo.group(0))
print(mo.group(1))

Batmobile
Batmobile
mobile


In [29]:
# optional matching ? means 0 or 1
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo2 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())
print(mo2.group())

Batman
Batwoman


In [34]:
# optional matching 0 or more * means 0 or more
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
mo2 = batRegex.search('The Adventures of Batwoman')
mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo1.group())
print(mo2.group())
print(mo3.group())

Batman
Batwoman
Batwowowowoman


In [35]:
# optional matching 1 or more  + means 1 or more
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batman')
mo2 = batRegex.search('The Adventures of Batwoman')
mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo1 == None)
print(mo2.group())
print(mo3.group())

True
Batwoman
Batwowowowoman


In [38]:
# these are equivalent
#(Ha){3}
#(Ha)(Ha)(Ha)
# these are equivalent
#(Ha){3,5}
#(Ha)(Ha)(Ha)(Ha)?(Ha)?
#((Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha)(Ha))
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHa'

In [39]:
# Greedy matching
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [40]:
# Non Greedy Lazy matching
greedyHaRegex = re.compile(r'(Ha){3,5}?')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHa'

In [41]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [42]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [43]:
# \d 0 to 9
# \D any char not 0 to 9
# \w any letter, digit or underscore
# \W Any character that is not a letter, numeric digit, or the underscore character.
# \s Any space, tab, or newline character.
# \S Any character that is not a space, tab, or newline.
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [44]:
# your own character class using [] similar to or(|) but for chars instead of regexes
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [45]:
# negative character class, use caret(^) to denote negation in character class
vowelRegex = re.compile(r'[^aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

In [46]:
# use caret to denote string start.
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello, world!')

<re.Match object; span=(0, 5), match='Hello'>

In [47]:
beginsWithHello.search('He said hello.') == None

True

In [48]:
# use dollar($) to denote end of string
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [49]:
endsWithNumber.search('Your number is forty two.') == None

True

In [51]:
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [52]:
wholeStringIsNum.search('12345xyz67890') == None

True

In [53]:
wholeStringIsNum.search('12 34567890') == None

True

In [54]:
# dot(.) is wildcard char that will match any char exccept new line
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [57]:
# match everuthing with .* in greedy fashion
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')

print(mo.group())
print(mo.group(0))
print(mo.group(1))
print(mo.group(2))

First Name: Al Last Name: Sweigart
First Name: Al Last Name: Sweigart
Al
Sweigart


In [58]:
# match everuthing with .*? in non greedy fashion
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [59]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

In [61]:
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [62]:
# use dotal to capture new line in wildcard
newlineRegex = re.compile('.*', re.DOTALL)
newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

In [64]:
vowelRegex = re.compile(r'[aeiou^AEIOU]')
vowelRegex.findall('RoboCop eats baby food. ^ BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', '^', 'A', 'O', 'O']

In [65]:
vowelRegex = re.compile(r'[^aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

In [66]:
# ignore case matching, case insensitive, flag has to passed in second argument
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [67]:
# substitution of matched string
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [261]:
# use matched texts for substitution using \1  \2 \n for captured groups
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

In [129]:
# use verbose flag and multi line string to manage large regular expression, ignores whitespace and comments
phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))?               # area code
(\s|-|\.)?                       # separator
(\d{3})                            # first 3 digits
(\s|-|\.)                        # separator
(\d{4})                            # last 4 digits
(\s*(ext|x|ext.)\s*(\d{2,5}))?     # extension
)''', re.VERBOSE)

In [130]:
phoneRegex.search('218-234-7564 ext. 315')

<re.Match object; span=(0, 21), match='218-234-7564 ext. 315'>

In [71]:
# use multiple flags together using bitwise or operator denoted by symbol pipe(|)
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)

In [74]:
emailRegex = re.compile('\w+@\w+\.\w{2,4}', re.IGNORECASE)
emailRegex.search("My email is andy@huhaaha.com.")

<re.Match object; span=(12, 28), match='andy@huhaaha.com'>

In [153]:
emailRegex = re.compile('''(
[a-z0-9._%+-]+       # username we do not need to escape in character matcher
@                    # @ symbol
[a-z0-9-]+           # site
(\.[a-z]{2,4}){1,2})  # domain capture like .gov.in or .com
(?:\s|$)''', re.IGNORECASE | re.VERBOSE)  # (?:) is a non capturing capture group, i.e. it is matched but not captured

In [147]:
emailRegex.search("My email is f2011@bits-pilani.ac.in")

<re.Match object; span=(12, 35), match='f2011@bits-pilani.ac.in'>

In [154]:
import pyperclip
text = str(pyperclip.paste()) # get text from clipboard
matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1], groups[3], groups[5]]) # see group 1 belong to area code 3 digit and 4 digit respectively
    if groups[8] != '': # outer group is counted first than inner group so last group which has extension is 8
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])
matches

['800-420-7240',
 '415-863-9900',
 '415-863-9950',
 'info@nostarch.com',
 'media@nostarch.com',
 'academic@nostarch.com',
 'info@nostarch.co.in']

In [150]:
"""
800-420-7240
415-863-9900
415-863-9950
info@nostarch.com
media@nostarch.com
academic@nostarch.com
info@nostarch.co.in
"""

'\n800-420-7240\n415-863-9900\n415-863-9950\ninfo@nostarch.com\nmedia@nostarch.com\nacademic@nostarch.com\ninfo@nostarch.com\n'

In [155]:
emailRegex.findall(text)

[('info@nostarch.com', '.com'),
 ('media@nostarch.com', '.com'),
 ('academic@nostarch.com', '.com'),
 ('info@nostarch.co.in', '.in')]

In [168]:
def checkDate(text):
    dateRegex = re.compile(r'(0[1-9]|[12][0-9]|3[01])[-/]([0][1-9]|1[0-2])[-/]([12]\d{3})')
    mo = dateRegex.search(text)
    if(not mo):
        return False
    date, month, year = [int(n) for n in dateRegex.search(text).groups()]
    if month == 2:
        if (year%4 == 0 and year%100 != 0 )or year%400 == 0:
            return date<30
        else :
            return date < 29
    if (month//8 == 0 and month%2 == 1) or (month//8 == 1 and month%2 == 0 ):
        return date < 32
    else :
        return date < 31

In [179]:
checkDate('30-02-2000')

False

In [262]:
def isStrongPassword(text):
    passwordCheck = re.compile(r'(.*[0-9].*[A-Z].*[a-z].*|.*[0-9].*[a-z].*[A-Z].*|.*[a-z].*[A-Z].*[0-9].*|.*[A-Z].*[a-z].*[0-9].*|.*[A-Z].*[0-9].*[a-z].*|.*[a-z].*[0-9].*[A-Z].*)')
    return (len(text) >= 8) and (passwordCheck.search(text) != None)

In [263]:
isStrongPassword('QWERy123')

True

In [32]:
def stripReg(text):
    expr = re.compile(r'^\s*|\s*$')
    return expr.sub('',text)

In [33]:
stripReg(' This is a sample text .  ')

'This is a sample text .'

In [6]:
help('test'.strip)

Help on built-in function strip:

strip(chars=None, /) method of builtins.str instance
    Return a copy of the string with leading and trailing whitespace removed.
    
    If chars is given and not None, remove characters in chars instead.



In [34]:
def stripReg2(text):
    return re.sub(r'^\s*', '', re.sub(r'\s*$','',text))

In [35]:
stripReg2(' This is a sample text .  ')

'This is a sample text .'

In [43]:
def stripReg3(text):
    mo = re.compile(r'[^\s].*[^\s]').search(text);
    return mo.group() if mo != None else ''

In [45]:
stripReg3(' This is a sample text .  ')

'This is a sample text .'