In [1]:
### Automate The Boring Stuff With Python
### https://automatetheboringstuff.com/chapter7/

In [None]:
### RegEx Basics

In [1]:
import re #Import Regular Expression modules

In [6]:
message = "Call me 415-555-1011 tomorrow or at 415-555-9999"

In [4]:
phoneNumRegEx = re.compile(r"\d\d\d-\d\d\d-\d\d\d\d")

In [5]:
mo = phoneNumRegEx.search(message)

In [9]:
print(mo.group()) 

415-555-1011


In [12]:
print(phoneNumRegEx.findall(message)) #Find all patterns

['415-555-1011', '415-555-9999']


In [None]:
### RegEx Group and Pipes

In [4]:
phoneNumRegEx = re.compile(r"(\d\d\d)-(\d\d\d-\d\d\d\d)") #This will create groups within the match object

In [7]:
mo = phoneNumRegEx.search(message)

In [8]:
mo.group(1) #Returns the first bracket group

'415'

In [9]:
mo.group(2) #Returns the second bracket group

'555-1011'

In [None]:
##Piping

In [10]:
batRegex = re.compile(r"Bat(man|mobile|copter|bat)")

In [11]:
mo = batRegex.search("Batmobile Lost A Wheel")

In [12]:
mo.group()

'Batmobile'

In [None]:
##Reptition and Greedy/NonGreedy Matching

In [None]:
#? 

In [13]:
batRegex = re.compile(r"Bat(wo)?man") #Match zero or one time

In [17]:
mo = batRegex.search("The Adventures of Batwoman") #If theres more wo, then it will return a None

In [18]:
mo.group()

'Batwoman'

In [19]:
phoneRegex = re.compile(r"(\d\d\d-)?\d\d\d-\d\d\d\d")

In [20]:
phoneRegex.search("MY phone number is 415-555-1234. Call me tomorrow")

<re.Match object; span=(19, 31), match='415-555-1234'>

In [21]:
phoneRegex.search("MY phone number is 555-1234. Call me tomorrow") #Would still have a match

<re.Match object; span=(19, 27), match='555-1234'>

In [None]:
### If you need to do a literal matching just do this "\?"

In [None]:
#*

In [22]:
batRegex = re.compile(r"Bat(wo)*man") #Can appear zero or more time

In [23]:
batRegex.search("The Adventures of Batman")

<re.Match object; span=(18, 24), match='Batman'>

In [24]:
batRegex.search("The Adventures of Batwoman")

<re.Match object; span=(18, 26), match='Batwoman'>

In [25]:
batRegex.search("The Adventures of Batwowowowowowoman")

<re.Match object; span=(18, 36), match='Batwowowowowowoman'>

In [None]:
### If you need to do a literal matching just do this "\*"

In [None]:
# +

In [26]:
batRegex = re.compile(r"Bat(wo)+man") #Can appear one or more time

In [27]:
batRegex.search("The Adventures of Batman") #Shows none

In [28]:
batRegex.search("The Adventures of Batwoman")

<re.Match object; span=(18, 26), match='Batwoman'>

In [None]:
### If you need to do a literal matching just do this "\+"

In [29]:
regex = re.compile(r"\+\*\?")

In [34]:
regex.search("I learned about +*? regex syntax") 

<re.Match object; span=(16, 19), match='+*?'>

In [None]:
# {x} matches exactly the amount of repetition

In [35]:
haRegex = re.compile(r"(Ha){3}") #Match Ha three times

In [36]:
haRegex.search("HaHaHa")

<re.Match object; span=(0, 6), match='HaHaHa'>

In [39]:
phoneRegex = re.compile(r"((\d\d\d-)?\d\d\d-\d\d\d\d(,)?){3}")

In [40]:
phoneRegex.search("My numbers are 415-555-1234,555-4242,212-555-0000")

<re.Match object; span=(15, 49), match='415-555-1234,555-4242,212-555-0000'>

In [None]:
# {x,y} at least x, at most y

In [42]:
haRegex = re.compile(r"(Ha){3,5}") #Minimum of 3 and maximum of 5 repetitions

In [44]:
haRegex.search("HaHaHa")

<re.Match object; span=(0, 6), match='HaHaHa'>

In [45]:
haRegex.search("HaHaHaHaHa")

<re.Match object; span=(0, 10), match='HaHaHaHaHa'>

In [48]:
haRegex.search("HaHaHaHaHaHaHaHa") #Shows only 5 ha

<re.Match object; span=(0, 16), match='HaHaHaHaHaHaHaHa'>

In [47]:
haRegex=re.compile(r"(Ha){3,}") #This is the same as three or more

In [49]:
digitRegex = re.compile(r"(\d){3,5}")

In [53]:
digitRegex.search("1234567890") #Greedy Matching, match the longest string possible

<re.Match object; span=(0, 3), match='123'>

In [52]:
digitRegex = re.compile(r"(\d){3,5}?") #Non Greedy Matching, matches the shortest possible string

In [None]:
### Character Classes and the findall() method

In [2]:
phoneRegEx = re.compile(r"\d\d\d-\d\d\d-\d\d\d\d")

In [3]:
phoneRegEx.findall() #Find all matches within the RegEx object
#Pass an object in the findall
#Returns a regular string values in list
#If the grouping has zero or one group

TypeError: findall() missing required argument 'string' (pos 1)

In [None]:
phoneRegEx = re.compile(r"(\d\d\d)-(\d\d\d-\d\d\d\d)")

In [None]:
phoneRegEx.findall() #Find all matches within the RegEx object
#Pass an object in the findall
#Returns a tuple in a list if you used grouping
#If the grouping has two or more group

In [None]:
phoneRegEx = re.compile(r"((\d\d\d)-(\d\d\d-\d\d\d\d))")

In [None]:
phoneRegEx.findall() #This will return 3 tuples in a list

In [None]:
xmasRegex = re.compile(r"\d+\s\w+") #This will cover two digits
#\s is to match a space
#\w is to match a letter

In [None]:
#Make your own character class

In [8]:
doublevowelRegex = re.compile(r"[aeiouAEIOU]{2}") #Create your class by using square brackets

In [13]:
doublevowelRegex.findall("Robocop eats baby food")

['R', 'b', 'c', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', ' ', 'f', 'd']

In [12]:
doublevowelRegex = re.compile(r"[^aeiouAEIOU]") #Putting a caret does the opposite thing, negative matching

In [None]:
#Caret/Dollar Characters

In [15]:
beginsWithHelloRegex = re.compile(r"^Hello") #This will match with the first string (IT MUST START AT THE BEGINNING) and it must be Hello

In [16]:
beginsWithHelloRegex.search("Hello There!")

<re.Match object; span=(0, 5), match='Hello'>

In [17]:
beginsWithHelloRegex.search("He said Hello There!") #This wont work

In [19]:
endsWithWorldRegex = re.compile(r"world!$") #This will amtch the string (AT THE END OF THE SENTENCE)

In [21]:
endsWithWorldRegex.search("Hello world!")

<re.Match object; span=(6, 12), match='world!'>

In [22]:
allDigitsRegex = re.compile(r"^\d+$") #Digit character, one or more, needs to begin and end with the pattern between caret and dollar

In [24]:
allDigitsRegex.search("5344653427856346782563427856342")

<re.Match object; span=(0, 31), match='5344653427856346782563427856342'>

In [None]:
# . this stands for any single character except for the newline 

In [25]:
atRegex = re.compile(r".at")

In [28]:
atRegex.findall("The cat in the hat sat on the flat mat")

[' cat', ' hat', ' sat', 'flat', ' mat']

In [27]:
atRegex = re.compile(r".{1,2}at") #This will include white space character as well

In [None]:
#We can combine . and star together, anything any pattern

In [29]:
nameRegex = re.compile(r"First Name: (.*) Last Name: (.*)") #will return a list of tuple strings

In [32]:
nameRegex.findall("First Name: Al Last Name: Sweigart")  #Just matches ANYTHING

[('Al', 'Sweigart')]

In [None]:
#.* uses greedy more tries to match as large as possible

In [None]:
#Non Greedy .*?

In [33]:
serve = "<To serve humans> for dinner.>"

In [37]:
nongreedy = re.compile(r"<(.*?)>")

In [38]:
nongreedy.findall(serve)

['To serve humans']

In [39]:
greedy = re.compile(r"<(.*)>")

In [40]:
greedy.findall(serve)

['To serve humans> for dinner.']

In [41]:
prime = "Serve the public trust.\nProtect the innocent.\nUpload the law."

In [42]:
dotStar = re.compile(r".*") #Reaches until the \n

In [44]:
dotStar = re.compile(r".*",re.DOTALL) #This will include the newline as well

In [45]:
dotStar.search(prime)

<re.Match object; span=(0, 61), match='Serve the public trust.\nProtect the innocent.\nU>

In [46]:
vowelRegex = re.compile(r"[aeiou]")

In [49]:
vowelRegex.findall("Al, why does your programming book talk about RoboCop so much?")

['A',
 'o',
 'e',
 'o',
 'u',
 'o',
 'a',
 'i',
 'o',
 'o',
 'a',
 'a',
 'o',
 'u',
 'o',
 'o',
 'o',
 'o',
 'u']

In [None]:
#It doesnt show the A in the Al

In [48]:
vowelRegex = re.compile(r"[aeiou]",re.I) #re.I means IGNORECASE so it makes it case insensitive

In [None]:
#Sub and Verbose Method

In [50]:
namesRegex = re.compile(r"Agent \w+")

In [54]:
namesRegex.findall("Agent Alice gave the secret documents to Agent Bob")

['A', 'B']

In [52]:
namesRegex.sub("REDACTED","Agent Alice gave the secret documents to Agent Bob")

'REDACTED gave the secret documents to REDACTED'

In [53]:
namesRegex = re.compile(r"Agent (\w)\w*")#Put in a group to find the first letter #Zero or more other letter charaacters

In [None]:
#This will return the group and not the entire string

In [55]:
namesRegex.sub(r"Agent \1****","Agent Alice gave the secret documents to Agent Bob")

'Agent A**** gave the secret documents to Agent B****'

In [None]:
#The slash number indicates the group letter

In [None]:
#Verbose Mode

In [56]:
re.compile(r"""\d\d\d
-
\d\d\d
-
\d\d\d\d""", re.VERBOSE)

re.compile(r'\d\d\d-\d\d\d-\d\d\d\d', re.UNICODE)

In [57]:
#Create a Phone and Email Scraper
import pyperclip

In [71]:
# Create a regex for phone numbers
phoneRegex = re.compile(r"""
#415-555-0000, (415) 555-0000, 555-0000 ext 12345, ext. 12345, x12345

((\d\d\d))|(\d(\d\d\d\)))?    #area code (optional)
(\s|-)    #first separator
\d\d\d    #first 3 digits
-    #separator
\d\d\d\d    #last 4 digiys
((ext(\.)?\s)|x)#Extension Word Part
(\d{2,5}))?#Extension number Part
""", re.VERBOSE)
#TODO: Create a regex for email addresses
#TODO: Get the text off the clipboard
#TODO: Extract the email/phone from this text
#TODO: Copy the extracted email/phone to the clipbnoard

error: unbalanced parenthesis at position 266 (line 10, column 10)