In [1]:
import re

In [2]:
# Helper function to show Regex results
def single_match_result(res):
    if res is None:
        return ("No matches found for your query!")
    
    #Get information about the matching

    #Return the first words that matches the pattern
    print("Match found: " + res.group())

    #Get the indecies of the matching 
    print("Match start: {}".format(res.start()))

    print("Match end: {}".format(res.end()))

    print("Match indices span: {}".format(res.span()))

    print("Query string: {}".format(res.re))

    print("Original string: {}".format(res.string))


## Search for a single match

### re.match
Check if a string starts with a given pattern




In [3]:
string = "Regular expressions are difficult but awesomely awesome"

found = re.match("Regular", string)
not_found = re.match("egular", string) 

In [4]:
single_match_result(found)

Match found: Regular
Match start: 0
Match end: 7
Match indices span: (0, 7)
Query string: re.compile('Regular')
Original string: Regular expressions are difficult but awesomely awesome


In [5]:
single_match_result(not_found)

'No matches found for your query!'

In [6]:
#using the results in an if statement
if found:
    print ("Found !!!!")

Found !!!!


In [7]:
# check if not found
if not_found is None:
    print("Not found !! ")
    
# Or
if not not_found:
    print("Not found !!")

Not found !! 
Not found !!


### re.search 
Searches for a query anywhere in the string

In [8]:
# let's check for words that derives from "awesome"

string = "Regular expressions are difficult but awesomely awesome"

found = re.search("awesome[a-z]*", string)

not_found = re.search("not found", string)

In [9]:
single_match_result(found)

Match found: awesomely
Match start: 38
Match end: 47
Match indices span: (38, 47)
Query string: re.compile('awesome[a-z]*')
Original string: Regular expressions are difficult but awesomely awesome


In [10]:
single_match_result(not_found)

'No matches found for your query!'

## Search for multiple matches

### re.findall

Returns a list of the substrings that correspond to the query

In [11]:
string = "Regular expressions are difficult but awesomely awesome and awesomely awesome"

found = re.findall("awesome[a-z]*", string)
not_found = re.match("not found", string) 

In [12]:
# Get all the starting indices of the matches
#Important: In Jupyter notebook, be sure to execute this line in each cell before iterating on found_start_indices
found_start_indices = re.finditer("awesome[a-z]*", string)

indices = [m.start() for m in found_start_indices]
indices

[38, 48, 60, 70]

In [13]:
# Get the span of every occurence
#Important: In Jupyter notebook, be sure to execute this line in each cell before iterating on found_start_indices
found_start_indices = re.finditer("awesome[a-z]*", string)

for i in found_start_indices:
    print("{} span : {}".format(i.group(), i.span()))

awesomely span : (38, 47)
awesome span : (48, 55)
awesomely span : (60, 69)
awesome span : (70, 77)


## String substitution

### re.sub

Substitutes all occurences of a pattern with another pattern

In [14]:
string = "Regular expressions are difficult but awesomely awesome and awesomely awesome"

substitute = re.sub("awesome", "intersting", string) 
substitute

'Regular expressions are difficult but interstingly intersting and interstingly intersting'

In [15]:
# If we want to get the new string with substitutions as well as the number of substitutions
substitute_n = re.subn("awesome", "intersting", string) 

print("The new string is: {}".format(substitute_n[0]))
print("The number of subtitutions is: {}".format(substitute_n[1]))

The new string is: Regular expressions are difficult but interstingly intersting and interstingly intersting
The number of subtitutions is: 4


In [16]:
# Let's suppose that we want to switch the positions of "awesome" and "awesomely"  

string = "Regular expressions are difficult but awesomely awesome and awesomely awesome and moreover AWESOMELY Awesome"

switch_words = re.sub(r"(awesomely) (awesome)", r"\2 \1", string, flags=re.IGNORECASE) 
switch_words

'Regular expressions are difficult but awesome awesomely and awesome awesomely and moreover Awesome AWESOMELY'

In [17]:
# Let's suppose that we want to switch the positions of "awesome" and "awesomely" but only for the 2 first occurences 

string = "Regular expression are difficult but awesomely awesome and awesomely awesome and moreover AWESOMELY Awesome"

switch_words = re.sub(r"(awesomely) (awesome)", r"\2 \1", string, flags=re.IGNORECASE, count=2) 
switch_words

'Regular expression are difficult but awesome awesomely and awesome awesomely and moreover AWESOMELY Awesome'

## Matching subgroups

In [18]:
#Let's find matches for : 2 or 3 digits followed by 1 or 2 non digits then 2 or more digits
string = "123aa45678a9859a56sS42aaa31a832a8"

regex = re.compile(r"(((\d{2,3})(\D{1,2}))(\d+))")

# Note that we have several groups in the previous regex
# Group 1 is the whole expression:  2 or 3 digits followed by 1 or 2 non digits then 2 or more digits
print("First group: {}".format([n.group(1) for n in regex.finditer(string)]))
print("First group span: {}".format([n.span(1) for n in regex.finditer(string)]))
print("-"*40)
# Group 2: 2 or 3 digits followed by 1 or 2 non digits 
print("Second group: {}".format([n.group(2) for n in regex.finditer(string)]))
print("Second group span: {}".format([n.span(2) for n in regex.finditer(string)]))
print("-"*40)
# Group 3: the 2 or 3 first digits
print("Third group: {}".format([n.group(3) for n in regex.finditer(string)]))
print("Third group span: {}".format([n.span(3) for n in regex.finditer(string)]))
print("-"*40)
# Group 4: 1 or 2 non digits that follow the 2 or 3 first digits
print("Fourth group: {}".format([n.group(4) for n in regex.finditer(string)]))
print("Fourth group span: {}".format([n.span(4) for n in regex.finditer(string)]))
print("-"*40)
# Group 5: 2 or more digits that follow the previous 3 groups
print("Fifth group: {}".format([n.group(5) for n in regex.finditer(string)]))
print("Fifth group span: {}".format([n.span(5) for n in regex.finditer(string)]))
print("-"*40)


#from each detected group let's get the digit part (group 2)


First group: ['123aa45678', '859a56', '31a832']
First group span: [(0, 10), (12, 18), (25, 31)]
----------------------------------------
Second group: ['123aa', '859a', '31a']
Second group span: [(0, 5), (12, 16), (25, 28)]
----------------------------------------
Third group: ['123', '859', '31']
Third group span: [(0, 3), (12, 15), (25, 27)]
----------------------------------------
Fourth group: ['aa', 'a', 'a']
Fourth group span: [(3, 5), (15, 16), (27, 28)]
----------------------------------------
Fifth group: ['45678', '56', '832']
Fifth group span: [(5, 10), (16, 18), (28, 31)]
----------------------------------------


## Conditional matching

In [5]:
#Check if a string contains 2 uppercase letters, 3 lowercase letters and starts and ends with numbers
string = "12 Hc3c66pH666"

print(re.search(r"^[0-9](?=.*[A-Z]){2}(?=.*[a-z]){3}.*[0-9]$", string).group())

string = "1HHH9965P5p9cc5555555555"
print(re.search(r"^[0-9](?=.*[A-Z]){2}(?=.*[a-z]){3}.*[0-9]$", string).group())

string = "HHH9965P5p9cc5555555555"
# print(re.search(r"^[0-9](?=.*[A-Z]){2}(?=.*[a-z]){3}.*[0-9]$", string).group())

12 Hc3c66pH666
1HHH9965P5p9cc5555555555


In [19]:
#Let's find all the matches for 2 or 3 didgits followed by 1 or 2 letters
string = "123aa45678a9859a56sS42aaa31a832a8"
regex = re.compile(r"(?=(\d{2,3}\w{1,2}))")

indices = [n.start() for n in regex.finditer(string)]
print("Starting indices are {}: ".format(indices))

for i in  regex.finditer(string):    
    print("Found matches: {} - spans: {}".format(i.group(1), i.span(1)))

Starting indices are [0, 1, 5, 6, 7, 8, 11, 12, 13, 16, 20, 25, 28, 29]: 
Found matches: 123aa - spans: (0, 5)
Found matches: 23aa - spans: (1, 5)
Found matches: 45678 - spans: (5, 10)
Found matches: 5678a - spans: (6, 11)
Found matches: 678a9 - spans: (7, 12)
Found matches: 78a9 - spans: (8, 12)
Found matches: 9859a - spans: (11, 16)
Found matches: 859a5 - spans: (12, 17)
Found matches: 59a5 - spans: (13, 17)
Found matches: 56sS - spans: (16, 20)
Found matches: 42aa - spans: (20, 24)
Found matches: 31a8 - spans: (25, 29)
Found matches: 832a8 - spans: (28, 33)
Found matches: 32a8 - spans: (29, 33)


## Find overlapping matches

In [20]:
#let's find all the matches for 3 consecutive number
string = "123a456789859a5642aaa31a832a8"

regex = re.compile(r"(?=(\d{3}))")

regex.finditer(string)

indices = [n.start() for n in regex.finditer(string)]

print("Starting indices are {}: ".format(indices))

for i in  regex.finditer(string):    
    print("Found matches: {} - spans: {}".format(i.group(1), i.span(1)))
    

Starting indices are [0, 4, 5, 6, 7, 8, 9, 10, 14, 15, 24]: 
Found matches: 123 - spans: (0, 3)
Found matches: 456 - spans: (4, 7)
Found matches: 567 - spans: (5, 8)
Found matches: 678 - spans: (6, 9)
Found matches: 789 - spans: (7, 10)
Found matches: 898 - spans: (8, 11)
Found matches: 985 - spans: (9, 12)
Found matches: 859 - spans: (10, 13)
Found matches: 564 - spans: (14, 17)
Found matches: 642 - spans: (15, 18)
Found matches: 832 - spans: (24, 27)
