# Regular Expression

* Regular Expression (or REGEX) specifies a set of strings that matches it
* The functions in this module let you check if a particular string matches a given regular expression

### REGEX functions
0. Literals
1. re.compile()
2. re.match()
3. re.search()
4. re.findall()
5. re.finditer()
6. re.fullmatch()
7. re.split()
8. re.sub()
9. re.subn()
10. re.escape()

In [None]:
##  Load example book

In [37]:
enc='utf-8'
with open("miracle_in_the_andes.txt","r",encoding=enc) as f:
    book=f.read()
    #print(book)

### 0) REGEX LITERALS
+  Match the actual character

In [40]:
import re

def myfunc(string):
    pattern=re.compile(r'chapter',re.I)
    array=pattern.findall(string)
    return array

myfunc(book)

['Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'Chapter',
 'chapter',
 'Chapter']

### 1)  re.compile()

re.findall(pattern, string, flags=0)
+  save the regular expression output in a variable, it can be used several times in a single program
+  As good practice, I always compile my pattern even if it is one-time use

In [43]:
import re

def myfunc(string):
    pattern=re.compile(r'chapter\s\w+',re.I)
    array=pattern.match(string)
    print(array)
    if array:
        print("Match found")
    else:
        print("Match not found")

myfunc(book)

<re.Match object; span=(0, 9), match='Chapter 1'>
Match found


### 2)  re.match()

re.match(pattern, string, flags=0)              
+  if zero or more characters at the beginning of string match the regular expression pattern, **return a corresponding Match.**
+  return None if the string does not match the pattern; note that this is different from a zero-length match.
+  in multiline mode, match beginning of string not beginning of each line
+  to locate anywhere in string, use re.search() instead
+  anchored at start of '^pattern'
+  the match objects start() method returns the starting position of pattern in the string, and end() returns the endpoint.

In [46]:
import re
#print(book)
def myfunc(string):
    #pattern=re.compile(r'before',re.I)                                               # match is case-sensitive
    pattern=re.compile(r'chapter\s\w+',re.I)
    array=pattern.match(string)
    print(array)
    if array:
        print("Match found")
    else:
        print("Match not found")

myfunc(book)

<re.Match object; span=(0, 9), match='Chapter 1'>
Match found


### 3)  re.search()

re.search(pattern, string, flags=0)
+  scan through string looking for the first location where this regular expression produces a match, and **return a corresponding Match.**
+  return None if no position in the string matches the pattern
+  not anchored at 'pattern'
+  the match objects start() method returns the starting position of pattern in the string, and end() returns the endpoint.

In [49]:
import re

def myfunc(string):
    # pattern=re.compile(r'\s+before\s+',re.I)                                             # match is case-sensitive
    # pattern=re.compile(r'before',re.I)
    pattern=re.compile(r'chapter\s\w+',re.I)
    array=pattern.search(string)
    print(array)
    if array:
        print("Match found")
    else:
        print("Match not found")

myfunc(book)

<re.Match object; span=(0, 9), match='Chapter 1'>
Match found


### 4)  re.findall()

re.findall(pattern, string, flags=0)
+  return all the non-overlapping matches of a pattern in the string
+  and it **returns these matches as a list of strings or tuple**
+  if multiple groups are present, return a list of tuples of strings matching the groups
+  non-capturing groups do not affect the form of the result

In [52]:
import re

def myfunc(string):
    # pattern=re.compile(r'\s+before\s+',re.I)                                             # match is case-sensitive
    # pattern=re.compile(r'before',re.I)
    pattern=re.compile(r'chapter\s\w+',re.I)
    array=pattern.findall(string)
    return array

myfunc(book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

### 5) re.finditer()

re.finditer(pattern, string, flags=0)
+  works exactly the same as the re.findall() method except **return an iterator yielding Match objects** over all non-overlapping matches
+  the string is scanned left-to-right, and matches are returned in the order found
+  empty matches are included in the result

In [55]:
import re

def myfunc(string):
    # pattern=re.compile(r'\s+before\s+',re.I)                                             # match is case-sensitive
    # pattern=re.compile(r'before',re.I)
    pattern=re.compile(r'chapter\s\w+',re.I)
    array=pattern.finditer(string)
    for match in array:
        print(match.group())

myfunc(book)

Chapter 1
Chapter 2
Chapter 3
Chapter 4
Chapter 5
Chapter 6
Chapter 7
Chapter 8
Chapter 9
Chapter 10


### 6) re.fullmatch()

re.fullmatch(pattern, string, flags=0)
+  If the whole string matches the regular expression pattern, **return a corresponding Match.**
+  Return None if the string does not match the pattern  
+  anchored at start and end of '^pattern$'

In [98]:
import re

def myfunc(string):
    # pattern=re.compile(r'\s+before\s+',re.I)                                             # match is case-sensitive
    # pattern=re.compile(r'before',re.I)
    pattern=re.compile(r'\w+\schapter\s\w+',re.I)
    array=pattern.fullmatch(string)
    print(array.group())

myfunc('before Chapter ends')

before Chapter ends


### 7) re.split()

re.split(pattern, string, maxsplit=0, flags=0)
+  split the string at occurances of pattern
+  If capturing parentheses are used in pattern, then the text of all groups in the pattern are also returned as part of the resulting list.


In [67]:
import re

print(re.split(r'\W+','Words, words, words.'))
print(re.split(r'\W+','Words, words, words.',1))
print(re.split(r'(\W+)','Words, words, words.'))
print(re.split(r'(\d+)','On 12th Jan 2016, at 11:02 AM'))
print(re.split(r'(\W+)','On 12th Jan 2016, at 11:02 AM'))
print(re.split(r'\W*','...Words, words, words.'))
print(re.split(r'(\W*)','...Words, words, words.'))

['Words', 'words', 'words', '']
['Words', 'words, words.']
['Words', ', ', 'words', ', ', 'words', '.', '']
['On ', '12', 'th Jan ', '2016', ', at ', '11', ':', '02', ' AM']
['On', ' ', '12th', ' ', 'Jan', ' ', '2016', ', ', 'at', ' ', '11', ':', '02', ' ', 'AM']
['', '', 'W', 'o', 'r', 'd', 's', '', 'w', 'o', 'r', 'd', 's', '', 'w', 'o', 'r', 'd', 's', '', '']
['', '...', '', '', 'W', '', 'o', '', 'r', '', 'd', '', 's', ', ', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', ', ', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', '.', '', '', '']


### 8)  re.sub()

re.sub(pattern, repl, string, count=0, flags=0)
+  return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl
+  count is the maximum number of pattern occurrences to be replaced; count must be a non-negative integer.
+  Uses:\
Remove unnecessary characters\
Convert the case of characters in a string\
Standardize formats to prepare data for analysis\
Correct spelling errors\
Replace specific words with synonyms\
Check for valid patterns\
Clean input to ensure data integrity and prevent errors

In [70]:
# reformat string
import re

def myfunc(string):
    pattern=r'\D+'                                                     # match is case-sensitive
    array=re.sub(pattern,'',string)
    print(array)

myfunc("(212)-456-7890")

2124567890


In [72]:
# replace whitespace
import re

string="Jessa knows testing and machine learning"
pattern=r'\s'
array=re.sub(pattern,"_",string)
print(array)

Jessa_knows_testing_and_machine_learning


In [74]:
# remove trailing and leading whitespace
import re

string="   Jessa Knows Testing And Machine Learning   \t\n"
pattern=r'^\s+|\s+$'
array=re.sub(pattern,"",string)
print(array)

Jessa Knows Testing And Machine Learning


### 9) re.subn()

re.subn(pattern, repl, string, count=0, flags=0)

+  Perform the same operation as sub(), but return a tuple (new_string, number_of_subs_made)

In [77]:
# reformat string
import re

def myfunc(string):
    pattern=r'\D+'                                                     # match is case-sensitive
    array=re.subn(pattern,'',string)
    print(array)

myfunc("(212)-456-7890")

('2124567890', 3)


In [79]:
# replace whitespace
import re

string="Jessa knows testing and machine learning"
pattern=r'\s'
array=re.subn(pattern,"_",string)
print(array)

('Jessa_knows_testing_and_machine_learning', 5)


In [81]:
# remove trailing and leading whitespace
import re

string="   Jessa Knows Testing And Machine Learning   \t\n"
pattern=r'^\s+|\s+$'
array=re.subn(pattern,"",string)
print(array)

('Jessa Knows Testing And Machine Learning', 2)


### 10)    re.escape()

re.escape(pattern)
+  Escape special characters in pattern.
+  This is useful when we need to match a string that contains special characters such as punctuation or regex operators, which would otherwise be interpreted as part of the regular expression syntax.

In [84]:
import re

text='https://www.github.com'
print(re.escape(text))

https://www\.github\.com
