
#### Metacharacters are characters that are interpreted in a special way by a RegEx engine. Here's a list of metacharacters:

#### [] . ^ $ * + ? {} () \ |


### [] - Square brackets
Square brackets specifies a set of characters you wish to match.


Here, [abc] will match if the string you are trying to match contains any of the a, b or c.

In [14]:
# [abc]
import re

pattern = '[abc]'
test_string = 'abc de ca'
result = re.findall(pattern, test_string)

if result:
  print("Search successful.", result)
else:
  print("Search unsuccessful.")	

Search successful. ['a', 'b', 'c', 'c', 'a']


In [80]:
import re 
  
# \d is equivalent to [0-9]. 
print(re.findall('\d', "I went to him at 11 A.M. on 4th July 1886"))

# \w is equivalent to [a-zA-Z0-9_]. 
print(re.findall('\w',"He said * in some_lang.")) 

# \w+ matches to group of alphanumeric character. 
p = re.compile('\w+') 
print(p.findall("I went to him at 11 A.M., he said *** in some_language.")) 

# \W matches to non alphanumeric characters. 
p = re.compile('\W') 
print(p.findall("he said *** in some_language.")) 
  

['1', '1', '4', '1', '8', '8', '6']
['H', 'e', 's', 'a', 'i', 'd', 'i', 'n', 's', 'o', 'm', 'e', '_', 'l', 'a', 'n', 'g']
['I', 'went', 'to', 'him', 'at', '11', 'A', 'M', 'he', 'said', 'in', 'some_language']
[' ', ' ', '*', '*', '*', ' ', ' ', '.']


In [19]:
from re import split 
  
# '\W+' denotes Non-Alphanumeric Characters or group of characters 
# Upon finding ',' or whitespace ' ', the split(), splits the string from that point 
print(split('\W+', 'Words, words , Words')) 
print(split('\W+', "Word's words Words")) 
  
# Here ':', ' ' ,',' are not AlphaNumeric thus, the point where splitting occurs 
print(split('\W+', 'On 12th Jan 2016, at 11:02 AM')) 
  
# '\d+' denotes Numeric Characters or group of characters 
# Splitting occurs at '12', '2016', '11', '02' only 
print(split('\d+', 'On 12th Jan 2016, at 11:02 AM')) 

['Words', 'words', 'Words']
['Word', 's', 'words', 'Words']
['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']
['On ', 'th Jan ', ', at ', ':', ' AM']


In [20]:
import re 

# Regular Expression pattern 'ub' matches the string at "Subject" and "Uber". 
# As the CASE has been ignored, using Flag, 'ub' should match twice with the string 
# Upon matching, 'ub' is replaced by '~*' in "Subject", and in "Uber", 'Ub' is replaced. 
print(re.sub('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE)) 

# Consider the Case Sensitivity, 'Ub' in "Uber", will not be reaplced. 
print(re.sub('ub', '~*' , 'Subject has Uber booked already')) 

# As count has been given value 1, the maximum times replacement occurs is 1 
print(re.sub('ub', '~*' , 'Subject has Uber booked already', count=1, flags = re.IGNORECASE)) 

# 'r' before the patter denotes RE, \s is for start and end of a String. 
print(re.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=re.IGNORECASE)) 


S~*ject has ~*er booked already
S~*ject has Uber booked already
S~*ject has Uber booked already
Baked Beans & Spam


In [21]:
import re 
print(re.subn('ub', '~*' , 'Subject has Uber booked already')) 
t = re.subn('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE) 
print(t) 
print(len(t)) 

# This will give same output as sub() would have 
print(t[0]) 


('S~*ject has Uber booked already', 1)
('S~*ject has ~*er booked already', 2)
2
S~*ject has ~*er booked already


In [32]:
import re
str = 'an example word:cat!!'
match = re.search('word:\w+', str)
# If-statement after search() tests if it succeeded
if match:
  print ('found', match.group() )## 'found word:cat'
else:
  print( 'did not find')

found word:cat


In [48]:
import re 
match = re.search(r'pi+', 'dfd piiigipiig dffd') # found, match.group() == "piii"
if match:
    print(match.group())

piii


In [58]:
match = re.search(r'\d\s\d*', 'xx1 2   3xx') # found, match.group() == "1 2   3"
if match:
    print(match.group())


1 2


In [72]:
import re
str = 'purple alic2e-b@google.com monkey dishwasher'
match = re.search(r'[\w-]+@[\w.]+', str)
if match:
    print(match.group()) ## 'b@google'

alic2e-b@go1ogle.com


In [76]:
 str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'([\w.-]+)@([\w.-]+)', str)
if match:
    print(match.group())  ## 'alice-b@google.com' (the whole match)
    print(match.group(1) ) ## 'alice-b' (the username, group 1)
    print(match.group(2) ) ## 'google.com' (the host, group 2)

alice-b@google.com
alice-b
google.com


In [78]:
import re
## Suppose we have a text with many email addresses
str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

## Here re.findall() returns a list of all the found email strings
emails = re.findall(r'[\w\.-]+@[\w\.-]+', str) ## ['alice@google.com', 'bob@abc.com']
for email in emails:
# do something with each found email string
    print (email)

alice@google.com
bob@abc.com
