Adapted from [RexEgg](http://www.rexegg.com/regex-python.html) 

In [1]:
import re
# import regex if you want to use a more advanced version or re.
# The regex module has many advanced
# features for regex lovers. http://pypi.python.org/pypi/regex

## A simple example

### The three parts of a regular expression

* pattern = defines the pattern to be used
* subject = the string to be processed
* regex = the program which can efficiently match the pattern to a string

In [2]:
pattern = r'(\w+):(\w+):(\d+)'
subject = 'apple:green:3 banana:yellow:5'
regex = re.compile(pattern)

### The six main tasks we're likely to have

In [3]:
# Task 1: Is there a match?
print("*** Is there a Match? ***")
if regex.search(subject):
	print ("Yes")
else:
	print ("No")

*** Is there a Match? ***
Yes


In [5]:
# Task 2: How many matches are there?
print("\n" + "*** Number of Matches ***")
matches = regex.findall(subject)
print(len(matches))


*** Number of Matches ***
2


In [7]:
# Task 3: What is the first match?
print("\n" + "*** First Match ***")
match = regex.search(subject)
#group is decided based on the parenthesis above
if match:
	print("Overall match: ", match.group(0))
	print("Group 1 : ", match.group(1))
	print("Group 2 : ", match.group(2))
	print("Group 3 : ", match.group(3))


*** First Match ***
Overall match:  apple:green:3
Group 1 :  apple
Group 2 :  green
Group 3 :  3


In [8]:
	
# Task 4: What are all the matches?
print("\n" + "*** All Matches ***\n")
print("------ Method 1: finditer ------\n")
for match in regex.finditer(subject):
    print("--- Start of Match ---")
    print("Overall match: ", match.group(0))
    print("Group 1 : ", match.group(1))
    print("Group 2 : ", match.group(2))
    print("Group 3 : ", match.group(3))
    print ("--- End of Match---\n")		


*** All Matches ***

------ Method 1: finditer ------

--- Start of Match ---
Overall match:  apple:green:3
Group 1 :  apple
Group 2 :  green
Group 3 :  3
--- End of Match---

--- Start of Match ---
Overall match:  banana:yellow:5
Group 1 :  banana
Group 2 :  yellow
Group 3 :  5
--- End of Match---



In [8]:
print("\n------ Method 2: findall ------\n")
# if there are capture groups, findall doesn't return the overall match
# therefore, in that case, wrap the pattern in capturing parentheses
# the overall match becomes group 1, so other group numbers are bumped up!
wrappedpattern = "(" + pattern + ")"
wrappedregex = re.compile(wrappedpattern)
matches = wrappedregex.findall(subject)
if len(matches)>0:
    for match in matches:
        print("--- Start of Match ---")
        print("Overall Match: ",match[0])
        print("Group 1: ",match[1])
        print("Group 2: ",match[2])
        print("Group 3: ",match[3])
        print("--- End of Match---\n")		


------ Method 2: findall ------

--- Start of Match ---
Overall Match:  apple:green:3
Group 1:  apple
Group 2:  green
Group 3:  3
--- End of Match---

--- Start of Match ---
Overall Match:  banana:yellow:5
Group 1:  banana
Group 2:  yellow
Group 3:  5
--- End of Match---



In [9]:
# Task 5: Replace the matches
# simple replacement: reverse group
print("\n" + "*** Replacements ***")
print("Let's reverse the groups")
def reversegroups(m):
    return m.group(3) + ":" + m.group(2) + ":" + m.group(1)
replaced = regex.sub(reversegroups, subject)
print(replaced)


*** Replacements ***
Let's reverse the groups
3:green:apple 5:yellow:banana


In [11]:
# Task 6: Split
print(subject)
print("\n" + "*** Splits ***")
# Let's split at colons or spaces
splits = re.split(r":|\s",subject)
for split in splits:
    print (split)

apple:green:3 banana:yellow:5

*** Splits ***
apple
green
3
banana
yellow
5


### A More sophisticated example
For 6 similar tasks

In [38]:
subject = 'Jane"" ""Tarzan12"" Tarzan11@Tarzan22 {4 Tarzan34}'
regex = re.compile(r'{[^}]+}|"Tarzan\d+"|(Tarzan\d+)')
# regex = re.compile(r'"Tarzan\d+"')

# put Group 1 captures in a list
matches = [group for group in re.findall(regex, subject) if group]
matches

['Tarzan11', 'Tarzan22']

In [17]:
# Task 1: Is there a match?
print("*** Is there a Match? ***")
if len(matches)>0:
	print ("Yes")
else:
	print ("No")

*** Is there a Match? ***
Yes


In [18]:
# Task 2: How many matches are there?
print("\n" + "*** Number of Matches ***")
print(len(matches))


*** Number of Matches ***
2


In [19]:
# Task 3: What is the first match?
print("\n" + "*** First Match ***")
if len(matches)>0:
	print (matches[0])


*** First Match ***
Tarzan11


In [20]:
# Task 4: What are all the matches?
print("\n" + "*** Matches ***")
if len(matches)>0:
	for match in matches:
	    print (match)


*** Matches ***
Tarzan11
Tarzan22


In [16]:
# Task 5: Replace the matches
def myreplacement(m):
    if m.group(1):
        return "Superman"
    else:
        return m.group(0)
replaced = regex.sub(myreplacement, subject)
print("\n" + "*** Replacements ***")
print(replaced)


*** Replacements ***
Jane"" ""Tarzan12"" Superman@Superman {4 Tarzan34}


In [17]:
# Task 6: Split
# Start by replacing by something distinctive,
# as in Step 5. Then split.
splits = replaced.split('Superman')
print("\n" + "*** Splits ***")
for split in splits:
	    print (split)


*** Splits ***
Jane"" ""Tarzan12"" 
@
 {4 Tarzan34}


### More info

* The regular expression cheat-sheet
* [Regex in Tutorial point](https://www.tutorialspoint.com/python/python_reg_expressions.htm)
* [The O'Reilly book on regular expressions](https://www.amazon.com/dp/0596528124?tag=onamazon-20)

If you are going to work with a lot of unstructured data, you will be a heavy user of regular expressions, and it will be worth your time to learn the many features of the regular expression language.