## Python RegEx

In [None]:
#A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.
#RegEx can be used to check if a string contains the specified search pattern

## RegEx Module
Python has a built-in package called re, which can be used to work with Regular Expressions.

Import the re module:

In [1]:
import re

In [7]:
#Search the string to see if it starts with "The" and ends with "Spain":

import re

txt = "The rain in Spain"
x = re.search("^The.*Spain$", txt)

In [8]:
print(x)

<re.Match object; span=(0, 17), match='The rain in Spain'>


In [9]:
#RegEx Functions
#The re module offers a set of functions that allows us to search a string for a match:

#Function	Description
#findall	Returns a list containing all matches
#search	Returns a Match object if there is a match anywhere in the string
#split	Returns a list where the string has been split at each match
#sub	Replaces one or many matches with a string

## Metacharacters
Metacharacters are characters with a special meaning:



In [10]:
#[]	A set of characters
import re

txt = "The rain in Spain"

#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[a-m]", txt)
print(x)

['h', 'e', 'a', 'i', 'i', 'a', 'i']


In [11]:
#\	Signals a special sequence (can also be used to escape special characters)
import re

txt = "That will be 59 dollars"

#Find all digit characters:

x = re.findall("\d", txt)
print(x)

['5', '9']


In [12]:
#.	Any character (except newline character)
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":

x = re.findall("he..o", txt)
print(x)

['hello']


In [13]:
#^	Starts with
import re

txt = "hello planet"

#Check if the string starts with 'hello':

x = re.findall("^hello", txt)
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

Yes, the string starts with 'hello'


In [14]:
#$	Ends with
import re

txt = "hello planet"

#Check if the string ends with 'planet':

x = re.findall("planet$", txt)
if x:
  print("Yes, the string ends with 'planet'")
else:
  print("No match")


Yes, the string ends with 'planet'


In [15]:
#*	Zero or more occurrences
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by 0 or more  (any) characters, and an "o":

x = re.findall("he.*o", txt)

print(x)


['hello']


In [16]:
#+	One or more occurrences

import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by 1 or more  (any) characters, and an "o":

x = re.findall("he.+o", txt)

print(x)


['hello']


In [17]:
#?	Zero or one occurrences
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by 0 or 1  (any) character, and an "o":

x = re.findall("he.?o", txt)

print(x)

#This time we got no match, because there were not zero, not one, but two characters between "he" and the "o"

[]


In [18]:
#{}	Exactly the specified number of occurrences
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed excactly 2 (any) characters, and an "o":

x = re.findall("he.{2}o", txt)

print(x)


['hello']


In [19]:
#|	Either or
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains either "falls" or "stays":

x = re.findall("falls|stays", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['falls']
Yes, there is at least one match!


In [20]:
# The findall() Function
# The findall() function returns a list containing all matches.

# Example
# Print a list of all matches:

import re

txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)
#The list contains the matches in the order they are found.
#If no matches are found, an empty list is returned:

['ai', 'ai']


In [21]:
#Return an empty list if no match was found:

import re

txt = "The rain in Spain"
x = re.findall("Portugal", txt)
print(x)

[]


In [22]:
# The search() Function
# The search() function searches the string for a match, and returns a Match object if there is a match.

# If there is more than one match, only the first occurrence of the match will be returned:

# Example
# Search for the first white-space character in the string:

import re

txt = "The rain in Spain"
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


In [23]:
# The split() Function
# The split() function returns a list where the string has been split at each match:

# Example
# Split at each white-space character:

import re

txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)


['The', 'rain', 'in', 'Spain']


In [24]:
# you can control the number of occurrences by specifying the maxsplit parameter:

# Example
# Split the string only at the first occurrence:

import re

txt = "The rain in Spain"
x = re.split("\s", txt, 1)
print(x)

['The', 'rain in Spain']


## The sub() Function
The sub() function replaces the matches with the text of your choice:

Example
Replace every white-space character with the number 9:



In [25]:
import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


In [26]:
# You can control the number of replacements by specifying the count parameter:

# Example
# Replace the first 2 occurrences:

import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)
print(x)

The9rain9in Spain


In [27]:
import re

#Search for an upper case "S" character in the beginning of a word, and print its position:

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.span())

(12, 17)


In [28]:
import re

#Search for an upper case "S" character in the beginning of a word, and print the word:

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.group())

Spain


In [29]:
#Print the string passed into the function:

import re

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.string)

The rain in Spain
