## Regular Expression Function

In [1]:
import re

### Find All:

In [2]:
txt = "The rain in Spain"

x = re.findall("ai", txt)
print(x)

['ai', 'ai']


## Search

In [3]:
txt = "The rain in Spain"

x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


In [4]:
txt = "The rain in Spain"

x = re.search("Portugal", txt)

print(x)

None


## Split

In [5]:
txt = "The rain in Spain"

x = re.split("\s", txt)

print(x)

['The', 'rain', 'in', 'Spain']


In [6]:
txt = "The rain in Spain"

x = re.split("\s", txt, 1)

print(x)

['The', 'rain in Spain']


## Sub

In [7]:
txt = "The rain in Spain"

x = re.sub("\s", "9", txt)

print(x)

The9rain9in9Spain


In [8]:
txt = "The rain in Spain"

x = re.sub("\s", "9", txt, 2)

print(x)

The9rain9in Spain


## Match the Object
Do a search that will return a Match Object:

In [9]:
txt = "The rain in Spain"

x = re.search("ai", txt)

print(x)

<re.Match object; span=(5, 7), match='ai'>


The Match object has properties and methods used to retrieve information about the search, and the result:

+ `.span()` returns a tuple containing the start-, and end positions of the match.
+ `.string` returns the string passed into the function.
+ `.group()` returns the part of the string where there was a match.

In [10]:
txt = "The rain in Spain"

x = re.search(r"\bS\w+", txt)

print(x.span())

(12, 17)


In [11]:
txt = "The rain in Spain"

x = re.search(r"\bS\w+", txt)

print(x.string)

The rain in Spain


In [12]:
txt = "The rain in Spain"

x = re.search(r"\bS\w+", txt)

print(x.group())

Spain


## Metacharacters

Metacharacters are characters with a special meaning

## A set of characters - `[]`

In [13]:
txt = "The rain in Spain"

x = re.findall("[A-z]", txt)

print(x)

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']


In [14]:
txt = "The rain in Spain"

x = re.findall("[a-z]", txt)

print(x)

['h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'p', 'a', 'i', 'n']


## Signals a special sequence (can also be used to escape special characters) - `\`

**More details** in the next section.

In [15]:
txt = "That will be 59 dollars"

# Find all digit characters:
x = re.findall("\d", txt)

print(x)

['5', '9']


## Any character (except newline character) - `.`

In [16]:
txt = "hello world"

# Search for a sequence that starts with "he", 
# followed by two (any) characters, and an "o":
x = re.findall("he....w", txt)

print(x)

['hello w']


In [17]:
x = re.findall("he..o", txt)

print(x)

['hello']


## Starts with - `^`

In [18]:
txt = "hello world"

# Check if the string starts with 'hello':
x = re.findall("^hello", txt)

if (x):
    print("Yes, the string starts with 'hello'")
else:
    print("No match")

Yes, the string starts with 'hello'


## Ends with - `$`

In [19]:
txt = "hello world"

#Check if the string ends with 'world':
x = re.findall("world$", txt)

if (x):
    print("Yes, the string ends with 'world'")
else:
    print("No match")

Yes, the string ends with 'world'


## Zero or more occurrences - `*`

In [20]:
txt = "The rain in Spain falls mainly in the plain!"

# Check if the string contains "ai" 
# followed by 0 or more "x" characters:
x = re.findall("aix*", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['ai', 'ai', 'ai', 'ai']
Yes, there is at least one match!


## One or more occurrences - `+`

In [21]:
txt = "The rain in Spain falls mainly in the plain!"

# Check if the string contains "ai" 
# followed by 1 or more "x" characters:
x = re.findall("aix+", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[]
No match


In [22]:
x = re.findall("ain+", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['ain', 'ain', 'ain', 'ain']
Yes, there is at least one match!


## Exactly the specified number of occurrences - `{}`

In [23]:
txt = "The rain in Spain falls mainly in the plain!"

# Check if the string contains "a" 
# followed by exactly two "l" characters:
x = re.findall("al{2}", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['all']
Yes, there is at least one match!


## Either or - `|`

In [24]:
txt = "The rain in Spain falls mainly in the plain!"

# Check if the string contains either "falls" or "stays":
x = re.findall("falls|stays", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['falls']
Yes, there is at least one match!


## Capture and group - `()`

# Signals a special sequence

A special sequence is a `\` followed by one of the characters in the list below, and has a special meaning.

## `\A`

Returns a match if the specified characters are at the beginning of the string

In [25]:
txt = "The rain in Spain"

# Check if the string starts with "The":
x = re.findall("\AThe", txt)

print(x)

if (x):
    print("Yes, there is a match!")
else:
    print("No match")

['The']
Yes, there is a match!


In [26]:
txt = "The rain in Spain"

# Check if the string starts with "The":
x = re.findall("\Arain", txt)

print(x)

if (x):
    print("Yes, there is a match!")
else:
    print("No match")

[]
No match


## `\b`

Returns a match where the specified characters are at the beginning or at the end of a word

In [27]:
txt = "The rain in Spain"

# Check if "ain" is present at the beginning of a WORD:
x = re.findall(r"\bSpa", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['Spa']
Yes, there is at least one match!


In [28]:
txt = "The rain in Spain"

# Check if "ain" is present at the beginning of a WORD:
x = re.findall(r"\bain", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[]
No match


In [29]:
# Check if "ain" is present at the end of a WORD:
x = re.findall(r"ain\b", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['ain', 'ain']
Yes, there is at least one match!


In [30]:
# Check if "ain" is present at the end of a WORD:
x = re.findall(r"ra\b", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[]
No match


## `\B`

Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word

In [31]:
txt = "The rain in Spain"

# Check if "ain" is present, but NOT 
# at the beginning of a word:
x = re.findall(r"\Bain", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['ain', 'ain']
Yes, there is at least one match!


In [32]:
# Check if "ain" is present, 
# but NOT at the end of a word:
x = re.findall(r"ain\B", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[]
No match


## `\d`

Returns a match where the string contains digits (numbers from 0-9)

In [33]:
txt = "The rain in Spain"

# Check if the string contains 
# any digits (numbers from 0-9):
x = re.findall("\d", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[]
No match


## \D

Returns a match where the string DOES NOT contain digits

In [34]:
txt = "The rain in Spain"

# Return a match at every no-digit character:
x = re.findall("\D", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['T', 'h', 'e', ' ', 'r', 'a', 'i', 'n', ' ', 'i', 'n', ' ', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


## `\s`

Returns a match where the string contains a white space character

In [35]:
# Return a match at every white-space character:
x = re.findall("\s", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


## `\S`

Returns a match where the string DOES NOT contain a white space character

In [36]:
# Return a match at every NON white-space character
x = re.findall("\S", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


## `\w`

Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)

In [37]:
txt = "The rain in Spain"

# Return a match at every word character 
# (characters from a to Z, digits from 0-9, 
# and the underscore _ character):
x = re.findall("\w", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


## `\W`

Returns a match where the string DOES NOT contain any word characters

In [38]:
txt = "The rain in Spain"

# Return a match at every NON word character 
# (characters NOT between a and Z. Like "!", "?" white-space etc.)
x = re.findall("\W", txt)

print(x)

if (x):
    print("Yes, there is at least one match!")
else:
    print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


## `\Z`

Returns a match if the specified characters are at the end of the string	

In [39]:
txt = "The rain in Spain"

#Check if the string ends with "Spain":

x = re.findall("Spain\Z", txt)

print(x)

if (x):
    print("Yes, there is a match!")
else:
    print("No match")

['Spain']
Yes, there is a match!
