# Python RegEx
A Regular Expression (RegEx) is a sequence of characters that defines a search pattern.

In [2]:
#example

#five letter starts with 'a' and endwith 's'

import re

pattern = '^a...s$'
test_string = 'abyss'
result = re.match(pattern, test_string)

if result:
  print("Search successful.")
else:
  print("Search unsuccessful.")	


Search successful.


Specify Pattern Using RegEx

To specify regular expressions, metacharacters are used. In the above example, ^ and $ are metacharacters.

Metacharacters are characters that are interpreted in a special way by a RegEx engine. Here's a list of metacharacters:

[] . ^ $ * + ? {} () \ |

In [3]:
# Square brackets
# Square brackets specifies a set of characters you wish to match.
# In this example, it matches any 't' in the string.
print(re.match('[t]','testing test string'))

# Period
# A period matches any single character (except newline '\n').
# In this example, it matches 'cat', 'sat', and 'mat'.
pattern = r'.at'
text = 'The cat sat on the mat'
matches = re.findall(pattern, text)
print(matches)

# Caret
# The caret symbol ^ is used to check if a string starts with a certain character.
# In this example, it matches 'The' at the beginning of the string.
pattern = r'^The'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Dollar
# The dollar symbol $ is used to check if a string ends with a certain character.
# In this example, it matches 'dog' at the end of the string.
pattern = r'dog$'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Star
# The star symbol * matches zero or more occurrences of the pattern left to it.
# In this example, it matches 'a', 'aa', 'aaa', etc.
pattern = r'a*'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Plus
# The plus symbol + matches one or more occurrences of the pattern left to it.
# In this example, it matches 'a', 'aa', 'aaa', etc., but not an empty string.
pattern = r'a+'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Question Mark
# The question mark symbol ? matches zero or one occurrence of the pattern left to it.
# In this example, it matches 'a' or an empty string.
pattern = r'a?'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Braces
# Consider this code: {n,m}. This means at least n, and at most m repetitions of the pattern left to it.
# In this example, it matches 'a', 'aa', 'aaa', 'aaaa', 'aaaaa', or 'aaaaaa'.
pattern = r'a{1,6}'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Alternation
# Vertical bar | is used for alternation (or operator).
# In this example, it matches either 'The' or 'the'.
pattern = r'The|the'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Group
# Parentheses () is used to group sub-patterns.
# In this example, it matches 'The' or 'the' followed by 'quick'.
pattern = r'(The|the) quick'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# Backslash
# Backlash \ is used to escape various characters including all metacharacters.
# In this example, it matches the literal period '.'.
pattern = r'\.'
text = 'The quick brown fox jumps over the lazy dog.'
matches = re.findall(pattern, text)
print(matches)

<re.Match object; span=(0, 1), match='t'>
['cat', 'sat', 'mat']
['The']
['dog']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'a', '', '', '', '', '', '', '']
['a']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'a', '', '', '', '', '', '', '']
['a']
['The', 'the']
['The']
['.']


### Special Sequences

Special sequences make commonly used patterns easier to write

In [4]:
# \A - Matches if the specified characters are at the start of a string.
# In this example, it matches 'The' at the beginning of the string.
pattern = r'\AThe'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

# \b - Matches if the specified characters are at the beginning or end of a word.
# In this example, it matches 'at' at the end of the word 'cat'.
pattern = r'at\b'
text = 'The cat sat on the mat'
matches = re.findall(pattern, text)
print(matches)

# \B - Opposite of \b. Matches if the specified characters are not at the beginning or end of a word.
# In this example, it matches 'at' in 'cat' but not in 'mat'.
pattern = r'at\B'
text = 'The cat sat on the mat'
matches = re.findall(pattern, text)
print(matches)

# \d - Matches any decimal digit. Equivalent to [0-9]
# In this example, it matches all digits in the string.
pattern = r'\d'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)

# \D - Matches any non-decimal digit. Equivalent to [^0-9]
# In this example, it matches all non-digits in the string.
pattern = r'\D'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)

# \s - Matches where a string contains any whitespace character. Equivalent to [ \t\n\r\f\v]
# In this example, it matches all whitespace characters in the string.
pattern = r'\s'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)

# \S - Matches where a string contains any non-whitespace character. Equivalent to [^ \t\n\r\f\v].
# In this example, it matches all non-whitespace characters in the string.
pattern = r'\S'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)

# \w - Matches any alphanumeric character (digits and alphabets). Equivalent to [a-zA-Z0-9_].
# In this example, it matches all alphanumeric characters in the string.
pattern = r'\w'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)

# \W - Matches any non-alphanumeric character. Equivalent to [^a-zA-Z0-9_]
# In this example, it matches all non-alphanumeric characters in the string.
pattern = r'\W'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)

# \Z - Matches if the specified characters are at the end of a string
# In this example, it matches 'dog' at the end of the string.
pattern = r'dog\Z'
text = 'The quick brown fox jumps over the lazy dog'
matches = re.findall(pattern, text)
print(matches)

['The']
['at', 'at', 'at']
[]
['9']
['T', 'h', 'e', ' ', 'c', 'a', 't', ' ', 'h', 'a', 's', ' ', ' ', 'l', 'i', 'v', 'e', 's']
[' ', ' ', ' ', ' ']
['T', 'h', 'e', 'c', 'a', 't', 'h', 'a', 's', '9', 'l', 'i', 'v', 'e', 's']
['T', 'h', 'e', 'c', 'a', 't', 'h', 'a', 's', '9', 'l', 'i', 'v', 'e', 's']
[' ', ' ', ' ', ' ']
['dog']


### regex functions

In [5]:
# re.findall()
# The method returns a list of strings containing all matches.
pattern = r'\d+'
text = 'The cat has 9 lives'
matches = re.findall(pattern, text)
print(matches)  # Output: ['9']

# re.split()
# The method splits the string where there is a match and returns a list of strings where the splits have occurred.
pattern = r'\s+'
text = 'The cat has 9 lives'
splits = re.split(pattern, text)
print(splits)  # Output: ['The', 'cat', 'has', '9', 'lives']

# re.sub(pattern, replace, string)
# The method returns a string where matched occurrences are replaced with the content of replace variable.
pattern = r'\d+'
replace = 'five'
text = 'The cat has 9 lives'
new_text = re.sub(pattern, replace, text)
print(new_text)  # Output: 'The cat has five lives'

# re.search()
# The re.search() method takes two arguments: a pattern and a string. The method looks for the first location where the RegEx pattern produces a match with the string.
pattern = r'cat'
text = 'The cat has 9 lives'
match = re.search(pattern, text)
print(match.group())  # Output: 'cat'

# match.group()
# The group() method returns the part of the string where there is a match.
pattern = r'cat'
text = 'The cat has 9 lives'
match = re.search(pattern, text)
print(match.group())  # Output: 'cat'

# match.start(), match.end() and match.span()
# The start() function returns the index of the start of the matched substring. Similarly, end() returns the end index of the matched substring.
pattern = r'cat'
text = 'The cat has 9 lives'
match = re.search(pattern, text)
print(match.start())  # Output: 4
print(match.end())  # Output: 7
print(match.span())  # Output: (4, 7)

['9']
['The', 'cat', 'has', '9', 'lives']
The cat has five lives
cat
cat
4
7
(4, 7)
